[𝘀𝗽𝗿] changes introduced through rebaseusers/aengelke/spr/main.clang-invoke-pass-plugin-precodegencallback

Created using spr 1.3.5-bogner [skip ci]
author: Alexis Engelke <engelke@in.tum.de> 2025-12-22 18:37:58 +0000
committer: Alexis Engelke <engelke@in.tum.de> 2025-12-22 18:37:58 +0000
commit: c60e63f3a30f356904e4085b631c1e2ed059b7a0 (patch)
tree: 6fc55ac669c0f075243853d09d36536af799b349
parent: 5889bee0925c32e508b7817e36e379c79cefba2f (diff)
parent: c3678c4165b554a2908dd7571c6373dc8142587d (diff)
download: llvm-users/aengelke/spr/main.clang-invoke-pass-plugin-precodegencallback.zip
llvm-users/aengelke/spr/main.clang-invoke-pass-plugin-precodegencallback.tar.gz
llvm-users/aengelke/spr/main.clang-invoke-pass-plugin-precodegencallback.tar.bz2
198 files changed, 8007 insertions, 4679 deletions
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
index 53caaa4..9c91355 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
@@ -21,8 +21,10 @@
 #define LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H
 
 #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/LifetimeStats.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 
 namespace clang::lifetimes {
@@ -62,9 +64,14 @@ public:
 
 /// The main entry point for the analysis.
 void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
-                               LifetimeSafetyReporter *Reporter);
+                               LifetimeSafetyReporter *Reporter,
+                               LifetimeSafetyStats &Stats, bool CollectStats);
 
 namespace internal {
+
+void collectLifetimeStats(AnalysisDeclContext &AC, OriginManager &OM,
+                          LifetimeSafetyStats &Stats);
+
 /// An object to hold the factories for immutable collections, ensuring
 /// that all created states share the same underlying memory management.
 struct LifetimeFactory {
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeStats.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeStats.h
new file mode 100644
index 0000000..0fae030
--- /dev/null
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeStats.h
@@ -0,0 +1,36 @@
+//===- LifetimeStats.h - Lifetime Safety Statistics -------------*- C++-* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the data structures and utility function for collection of
+// statistics related to Lifetime Safety analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_LIFETIMESTATS_H
+#define LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_LIFETIMESTATS_H
+
+#include "clang/AST/TypeBase.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace clang::lifetimes {
+/// A structure to hold the statistics related to LifetimeAnalysis.
+/// These are accumulated across all analyzed functions and printed
+/// when -print-stats is enabled.
+struct LifetimeSafetyStats {
+  /// A map from `StmtClassName` to their missing origin counts.
+  llvm::StringMap<unsigned> ExprStmtClassToMissingOriginCount;
+  /// A map from `QualType` to their missing origin counts.
+  llvm::DenseMap<const clang::Type *, unsigned> ExprTypeToMissingOriginCount;
+};
+
+/// Utility function to print missing origin stats.
+void printStats(const LifetimeSafetyStats &Stats);
+} // namespace clang::lifetimes
+
+#endif // LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_LIFETIMESTATS_H
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
index d624618..690faae 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
@@ -17,7 +17,9 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/TypeBase.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/LifetimeStats.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang::lifetimes::internal {
 
@@ -150,6 +152,9 @@ public:
 
   void dump(OriginID OID, llvm::raw_ostream &OS) const;
 
+  /// Collects statistics about expressions that lack associated origins.
+  void collectMissingOrigins(Stmt &FunctionBody, LifetimeSafetyStats &LSStats);
+
 private:
   OriginID getNextOriginID() { return NextOriginID++; }
 
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 18c46bf..66c7f92 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -398,6 +398,10 @@ def err_invalid_llvm_ir : Error<"invalid LLVM IR input: %0">;
 def err_os_unsupport_riscv_fmv : Error<
   "function multiversioning is currently only supported on Linux">;
 
+def warn_unreachable_version
+    : Warning<"function version '%0' is unreachable; ignoring version">,
+      InGroup<FunctionMultiVersioning>;
+
 def warn_hlsl_langstd_minimal :
   Warning<"support for HLSL language version %0 is incomplete, "
           "recommend using %1 instead">,
diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h
index 20a2030..0ed61e5 100644
--- a/clang/include/clang/Sema/AnalysisBasedWarnings.h
+++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h
@@ -14,9 +14,8 @@
 #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H
 
 #include "clang/AST/Decl.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/LifetimeStats.h"
 #include "clang/Sema/ScopeInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
 #include <memory>
 
 namespace clang {
@@ -101,6 +100,11 @@ private:
   /// a single function.
   unsigned MaxUninitAnalysisBlockVisitsPerFunction;
 
+  /// Statistics collected during lifetime safety analysis.
+  /// These are accumulated across all analyzed functions and printed
+  /// when -print-stats is enabled.
+  clang::lifetimes::LifetimeSafetyStats LSStats;
+
   /// @}
 
 public:
diff --git a/clang/lib/Analysis/LifetimeSafety/CMakeLists.txt b/clang/lib/Analysis/LifetimeSafety/CMakeLists.txt
index 5874e84..e5876e7 100644
--- a/clang/lib/Analysis/LifetimeSafety/CMakeLists.txt
+++ b/clang/lib/Analysis/LifetimeSafety/CMakeLists.txt
@@ -5,6 +5,7 @@ add_clang_library(clangAnalysisLifetimeSafety
   LifetimeAnnotations.cpp
   LifetimeSafety.cpp
   LiveOrigins.cpp
+  LifetimeStats.cpp
   Loans.cpp
   LoanPropagation.cpp
   Origins.cpp
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index c0fa640..be0d405 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -18,8 +18,10 @@
 #include "clang/Analysis/Analyses/LifetimeSafety/Checker.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/LifetimeStats.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -90,11 +92,22 @@ void LifetimeSafetyAnalysis::run() {
 
   runLifetimeChecker(*LoanPropagation, *LiveOrigins, *FactMgr, AC, Reporter);
 }
+
+void collectLifetimeStats(AnalysisDeclContext &AC, OriginManager &OM,
+                          LifetimeSafetyStats &Stats) {
+  Stmt *FunctionBody = AC.getBody();
+  if (FunctionBody == nullptr)
+    return;
+  OM.collectMissingOrigins(*FunctionBody, Stats);
+}
 } // namespace internal
 
 void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
-                               LifetimeSafetyReporter *Reporter) {
+                               LifetimeSafetyReporter *Reporter,
+                               LifetimeSafetyStats &Stats, bool CollectStats) {
   internal::LifetimeSafetyAnalysis Analysis(AC, Reporter);
   Analysis.run();
+  if (CollectStats)
+    collectLifetimeStats(AC, Analysis.getFactManager().getOriginMgr(), Stats);
 }
 } // namespace clang::lifetimes
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeStats.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeStats.cpp
new file mode 100644
index 0000000..7e56859
--- /dev/null
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeStats.cpp
@@ -0,0 +1,38 @@
+//===- LifetimeStats.cpp - Lifetime Safety Statistics -*------------ C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the data structures and utility function for collection of
+// staticstics related to Lifetimesafety analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/Analyses/LifetimeSafety/LifetimeStats.h"
+#include "clang/AST/TypeBase.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang::lifetimes {
+void printStats(const LifetimeSafetyStats &Stats) {
+  llvm::errs() << "\n*** LifetimeSafety Missing Origin per QualType: "
+                  "(QualType : count) :\n\n";
+  unsigned TotalMissingOrigins = 0;
+  for (const auto &[ExprType, MissingOriginCount] :
+       Stats.ExprTypeToMissingOriginCount) {
+    QualType QT = QualType(ExprType, 0);
+    llvm::errs() << QT.getAsString() << " : " << MissingOriginCount << '\n';
+    TotalMissingOrigins += MissingOriginCount;
+  }
+  llvm::errs() << "\n\n*** LifetimeSafety Missing Origin per StmtClassName: "
+                  "(StmtClassName : count) :\n\n";
+  for (const auto &[ExprStmtClassName, MissingOriginCount] :
+       Stats.ExprStmtClassToMissingOriginCount) {
+    llvm::errs() << ExprStmtClassName << " : " << MissingOriginCount << '\n';
+  }
+  llvm::errs() << "\nTotal missing origins: " << TotalMissingOrigins << "\n";
+  llvm::errs() << "\n****************************************\n";
+}
+} // namespace clang::lifetimes
diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
index b2f1af3..2c1deac 100644
--- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
@@ -11,10 +11,42 @@
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclTemplate.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/TypeBase.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeAnnotations.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/LifetimeStats.h"
+#include "llvm/ADT/StringMap.h"
 
 namespace clang::lifetimes::internal {
+namespace {
+/// A utility class to traverse the function body in the analysis
+/// context and collect the count of expressions with missing origins.
+class MissingOriginCollector
+    : public RecursiveASTVisitor<MissingOriginCollector> {
+public:
+  MissingOriginCollector(
+      const llvm::DenseMap<const clang::Expr *, OriginList *> &ExprToOriginList,
+      LifetimeSafetyStats &LSStats)
+      : ExprToOriginList(ExprToOriginList), LSStats(LSStats) {}
+  bool VisitExpr(Expr *E) {
+    if (!hasOrigins(E))
+      return true;
+    // Check if we have an origin for this expression.
+    if (!ExprToOriginList.contains(E)) {
+      // No origin found: count this as missing origin.
+      LSStats.ExprTypeToMissingOriginCount[E->getType().getTypePtr()]++;
+      LSStats.ExprStmtClassToMissingOriginCount[std::string(
+          E->getStmtClassName())]++;
+    }
+    return true;
+  }
+
+private:
+  const llvm::DenseMap<const clang::Expr *, OriginList *> &ExprToOriginList;
+  LifetimeSafetyStats &LSStats;
+};
+} // namespace
 
 bool hasOrigins(QualType QT) {
   return QT->isPointerOrReferenceType() || isGslPointerType(QT);
@@ -157,4 +189,10 @@ const Origin &OriginManager::getOrigin(OriginID ID) const {
   return AllOrigins[ID.Value];
 }
 
+void OriginManager::collectMissingOrigins(Stmt &FunctionBody,
+                                          LifetimeSafetyStats &LSStats) {
+  MissingOriginCollector Collector(this->ExprToList, LSStats);
+  Collector.TraverseStmt(const_cast<Stmt *>(&FunctionBody));
+}
+
 } // namespace clang::lifetimes::internal
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 94323da..f365fa0 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/Frontend/Driver/CodeGenOptions.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -43,6 +42,7 @@
 #include "llvm/Object/OffloadBinary.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/Support/BuryPointer.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index ad9ef91..dbbc35b 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -25,6 +25,7 @@ set(LLVM_LINK_COMPONENTS
   ObjCARCOpts
   Object
   Passes
+  Plugins
   ProfileData
   ScalarOpts
   Support
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 0e3e7b11..26d2abc 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -3144,6 +3144,10 @@ void CodeGenFunction::EmitAArch64MultiVersionResolver(
       Builder.SetInsertPoint(CurBlock);
     }
 
+    // Skip unreachable versions.
+    if (RO.Function == nullptr)
+      continue;
+
     llvm::BasicBlock *RetBlock = createBasicBlock("resolver_return", Resolver);
     CGBuilderTy RetBuilder(*this, RetBlock);
     CreateMultiVersionResolverReturn(CGM, Resolver, RetBuilder, RO.Function,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 34af86d..85ed38f 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -68,6 +68,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Hash.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/TargetParser/X86TargetParser.h"
@@ -4664,6 +4665,7 @@ void CodeGenModule::emitMultiVersionFunctions() {
     // in this TU. For other architectures it is always emitted.
     bool ShouldEmitResolver = !getTarget().getTriple().isAArch64();
     SmallVector<CodeGenFunction::FMVResolverOption, 10> Options;
+    llvm::DenseMap<llvm::Function *, const FunctionDecl *> DeclMap;
 
     getContext().forEachMultiversionedFunctionVersion(
         FD, [&](const FunctionDecl *CurFD) {
@@ -4674,11 +4676,13 @@ void CodeGenModule::emitMultiVersionFunctions() {
             assert(getTarget().getTriple().isX86() && "Unsupported target");
             TA->getX86AddedFeatures(Feats);
             llvm::Function *Func = createFunction(CurFD);
+            DeclMap.insert({Func, CurFD});
             Options.emplace_back(Func, Feats, TA->getX86Architecture());
           } else if (const auto *TVA = CurFD->getAttr<TargetVersionAttr>()) {
             if (TVA->isDefaultVersion() && IsDefined)
               ShouldEmitResolver = true;
             llvm::Function *Func = createFunction(CurFD);
+            DeclMap.insert({Func, CurFD});
             char Delim = getTarget().getTriple().isAArch64() ? '+' : ',';
             TVA->getFeatures(Feats, Delim);
             Options.emplace_back(Func, Feats);
@@ -4689,6 +4693,7 @@ void CodeGenModule::emitMultiVersionFunctions() {
               if (TC->isDefaultVersion(I) && IsDefined)
                 ShouldEmitResolver = true;
               llvm::Function *Func = createFunction(CurFD, I);
+              DeclMap.insert({Func, CurFD});
               Feats.clear();
               if (getTarget().getTriple().isX86()) {
                 TC->getX86Feature(Feats, I);
@@ -4734,6 +4739,24 @@ void CodeGenModule::emitMultiVersionFunctions() {
                        const CodeGenFunction::FMVResolverOption &RHS) {
           return getFMVPriority(TI, LHS).ugt(getFMVPriority(TI, RHS));
         });
+
+    // Diagnose unreachable function versions.
+    if (getTarget().getTriple().isAArch64()) {
+      for (auto I = Options.begin() + 1, E = Options.end(); I != E; ++I) {
+        llvm::APInt RHS = llvm::AArch64::getCpuSupportsMask(I->Features);
+        if (std::any_of(Options.begin(), I, [RHS](auto RO) {
+              llvm::APInt LHS = llvm::AArch64::getCpuSupportsMask(RO.Features);
+              return LHS.isSubsetOf(RHS);
+            })) {
+          Diags.Report(DeclMap[I->Function]->getLocation(),
+                       diag::warn_unreachable_version)
+              << I->Function->getName();
+          assert(I->Function->user_empty() && "unexpected users");
+          I->Function->eraseFromParent();
+          I->Function = nullptr;
+        }
+      }
+    }
     CodeGenFunction CGF(*this);
     CGF.EmitMultiVersionResolver(ResolverFunc, Options);
 
diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt
index b17ffe0..c40baa3 100644
--- a/clang/lib/Frontend/CMakeLists.txt
+++ b/clang/lib/Frontend/CMakeLists.txt
@@ -3,8 +3,8 @@ add_subdirectory(Rewrite)
 set(LLVM_LINK_COMPONENTS
   BitReader
   BitstreamReader
-  Extensions
   Option
+  Plugins
   ProfileData
   Support
   TargetParser
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 95f59f7..39e20f3 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -46,7 +46,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Extensions/PassPlugin.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/AdvisoryLock.h"
 #include "llvm/Support/BuryPointer.h"
 #include "llvm/Support/CrashRecoveryContext.h"
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 9dcae98..7b08648 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -3135,7 +3135,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   if (EnableLifetimeSafetyAnalysis && S.getLangOpts().CPlusPlus) {
     if (AC.getCFG()) {
       lifetimes::LifetimeSafetyReporterImpl LifetimeSafetyReporter(S);
-      lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter);
+      lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter, LSStats,
+                                           S.CollectStats);
     }
   }
   // Check for violations of "called once" parameter properties.
@@ -3231,4 +3232,5 @@ void clang::sema::AnalysisBasedWarnings::PrintStats() const {
                << " average block visits per function.\n"
                << "  " << MaxUninitAnalysisBlockVisitsPerFunction
                << " max block visits per function.\n";
+  clang::lifetimes::printStats(LSStats);
 }
diff --git a/clang/test/CodeGen/AArch64/fmv-unreachable-version.c b/clang/test/CodeGen/AArch64/fmv-unreachable-version.c
new file mode 100644
index 0000000..c9626c5
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fmv-unreachable-version.c
@@ -0,0 +1,89 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -verify -emit-llvm -o - %s | FileCheck %s
+
+__attribute__((target_version("sve;priority=5"))) int unreachable_versions(void) { return 5; }
+// expected-warning@+1 {{function version 'unreachable_versions._MlseMmops' is unreachable; ignoring version}}
+__attribute__((target_version("mops+lse;priority=1"))) int unreachable_versions(void) { return 1; }
+int foo() { return unreachable_versions(); }
+// expected-warning@+1 {{function version 'unreachable_versions._Msve2' is unreachable; ignoring version}}
+__attribute__((target_clones("sve2;priority=4", "aes+sve2;priority=3", "lse;priority=2", "default"))) int unreachable_versions(void) { return 0; }
+// expected-warning@-1 {{function version 'unreachable_versions._MaesMsve2' is unreachable; ignoring version}}
+
+//.
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
+// CHECK: @unreachable_versions = weak_odr ifunc i32 (), ptr @unreachable_versions.resolver
+//.
+// CHECK: Function Attrs: noinline nounwind optnone vscale_range(1,16)
+// CHECK-LABEL: define {{[^@]+}}@unreachable_versions._Msve
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 5
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@foo
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @unreachable_versions()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone vscale_range(1,16)
+// CHECK-LABEL: define {{[^@]+}}@unreachable_versions._Mlse
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone vscale_range(1,16)
+// CHECK-LABEL: define {{[^@]+}}@unreachable_versions.default
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: disable_sanitizer_instrumentation
+// CHECK-LABEL: define {{[^@]+}}@unreachable_versions.resolver
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073807616
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073807616
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @unreachable_versions._Msve
+// CHECK:       resolver_else:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 69793284352
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 69793284352
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 69793317632
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 69793317632
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
+// CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 128
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 128
+// CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
+// CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @unreachable_versions._Mlse
+// CHECK:       resolver_else2:
+// CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 576460752303423616
+// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 576460752303423616
+// CHECK-NEXT:    [[TMP19:%.*]] = and i1 true, [[TMP18]]
+// CHECK-NEXT:    ret ptr @unreachable_versions.default
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone vscale_range(1,16) "fmv-features"="P0,P2,sve" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
+// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone vscale_range(1,16) "fmv-features"="P1,lse" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
+// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone vscale_range(1,16) "fmv-features" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR4]] = { disable_sanitizer_instrumentation }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/Sema/warn-lifetime-safety-missing-origin-stats.cpp b/clang/test/Sema/warn-lifetime-safety-missing-origin-stats.cpp
new file mode 100644
index 0000000..446bbe0
--- /dev/null
+++ b/clang/test/Sema/warn-lifetime-safety-missing-origin-stats.cpp
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -print-stats -fexperimental-lifetime-safety -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
+
+
+// CHECK: *** LifetimeSafety Missing Origin per QualType: (QualType : count) :
+// CHECK: *** LifetimeSafety Missing Origin per StmtClassName: (StmtClassName : count) :
diff --git a/clang/tools/clang-linker-wrapper/CMakeLists.txt b/clang/tools/clang-linker-wrapper/CMakeLists.txt
index 741ea01..0c2dea3 100644
--- a/clang/tools/clang-linker-wrapper/CMakeLists.txt
+++ b/clang/tools/clang-linker-wrapper/CMakeLists.txt
@@ -4,11 +4,11 @@ set(LLVM_LINK_COMPONENTS
   Core
   BinaryFormat
   MC
-  Extensions
   Target
   TransformUtils
   Analysis
   Passes
+  Plugins
   IRReader
   Object
   Option
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 549251e..48a3c5f 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -20,7 +20,6 @@
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/Frontend/Offloading/OffloadWrapper.h"
 #include "llvm/Frontend/Offloading/Utility.h"
 #include "llvm/IR/Constants.h"
@@ -39,6 +38,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Errc.h"
diff --git a/flang/docs/DebugGeneration.md b/flang/docs/DebugGeneration.md
index f26bb82..df469e4 100644
--- a/flang/docs/DebugGeneration.md
+++ b/flang/docs/DebugGeneration.md
@@ -418,7 +418,7 @@ with fixed sizes arrays. It needs to also accept `DIExpressionAttr` or
 `rank`, `allocated` and `associated`.
 5. `DIStringTypeAttr`
 
-# Testing
+## Testing
 
 - LLVM LIT tests will be added to test:
   - the driver and ensure that it passes the line table and full debug
@@ -434,7 +434,7 @@ with fixed sizes arrays. It needs to also accept `DIExpressionAttr` or
   - print values and types (ptype) of various type of variables
 - Manually run `GDB`'s gdb.fortran testsuite with llvm-flang.
 
-# Resources
+## Resources
 - [1] https://dwarfstd.org/doc/DWARF5.pdf
 - [2] https://llvm.org/docs/LangRef.html#metadata
 - [3] https://archive.fosdem.org/2022/schedule/event/llvm_fortran_debug/
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index fb74b3d..4ebe497 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -46,6 +46,7 @@ add_flang_library(flangFrontend
 
   LINK_COMPONENTS
   Passes
+  Plugins
   Analysis
   BitReader
   Extensions
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 1d4efbe..5c0311c 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -47,7 +47,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
@@ -57,6 +56,7 @@
 #include "llvm/Object/OffloadBinary.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Error.h"
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 5caf840..57e4381 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -123,6 +123,14 @@ function(_get_compile_options_from_config output_var)
     list(APPEND config_options "-DLIBC_COPT_RAW_MUTEX_DEFAULT_SPIN_COUNT=${LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT}")
   endif()
 
+  if(LIBC_CONF_MATH_USE_SYSTEM_FENV)
+    if(MSVC)
+      list(APPEND config_options "/DLIBC_MATH_USE_SYSTEM_FENV")
+    else()
+      list(APPEND config_options "-DLIBC_MATH_USE_SYSTEM_FENV")
+    endif()
+  endif()
+
   set(${output_var} ${config_options} PARENT_SCOPE)
 endfunction(_get_compile_options_from_config)
 
diff --git a/libc/config/config.json b/libc/config/config.json
index 1c11b9a..f981c43 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -105,11 +105,15 @@
   "math": {
     "LIBC_CONF_MATH_OPTIMIZATIONS": {
       "value": 0,
-      "doc": "Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST."
+      "doc": "Configure optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST."
     },
     "LIBC_CONF_FREXP_INF_NAN_EXPONENT": {
       "value": "",
       "doc": "The value written back to the second parameter when calling frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified.  Configure an explicit exp value for Inf/NaN inputs."
+    },
+    "LIBC_CONF_MATH_USE_SYSTEM_FENV": {
+      "value": false,
+      "doc": "Use C standard fenv.h calls from the system libc instead our internal fenv implementations."
     }
   },
   "qsort": {
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 7c36222..81888bb 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -36,7 +36,8 @@ to learn about the defaults for your platform and target.
     - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
 * **"math" options**
     - ``LIBC_CONF_FREXP_INF_NAN_EXPONENT``: The value written back to the second parameter when calling frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified.  Configure an explicit exp value for Inf/NaN inputs.
-    - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST.
+    - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configure optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST.
+    - ``LIBC_CONF_MATH_USE_SYSTEM_FENV``: Use C standard fenv.h calls from the system libc instead our internal fenv implementations.
 * **"printf" options**
     - ``LIBC_CONF_PRINTF_DISABLE_FIXED_POINT``: Disable printing fixed point values in printf and friends.
     - ``LIBC_CONF_PRINTF_DISABLE_FLOAT``: Disable printing floating point values in printf and friends.
diff --git a/libc/shared/libc_common.h b/libc/shared/libc_common.h
index c4560bb..5f7bebd 100644
--- a/libc/shared/libc_common.h
+++ b/libc/shared/libc_common.h
@@ -19,6 +19,11 @@
 #define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM_INLINE
 #endif // LIBC_ERRNO_MODE
 
+// Use system fenv functions in math implementations.
+#ifndef LIBC_MATH_USE_SYSTEM_FENV
+#define LIBC_MATH_USE_SYSTEM_FENV
+#endif // LIBC_MATH_USE_SYSTEM_FENV
+
 #ifndef LIBC_NAMESPACE
 #define LIBC_NAMESPACE __llvm_libc
 #endif // LIBC_NAMESPACE
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index 3ef2df5f..a21f511 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -20,6 +20,51 @@
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
 
+// In full build mode we are the system fenv in libc.
+#if defined(LIBC_FULL_BUILD)
+#undef LIBC_MATH_USE_SYSTEM_FENV
+#endif // LIBC_FULL_BUILD
+
+#if defined(LIBC_MATH_USE_SYSTEM_FENV)
+
+// Simply call the system libc fenv.h functions, only for those that are used in
+// math function implementations.
+// To be used as an option for math function implementation, not to be used to
+// implement fenv.h functions themselves.
+
+#include <fenv.h>
+
+namespace LIBC_NAMESPACE_DECL {
+namespace fputil {
+
+LIBC_INLINE int clear_except(int excepts) { return feclearexcept(excepts); }
+
+LIBC_INLINE int test_except(int excepts) { return fetestexcept(excepts); }
+
+LIBC_INLINE int get_except() {
+  fexcept_t excepts = 0;
+  fegetexceptflag(&excepts, FE_ALL_EXCEPT);
+  return static_cast<int>(excepts);
+}
+
+LIBC_INLINE int set_except(int excepts) {
+  fexcept_t exc = static_cast<fexcept_t>(excepts);
+  return fesetexceptflag(&exc, FE_ALL_EXCEPT);
+}
+
+LIBC_INLINE int raise_except(int excepts) { return feraiseexcept(excepts); }
+
+LIBC_INLINE int get_round() { return fegetround(); }
+
+LIBC_INLINE int set_round(int rounding_mode) {
+  return fesetround(rounding_mode);
+}
+
+} // namespace fputil
+} // namespace LIBC_NAMESPACE_DECL
+
+#else // !LIBC_MATH_USE_SYSTEM_FENV
+
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP)
 #if defined(__APPLE__)
 #include "aarch64/fenv_darwin_impl.h"
@@ -73,6 +118,8 @@ LIBC_INLINE int set_env(const fenv_t *) { return 0; }
 } // namespace LIBC_NAMESPACE_DECL
 #endif
 
+#endif // LIBC_MATH_USE_SYSTEM_FENV
+
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
diff --git a/libc/src/fenv/feclearexcept.cpp b/libc/src/fenv/feclearexcept.cpp
index c8a032f..7adfa23 100644
--- a/libc/src/fenv/feclearexcept.cpp
+++ b/libc/src/fenv/feclearexcept.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/feclearexcept.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/fedisableexcept.cpp b/libc/src/fenv/fedisableexcept.cpp
index a2a1e97..9b66a2c 100644
--- a/libc/src/fenv/fedisableexcept.cpp
+++ b/libc/src/fenv/fedisableexcept.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fedisableexcept.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/feenableexcept.cpp b/libc/src/fenv/feenableexcept.cpp
index 468a170..ecd6bcd 100644
--- a/libc/src/fenv/feenableexcept.cpp
+++ b/libc/src/fenv/feenableexcept.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/feenableexcept.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/fegetenv.cpp b/libc/src/fenv/fegetenv.cpp
index c692b87..a396aeb 100644
--- a/libc/src/fenv/fegetenv.cpp
+++ b/libc/src/fenv/fegetenv.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fegetenv.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/fegetexcept.cpp b/libc/src/fenv/fegetexcept.cpp
index 2b3de83..5efad39 100644
--- a/libc/src/fenv/fegetexcept.cpp
+++ b/libc/src/fenv/fegetexcept.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fegetexcept.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/fegetexceptflag.cpp b/libc/src/fenv/fegetexceptflag.cpp
index 58418cc..4d508eb 100644
--- a/libc/src/fenv/fegetexceptflag.cpp
+++ b/libc/src/fenv/fegetexceptflag.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fegetexceptflag.h"
 #include "hdr/types/fexcept_t.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
diff --git a/libc/src/fenv/fegetround.cpp b/libc/src/fenv/fegetround.cpp
index 4f5caed..cb2a802 100644
--- a/libc/src/fenv/fegetround.cpp
+++ b/libc/src/fenv/fegetround.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fegetround.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/feholdexcept.cpp b/libc/src/fenv/feholdexcept.cpp
index 81b3ea4..83ac298 100644
--- a/libc/src/fenv/feholdexcept.cpp
+++ b/libc/src/fenv/feholdexcept.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/feholdexcept.h"
 #include "hdr/types/fenv_t.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
diff --git a/libc/src/fenv/feraiseexcept.cpp b/libc/src/fenv/feraiseexcept.cpp
index 6eaa09d..1379457 100644
--- a/libc/src/fenv/feraiseexcept.cpp
+++ b/libc/src/fenv/feraiseexcept.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/feraiseexcept.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/fesetenv.cpp b/libc/src/fenv/fesetenv.cpp
index 7a9f90a..6eb051d 100644
--- a/libc/src/fenv/fesetenv.cpp
+++ b/libc/src/fenv/fesetenv.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fesetenv.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/fesetexcept.cpp b/libc/src/fenv/fesetexcept.cpp
index 8775821..f0df251 100644
--- a/libc/src/fenv/fesetexcept.cpp
+++ b/libc/src/fenv/fesetexcept.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fesetexcept.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/fesetexceptflag.cpp b/libc/src/fenv/fesetexceptflag.cpp
index 9cec9d1..86cbdaa 100644
--- a/libc/src/fenv/fesetexceptflag.cpp
+++ b/libc/src/fenv/fesetexceptflag.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fesetexceptflag.h"
 #include "hdr/types/fexcept_t.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
diff --git a/libc/src/fenv/fesetround.cpp b/libc/src/fenv/fesetround.cpp
index 6f65f9f..b32af30 100644
--- a/libc/src/fenv/fesetround.cpp
+++ b/libc/src/fenv/fesetround.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fesetround.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/fetestexcept.cpp b/libc/src/fenv/fetestexcept.cpp
index f4986ac..6e7b3ea 100644
--- a/libc/src/fenv/fetestexcept.cpp
+++ b/libc/src/fenv/fetestexcept.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fetestexcept.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/src/fenv/fetestexceptflag.cpp b/libc/src/fenv/fetestexceptflag.cpp
index 03ba1e6..28ba2f7 100644
--- a/libc/src/fenv/fetestexceptflag.cpp
+++ b/libc/src/fenv/fetestexceptflag.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/fetestexceptflag.h"
 #include "hdr/types/fexcept_t.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
diff --git a/libc/src/fenv/feupdateenv.cpp b/libc/src/fenv/feupdateenv.cpp
index 1cc730c..cf3c320 100644
--- a/libc/src/fenv/feupdateenv.cpp
+++ b/libc/src/fenv/feupdateenv.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/feupdateenv.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp
index c469470..73cf6a8 100644
--- a/libc/test/UnitTest/FEnvSafeTest.cpp
+++ b/libc/test/UnitTest/FEnvSafeTest.cpp
@@ -6,6 +6,10 @@
 //
 //===---------------------------------------------------------------------===//
 
+#ifdef LIBC_MATH_USE_SYSTEM_FENV
+#undef LIBC_MATH_USE_SYSTEM_FENV
+#endif // LIBC_MATH_USE_SYSTEM_FENV
+
 #include "FEnvSafeTest.h"
 
 #include "src/__support/FPUtil/FEnvImpl.h"
diff --git a/libc/test/UnitTest/FPExceptMatcher.cpp b/libc/test/UnitTest/FPExceptMatcher.cpp
index 9688d53..3f5329b 100644
--- a/libc/test/UnitTest/FPExceptMatcher.cpp
+++ b/libc/test/UnitTest/FPExceptMatcher.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "FPExceptMatcher.h"
 
 #include "src/__support/macros/config.h"
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 430727e..bc6ac36 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_UNITTEST_FPMATCHER_H
 #define LLVM_LIBC_TEST_UNITTEST_FPMATCHER_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
diff --git a/libc/test/UnitTest/RoundingModeUtils.cpp b/libc/test/UnitTest/RoundingModeUtils.cpp
index 46ac204..15b8a36f 100644
--- a/libc/test/UnitTest/RoundingModeUtils.cpp
+++ b/libc/test/UnitTest/RoundingModeUtils.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "RoundingModeUtils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/rounding_mode.h"
diff --git a/libc/test/src/fenv/enabled_exceptions_test.cpp b/libc/test/src/fenv/enabled_exceptions_test.cpp
index b5e6562..a742c24 100644
--- a/libc/test/src/fenv/enabled_exceptions_test.cpp
+++ b/libc/test/src/fenv/enabled_exceptions_test.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/feclearexcept.h"
 #include "src/fenv/feraiseexcept.h"
 #include "src/fenv/fetestexcept.h"
diff --git a/libc/test/src/fenv/exception_flags_test.cpp b/libc/test/src/fenv/exception_flags_test.cpp
index 2f4332d..6fbb1a4 100644
--- a/libc/test/src/fenv/exception_flags_test.cpp
+++ b/libc/test/src/fenv/exception_flags_test.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "hdr/types/fexcept_t.h"
 #include "src/fenv/fegetexceptflag.h"
 #include "src/fenv/fesetexceptflag.h"
diff --git a/libc/test/src/fenv/exception_status_test.cpp b/libc/test/src/fenv/exception_status_test.cpp
index fdf9421..49461bc4 100644
--- a/libc/test/src/fenv/exception_status_test.cpp
+++ b/libc/test/src/fenv/exception_status_test.cpp
@@ -7,6 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/feclearexcept.h"
 #include "src/fenv/feraiseexcept.h"
 #include "src/fenv/fesetexcept.h"
diff --git a/libc/test/src/fenv/feclearexcept_test.cpp b/libc/test/src/fenv/feclearexcept_test.cpp
index f39cf4ec..17da105b 100644
--- a/libc/test/src/fenv/feclearexcept_test.cpp
+++ b/libc/test/src/fenv/feclearexcept_test.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/fenv/feclearexcept.h"
 
 #include "src/__support/FPUtil/FEnvImpl.h"
diff --git a/libc/test/src/fenv/feholdexcept_test.cpp b/libc/test/src/fenv/feholdexcept_test.cpp
index 1112ec9..0e601d8 100644
--- a/libc/test/src/fenv/feholdexcept_test.cpp
+++ b/libc/test/src/fenv/feholdexcept_test.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "hdr/types/fenv_t.h"
 #include "src/fenv/feholdexcept.h"
 
diff --git a/libc/test/src/fenv/feupdateenv_test.cpp b/libc/test/src/fenv/feupdateenv_test.cpp
index d2ffc0e..f50b25e 100644
--- a/libc/test/src/fenv/feupdateenv_test.cpp
+++ b/libc/test/src/fenv/feupdateenv_test.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "hdr/types/fenv_t.h"
 #include "src/fenv/feupdateenv.h"
 
diff --git a/libc/test/src/fenv/getenv_and_setenv_test.cpp b/libc/test/src/fenv/getenv_and_setenv_test.cpp
index 63929ee..f51c599 100644
--- a/libc/test/src/fenv/getenv_and_setenv_test.cpp
+++ b/libc/test/src/fenv/getenv_and_setenv_test.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "hdr/types/fenv_t.h"
 #include "src/fenv/fegetenv.h"
 #include "src/fenv/fegetround.h"
diff --git a/libc/test/src/math/RIntTest.h b/libc/test/src/math/RIntTest.h
index c9d8ebc..86e73bb 100644
--- a/libc/test/src/math/RIntTest.h
+++ b/libc/test/src/math/RIntTest.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_RINTTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_RINTTEST_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index e5e9386..469db11 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_ROUNDTOINTEGERTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_ROUNDTOINTEGERTEST_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
diff --git a/libc/test/src/math/smoke/CanonicalizeTest.h b/libc/test/src/math/smoke/CanonicalizeTest.h
index e500bc3..524a79a 100644
--- a/libc/test/src/math/smoke/CanonicalizeTest.h
+++ b/libc/test/src/math/smoke/CanonicalizeTest.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_CANONICALIZETEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_CANONICALIZETEST_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/integer_literals.h"
diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h
index 838c917..ab336cd 100644
--- a/libc/test/src/math/smoke/FModTest.h
+++ b/libc/test/src/math/smoke/FModTest.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
diff --git a/libc/test/src/math/smoke/NearbyIntTest.h b/libc/test/src/math/smoke/NearbyIntTest.h
index 092700e..aa6c6bf 100644
--- a/libc/test/src/math/smoke/NearbyIntTest.h
+++ b/libc/test/src/math/smoke/NearbyIntTest.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_NEARBYINTTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_NEARBYINTTEST_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h
index b7e59f7..2d3e6a0 100644
--- a/libc/test/src/math/smoke/NextAfterTest.h
+++ b/libc/test/src/math/smoke/NextAfterTest.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/__support/CPP/bit.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h
index 43e71c6..377f458 100644
--- a/libc/test/src/math/smoke/NextTowardTest.h
+++ b/libc/test/src/math/smoke/NextTowardTest.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTTOWARDTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_NEXTTOWARDTEST_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/__support/CPP/bit.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h
index 7f14aeb..d4cfc1b 100644
--- a/libc/test/src/math/smoke/RIntTest.h
+++ b/libc/test/src/math/smoke/RIntTest.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_RINTTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_RINTTEST_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FEnvSafeTest.h"
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index c0b326e..70fc361 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_ROUNDTOINTEGERTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_ROUNDTOINTEGERTEST_H
 
+#undef LIBC_MATH_USE_SYSTEM_FENV
+
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 4613539..34cca88 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -527,35 +527,17 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel,
     write32(ctx, loc, val);
     break;
   case R_AARCH64_ABS64:
-    // AArch64 relocations to tagged symbols have extended semantics, as
-    // described here:
-    // https://github.com/ARM-software/abi-aa/blob/main/memtagabielf64/memtagabielf64.rst#841extended-semantics-of-r_aarch64_relative.
-    // tl;dr: encode the symbol's special addend in the place, which is an
-    // offset to the point where the logical tag is derived from. Quick hack, if
-    // the addend is within the symbol's bounds, no need to encode the tag
-    // derivation offset.
-    if (rel.sym && rel.sym->isTagged() &&
-        (rel.addend < 0 ||
-         rel.addend >= static_cast<int64_t>(rel.sym->getSize())))
-      write64(ctx, loc, -rel.addend);
-    else
-      write64(ctx, loc, val);
+    write64(ctx, loc, val);
     break;
   case R_AARCH64_PREL64:
     write64(ctx, loc, val);
     break;
   case R_AARCH64_AUTH_ABS64:
-    // If val is wider than 32 bits, the relocation must have been moved from
-    // .relr.auth.dyn to .rela.dyn, and the addend write is not needed.
-    //
-    // If val fits in 32 bits, we have two potential scenarios:
-    // * True RELR: Write the 32-bit `val`.
-    // * RELA: Even if the value now fits in 32 bits, it might have been
-    //   converted from RELR during an iteration in
-    //   finalizeAddressDependentContent(). Writing the value is harmless
-    //   because dynamic linking ignores it.
-    if (isInt<32>(val))
-      write32(ctx, loc, val);
+    // This is used for the addend of a .relr.auth.dyn entry,
+    // which is a 32-bit value; the upper 32 bits are used to
+    // encode the schema.
+    checkInt(ctx, loc, val, 32, rel);
+    write32(ctx, loc, val);
     break;
   case R_AARCH64_ADD_ABS_LO12_NC:
   case R_AARCH64_AUTH_GOT_ADD_LO12_NC:
@@ -947,6 +929,8 @@ void AArch64::relocateAlloc(InputSection &sec, uint8_t *buf) const {
   AArch64Relaxer relaxer(ctx, sec.relocs());
   for (size_t i = 0, size = sec.relocs().size(); i != size; ++i) {
     const Relocation &rel = sec.relocs()[i];
+    if (rel.expr == R_NONE) // See finalizeAddressDependentContent()
+      continue;
     uint8_t *loc = buf + rel.offset;
     const uint64_t val = sec.getRelocTargetVA(ctx, rel, secAddr + rel.offset);
 
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index ff7ef2d..ca8a9c0 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -784,6 +784,8 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
     return r.sym->getVA(ctx, a);
   case R_ADDEND:
     return a;
+  case R_ADDEND_NEG:
+    return -static_cast<uint64_t>(a);
   case R_RELAX_HINT:
     return 0;
   case RE_ARM_SBREL:
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 59aa430..5c23c76 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -717,7 +717,7 @@ static void addRelativeReloc(Ctx &ctx, InputSectionBase &isec,
     // field. This is described in further detail in:
     // https://github.com/ARM-software/abi-aa/blob/main/memtagabielf64/memtagabielf64.rst#841extended-semantics-of-r_aarch64_relative
     if (addend < 0 || static_cast<uint64_t>(addend) >= sym.getSize())
-      isec.relocations.push_back({expr, type, offsetInSec, addend, &sym});
+      isec.relocations.push_back({R_ADDEND_NEG, type, offsetInSec, addend, &sym});
     return;
   }
 
@@ -1005,13 +1005,12 @@ void RelocScan::process(RelExpr expr, RelType type, uint64_t offset,
         rel = ctx.target->relativeRel;
       std::lock_guard<std::mutex> lock(ctx.relocMutex);
       Partition &part = sec->getPartition(ctx);
-      if (ctx.arg.emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64) {
-        // For a preemptible symbol, we can't use a relative relocation. For an
-        // undefined symbol, we can't compute offset at link-time and use a
-        // relative relocation. Use a symbolic relocation instead.
-        if (sym.isPreemptible) {
-          part.relaDyn->addSymbolReloc(type, *sec, offset, sym, addend, type);
-        } else if (part.relrAuthDyn && sec->addralign >= 2 && offset % 2 == 0) {
+      // For a preemptible symbol, we can't use a relative relocation. For an
+      // undefined symbol, we can't compute offset at link-time and use a
+      // relative relocation. Use a symbolic relocation instead.
+      if (ctx.arg.emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64 &&
+          !sym.isPreemptible) {
+        if (part.relrAuthDyn && sec->addralign >= 2 && offset % 2 == 0) {
           // When symbol values are determined in
           // finalizeAddressDependentContent, some .relr.auth.dyn relocations
           // may be moved to .rela.dyn.
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index 86ca298..4cb09f3 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -42,6 +42,7 @@ using JumpModType = uint32_t;
 enum RelExpr {
   R_ABS,
   R_ADDEND,
+  R_ADDEND_NEG,
   R_DTPREL,
   R_GOT,
   R_GOT_OFF,
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index e01a5ad..16bdda8e 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -585,7 +585,7 @@ struct RelativeReloc {
     return inputSec->getVA(inputSec->relocs()[relocIdx].offset);
   }
 
-  const InputSectionBase *inputSec;
+  InputSectionBase *inputSec;
   size_t relocIdx;
 };
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 083b4fb..db5626e 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1583,9 +1583,10 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
       if (part.relrAuthDyn) {
         auto it = llvm::remove_if(
             part.relrAuthDyn->relocs, [this, &part](const RelativeReloc &elem) {
-              const Relocation &reloc = elem.inputSec->relocs()[elem.relocIdx];
+              Relocation &reloc = elem.inputSec->relocs()[elem.relocIdx];
               if (isInt<32>(reloc.sym->getVA(ctx, reloc.addend)))
                 return false;
+              reloc.expr = R_NONE;
               part.relaDyn->addReloc({R_AARCH64_AUTH_RELATIVE, elem.inputSec,
                                       reloc.offset, false, *reloc.sym,
                                       reloc.addend, R_ABS});
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 4dd8559..8614347 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -1680,6 +1680,9 @@ public:
   size_t ReadCStringFromMemory(lldb::addr_t vm_addr, std::string &out_str,
                                Status &error);
 
+  llvm::SmallVector<std::optional<std::string>>
+  ReadCStringsFromMemory(llvm::ArrayRef<lldb::addr_t> addresses);
+
   /// Reads an unsigned integer of the specified byte size from process
   /// memory.
   ///
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
index ebde889..430f4f0 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
@@ -265,6 +265,28 @@ bool ClassDescriptorV2::method_list_t::Read(Process *process,
   return true;
 }
 
+void ClassDescriptorV2::method_t::ReadNames(
+    llvm::MutableArrayRef<method_t> methods, Process &process) {
+  std::vector<lldb::addr_t> str_addresses;
+  str_addresses.reserve(2 * methods.size());
+  for (auto &method : methods)
+    str_addresses.push_back(method.m_name_ptr);
+  for (auto &method : methods)
+    str_addresses.push_back(method.m_types_ptr);
+
+  llvm::SmallVector<std::optional<std::string>> read_result =
+      process.ReadCStringsFromMemory(str_addresses);
+  auto names = llvm::MutableArrayRef(read_result).take_front(methods.size());
+  auto types = llvm::MutableArrayRef(read_result).take_back(methods.size());
+
+  for (auto [name_str, type_str, method] : llvm::zip(names, types, methods)) {
+    if (name_str)
+      method.m_name = std::move(*name_str);
+    if (type_str)
+      method.m_types = std::move(*type_str);
+  }
+}
+
 llvm::SmallVector<ClassDescriptorV2::method_t, 0>
 ClassDescriptorV2::ReadMethods(llvm::ArrayRef<lldb::addr_t> addresses,
                                lldb::addr_t relative_string_base_addr,
@@ -301,6 +323,7 @@ ClassDescriptorV2::ReadMethods(llvm::ArrayRef<lldb::addr_t> addresses,
                         is_small, has_direct_sel, has_relative_types);
   }
 
+  method_t::ReadNames(methods, *process);
   return methods;
 }
 
@@ -338,13 +361,7 @@ bool ClassDescriptorV2::method_t::Read(DataExtractor &extractor,
     m_imp_ptr = extractor.GetAddress_unchecked(&cursor);
   }
 
-  Status error;
-  process->ReadCStringFromMemory(m_name_ptr, m_name, error);
-  if (error.Fail())
-    return false;
-
-  process->ReadCStringFromMemory(m_types_ptr, m_types, error);
-  return error.Success();
+  return true;
 }
 
 bool ClassDescriptorV2::ivar_list_t::Read(Process *process, lldb::addr_t addr) {
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
index 8d19b00..b5cc50a 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
@@ -173,6 +173,10 @@ private:
     bool Read(DataExtractor &extractor, Process *process, lldb::addr_t addr,
               lldb::addr_t relative_string_base_addr, bool is_small,
               bool has_direct_sel, bool has_relative_types);
+
+    /// Fill in `m_name` and `m_types` efficiently by batching read requests.
+    static void ReadNames(llvm::MutableArrayRef<method_t> methods,
+                          Process &process);
   };
 
   llvm::SmallVector<method_t, 0>
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 9c8e8fa7..ab25094 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -117,6 +117,8 @@ static constexpr OptionEnumValueElement g_follow_fork_mode_values[] = {
     },
 };
 
+static constexpr unsigned g_string_read_width = 256;
+
 #define LLDB_PROPERTIES_process
 #include "TargetProperties.inc"
 
@@ -2135,9 +2137,63 @@ lldb::addr_t Process::FindInMemory(const uint8_t *buf, uint64_t size,
   return matches[0].GetBaseAddress().GetLoadAddress(&target);
 }
 
+llvm::SmallVector<std::optional<std::string>>
+Process::ReadCStringsFromMemory(llvm::ArrayRef<lldb::addr_t> addresses) {
+  llvm::SmallVector<std::optional<std::string>> output_strs(addresses.size(),
+                                                            "");
+  llvm::SmallVector<Range<addr_t, size_t>> ranges{
+      llvm::map_range(addresses, [=](addr_t ptr) {
+        return Range<addr_t, size_t>(ptr, g_string_read_width);
+      })};
+
+  std::vector<uint8_t> buffer(g_string_read_width * addresses.size(), 0);
+  uint64_t num_completed_strings = 0;
+
+  while (num_completed_strings != addresses.size()) {
+    llvm::SmallVector<llvm::MutableArrayRef<uint8_t>> read_results =
+        ReadMemoryRanges(ranges, buffer);
+
+    // Each iteration of this loop either increments num_completed_strings or
+    // updates the base pointer of some range, guaranteeing forward progress of
+    // the outer loop.
+    for (auto [range, read_result, output_str] :
+         llvm::zip(ranges, read_results, output_strs)) {
+      // A previously completed string.
+      if (range.GetByteSize() == 0)
+        continue;
+
+      // The read failed, set the range to 0 to avoid reading it again.
+      if (read_result.empty()) {
+        output_str = std::nullopt;
+        range.SetByteSize(0);
+        num_completed_strings++;
+        continue;
+      }
+
+      // Convert ArrayRef to StringRef so the pointers work with std::string.
+      auto read_result_str = llvm::toStringRef(read_result);
+
+      const char *null_terminator_pos = llvm::find(read_result_str, '\0');
+      output_str->append(read_result_str.begin(), null_terminator_pos);
+
+      // If the terminator was found, this string is complete.
+      if (null_terminator_pos != read_result_str.end()) {
+        range.SetByteSize(0);
+        num_completed_strings++;
+      }
+      // Otherwise increment the base pointer for the next read.
+      else {
+        range.SetRangeBase(range.GetRangeBase() + read_result.size());
+      }
+    }
+  }
+
+  return output_strs;
+}
+
 size_t Process::ReadCStringFromMemory(addr_t addr, std::string &out_str,
                                       Status &error) {
-  char buf[256];
+  char buf[g_string_read_width];
   out_str.clear();
   addr_t curr_addr = addr;
   while (true) {
diff --git a/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py b/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py
index 45d855f..65f0cef 100644
--- a/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py
+++ b/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py
@@ -65,6 +65,7 @@ class TestPlatformProcessLaunchGDBServer(TestBase):
 
     @skipIfRemote
     @skipIfDarwin  # Uses debugserver for debugging
+    @skipIfWindows
     @add_test_categories(["lldb-server"])
     def test_launch_with_unusual_process_name(self):
         """
diff --git a/lldb/unittests/Target/MemoryTest.cpp b/lldb/unittests/Target/MemoryTest.cpp
index 131a3ca..15b22a4 100644
--- a/lldb/unittests/Target/MemoryTest.cpp
+++ b/lldb/unittests/Target/MemoryTest.cpp
@@ -434,3 +434,76 @@ TEST_F(MemoryDeathTest, TestReadMemoryRangesWithShortBuffer) {
     ASSERT_TRUE(result.empty());
 #endif
 }
+
+/// A process class whose memory contains the following map of addresses to
+/// strings:
+///   100 -> "hello\0"
+///   200 -> "\0"
+///   201 -> "goodbye"
+///   300 -> a string composed of 500 'c' characters, followed by '\0'.
+///   addresses >= 1024 -> error
+class StringReaderProcess : public Process {
+public:
+  char memory[1024];
+  void initialize_memory() {
+    // Use some easily identifiable character for the areas of memory we're not
+    // intending to read.
+    memset(memory, '?', 1024);
+    strcpy(&memory[100], "hello");
+    strcpy(&memory[200], "");
+    strcpy(&memory[201], "goodbye");
+    std::vector<char> long_str(500, 'c');
+    long_str.push_back('\0');
+    strcpy(&memory[300], long_str.data());
+  }
+
+  size_t DoReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
+                      Status &error) override {
+    if (vm_addr >= 1024) {
+      error = Status::FromErrorString("out of bounds!");
+      return 0;
+    }
+    memcpy(buf, memory + vm_addr, size);
+    return size;
+  }
+  StringReaderProcess(lldb::TargetSP target_sp, lldb::ListenerSP listener_sp)
+      : Process(target_sp, listener_sp) {
+    initialize_memory();
+  }
+  // Boilerplate, nothing interesting below.
+  bool CanDebug(lldb::TargetSP, bool) override { return true; }
+  Status DoDestroy() override { return {}; }
+  void RefreshStateAfterStop() override {}
+  bool DoUpdateThreadList(ThreadList &, ThreadList &) override { return false; }
+  llvm::StringRef GetPluginName() override { return "Dummy"; }
+};
+
+TEST_F(MemoryTest, TestReadCStringsFromMemory) {
+  ArchSpec arch("x86_64-apple-macosx-");
+  Platform::SetHostPlatform(PlatformRemoteMacOSX::CreateInstance(true, &arch));
+  DebuggerSP debugger_sp = Debugger::CreateInstance();
+  ASSERT_TRUE(debugger_sp);
+  TargetSP target_sp = CreateTarget(debugger_sp, arch);
+  ASSERT_TRUE(target_sp);
+  ListenerSP listener_sp(Listener::MakeListener("dummy"));
+  ProcessSP process_sp =
+      std::make_shared<StringReaderProcess>(target_sp, listener_sp);
+  ASSERT_TRUE(process_sp);
+
+  // See the docs for StringReaderProcess above for an explanation of these
+  // addresses.
+  llvm::SmallVector<std::optional<std::string>> maybe_strings =
+      process_sp->ReadCStringsFromMemory({100, 200, 201, 300, 0xffffff});
+  ASSERT_EQ(maybe_strings.size(), 5ull);
+  auto expected_valid_strings = llvm::ArrayRef(maybe_strings).take_front(4);
+
+  std::vector<char> long_str(500, 'c');
+  long_str.push_back('\0');
+  std::string big_str(long_str.data());
+
+  const std::vector<std::optional<std::string>> expected_answers = {
+      "hello", "", "goodbye", big_str, std::nullopt};
+  for (auto [maybe_str, expected_answer] :
+       llvm::zip(expected_valid_strings, expected_answers))
+    EXPECT_EQ(maybe_str, expected_answer);
+}
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index fb94fde..d99280f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -8677,6 +8677,23 @@ denoting if the type contains a pointer.
 
   !0 = !{!"<type-name>", i1 <contains-pointer>}
 
+'``stack-protector``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``stack-protector`` metadata may be attached to alloca instructions.  An
+alloca instruction with this metadata and value `i32 0` will be skipped when
+deciding whether a given function requires a stack protector.  The function
+may still use a stack protector, if other criteria determine it needs one.
+
+The metadata contains an integer, where a 0 value opts the given alloca out
+of requiring a stack protector.
+
+.. code-block:: none
+
+   %a = alloca [1000 x i8], align 1, !stack-protector !0
+
+  !0 = !{i32 0}
+
 Module Flags Metadata
 =====================
 
diff --git a/llvm/examples/Bye/Bye.cpp b/llvm/examples/Bye/Bye.cpp
index 58d330a..b476ab7 100644
--- a/llvm/examples/Bye/Bye.cpp
+++ b/llvm/examples/Bye/Bye.cpp
@@ -1,8 +1,8 @@
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/llvm/examples/IRTransforms/SimplifyCFG.cpp b/llvm/examples/IRTransforms/SimplifyCFG.cpp
index 942764c..bf72643 100644
--- a/llvm/examples/IRTransforms/SimplifyCFG.cpp
+++ b/llvm/examples/IRTransforms/SimplifyCFG.cpp
@@ -33,12 +33,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
diff --git a/llvm/include/llvm/AsmParser/AsmParserContext.h b/llvm/include/llvm/AsmParser/AsmParserContext.h
index d7330d9..8f7823a 100644
--- a/llvm/include/llvm/AsmParser/AsmParserContext.h
+++ b/llvm/include/llvm/AsmParser/AsmParserContext.h
@@ -10,11 +10,13 @@
 #define LLVM_ASMPARSER_ASMPARSERCONTEXT_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntervalMap.h"
 #include "llvm/AsmParser/FileLoc.h"
 #include "llvm/IR/Value.h"
 #include <optional>
 
 namespace llvm {
+class BasicBlock;
 
 /// Registry of file location information for LLVM IR constructs.
 ///
@@ -29,9 +31,28 @@ namespace llvm {
 /// This information is optionally emitted by the LLParser while
 /// it reads LLVM textual IR.
 class AsmParserContext {
+  using FMap =
+      IntervalMap<FileLoc, Function *,
+                  IntervalMapImpl::NodeSizer<FileLoc, Function *>::LeafSize,
+                  IntervalMapHalfOpenInfo<FileLoc>>;
+
   DenseMap<Function *, FileLocRange> Functions;
+  FMap::Allocator FAllocator;
+  FMap FunctionsInverse = FMap(FAllocator);
   DenseMap<BasicBlock *, FileLocRange> Blocks;
+  using BBMap =
+      IntervalMap<FileLoc, BasicBlock *,
+                  IntervalMapImpl::NodeSizer<FileLoc, BasicBlock *>::LeafSize,
+                  IntervalMapHalfOpenInfo<FileLoc>>;
+  BBMap::Allocator BBAllocator;
+  BBMap BlocksInverse = BBMap(BBAllocator);
   DenseMap<Instruction *, FileLocRange> Instructions;
+  using IMap =
+      IntervalMap<FileLoc, Instruction *,
+                  IntervalMapImpl::NodeSizer<FileLoc, Instruction *>::LeafSize,
+                  IntervalMapHalfOpenInfo<FileLoc>>;
+  IMap::Allocator IAllocator;
+  IMap InstructionsInverse = IMap(IAllocator);
 
 public:
   LLVM_ABI std::optional<FileLocRange>
diff --git a/llvm/include/llvm/AsmParser/FileLoc.h b/llvm/include/llvm/AsmParser/FileLoc.h
index 02c1849..90d8485 100644
--- a/llvm/include/llvm/AsmParser/FileLoc.h
+++ b/llvm/include/llvm/AsmParser/FileLoc.h
@@ -21,6 +21,10 @@ struct FileLoc {
   /// 0-based column number
   unsigned Col;
 
+  bool operator==(const FileLoc &RHS) const {
+    return Line == RHS.Line && Col == RHS.Col;
+  }
+
   bool operator<=(const FileLoc &RHS) const {
     return Line < RHS.Line || (Line == RHS.Line && Col <= RHS.Col);
   }
@@ -29,6 +33,7 @@ struct FileLoc {
     return Line < RHS.Line || (Line == RHS.Line && Col < RHS.Col);
   }
 
+  FileLoc() : Line(0), Col(0) {}
   FileLoc(unsigned L, unsigned C) : Line(L), Col(C) {}
   FileLoc(std::pair<unsigned, unsigned> LC) : Line(LC.first), Col(LC.second) {}
 };
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 437cf1a..27a9019 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -371,11 +371,13 @@ enum NodeType {
 
   /// RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift. The first
   /// operand is the value to be shifted, and the second argument is the amount
-  /// to shift by. Both must be integers of the same bit width (W). If the true
-  /// value of LHS << RHS exceeds the largest value that can be represented by
-  /// W bits, the resulting value is this maximum value, Otherwise, if this
-  /// value is less than the smallest value that can be represented by W bits,
-  /// the resulting value is this minimum value.
+  /// to shift by. Both must be integers. After legalization the type of the
+  /// shift amount is known to be TLI.getShiftAmountTy(). Before legalization
+  /// the shift amount can be any type, but care must be taken to ensure it is
+  /// large enough. If the true value of LHS << RHS exceeds the largest value
+  /// that can be represented by W bits, the resulting value is this maximum
+  /// value, Otherwise, if this value is less than the smallest value that can
+  /// be represented by W bits, the resulting value is this minimum value.
   SSHLSAT,
   USHLSAT,
 
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 1138566..2eb4fd3 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -762,12 +762,6 @@ public:
   /// applied to any type.
   ///
   LLVM_ABI bool isCommutative() const LLVM_READONLY;
-
-  /// Checks if the operand is commutative. In commutative operations, not all
-  /// operands might commutable, e.g. for fmuladd only 2 first operands are
-  /// commutable.
-  LLVM_ABI bool isCommutableOperand(unsigned Op) const LLVM_READONLY;
-
   static bool isCommutative(unsigned Opcode) {
     switch (Opcode) {
     case Add: case FAdd:
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 0b25baa..0622bfa 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -101,12 +101,6 @@ public:
     }
   }
 
-  /// Return true if the operand is commutable.
-  bool isCommutableOperand(unsigned Op) const {
-    constexpr unsigned NumCommutativeOps = 2;
-    return isCommutative() && Op < NumCommutativeOps;
-  }
-
   /// Checks if the intrinsic is an annotation.
   bool isAssumeLikeIntrinsic() const {
     switch (getIntrinsicID()) {
diff --git a/llvm/include/llvm/Extensions/PassPlugin.h b/llvm/include/llvm/Plugins/PassPlugin.h
index a9db567..6ca53b4 100644
--- a/llvm/include/llvm/Extensions/PassPlugin.h
+++ b/llvm/include/llvm/Plugins/PassPlugin.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXTENSIONS_PASSPLUGIN_H
-#define LLVM_EXTENSIONS_PASSPLUGIN_H
+#ifndef LLVM_PLUGINS_PASSPLUGIN_H
+#define LLVM_PLUGINS_PASSPLUGIN_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CodeGen.h"
@@ -139,4 +139,4 @@ llvmGetPassPluginInfo();
 #pragma clang diagnostic pop
 #endif
 
-#endif /* LLVM_EXTENSIONS_PASSPLUGIN_H */
+#endif /* LLVM_PLUGINS_PASSPLUGIN_H */
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index e6f14b9..994bb61 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -478,8 +478,8 @@ def saddsat    : SDNode<"ISD::SADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
 def uaddsat    : SDNode<"ISD::UADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
 def ssubsat    : SDNode<"ISD::SSUBSAT"   , SDTIntBinOp>;
 def usubsat    : SDNode<"ISD::USUBSAT"   , SDTIntBinOp>;
-def sshlsat    : SDNode<"ISD::SSHLSAT"   , SDTIntBinOp>;
-def ushlsat    : SDNode<"ISD::USHLSAT"   , SDTIntBinOp>;
+def sshlsat    : SDNode<"ISD::SSHLSAT"   , SDTIntShiftOp>;
+def ushlsat    : SDNode<"ISD::USHLSAT"   , SDTIntShiftOp>;
 
 def smulfix    : SDNode<"ISD::SMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
 def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
diff --git a/llvm/lib/AsmParser/AsmParserContext.cpp b/llvm/lib/AsmParser/AsmParserContext.cpp
index 59d3ffc..453e699 100644
--- a/llvm/lib/AsmParser/AsmParserContext.cpp
+++ b/llvm/lib/AsmParser/AsmParserContext.cpp
@@ -33,57 +33,63 @@ AsmParserContext::getInstructionLocation(const Instruction *I) const {
 
 Function *
 AsmParserContext::getFunctionAtLocation(const FileLocRange &Query) const {
-  for (auto &[F, Loc] : Functions) {
-    if (Loc.contains(Query))
-      return F;
-  }
+  auto It = FunctionsInverse.find(Query.Start);
+  if (It.stop() <= Query.End)
+    return *It;
   return nullptr;
 }
 
 Function *AsmParserContext::getFunctionAtLocation(const FileLoc &Query) const {
-  return getFunctionAtLocation(FileLocRange(Query, Query));
+  return FunctionsInverse.lookup(Query, nullptr);
 }
 
 BasicBlock *
 AsmParserContext::getBlockAtLocation(const FileLocRange &Query) const {
-  for (auto &[BB, Loc] : Blocks) {
-    if (Loc.contains(Query))
-      return BB;
-  }
+  auto It = BlocksInverse.find(Query.Start);
+  if (It.stop() <= Query.End)
+    return *It;
   return nullptr;
 }
 
 BasicBlock *AsmParserContext::getBlockAtLocation(const FileLoc &Query) const {
-  return getBlockAtLocation(FileLocRange(Query, Query));
+  return BlocksInverse.lookup(Query, nullptr);
 }
 
 Instruction *
 AsmParserContext::getInstructionAtLocation(const FileLocRange &Query) const {
-  for (auto &[I, Loc] : Instructions) {
-    if (Loc.contains(Query))
-      return I;
-  }
+  auto It = InstructionsInverse.find(Query.Start);
+  if (It.stop() <= Query.End)
+    return *It;
   return nullptr;
 }
 
 Instruction *
 AsmParserContext::getInstructionAtLocation(const FileLoc &Query) const {
-  return getInstructionAtLocation(FileLocRange(Query, Query));
+  return InstructionsInverse.lookup(Query, nullptr);
 }
 
 bool AsmParserContext::addFunctionLocation(Function *F,
                                            const FileLocRange &Loc) {
-  return Functions.insert({F, Loc}).second;
+  bool Inserted = Functions.insert({F, Loc}).second;
+  if (Inserted)
+    FunctionsInverse.insert(Loc.Start, Loc.End, F);
+  return Inserted;
 }
 
 bool AsmParserContext::addBlockLocation(BasicBlock *BB,
                                         const FileLocRange &Loc) {
-  return Blocks.insert({BB, Loc}).second;
+  bool Inserted = Blocks.insert({BB, Loc}).second;
+  if (Inserted)
+    BlocksInverse.insert(Loc.Start, Loc.End, BB);
+  return Inserted;
 }
 
 bool AsmParserContext::addInstructionLocation(Instruction *I,
                                               const FileLocRange &Loc) {
-  return Instructions.insert({I, Loc}).second;
+  bool Inserted = Instructions.insert({I, Loc}).second;
+  if (Inserted)
+    InstructionsInverse.insert(Loc.Start, Loc.End, I);
+  return Inserted;
 }
 
 } // namespace llvm
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index a943297..d1c4ff5 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -41,6 +41,7 @@ add_subdirectory(AsmParser)
 add_subdirectory(LineEditor)
 add_subdirectory(ProfileData)
 add_subdirectory(Passes)
+add_subdirectory(Plugins)
 add_subdirectory(TargetParser)
 add_subdirectory(TextAPI)
 add_subdirectory(Telemetry)
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index b8b0d4d..653c4e8 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -562,6 +562,7 @@ bool GlobalMergeImpl::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
 
     MergedGV->setAlignment(MaxAlign);
     MergedGV->setSection(Globals[i]->getSection());
+    MergedGV->setComdat(Globals[i]->getComdat());
 
     LLVM_DEBUG(dbgs() << "MergedGV:  " << *MergedGV << "\n");
 
@@ -677,7 +678,8 @@ bool GlobalMergeImpl::run(Module &M) {
   IsMachO = M.getTargetTriple().isOSBinFormatMachO();
 
   auto &DL = M.getDataLayout();
-  MapVector<std::pair<unsigned, StringRef>, SmallVector<GlobalVariable *, 0>>
+  MapVector<std::tuple<unsigned, StringRef, Comdat *>,
+            SmallVector<GlobalVariable *, 0>>
       Globals, ConstGlobals, BSSGlobals;
   bool Changed = false;
   setMustKeepGlobalVariables(M);
@@ -735,11 +737,11 @@ bool GlobalMergeImpl::run(Module &M) {
     if (CanMerge) {
       if (TM &&
           TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSS())
-        BSSGlobals[{AddressSpace, Section}].push_back(&GV);
+        BSSGlobals[{AddressSpace, Section, GV.getComdat()}].push_back(&GV);
       else if (GV.isConstant())
-        ConstGlobals[{AddressSpace, Section}].push_back(&GV);
+        ConstGlobals[{AddressSpace, Section, GV.getComdat()}].push_back(&GV);
       else
-        Globals[{AddressSpace, Section}].push_back(&GV);
+        Globals[{AddressSpace, Section, GV.getComdat()}].push_back(&GV);
     }
     LLVM_DEBUG(dbgs() << "GV " << (CanMerge ? "" : "not ") << "to merge: " << GV
                       << "\n");
@@ -747,16 +749,16 @@ bool GlobalMergeImpl::run(Module &M) {
 
   for (auto &P : Globals)
     if (P.second.size() > 1)
-      Changed |= doMerge(P.second, M, false, P.first.first);
+      Changed |= doMerge(P.second, M, false, std::get<0>(P.first));
 
   for (auto &P : BSSGlobals)
     if (P.second.size() > 1)
-      Changed |= doMerge(P.second, M, false, P.first.first);
+      Changed |= doMerge(P.second, M, false, std::get<0>(P.first));
 
   if (Opt.MergeConstantGlobals)
     for (auto &P : ConstGlobals)
       if (P.second.size() > 1)
-        Changed |= doMerge(P.second, M, true, P.first.first);
+        Changed |= doMerge(P.second, M, true, std::get<0>(P.first));
 
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 73f59b4..cb42d42 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1291,7 +1291,9 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     case ISD::SRL:
     case ISD::SRA:
     case ISD::ROTL:
-    case ISD::ROTR: {
+    case ISD::ROTR:
+    case ISD::SSHLSAT:
+    case ISD::USHLSAT: {
       // Legalizing shifts/rotates requires adjusting the shift amount
       // to the appropriate width.
       SDValue Op0 = Node->getOperand(0);
@@ -1306,8 +1308,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
         if (SAO != Op1)
           NewNode = DAG.UpdateNodeOperands(Node, Op0, SAO);
       }
+      break;
     }
-    break;
     case ISD::FSHL:
     case ISD::FSHR:
     case ISD::SRL_PARTS:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 981db6d..67c4ecc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1144,7 +1144,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   // FIXME: We need vp-aware PromotedInteger functions.
   if (IsShift) {
     Op1 = GetPromotedInteger(Op1);
-    Op2 = ZExtPromotedInteger(Op2);
+    if (getTypeAction(Op2.getValueType()) == TargetLowering::TypePromoteInteger)
+      Op2 = ZExtPromotedInteger(Op2);
   } else {
     Op1 = SExtPromotedInteger(Op1);
     Op2 = SExtPromotedInteger(Op2);
@@ -2052,7 +2053,11 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::SRA:
   case ISD::SRL:
   case ISD::ROTL:
-  case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
+  case ISD::ROTR:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+    Res = PromoteIntOp_Shift(N);
+    break;
 
   case ISD::SCMP:
   case ISD::UCMP: Res = PromoteIntOp_CMP(N); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4482df1..891f584 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7826,6 +7826,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     [[fallthrough]];
   case ISD::ROTL:
   case ISD::ROTR:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
     assert(VT == N1.getValueType() &&
            "Shift operators return type must be the same as their first arg");
     assert(VT.isInteger() && N2.getValueType().isInteger() &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b8da601..9e342f9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7343,16 +7343,26 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2));
     return;
   }
-  case Intrinsic::sshl_sat: {
-    SDValue Op1 = getValue(I.getArgOperand(0));
-    SDValue Op2 = getValue(I.getArgOperand(1));
-    setValue(&I, DAG.getNode(ISD::SSHLSAT, sdl, Op1.getValueType(), Op1, Op2));
-    return;
-  }
+  case Intrinsic::sshl_sat:
   case Intrinsic::ushl_sat: {
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
-    setValue(&I, DAG.getNode(ISD::USHLSAT, sdl, Op1.getValueType(), Op1, Op2));
+
+    EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
+        Op1.getValueType(), DAG.getDataLayout());
+
+    // Coerce the shift amount to the right type if we can. This exposes the
+    // truncate or zext to optimization early.
+    if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
+      assert(ShiftTy.getSizeInBits() >=
+                 Log2_32_Ceil(Op1.getValueSizeInBits()) &&
+             "Unexpected shift type");
+      Op2 = DAG.getZExtOrTrunc(Op2, getCurSDLoc(), ShiftTy);
+    }
+
+    unsigned Opc =
+        Intrinsic == Intrinsic::sshl_sat ? ISD::SSHLSAT : ISD::USHLSAT;
+    setValue(&I, DAG.getNode(Opc, sdl, Op1.getValueType(), Op1, Op2));
     return;
   }
   case Intrinsic::smul_fix:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 29e13c5..d5daba9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11053,8 +11053,7 @@ SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {
 
   assert((Node->getOpcode() == ISD::SSHLSAT ||
           Node->getOpcode() == ISD::USHLSAT) &&
-          "Expected a SHLSAT opcode");
-  assert(VT == RHS.getValueType() && "Expected operands to be the same type");
+         "Expected a SHLSAT opcode");
   assert(VT.isInteger() && "Expected operands to be integers");
 
   if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 971c3e5..82e5661 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -431,6 +431,11 @@ bool SSPLayoutAnalysis::requiresStackProtector(Function *F,
   for (const BasicBlock &BB : *F) {
     for (const Instruction &I : BB) {
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+        if (const MDNode *MD = AI->getMetadata("stack-protector")) {
+          const auto *CI = mdconst::dyn_extract<ConstantInt>(MD->getOperand(0));
+          if (CI->isZero())
+            continue;
+        }
         if (AI->isArrayAllocation()) {
           auto RemarkBuilder = [&]() {
             return OptimizationRemark(DEBUG_TYPE, "StackProtectorAllocaOrArray",
diff --git a/llvm/lib/Extensions/CMakeLists.txt b/llvm/lib/Extensions/CMakeLists.txt
index 0bfca3e..c1007df 100644
--- a/llvm/lib/Extensions/CMakeLists.txt
+++ b/llvm/lib/Extensions/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_llvm_component_library(LLVMExtensions
   Extensions.cpp
-  PassPlugin.cpp
 
   LINK_COMPONENTS
   Support
diff --git a/llvm/lib/Extensions/Extensions.cpp b/llvm/lib/Extensions/Extensions.cpp
index f8e1be1..3e9f76d 100644
--- a/llvm/lib/Extensions/Extensions.cpp
+++ b/llvm/lib/Extensions/Extensions.cpp
@@ -1,4 +1,4 @@
-#include "llvm/Extensions/PassPlugin.h"
+#include "llvm/Plugins/PassPlugin.h"
 #define HANDLE_EXTENSION(Ext)                                                  \
 		llvm::PassPluginLibraryInfo get##Ext##PluginInfo();
 #include "llvm/Support/Extension.def"
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 7682c28..f3d4d24 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1293,13 +1293,6 @@ bool Instruction::isCommutative() const {
   return isCommutative(getOpcode());
 }
 
-bool Instruction::isCommutableOperand(unsigned Op) const {
-  if (auto *II = dyn_cast<IntrinsicInst>(this))
-    return II->isCommutableOperand(Op);
-  // TODO: Should allow icmp/fcmp?
-  return isCommutative(getOpcode());
-}
-
 unsigned Instruction::getNumSuccessors() const {
   switch (getOpcode()) {
 #define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt
index 057d73b..cf455ff 100644
--- a/llvm/lib/LTO/CMakeLists.txt
+++ b/llvm/lib/LTO/CMakeLists.txt
@@ -34,6 +34,7 @@ add_llvm_component_library(LLVMLTO
   ObjCARC
   Object
   Passes
+  Plugins
   Remarks
   Scalar
   Support
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index bfda6de..e998ac9 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/CGData/CodeGenData.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
@@ -32,6 +31,7 @@
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/llvm/lib/Plugins/CMakeLists.txt b/llvm/lib/Plugins/CMakeLists.txt
new file mode 100644
index 0000000..564be0c
--- /dev/null
+++ b/llvm/lib/Plugins/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_llvm_component_library(LLVMPlugins
+  PassPlugin.cpp
+
+  LINK_COMPONENTS
+  Support
+)
diff --git a/llvm/lib/Extensions/PassPlugin.cpp b/llvm/lib/Plugins/PassPlugin.cpp
index 77dc5ae..84d55fc 100644
--- a/llvm/lib/Extensions/PassPlugin.cpp
+++ b/llvm/lib/Plugins/PassPlugin.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Extensions/PassPlugin.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <cstdint>
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 34f492a..ee43448 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1187,6 +1187,9 @@ bool AArch64RegisterInfo::getRegAllocationHints(
         case AArch64::DestructiveBinaryImm:
           AddHintIfSuitable(R, Def.getOperand(2));
           break;
+        case AArch64::DestructiveUnaryPassthru:
+          AddHintIfSuitable(R, Def.getOperand(3));
+          break;
         }
       }
     }
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9558cb5..fd177e1 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -3165,7 +3165,7 @@ multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
   def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, packedvt3, !cast<Instruction>(NAME)>;
   def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
 
-  def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype)>;
+  def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype), FalseLanesUndef>;
 
   defm : SVE_1_Op_PassthruUndef_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME # _UNDEF)>;
 }
@@ -3185,7 +3185,7 @@ multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
   def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>;
   def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
 
-  def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype)>;
+  def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype), FalseLanesUndef>;
 
   defm : SVE_1_Op_PassthruUndef_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME # _UNDEF)>;
 }
@@ -3205,9 +3205,9 @@ multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
   def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 
-  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_1_Op_PassthruUndef_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H_UNDEF)>;
   defm : SVE_1_Op_PassthruUndef_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H_UNDEF)>;
@@ -4235,7 +4235,7 @@ multiclass sve2_int_un_pred_arit_s<bits<2> opc, string asm,
 
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
 
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
 
   defm : SVE_3_Op_Undef_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Pseudo>(NAME # _S_UNDEF)>;
 }
@@ -4255,10 +4255,10 @@ multiclass sve2_int_un_pred_arit<bits<2> opc, string asm, SDPatternOperator op>
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 
-  def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
-  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_3_Op_Undef_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _B_UNDEF)>;
   defm : SVE_3_Op_Undef_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Pseudo>(NAME # _H_UNDEF)>;
@@ -4957,10 +4957,10 @@ multiclass sve_int_un_pred_arit<bits<3> opc, string asm,
   def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 
-  def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
-  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_1_Op_PassthruUndef_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _B_UNDEF)>;
   defm : SVE_1_Op_PassthruUndef_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Pseudo>(NAME # _H_UNDEF)>;
@@ -4993,9 +4993,9 @@ multiclass sve_int_un_pred_arit_h<bits<3> opc, string asm,
   def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i8, !cast<Instruction>(NAME # _S)>;
   def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i8, !cast<Instruction>(NAME # _D)>;
 
-  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_InReg_Extend_PassthruUndef<nxv8i16, op, nxv8i1, nxv8i8, !cast<Pseudo>(NAME # _H_UNDEF)>;
   defm : SVE_InReg_Extend_PassthruUndef<nxv4i32, op, nxv4i1, nxv4i8, !cast<Pseudo>(NAME # _S_UNDEF)>;
@@ -5022,8 +5022,8 @@ multiclass sve_int_un_pred_arit_w<bits<3> opc, string asm,
   def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i16, !cast<Instruction>(NAME # _S)>;
   def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i16, !cast<Instruction>(NAME # _D)>;
 
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_InReg_Extend_PassthruUndef<nxv4i32, op, nxv4i1, nxv4i16, !cast<Pseudo>(NAME # _S_UNDEF)>;
   defm : SVE_InReg_Extend_PassthruUndef<nxv2i64, op, nxv2i1, nxv2i16, !cast<Pseudo>(NAME # _D_UNDEF)>;
@@ -5044,7 +5044,7 @@ multiclass sve_int_un_pred_arit_d<bits<3> opc, string asm,
 
   def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i32, !cast<Instruction>(NAME # _D)>;
 
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_InReg_Extend_PassthruUndef<nxv2i64, op, nxv2i1, nxv2i32, !cast<Pseudo>(NAME # _D_UNDEF)>;
 }
@@ -5071,10 +5071,10 @@ multiclass sve_int_un_pred_arit_bitwise<bits<3> opc, string asm,
   def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 
-  def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
-  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_1_Op_PassthruUndef_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _B_UNDEF)>;
   defm : SVE_1_Op_PassthruUndef_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Pseudo>(NAME # _H_UNDEF)>;
@@ -5113,9 +5113,9 @@ multiclass sve_int_un_pred_arit_bitwise_fp<bits<3> opc, string asm,
   def : SVE_1_Op_Passthru_Pat<nxv4bf16, op, nxv4i1, nxv4bf16, !cast<Instruction>(NAME # _H)>;
   def : SVE_1_Op_Passthru_Pat<nxv2bf16, op, nxv2i1, nxv2bf16, !cast<Instruction>(NAME # _H)>;
 
-  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_1_Op_PassthruUndef_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Pseudo>(NAME # _H_UNDEF)>;
   defm : SVE_1_Op_PassthruUndef_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Pseudo>(NAME # _H_UNDEF)>;
@@ -5142,9 +5142,9 @@ multiclass sve_int_un_pred_arit_bitwise_fp_z<bits<3> opc, string asm, SDPatternO
 }
 
 multiclass sve_fp_un_pred_arit_hsd<SDPatternOperator op> {
-  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_1_Op_PassthruUndef_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Pseudo>(NAME # _H_UNDEF)>;
   defm : SVE_1_Op_PassthruUndef_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Pseudo>(NAME # _H_UNDEF)>;
@@ -5155,10 +5155,10 @@ multiclass sve_fp_un_pred_arit_hsd<SDPatternOperator op> {
 }
 
 multiclass sve_int_un_pred_arit_bhsd<SDPatternOperator op> {
-  def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
-  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
-  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
-  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+  def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
 
   defm : SVE_1_Op_PassthruUndef_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _B_UNDEF)>;
   defm : SVE_1_Op_PassthruUndef_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Pseudo>(NAME # _H_UNDEF)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 71ea9ef..0a262b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -236,6 +236,9 @@ public:
                       FastMathFlags FMF) const;
   Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
                           FastMathFlags FMF) const;
+  Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
+                    FastMathFlags DivFMF, const Instruction *CtxI,
+                    bool IsNegative) const;
 
   bool tryNarrowMathIfNoOverflow(Instruction *I);
 
@@ -605,6 +608,96 @@ static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
   return Builder.CreateFMul(Rsq, OutputScaleFactor);
 }
 
+/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
+/// v_rsq_f64. This should give a 1ulp result.
+Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
+                                            FastMathFlags SqrtFMF,
+                                            FastMathFlags DivFMF,
+                                            const Instruction *CtxI,
+                                            bool IsNegative) const {
+  // rsq(x):
+  //   double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
+  //   double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
+  //   return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
+  //
+  // -rsq(x):
+  //   double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
+  //   double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
+  //   return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
+  //
+  // The rsq instruction handles the special cases correctly. We need to check
+  // for the edge case conditions to ensure the special case propagates through
+  // the later instructions.
+
+  Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
+
+  // Try to elide the edge case check.
+  //
+  // Fast math flags imply:
+  //   sqrt ninf => !isinf(x)
+  //   fdiv ninf => x != 0, !isinf(x)
+  bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
+  bool MaybeZero = !DivFMF.noInfs();
+
+  DenormalMode DenormMode;
+  FPClassTest Interested = fcNone;
+  if (MaybePosInf)
+    Interested = fcPosInf;
+  if (MaybeZero)
+    Interested |= fcZero;
+
+  if (Interested != fcNone) {
+    KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
+    if (KnownSrc.isKnownNeverPosInfinity())
+      MaybePosInf = false;
+
+    DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
+    if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
+      MaybeZero = false;
+  }
+
+  Value *SpecialOrRsq = X;
+  if (MaybeZero || MaybePosInf) {
+    Value *Cond;
+    if (MaybePosInf && MaybeZero) {
+      if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
+        FPClassTest TestMask = fcPosInf | fcZero;
+        if (DenormMode.inputsAreZero())
+          TestMask |= fcSubnormal;
+
+        Cond = Builder.createIsFPClass(X, TestMask);
+      } else {
+        // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
+        // doesn't respect the floating-point environment.
+        Value *IsZero =
+            Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
+        Value *IsInf =
+            Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
+        Cond = Builder.CreateOr(IsZero, IsInf);
+      }
+    } else if (MaybeZero) {
+      Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
+    } else {
+      Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
+    }
+
+    SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
+  }
+
+  Value *NegY0 = Builder.CreateFNeg(Y0);
+  Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
+
+  // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
+  Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
+
+  Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
+
+  Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
+                                  ConstantFP::get(X->getType(), 0.5));
+
+  return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
+}
+
 bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
                                                   FastMathFlags DivFMF,
                                                   FastMathFlags SqrtFMF) const {
@@ -612,8 +705,22 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
   if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
     return false;
 
-  // v_rsq_f32 gives 1ulp
-  return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
+  Type *EltTy = SqrtOp->getType()->getScalarType();
+  switch (EltTy->getTypeID()) {
+  case Type::FloatTyID:
+    // v_rsq_f32 gives 1ulp
+    // Separate correctly rounded fdiv + sqrt give ~1.81 ulp.
+
+    // FIXME: rsq formation should not depend on approx func or the fpmath
+    // accuracy. This strictly improves precision.
+    return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
+  case Type::DoubleTyID:
+    return true;
+  default:
+    return false;
+  }
+
+  llvm_unreachable("covered switch");
 }
 
 Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -629,8 +736,6 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
   if (!CLHS)
     return nullptr;
 
-  assert(Den->getType()->isFloatTy());
-
   bool IsNegative = false;
 
   // TODO: Handle other numerator values with arcp.
@@ -639,14 +744,20 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
     IRBuilder<>::FastMathFlagGuard Guard(Builder);
     Builder.setFastMathFlags(DivFMF | SqrtFMF);
 
-    if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
-        canIgnoreDenormalInput(Den, CtxI)) {
-      Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
-      // -1.0 / sqrt(x) -> fneg(rsq(x))
-      return IsNegative ? Builder.CreateFNeg(Result) : Result;
+    if (Den->getType()->isFloatTy()) {
+      if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
+          canIgnoreDenormalInput(Den, CtxI)) {
+        Value *Result =
+            Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
+        // -1.0 / sqrt(x) -> fneg(rsq(x))
+        return IsNegative ? Builder.CreateFNeg(Result) : Result;
+      }
+
+      return emitRsqIEEE1ULP(Builder, Den, IsNegative);
     }
 
-    return emitRsqIEEE1ULP(Builder, Den, IsNegative);
+    if (Den->getType()->isDoubleTy())
+      return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
   }
 
   return nullptr;
@@ -758,6 +869,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
       return Rsq;
   }
 
+  if (!Num->getType()->isFloatTy())
+    return nullptr;
+
   Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
   if (Rcp)
     return Rcp;
@@ -793,7 +907,8 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
     return false;
 
   Type *Ty = FDiv.getType()->getScalarType();
-  if (!Ty->isFloatTy())
+  const bool IsFloat = Ty->isFloatTy();
+  if (!IsFloat && !Ty->isDoubleTy())
     return false;
 
   // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
@@ -818,6 +933,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
       RsqOp = SqrtOp->getOperand(0);
   }
 
+  // rcp path not yet implemented for f64.
+  if (!IsFloat && !RsqOp)
+    return false;
+
   // Inaccurate rcp is allowed with afn.
   //
   // Defer to codegen to handle this.
@@ -832,7 +951,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
     return false;
 
   // Defer the correct implementations to codegen.
-  if (ReqdAccuracy < 1.0f)
+  if (IsFloat && ReqdAccuracy < 1.0f)
     return false;
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index cc31d7d..f21b87c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -684,10 +684,12 @@ bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   assert(MRI.getType(Dst) == V2S16);
   unsigned Opc = MI.getOpcode();
+  unsigned NumOps = MI.getNumOperands();
   auto Flags = MI.getFlags();
 
-  if (MI.getNumOperands() == 2) {
-    auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
+  auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
+
+  if (NumOps == 2) {
     auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
     auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
     B.buildMergeLikeInstr(Dst, {Lo, Hi});
@@ -695,11 +697,20 @@ bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
     return true;
   }
 
-  assert(MI.getNumOperands() == 3);
-  auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
   auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
-  auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
-  auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
+
+  if (NumOps == 3) {
+    auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
+    auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
+    B.buildMergeLikeInstr(Dst, {Lo, Hi});
+    MI.eraseFromParent();
+    return true;
+  }
+
+  assert(NumOps == 4);
+  auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
+  auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
+  auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
   B.buildMergeLikeInstr(Dst, {Lo, Hi});
   MI.eraseFromParent();
   return true;
@@ -971,6 +982,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
     return LLT::fixed_vector(2, 16);
   case SgprV2S32:
   case VgprV2S32:
+  case UniInVgprV2S32:
     return LLT::fixed_vector(2, 32);
   case SgprV4S32:
   case SgprV4S32_WF:
@@ -1074,6 +1086,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case UniInVgprS32:
   case UniInVgprS64:
   case UniInVgprV2S16:
+  case UniInVgprV2S32:
   case UniInVgprV4S32:
   case UniInVgprB32:
   case UniInVgprB64:
@@ -1209,6 +1222,7 @@ bool RegBankLegalizeHelper::applyMappingDst(
     case UniInVgprS32:
     case UniInVgprS64:
     case UniInVgprV2S16:
+    case UniInVgprV2S32:
     case UniInVgprV4S32: {
       assert(Ty == getTyFromID(MethodIDs[OpIdx]));
       assert(RB == SgprRB);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 63135fe..dee8488 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -120,6 +120,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
   case UniV2S16:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
+  case UniV2S32:
+    return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg);
   case UniB32:
     return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
   case UniB64:
@@ -160,6 +162,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
   case DivV2S16:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
+  case DivV2S32:
+    return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
   case DivB32:
     return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
   case DivB64:
@@ -968,6 +972,30 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
       .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
 
+  addRulesForGOpcs({G_FMAD}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
+  addRulesForGOpcs({G_FMA}, Standard)
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}})
+      .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+      .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+      .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat)
+      .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat)
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat)
+      .Uni(V2S16,
+           {{SgprV2S16}, {SgprV2S16, SgprV2S16, SgprV2S16}, ScalarizeToS16},
+           hasSALUFloat)
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}},
+           !hasSALUFloat);
+
   // FNEG and FABS are either folded as source modifiers or can be selected as
   // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
   // targets without SALU float we still select them as VGPR since there would
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 796c227..1fdf398 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55154,6 +55154,19 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
                                     cast<ShuffleVectorSDNode>(Op)->getMask());
     break;
   }
+  case ISD::CONCAT_VECTORS: {
+    // Limited to FNEG to ensure we don't create orphan nodes via isFNEG.
+    SmallVector<SDValue, 4> SubOps;
+    if (collectConcatOps(N, SubOps, DAG) &&
+        llvm::all_of(SubOps, [](SDValue SubOp) {
+          return SubOp.getOpcode() == ISD::FNEG;
+        })) {
+      for (SDValue &SubOp : SubOps)
+        SubOp = SubOp.getOperand(0);
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, SubOps);
+    }
+    break;
+  }
   case ISD::INSERT_VECTOR_ELT: {
     // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
     // -V, INDEX).
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8df13318..b8e0971 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -573,27 +573,6 @@ static bool isCommutative(Instruction *I, Value *ValWithUses,
   return I->isCommutative();
 }
 
-/// Checks if the operand is commutative. In commutative operations, not all
-/// operands might commutable, e.g. for fmuladd only 2 first operands are
-/// commutable.
-static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
-                                bool IsCopyable = false) {
-  assert(::isCommutative(I, ValWithUses, IsCopyable) &&
-         "The instruction is not commutative.");
-  if (isa<CmpInst>(I))
-    return true;
-  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
-    switch (BO->getOpcode()) {
-    case Instruction::Sub:
-    case Instruction::FSub:
-      return true;
-    default:
-      break;
-    }
-  }
-  return I->isCommutableOperand(Op);
-}
-
 /// This is a helper function to check whether \p I is commutative.
 /// This is a convenience wrapper that calls the two-parameter version of
 /// isCommutative with the same instruction for both parameters. This is
@@ -5347,14 +5326,13 @@ private:
       if (ScheduleCopyableDataMap.empty())
         return false;
       SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
+      SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
       ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
       if (Entries.empty())
         return false;
-      unsigned CurNumOps = 0;
       for (const Use &U : User->operands()) {
         if (U.get() != Op)
           continue;
-        ++CurNumOps;
         // Check all tree entries, if they have operands replaced by copyable
         // data.
         for (TreeEntry *TE : Entries) {
@@ -5387,43 +5365,27 @@ private:
           // Same applies even for non-commutative cmps, because we can invert
           // their predicate potentially and, thus, reorder the operands.
           bool IsCommutativeUser =
-              ::isCommutative(User) &&
-              ::isCommutableOperand(User, User, U.getOperandNo());
-          if (!IsCommutativeUser) {
-            Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
-            IsCommutativeUser =
-                ::isCommutative(MainOp, User) &&
-                ::isCommutableOperand(MainOp, User, U.getOperandNo());
-          }
-          // The commutative user with the same operands can be safely
-          // considered as non-commutative, operands reordering does not change
-          // the semantics.
-          assert(
-              (!IsCommutativeUser ||
-               (((::isCommutative(User) &&
-                  ::isCommutableOperand(User, User, 0) &&
-                  ::isCommutableOperand(User, User, 1)) ||
-                 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
-                  ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
-                                        User, 0) &&
-                  ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
-                                        User, 1))))) &&
-              "Expected commutative user with 2 first commutable operands");
-          bool IsCommutativeWithSameOps =
-              IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
-          if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
-              !isa<CmpInst>(User)) {
+              ::isCommutative(User) ||
+              ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
+          if (!IsCommutativeUser && !isa<CmpInst>(User)) {
+            unsigned &OpCnt =
+                OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
             EdgeInfo EI(TE, U.getOperandNo());
-            if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
+            if (!getScheduleCopyableData(EI, Op))
               continue;
-            return false;
+            // Found copyable operand - continue.
+            OpCnt += Inc;
+            continue;
           }
           PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
               .first->getSecond() += Inc;
         }
       }
       if (PotentiallyReorderedEntriesCount.empty())
-        return true;
+        return all_of(OrderedEntriesCount,
+                      [&](const std::pair<const TreeEntry *, unsigned> &P) {
+                        return P.second == NumOps;
+                      });
       // Check the commutative/cmp entries.
       for (auto &P : PotentiallyReorderedEntriesCount) {
         SmallPtrSet<Value *, 4> ParentsUniqueUsers;
@@ -5469,6 +5431,10 @@ private:
       return all_of(PotentiallyReorderedEntriesCount,
                     [&](const std::pair<const TreeEntry *, unsigned> &P) {
                       return P.second == NumOps - 1;
+                    }) &&
+             all_of(OrderedEntriesCount,
+                    [&](const std::pair<const TreeEntry *, unsigned> &P) {
+                      return P.second == NumOps;
                     });
     }
 
@@ -5690,7 +5656,6 @@ private:
                 }
               };
 
-          SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
@@ -5698,6 +5663,7 @@ private:
             // Need to search for the lane since the tree entry can be
             // reordered.
             auto *It = find(Bundle->getTreeEntry()->Scalars, In);
+            SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
             bool IsNonSchedulableWithParentPhiNode =
                 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
                 Bundle->getTreeEntry()->UserTreeIndex &&
@@ -10907,9 +10873,7 @@ class InstructionsCompatibilityAnalysis {
            Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
            Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
            Opcode == Instruction::And || Opcode == Instruction::Or ||
-           Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
-           Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
-           Opcode == Instruction::FDiv;
+           Opcode == Instruction::Xor;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -11250,10 +11214,6 @@ public:
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
-      case Instruction::FAdd:
-      case Instruction::FMul:
-      case Instruction::FSub:
-      case Instruction::FDiv:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9239cb1..eda7f0b 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2280,8 +2280,7 @@ bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
 bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   BinaryOperator *BinOp;
   ArrayRef<int> OuterMask;
-  if (!match(&I,
-             m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
+  if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
     return false;
 
   // Don't introduce poison into div/rem.
@@ -2290,12 +2289,10 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
 
   Value *Op00, *Op01, *Op10, *Op11;
   ArrayRef<int> Mask0, Mask1;
-  bool Match0 =
-      match(BinOp->getOperand(0),
-            m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
-  bool Match1 =
-      match(BinOp->getOperand(1),
-            m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
+  bool Match0 = match(BinOp->getOperand(0),
+                      m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
+  bool Match1 = match(BinOp->getOperand(1),
+                      m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
   if (!Match0 && !Match1)
     return false;
 
@@ -2340,22 +2337,35 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
       all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
       ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
 
+  InstructionCost NewCost = 0;
   // Try to merge shuffles across the binop if the new shuffles are not costly.
+  InstructionCost BinOpCost =
+      TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
   InstructionCost OldCost =
-      TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy,
-                         BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
-  if (Match0)
-    OldCost += TTI.getShuffleCost(
+      BinOpCost + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                     ShuffleDstTy, BinOpTy, OuterMask, CostKind,
+                                     0, nullptr, {BinOp}, &I);
+  if (!BinOp->hasOneUse())
+    NewCost += BinOpCost;
+
+  if (Match0) {
+    InstructionCost Shuf0Cost = TTI.getShuffleCost(
         TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
         0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
-  if (Match1)
-    OldCost += TTI.getShuffleCost(
-        TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
+    OldCost += Shuf0Cost;
+    if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
+      NewCost += Shuf0Cost;
+  }
+  if (Match1) {
+    InstructionCost Shuf1Cost = TTI.getShuffleCost(
+        TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
         0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
+    OldCost += Shuf1Cost;
+    if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
+      NewCost += Shuf1Cost;
+  }
 
-  InstructionCost NewCost =
-      TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
+  NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
 
   if (!IsIdentity0)
     NewCost +=
diff --git a/llvm/test/CodeGen/AArch64/global-merge-profile-sections.ll b/llvm/test/CodeGen/AArch64/global-merge-profile-sections.ll
new file mode 100644
index 0000000..7108d82
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/global-merge-profile-sections.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-global-merge  -global-merge-group-by-use=false < %s | FileCheck %s
+; CHECK-NOT: _MergedGlobals
+
+$__profc_begin = comdat nodeduplicate
+$__profc_end = comdat nodeduplicate
+
+@__profc_begin = private global [2 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
+@__profd_begin = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 -1301828029439649651, i64 172590168, i64 sub (i64 ptrtoint (ptr @__profc_begin to i64), i64 ptrtoint (ptr @__profd_begin to i64)), i64 0, ptr null, ptr null, i32 2, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_begin), align 8
+@__profc_end = private global [2 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
+@__profd_end = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 3274037854792712831, i64 172590168, i64 sub (i64 ptrtoint (ptr @__profc_end to i64), i64 ptrtoint (ptr @__profd_end to i64)), i64 0, ptr null, ptr null, i32 2, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_end), align 8
+
diff --git a/llvm/test/CodeGen/AArch64/sched-movprfx.ll b/llvm/test/CodeGen/AArch64/sched-movprfx.ll
index 9e88d16..cf1c1f4 100644
--- a/llvm/test/CodeGen/AArch64/sched-movprfx.ll
+++ b/llvm/test/CodeGen/AArch64/sched-movprfx.ll
@@ -14,13 +14,14 @@ define <vscale x 2 x i64> @and_i64_zero(<vscale x 2 x i1> %pg, <vscale x 2 x i64
 ; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    abs z0.d, p1/m, z2.d
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z2.d
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %data0 = tail call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> %c, i1 0)
   %data1 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr %base,
                                                             i32 1,
                                                             <vscale x 2 x i1> %pg,
-                                                            <vscale x 2 x i64> undef)
+                                                            <vscale x 2 x i64> %c)
   %out = add <vscale x 2 x i64> %data0, %data1
   ret <vscale x 2 x i64> %out
 }
diff --git a/llvm/test/CodeGen/AArch64/stack-protector-metadata.ll b/llvm/test/CodeGen/AArch64/stack-protector-metadata.ll
new file mode 100644
index 0000000..95473be
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-protector-metadata.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=aarch64-apple-darwin < %s -o - | FileCheck %s
+
+@.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+
+; CHECK-LABEL: test1:
+; CHECK-NOT: ___stack_chk_guard
+
+; Function Attrs: noinline nounwind optnone
+define void @test1(ptr noundef %msg) #0 {
+entry:
+  %msg.addr = alloca ptr, align 8
+  %a = alloca [1000 x i8], align 1, !stack-protector !2
+  store ptr %msg, ptr %msg.addr, align 8
+  %arraydecay = getelementptr inbounds [1000 x i8], ptr %a, i64 0, i64 0
+  %0 = load ptr, ptr %msg.addr, align 8
+  %call = call ptr @strcpy(ptr noundef %arraydecay, ptr noundef %0) #3
+  %arraydecay1 = getelementptr inbounds [1000 x i8], ptr %a, i64 0, i64 0
+  %call2 = call i32 (ptr, ...) @printf(ptr noundef @.str, ptr noundef %arraydecay1)
+  ret void
+}
+
+
+; CHECK-LABEL: test2:
+; CHECK: ___stack_chk_guard
+
+; Function Attrs: noinline nounwind optnone
+define void @test2(ptr noundef %msg) #0 {
+entry:
+  %msg.addr = alloca ptr, align 8
+  %b = alloca [1000 x i8], align 1
+  store ptr %msg, ptr %msg.addr, align 8
+  %arraydecay = getelementptr inbounds [1000 x i8], ptr %b, i64 0, i64 0
+  %0 = load ptr, ptr %msg.addr, align 8
+  %call = call ptr @strcpy(ptr noundef %arraydecay, ptr noundef %0) #3
+  %arraydecay1 = getelementptr inbounds [1000 x i8], ptr %b, i64 0, i64 0
+  %call2 = call i32 (ptr, ...) @printf(ptr noundef @.str, ptr noundef %arraydecay1)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare ptr @strcpy(ptr noundef, ptr noundef) #1
+
+declare i32 @printf(ptr noundef, ...) #2
+
+attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" ssp }
+attributes #1 = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 22.0.0"}
+!2 = !{i32 0}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
index 7362395..ace0422 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -1385,10 +1385,10 @@ define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    mov x11, #80 // =0x50
 ; CHECK-NEXT:    mov x12, #32 // =0x20
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
 ; CHECK-NEXT:    mov x13, #48 // =0x30
 ; CHECK-NEXT:    mov x14, #16 // =0x10
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
 ; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
 ; CHECK-NEXT:    ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
@@ -1398,19 +1398,17 @@ define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    abs z2.h, p0/m, z2.h
 ; CHECK-NEXT:    abs z3.h, p0/m, z3.h
 ; CHECK-NEXT:    abs z4.h, p0/m, z4.h
+; CHECK-NEXT:    abs z5.h, p0/m, z5.h
+; CHECK-NEXT:    abs z6.h, p0/m, z6.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    abs z1.h, p0/m, z5.h
 ; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
-; CHECK-NEXT:    movprfx z2, z6
-; CHECK-NEXT:    abs z2.h, p0/m, z6.h
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    st1h { z3.h }, p0, [x0, x11, lsl #1]
 ; CHECK-NEXT:    st1h { z4.h }, p0, [x0, x12, lsl #1]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x13, lsl #1]
-; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x14, lsl #1]
+; CHECK-NEXT:    st1h { z5.h }, p0, [x0, x13, lsl #1]
+; CHECK-NEXT:    st1h { z6.h }, p0, [x0, x14, lsl #1]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll
index a8b2c30..c95fa96 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll
@@ -599,19 +599,18 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) nounwind {
 ; CHECK-NEXT:    splice z2.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    frintx z1.d, p0/m, z2.d
-; CHECK-NEXT:    mov z4.d, z1.d[2]
+; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-NEXT:    mov z4.d, z2.d[2]
 ; CHECK-NEXT:    mov z5.d, z0.d[2]
-; CHECK-NEXT:    mov z2.d, z0.d[1]
-; CHECK-NEXT:    mov z3.d, z1.d[3]
+; CHECK-NEXT:    mov z1.d, z0.d[1]
+; CHECK-NEXT:    mov z3.d, z2.d[3]
 ; CHECK-NEXT:    mov z6.d, z0.d[3]
 ; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    mov z0.d, z1.d[1]
-; CHECK-NEXT:    fcvtzs x10, d1
+; CHECK-NEXT:    mov z0.d, z2.d[1]
+; CHECK-NEXT:    fcvtzs x10, d2
 ; CHECK-NEXT:    fcvtzs x11, d4
 ; CHECK-NEXT:    fcvtzs x12, d5
-; CHECK-NEXT:    fcvtzs x9, d2
+; CHECK-NEXT:    fcvtzs x9, d1
 ; CHECK-NEXT:    fcvtzs x13, d3
 ; CHECK-NEXT:    fcvtzs x14, d6
 ; CHECK-NEXT:    fcvtzs x15, d0
@@ -633,57 +632,55 @@ define <16 x i64> @llrint_v16f64(<16 x double> %x) nounwind {
 ; CHECK-LABEL: llrint_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p1.d, vl2
-; CHECK-NEXT:    // kill: def $q6 killed $q6 def $z6
 ; CHECK-NEXT:    // kill: def $q4 killed $q4 def $z4
-; CHECK-NEXT:    // kill: def $q7 killed $q7 def $z7
-; CHECK-NEXT:    // kill: def $q5 killed $q5 def $z5
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q5 killed $q5 def $z5
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q7 killed $q7 def $z7
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    splice z6.d, p1, z6.d, z7.d
 ; CHECK-NEXT:    splice z4.d, p1, z4.d, z5.d
 ; CHECK-NEXT:    splice z2.d, p1, z2.d, z3.d
+; CHECK-NEXT:    splice z6.d, p1, z6.d, z7.d
 ; CHECK-NEXT:    splice z0.d, p1, z0.d, z1.d
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    frintx z3.d, p0/m, z6.d
-; CHECK-NEXT:    movprfx z1, z4
-; CHECK-NEXT:    frintx z1.d, p0/m, z4.d
+; CHECK-NEXT:    frintx z4.d, p0/m, z4.d
 ; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    mov z4.d, z3.d[2]
-; CHECK-NEXT:    mov z5.d, z1.d[2]
-; CHECK-NEXT:    mov z6.d, z2.d[3]
+; CHECK-NEXT:    mov z3.d, z4.d[2]
+; CHECK-NEXT:    mov z5.d, z2.d[3]
+; CHECK-NEXT:    mov z1.d, z6.d[2]
 ; CHECK-NEXT:    fcvtzs x11, d0
-; CHECK-NEXT:    fcvtzs x12, d1
+; CHECK-NEXT:    fcvtzs x12, d4
 ; CHECK-NEXT:    fcvtzs x13, d2
-; CHECK-NEXT:    fcvtzs x14, d3
-; CHECK-NEXT:    mov z7.d, z3.d[3]
-; CHECK-NEXT:    mov z16.d, z1.d[3]
-; CHECK-NEXT:    fcvtzs x9, d4
-; CHECK-NEXT:    fcvtzs x10, d5
-; CHECK-NEXT:    mov z4.d, z2.d[2]
+; CHECK-NEXT:    fcvtzs x14, d6
+; CHECK-NEXT:    mov z7.d, z6.d[3]
+; CHECK-NEXT:    mov z16.d, z0.d[3]
+; CHECK-NEXT:    fcvtzs x10, d3
+; CHECK-NEXT:    mov z3.d, z2.d[2]
+; CHECK-NEXT:    fcvtzs x8, d5
 ; CHECK-NEXT:    mov z5.d, z0.d[2]
-; CHECK-NEXT:    fcvtzs x8, d6
+; CHECK-NEXT:    fcvtzs x9, d1
+; CHECK-NEXT:    mov z1.d, z4.d[3]
 ; CHECK-NEXT:    mov z2.d, z2.d[1]
-; CHECK-NEXT:    mov z6.d, z0.d[3]
-; CHECK-NEXT:    mov z1.d, z1.d[1]
-; CHECK-NEXT:    mov z3.d, z3.d[1]
-; CHECK-NEXT:    fcvtzs x15, d4
-; CHECK-NEXT:    mov z4.d, z0.d[1]
+; CHECK-NEXT:    mov z17.d, z6.d[1]
+; CHECK-NEXT:    fcvtzs x17, d7
+; CHECK-NEXT:    fcvtzs x15, d3
+; CHECK-NEXT:    mov z3.d, z0.d[1]
 ; CHECK-NEXT:    fmov d0, x11
 ; CHECK-NEXT:    fcvtzs x16, d5
+; CHECK-NEXT:    mov z5.d, z4.d[1]
+; CHECK-NEXT:    fmov d4, x12
 ; CHECK-NEXT:    fcvtzs x11, d2
 ; CHECK-NEXT:    fmov d2, x13
-; CHECK-NEXT:    fcvtzs x17, d7
-; CHECK-NEXT:    fcvtzs x18, d16
-; CHECK-NEXT:    fcvtzs x0, d3
-; CHECK-NEXT:    fcvtzs x13, d4
-; CHECK-NEXT:    fmov d4, x12
-; CHECK-NEXT:    fcvtzs x12, d6
+; CHECK-NEXT:    fcvtzs x12, d16
+; CHECK-NEXT:    fcvtzs x13, d3
 ; CHECK-NEXT:    fmov d6, x14
-; CHECK-NEXT:    fcvtzs x14, d1
+; CHECK-NEXT:    fcvtzs x18, d1
+; CHECK-NEXT:    fcvtzs x14, d5
+; CHECK-NEXT:    fcvtzs x0, d17
 ; CHECK-NEXT:    fmov d3, x15
 ; CHECK-NEXT:    fmov d1, x16
 ; CHECK-NEXT:    fmov d5, x10
@@ -691,9 +688,9 @@ define <16 x i64> @llrint_v16f64(<16 x double> %x) nounwind {
 ; CHECK-NEXT:    mov v2.d[1], x11
 ; CHECK-NEXT:    mov v0.d[1], x13
 ; CHECK-NEXT:    mov v3.d[1], x8
-; CHECK-NEXT:    mov v6.d[1], x0
 ; CHECK-NEXT:    mov v4.d[1], x14
 ; CHECK-NEXT:    mov v1.d[1], x12
+; CHECK-NEXT:    mov v6.d[1], x0
 ; CHECK-NEXT:    mov v5.d[1], x18
 ; CHECK-NEXT:    mov v7.d[1], x17
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll
index 465ba38..2b8e340 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll
@@ -635,54 +635,53 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) nounwind {
 ; CHECK-i32-NEXT:    splice z2.d, p0, z2.d, z3.d
 ; CHECK-i32-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-i32-NEXT:    ptrue p0.s, vl8
-; CHECK-i32-NEXT:    movprfx z1, z2
-; CHECK-i32-NEXT:    frintx z1.s, p0/m, z2.s
+; CHECK-i32-NEXT:    frintx z2.s, p0/m, z2.s
 ; CHECK-i32-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-i32-NEXT:    mov z2.s, z1.s[5]
-; CHECK-i32-NEXT:    mov z3.s, z1.s[4]
+; CHECK-i32-NEXT:    mov z1.s, z2.s[5]
+; CHECK-i32-NEXT:    mov z3.s, z2.s[4]
 ; CHECK-i32-NEXT:    mov z5.s, z0.s[5]
 ; CHECK-i32-NEXT:    mov z7.s, z0.s[1]
 ; CHECK-i32-NEXT:    fcvtzs w11, s0
-; CHECK-i32-NEXT:    fcvtzs w13, s1
-; CHECK-i32-NEXT:    mov z4.s, z1.s[7]
-; CHECK-i32-NEXT:    mov z6.s, z1.s[6]
+; CHECK-i32-NEXT:    fcvtzs w13, s2
+; CHECK-i32-NEXT:    mov z4.s, z2.s[7]
+; CHECK-i32-NEXT:    mov z6.s, z2.s[6]
 ; CHECK-i32-NEXT:    mov z16.s, z0.s[7]
-; CHECK-i32-NEXT:    fcvtzs w8, s2
-; CHECK-i32-NEXT:    mov z2.s, z0.s[4]
+; CHECK-i32-NEXT:    fcvtzs w8, s1
+; CHECK-i32-NEXT:    mov z1.s, z0.s[4]
 ; CHECK-i32-NEXT:    fcvtzs w9, s3
-; CHECK-i32-NEXT:    mov z3.s, z1.s[1]
+; CHECK-i32-NEXT:    mov z3.s, z2.s[1]
 ; CHECK-i32-NEXT:    fcvtzs w10, s5
 ; CHECK-i32-NEXT:    fcvtzs w12, s7
 ; CHECK-i32-NEXT:    mov z5.s, z0.s[6]
-; CHECK-i32-NEXT:    mov z7.s, z1.s[2]
-; CHECK-i32-NEXT:    mov z17.s, z1.s[3]
-; CHECK-i32-NEXT:    fcvtzs w14, s2
-; CHECK-i32-NEXT:    mov z2.s, z0.s[2]
+; CHECK-i32-NEXT:    mov z7.s, z2.s[2]
+; CHECK-i32-NEXT:    mov z17.s, z2.s[3]
+; CHECK-i32-NEXT:    fcvtzs w14, s1
+; CHECK-i32-NEXT:    mov z1.s, z0.s[2]
 ; CHECK-i32-NEXT:    mov z18.s, z0.s[3]
 ; CHECK-i32-NEXT:    fcvtzs w15, s3
 ; CHECK-i32-NEXT:    fmov s0, w11
+; CHECK-i32-NEXT:    fmov s2, w13
 ; CHECK-i32-NEXT:    fmov s3, w9
 ; CHECK-i32-NEXT:    fcvtzs w16, s6
 ; CHECK-i32-NEXT:    fcvtzs w17, s5
+; CHECK-i32-NEXT:    fcvtzs w18, s1
 ; CHECK-i32-NEXT:    fcvtzs w11, s7
-; CHECK-i32-NEXT:    fcvtzs w18, s2
-; CHECK-i32-NEXT:    fmov s2, w13
 ; CHECK-i32-NEXT:    fcvtzs w9, s16
 ; CHECK-i32-NEXT:    fmov s1, w14
 ; CHECK-i32-NEXT:    mov v0.s[1], w12
-; CHECK-i32-NEXT:    mov v3.s[1], w8
-; CHECK-i32-NEXT:    fcvtzs w8, s4
 ; CHECK-i32-NEXT:    fcvtzs w12, s18
 ; CHECK-i32-NEXT:    mov v2.s[1], w15
+; CHECK-i32-NEXT:    mov v3.s[1], w8
+; CHECK-i32-NEXT:    fcvtzs w8, s4
 ; CHECK-i32-NEXT:    mov v1.s[1], w10
 ; CHECK-i32-NEXT:    fcvtzs w10, s17
 ; CHECK-i32-NEXT:    mov v0.s[2], w18
-; CHECK-i32-NEXT:    mov v3.s[2], w16
 ; CHECK-i32-NEXT:    mov v2.s[2], w11
+; CHECK-i32-NEXT:    mov v3.s[2], w16
 ; CHECK-i32-NEXT:    mov v1.s[2], w17
 ; CHECK-i32-NEXT:    mov v0.s[3], w12
-; CHECK-i32-NEXT:    mov v3.s[3], w8
 ; CHECK-i32-NEXT:    mov v2.s[3], w10
+; CHECK-i32-NEXT:    mov v3.s[3], w8
 ; CHECK-i32-NEXT:    mov v1.s[3], w9
 ; CHECK-i32-NEXT:    ret
 ;
@@ -750,8 +749,8 @@ define <32 x iXLen> @lrint_v32f32(<32 x float> %x) nounwind {
 ; CHECK-i32-NEXT:    // kill: def $q6 killed $q6 def $z6
 ; CHECK-i32-NEXT:    // kill: def $q7 killed $q7 def $z7
 ; CHECK-i32-NEXT:    // kill: def $q2 killed $q2 def $z2
-; CHECK-i32-NEXT:    // kill: def $q4 killed $q4 def $z4
 ; CHECK-i32-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i32-NEXT:    // kill: def $q4 killed $q4 def $z4
 ; CHECK-i32-NEXT:    // kill: def $q5 killed $q5 def $z5
 ; CHECK-i32-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-i32-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -764,64 +763,62 @@ define <32 x iXLen> @lrint_v32f32(<32 x float> %x) nounwind {
 ; CHECK-i32-NEXT:    splice z0.d, p1, z0.d, z1.d
 ; CHECK-i32-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
 ; CHECK-i32-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-i32-NEXT:    movprfx z3, z6
-; CHECK-i32-NEXT:    frintx z3.s, p0/m, z6.s
+; CHECK-i32-NEXT:    frintx z6.s, p0/m, z6.s
 ; CHECK-i32-NEXT:    frintx z2.s, p0/m, z2.s
-; CHECK-i32-NEXT:    movprfx z1, z4
-; CHECK-i32-NEXT:    frintx z1.s, p0/m, z4.s
+; CHECK-i32-NEXT:    frintx z4.s, p0/m, z4.s
 ; CHECK-i32-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-i32-NEXT:    mov z4.s, z3.s[7]
-; CHECK-i32-NEXT:    mov z5.s, z3.s[6]
-; CHECK-i32-NEXT:    mov z6.s, z3.s[5]
-; CHECK-i32-NEXT:    mov z16.s, z1.s[7]
-; CHECK-i32-NEXT:    mov z7.s, z3.s[4]
-; CHECK-i32-NEXT:    mov z17.s, z1.s[6]
-; CHECK-i32-NEXT:    mov z18.s, z1.s[5]
-; CHECK-i32-NEXT:    mov z19.s, z1.s[4]
-; CHECK-i32-NEXT:    fcvtzs w7, s3
-; CHECK-i32-NEXT:    fcvtzs w8, s4
-; CHECK-i32-NEXT:    mov z4.s, z2.s[7]
-; CHECK-i32-NEXT:    fcvtzs w10, s5
-; CHECK-i32-NEXT:    mov z5.s, z2.s[6]
-; CHECK-i32-NEXT:    fcvtzs w13, s6
+; CHECK-i32-NEXT:    mov z1.s, z6.s[7]
+; CHECK-i32-NEXT:    mov z3.s, z6.s[6]
+; CHECK-i32-NEXT:    mov z5.s, z6.s[5]
+; CHECK-i32-NEXT:    mov z16.s, z4.s[7]
+; CHECK-i32-NEXT:    mov z7.s, z6.s[4]
+; CHECK-i32-NEXT:    mov z17.s, z4.s[6]
+; CHECK-i32-NEXT:    mov z18.s, z4.s[5]
+; CHECK-i32-NEXT:    mov z19.s, z4.s[4]
+; CHECK-i32-NEXT:    fcvtzs w7, s6
+; CHECK-i32-NEXT:    fcvtzs w8, s1
+; CHECK-i32-NEXT:    mov z1.s, z2.s[7]
+; CHECK-i32-NEXT:    fcvtzs w10, s3
+; CHECK-i32-NEXT:    mov z3.s, z2.s[6]
+; CHECK-i32-NEXT:    fcvtzs w13, s5
 ; CHECK-i32-NEXT:    fcvtzs w9, s16
-; CHECK-i32-NEXT:    mov z6.s, z2.s[4]
+; CHECK-i32-NEXT:    mov z5.s, z2.s[4]
 ; CHECK-i32-NEXT:    mov z16.s, z0.s[6]
 ; CHECK-i32-NEXT:    fcvtzs w14, s7
-; CHECK-i32-NEXT:    fcvtzs w11, s4
-; CHECK-i32-NEXT:    mov z4.s, z2.s[5]
+; CHECK-i32-NEXT:    fcvtzs w11, s1
+; CHECK-i32-NEXT:    mov z1.s, z2.s[5]
 ; CHECK-i32-NEXT:    mov z7.s, z0.s[7]
-; CHECK-i32-NEXT:    fcvtzs w16, s5
-; CHECK-i32-NEXT:    mov z5.s, z0.s[4]
+; CHECK-i32-NEXT:    fcvtzs w16, s3
+; CHECK-i32-NEXT:    mov z3.s, z0.s[4]
 ; CHECK-i32-NEXT:    fcvtzs w12, s17
 ; CHECK-i32-NEXT:    fcvtzs w15, s18
 ; CHECK-i32-NEXT:    fcvtzs w17, s19
 ; CHECK-i32-NEXT:    mov z17.s, z0.s[5]
-; CHECK-i32-NEXT:    fcvtzs w3, s4
-; CHECK-i32-NEXT:    mov z4.s, z3.s[1]
-; CHECK-i32-NEXT:    mov z18.s, z3.s[2]
-; CHECK-i32-NEXT:    fcvtzs w4, s6
+; CHECK-i32-NEXT:    fcvtzs w3, s1
+; CHECK-i32-NEXT:    mov z1.s, z6.s[1]
+; CHECK-i32-NEXT:    mov z18.s, z6.s[2]
+; CHECK-i32-NEXT:    fcvtzs w4, s5
 ; CHECK-i32-NEXT:    fcvtzs w0, s16
-; CHECK-i32-NEXT:    fcvtzs w6, s5
-; CHECK-i32-NEXT:    mov z16.s, z3.s[3]
-; CHECK-i32-NEXT:    mov z3.s, z0.s[1]
-; CHECK-i32-NEXT:    mov z5.s, z1.s[1]
+; CHECK-i32-NEXT:    fcvtzs w6, s3
+; CHECK-i32-NEXT:    mov z16.s, z6.s[3]
+; CHECK-i32-NEXT:    mov z5.s, z4.s[1]
 ; CHECK-i32-NEXT:    mov z6.s, z2.s[1]
-; CHECK-i32-NEXT:    fcvtzs w21, s1
+; CHECK-i32-NEXT:    fcvtzs w2, s1
+; CHECK-i32-NEXT:    mov z1.s, z0.s[1]
+; CHECK-i32-NEXT:    fcvtzs w21, s4
 ; CHECK-i32-NEXT:    fcvtzs w22, s0
 ; CHECK-i32-NEXT:    fcvtzs w23, s2
 ; CHECK-i32-NEXT:    fcvtzs w18, s7
-; CHECK-i32-NEXT:    fcvtzs w2, s4
-; CHECK-i32-NEXT:    mov z4.s, z1.s[2]
+; CHECK-i32-NEXT:    mov z3.s, z4.s[2]
 ; CHECK-i32-NEXT:    mov z7.s, z2.s[2]
 ; CHECK-i32-NEXT:    fcvtzs w5, s17
-; CHECK-i32-NEXT:    fcvtzs w24, s3
+; CHECK-i32-NEXT:    fcvtzs w24, s1
 ; CHECK-i32-NEXT:    fcvtzs w25, s5
 ; CHECK-i32-NEXT:    fcvtzs w26, s6
 ; CHECK-i32-NEXT:    fcvtzs w1, s18
 ; CHECK-i32-NEXT:    mov z18.s, z0.s[2]
-; CHECK-i32-NEXT:    mov z17.s, z1.s[3]
-; CHECK-i32-NEXT:    fcvtzs w19, s4
+; CHECK-i32-NEXT:    mov z17.s, z4.s[3]
+; CHECK-i32-NEXT:    fcvtzs w19, s3
 ; CHECK-i32-NEXT:    mov z19.s, z2.s[3]
 ; CHECK-i32-NEXT:    fcvtzs w20, s7
 ; CHECK-i32-NEXT:    mov z20.s, z0.s[3]
@@ -1133,19 +1130,18 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) nounwind {
 ; CHECK-i64-NEXT:    splice z2.d, p0, z2.d, z3.d
 ; CHECK-i64-NEXT:    ptrue p0.d, vl4
 ; CHECK-i64-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-i64-NEXT:    movprfx z1, z2
-; CHECK-i64-NEXT:    frintx z1.d, p0/m, z2.d
-; CHECK-i64-NEXT:    mov z4.d, z1.d[2]
+; CHECK-i64-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-i64-NEXT:    mov z4.d, z2.d[2]
 ; CHECK-i64-NEXT:    mov z5.d, z0.d[2]
-; CHECK-i64-NEXT:    mov z2.d, z0.d[1]
-; CHECK-i64-NEXT:    mov z3.d, z1.d[3]
+; CHECK-i64-NEXT:    mov z1.d, z0.d[1]
+; CHECK-i64-NEXT:    mov z3.d, z2.d[3]
 ; CHECK-i64-NEXT:    mov z6.d, z0.d[3]
 ; CHECK-i64-NEXT:    fcvtzs x8, d0
-; CHECK-i64-NEXT:    mov z0.d, z1.d[1]
-; CHECK-i64-NEXT:    fcvtzs x10, d1
+; CHECK-i64-NEXT:    mov z0.d, z2.d[1]
+; CHECK-i64-NEXT:    fcvtzs x10, d2
 ; CHECK-i64-NEXT:    fcvtzs x11, d4
 ; CHECK-i64-NEXT:    fcvtzs x12, d5
-; CHECK-i64-NEXT:    fcvtzs x9, d2
+; CHECK-i64-NEXT:    fcvtzs x9, d1
 ; CHECK-i64-NEXT:    fcvtzs x13, d3
 ; CHECK-i64-NEXT:    fcvtzs x14, d6
 ; CHECK-i64-NEXT:    fcvtzs x15, d0
@@ -1235,57 +1231,55 @@ define <16 x iXLen> @lrint_v16f64(<16 x double> %x) nounwind {
 ; CHECK-i64-LABEL: lrint_v16f64:
 ; CHECK-i64:       // %bb.0:
 ; CHECK-i64-NEXT:    ptrue p1.d, vl2
-; CHECK-i64-NEXT:    // kill: def $q6 killed $q6 def $z6
 ; CHECK-i64-NEXT:    // kill: def $q4 killed $q4 def $z4
-; CHECK-i64-NEXT:    // kill: def $q7 killed $q7 def $z7
-; CHECK-i64-NEXT:    // kill: def $q5 killed $q5 def $z5
 ; CHECK-i64-NEXT:    // kill: def $q2 killed $q2 def $z2
-; CHECK-i64-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i64-NEXT:    // kill: def $q5 killed $q5 def $z5
 ; CHECK-i64-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i64-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-i64-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i64-NEXT:    // kill: def $q7 killed $q7 def $z7
 ; CHECK-i64-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-i64-NEXT:    ptrue p0.d, vl4
-; CHECK-i64-NEXT:    splice z6.d, p1, z6.d, z7.d
 ; CHECK-i64-NEXT:    splice z4.d, p1, z4.d, z5.d
 ; CHECK-i64-NEXT:    splice z2.d, p1, z2.d, z3.d
+; CHECK-i64-NEXT:    splice z6.d, p1, z6.d, z7.d
 ; CHECK-i64-NEXT:    splice z0.d, p1, z0.d, z1.d
-; CHECK-i64-NEXT:    movprfx z3, z6
-; CHECK-i64-NEXT:    frintx z3.d, p0/m, z6.d
-; CHECK-i64-NEXT:    movprfx z1, z4
-; CHECK-i64-NEXT:    frintx z1.d, p0/m, z4.d
+; CHECK-i64-NEXT:    frintx z4.d, p0/m, z4.d
 ; CHECK-i64-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-i64-NEXT:    frintx z6.d, p0/m, z6.d
 ; CHECK-i64-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-i64-NEXT:    mov z4.d, z3.d[2]
-; CHECK-i64-NEXT:    mov z5.d, z1.d[2]
-; CHECK-i64-NEXT:    mov z6.d, z2.d[3]
+; CHECK-i64-NEXT:    mov z3.d, z4.d[2]
+; CHECK-i64-NEXT:    mov z5.d, z2.d[3]
+; CHECK-i64-NEXT:    mov z1.d, z6.d[2]
 ; CHECK-i64-NEXT:    fcvtzs x11, d0
-; CHECK-i64-NEXT:    fcvtzs x12, d1
+; CHECK-i64-NEXT:    fcvtzs x12, d4
 ; CHECK-i64-NEXT:    fcvtzs x13, d2
-; CHECK-i64-NEXT:    fcvtzs x14, d3
-; CHECK-i64-NEXT:    mov z7.d, z3.d[3]
-; CHECK-i64-NEXT:    mov z16.d, z1.d[3]
-; CHECK-i64-NEXT:    fcvtzs x9, d4
-; CHECK-i64-NEXT:    fcvtzs x10, d5
-; CHECK-i64-NEXT:    mov z4.d, z2.d[2]
+; CHECK-i64-NEXT:    fcvtzs x14, d6
+; CHECK-i64-NEXT:    mov z7.d, z6.d[3]
+; CHECK-i64-NEXT:    mov z16.d, z0.d[3]
+; CHECK-i64-NEXT:    fcvtzs x10, d3
+; CHECK-i64-NEXT:    mov z3.d, z2.d[2]
+; CHECK-i64-NEXT:    fcvtzs x8, d5
 ; CHECK-i64-NEXT:    mov z5.d, z0.d[2]
-; CHECK-i64-NEXT:    fcvtzs x8, d6
+; CHECK-i64-NEXT:    fcvtzs x9, d1
+; CHECK-i64-NEXT:    mov z1.d, z4.d[3]
 ; CHECK-i64-NEXT:    mov z2.d, z2.d[1]
-; CHECK-i64-NEXT:    mov z6.d, z0.d[3]
-; CHECK-i64-NEXT:    mov z1.d, z1.d[1]
-; CHECK-i64-NEXT:    mov z3.d, z3.d[1]
-; CHECK-i64-NEXT:    fcvtzs x15, d4
-; CHECK-i64-NEXT:    mov z4.d, z0.d[1]
+; CHECK-i64-NEXT:    mov z17.d, z6.d[1]
+; CHECK-i64-NEXT:    fcvtzs x17, d7
+; CHECK-i64-NEXT:    fcvtzs x15, d3
+; CHECK-i64-NEXT:    mov z3.d, z0.d[1]
 ; CHECK-i64-NEXT:    fmov d0, x11
 ; CHECK-i64-NEXT:    fcvtzs x16, d5
+; CHECK-i64-NEXT:    mov z5.d, z4.d[1]
+; CHECK-i64-NEXT:    fmov d4, x12
 ; CHECK-i64-NEXT:    fcvtzs x11, d2
 ; CHECK-i64-NEXT:    fmov d2, x13
-; CHECK-i64-NEXT:    fcvtzs x17, d7
-; CHECK-i64-NEXT:    fcvtzs x18, d16
-; CHECK-i64-NEXT:    fcvtzs x0, d3
-; CHECK-i64-NEXT:    fcvtzs x13, d4
-; CHECK-i64-NEXT:    fmov d4, x12
-; CHECK-i64-NEXT:    fcvtzs x12, d6
+; CHECK-i64-NEXT:    fcvtzs x12, d16
+; CHECK-i64-NEXT:    fcvtzs x13, d3
 ; CHECK-i64-NEXT:    fmov d6, x14
-; CHECK-i64-NEXT:    fcvtzs x14, d1
+; CHECK-i64-NEXT:    fcvtzs x18, d1
+; CHECK-i64-NEXT:    fcvtzs x14, d5
+; CHECK-i64-NEXT:    fcvtzs x0, d17
 ; CHECK-i64-NEXT:    fmov d3, x15
 ; CHECK-i64-NEXT:    fmov d1, x16
 ; CHECK-i64-NEXT:    fmov d5, x10
@@ -1293,9 +1287,9 @@ define <16 x iXLen> @lrint_v16f64(<16 x double> %x) nounwind {
 ; CHECK-i64-NEXT:    mov v2.d[1], x11
 ; CHECK-i64-NEXT:    mov v0.d[1], x13
 ; CHECK-i64-NEXT:    mov v3.d[1], x8
-; CHECK-i64-NEXT:    mov v6.d[1], x0
 ; CHECK-i64-NEXT:    mov v4.d[1], x14
 ; CHECK-i64-NEXT:    mov v1.d[1], x12
+; CHECK-i64-NEXT:    mov v6.d[1], x0
 ; CHECK-i64-NEXT:    mov v5.d[1], x18
 ; CHECK-i64-NEXT:    mov v7.d[1], x17
 ; CHECK-i64-NEXT:    ret
@@ -1309,13 +1303,13 @@ define <32 x iXLen> @lrint_v32f64(<32 x double> %x) nounwind {
 ; CHECK-i32:       // %bb.0:
 ; CHECK-i32-NEXT:    ptrue p1.d, vl2
 ; CHECK-i32-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i32-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-i32-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-i32-NEXT:    // kill: def $q3 killed $q3 def $z3
-; CHECK-i32-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-i32-NEXT:    // kill: def $q4 killed $q4 def $z4
-; CHECK-i32-NEXT:    // kill: def $q5 killed $q5 def $z5
 ; CHECK-i32-NEXT:    // kill: def $q7 killed $q7 def $z7
 ; CHECK-i32-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-i32-NEXT:    // kill: def $q5 killed $q5 def $z5
 ; CHECK-i32-NEXT:    ptrue p0.d, vl4
 ; CHECK-i32-NEXT:    splice z0.d, p1, z0.d, z1.d
 ; CHECK-i32-NEXT:    splice z2.d, p1, z2.d, z3.d
@@ -1323,114 +1317,113 @@ define <32 x iXLen> @lrint_v32f64(<32 x double> %x) nounwind {
 ; CHECK-i32-NEXT:    ldp q1, q3, [sp]
 ; CHECK-i32-NEXT:    splice z6.d, p1, z6.d, z7.d
 ; CHECK-i32-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-i32-NEXT:    splice z1.d, p1, z1.d, z3.d
 ; CHECK-i32-NEXT:    movprfx z18, z2
 ; CHECK-i32-NEXT:    frintx z18.d, p0/m, z2.d
+; CHECK-i32-NEXT:    splice z1.d, p1, z1.d, z3.d
 ; CHECK-i32-NEXT:    ldp q5, q3, [sp, #96]
+; CHECK-i32-NEXT:    frintx z4.d, p0/m, z4.d
 ; CHECK-i32-NEXT:    ldp q2, q7, [sp, #64]
-; CHECK-i32-NEXT:    splice z5.d, p1, z5.d, z3.d
-; CHECK-i32-NEXT:    movprfx z3, z4
-; CHECK-i32-NEXT:    frintx z3.d, p0/m, z4.d
-; CHECK-i32-NEXT:    mov z4.d, z0.d[1]
+; CHECK-i32-NEXT:    movprfx z16, z6
+; CHECK-i32-NEXT:    frintx z16.d, p0/m, z6.d
+; CHECK-i32-NEXT:    mov z19.d, z0.d[1]
 ; CHECK-i32-NEXT:    fcvtzs w8, d0
+; CHECK-i32-NEXT:    splice z5.d, p1, z5.d, z3.d
 ; CHECK-i32-NEXT:    splice z2.d, p1, z2.d, z7.d
-; CHECK-i32-NEXT:    mov z19.d, z18.d[1]
-; CHECK-i32-NEXT:    ldp q7, q16, [sp, #32]
+; CHECK-i32-NEXT:    ldp q3, q7, [sp, #32]
+; CHECK-i32-NEXT:    mov z20.d, z18.d[1]
+; CHECK-i32-NEXT:    fcvtzs w9, d18
 ; CHECK-i32-NEXT:    movprfx z17, z1
 ; CHECK-i32-NEXT:    frintx z17.d, p0/m, z1.d
-; CHECK-i32-NEXT:    fcvtzs w10, d4
 ; CHECK-i32-NEXT:    mov z1.d, z0.d[2]
-; CHECK-i32-NEXT:    fcvtzs w9, d18
-; CHECK-i32-NEXT:    mov z4.d, z0.d[3]
-; CHECK-i32-NEXT:    fcvtzs w11, d19
-; CHECK-i32-NEXT:    mov z20.d, z18.d[3]
-; CHECK-i32-NEXT:    fmov s0, w8
-; CHECK-i32-NEXT:    splice z7.d, p1, z7.d, z16.d
-; CHECK-i32-NEXT:    movprfx z16, z6
-; CHECK-i32-NEXT:    frintx z16.d, p0/m, z6.d
+; CHECK-i32-NEXT:    fcvtzs w10, d19
 ; CHECK-i32-NEXT:    mov z6.d, z18.d[2]
-; CHECK-i32-NEXT:    mov z18.d, z3.d[1]
-; CHECK-i32-NEXT:    fcvtzs w12, d3
+; CHECK-i32-NEXT:    splice z3.d, p1, z3.d, z7.d
+; CHECK-i32-NEXT:    mov z7.d, z0.d[3]
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    fcvtzs w11, d20
+; CHECK-i32-NEXT:    mov z20.d, z18.d[3]
+; CHECK-i32-NEXT:    mov z18.d, z4.d[1]
+; CHECK-i32-NEXT:    fcvtzs w12, d4
+; CHECK-i32-NEXT:    mov z21.d, z4.d[2]
 ; CHECK-i32-NEXT:    fcvtzs w13, d1
 ; CHECK-i32-NEXT:    fmov s1, w9
+; CHECK-i32-NEXT:    mov v0.s[1], w10
 ; CHECK-i32-NEXT:    movprfx z19, z2
 ; CHECK-i32-NEXT:    frintx z19.d, p0/m, z2.d
-; CHECK-i32-NEXT:    mov v0.s[1], w10
-; CHECK-i32-NEXT:    mov z21.d, z3.d[2]
-; CHECK-i32-NEXT:    fcvtzs w8, d4
-; CHECK-i32-NEXT:    fcvtzs w14, d6
-; CHECK-i32-NEXT:    mov z6.d, z16.d[1]
 ; CHECK-i32-NEXT:    fcvtzs w15, d18
-; CHECK-i32-NEXT:    movprfx z18, z7
-; CHECK-i32-NEXT:    frintx z18.d, p0/m, z7.d
+; CHECK-i32-NEXT:    movprfx z18, z3
+; CHECK-i32-NEXT:    frintx z18.d, p0/m, z3.d
+; CHECK-i32-NEXT:    mov z3.d, z4.d[3]
+; CHECK-i32-NEXT:    fcvtzs w16, d16
+; CHECK-i32-NEXT:    mov z4.d, z16.d[1]
+; CHECK-i32-NEXT:    fcvtzs w14, d6
 ; CHECK-i32-NEXT:    mov v1.s[1], w11
+; CHECK-i32-NEXT:    fcvtzs w11, d21
+; CHECK-i32-NEXT:    movprfx z21, z5
+; CHECK-i32-NEXT:    frintx z21.d, p0/m, z5.d
+; CHECK-i32-NEXT:    fcvtzs w8, d7
 ; CHECK-i32-NEXT:    fmov s2, w12
 ; CHECK-i32-NEXT:    mov z7.d, z17.d[1]
-; CHECK-i32-NEXT:    mov z4.d, z16.d[2]
-; CHECK-i32-NEXT:    fcvtzs w16, d16
+; CHECK-i32-NEXT:    mov z6.d, z16.d[2]
 ; CHECK-i32-NEXT:    mov v0.s[2], w13
+; CHECK-i32-NEXT:    fcvtzs w12, d4
 ; CHECK-i32-NEXT:    fcvtzs w13, d17
-; CHECK-i32-NEXT:    fcvtzs w12, d6
-; CHECK-i32-NEXT:    mov z6.d, z19.d[1]
-; CHECK-i32-NEXT:    fcvtzs w11, d21
-; CHECK-i32-NEXT:    movprfx z21, z5
-; CHECK-i32-NEXT:    frintx z21.d, p0/m, z5.d
-; CHECK-i32-NEXT:    mov z3.d, z3.d[3]
-; CHECK-i32-NEXT:    mov v2.s[1], w15
-; CHECK-i32-NEXT:    mov z5.d, z18.d[1]
-; CHECK-i32-NEXT:    fcvtzs w15, d7
-; CHECK-i32-NEXT:    fcvtzs w0, d19
-; CHECK-i32-NEXT:    mov v1.s[2], w14
-; CHECK-i32-NEXT:    fcvtzs w14, d4
-; CHECK-i32-NEXT:    mov z7.d, z18.d[2]
-; CHECK-i32-NEXT:    fmov s4, w13
-; CHECK-i32-NEXT:    fcvtzs w13, d6
-; CHECK-i32-NEXT:    mov z6.d, z19.d[2]
 ; CHECK-i32-NEXT:    fcvtzs w10, d3
 ; CHECK-i32-NEXT:    fmov s3, w16
+; CHECK-i32-NEXT:    mov v2.s[1], w15
+; CHECK-i32-NEXT:    mov z4.d, z18.d[1]
+; CHECK-i32-NEXT:    fcvtzs w15, d7
+; CHECK-i32-NEXT:    mov z5.d, z19.d[1]
 ; CHECK-i32-NEXT:    fcvtzs w17, d18
-; CHECK-i32-NEXT:    fcvtzs w18, d5
-; CHECK-i32-NEXT:    mov z5.d, z21.d[1]
+; CHECK-i32-NEXT:    fcvtzs w0, d19
+; CHECK-i32-NEXT:    mov z7.d, z21.d[1]
 ; CHECK-i32-NEXT:    fcvtzs w2, d21
-; CHECK-i32-NEXT:    fcvtzs w1, d7
-; CHECK-i32-NEXT:    mov z7.d, z21.d[2]
-; CHECK-i32-NEXT:    mov v4.s[1], w15
-; CHECK-i32-NEXT:    fcvtzs w15, d6
-; CHECK-i32-NEXT:    fmov s6, w0
-; CHECK-i32-NEXT:    mov v3.s[1], w12
 ; CHECK-i32-NEXT:    fcvtzs w9, d20
-; CHECK-i32-NEXT:    fcvtzs w12, d5
+; CHECK-i32-NEXT:    mov v1.s[2], w14
 ; CHECK-i32-NEXT:    mov z20.d, z17.d[2]
+; CHECK-i32-NEXT:    fcvtzs w14, d6
+; CHECK-i32-NEXT:    mov z6.d, z18.d[2]
+; CHECK-i32-NEXT:    fcvtzs w18, d4
+; CHECK-i32-NEXT:    fmov s4, w13
+; CHECK-i32-NEXT:    fcvtzs w13, d5
+; CHECK-i32-NEXT:    mov v3.s[1], w12
+; CHECK-i32-NEXT:    fcvtzs w12, d7
+; CHECK-i32-NEXT:    fcvtzs w16, d20
+; CHECK-i32-NEXT:    mov z20.d, z19.d[2]
+; CHECK-i32-NEXT:    mov z22.d, z21.d[2]
+; CHECK-i32-NEXT:    fcvtzs w1, d6
 ; CHECK-i32-NEXT:    fmov s5, w17
+; CHECK-i32-NEXT:    fmov s6, w0
+; CHECK-i32-NEXT:    fmov s7, w2
+; CHECK-i32-NEXT:    mov v4.s[1], w15
 ; CHECK-i32-NEXT:    mov z16.d, z16.d[3]
+; CHECK-i32-NEXT:    fcvtzs w15, d20
 ; CHECK-i32-NEXT:    mov z17.d, z17.d[3]
 ; CHECK-i32-NEXT:    mov z18.d, z18.d[3]
-; CHECK-i32-NEXT:    mov v6.s[1], w13
-; CHECK-i32-NEXT:    fcvtzs w13, d7
-; CHECK-i32-NEXT:    fmov s7, w2
-; CHECK-i32-NEXT:    fcvtzs w16, d20
 ; CHECK-i32-NEXT:    mov v5.s[1], w18
+; CHECK-i32-NEXT:    mov v6.s[1], w13
+; CHECK-i32-NEXT:    fcvtzs w13, d22
+; CHECK-i32-NEXT:    mov v7.s[1], w12
 ; CHECK-i32-NEXT:    mov z19.d, z19.d[3]
 ; CHECK-i32-NEXT:    mov z20.d, z21.d[3]
 ; CHECK-i32-NEXT:    mov v2.s[2], w11
 ; CHECK-i32-NEXT:    mov v3.s[2], w14
-; CHECK-i32-NEXT:    mov v7.s[1], w12
 ; CHECK-i32-NEXT:    fcvtzs w11, d16
+; CHECK-i32-NEXT:    mov v4.s[2], w16
 ; CHECK-i32-NEXT:    fcvtzs w12, d17
 ; CHECK-i32-NEXT:    fcvtzs w14, d18
+; CHECK-i32-NEXT:    mov v5.s[2], w1
 ; CHECK-i32-NEXT:    mov v6.s[2], w15
 ; CHECK-i32-NEXT:    fcvtzs w15, d19
-; CHECK-i32-NEXT:    mov v4.s[2], w16
-; CHECK-i32-NEXT:    mov v5.s[2], w1
+; CHECK-i32-NEXT:    mov v7.s[2], w13
+; CHECK-i32-NEXT:    fcvtzs w13, d20
 ; CHECK-i32-NEXT:    mov v0.s[3], w8
 ; CHECK-i32-NEXT:    mov v1.s[3], w9
 ; CHECK-i32-NEXT:    mov v2.s[3], w10
-; CHECK-i32-NEXT:    mov v7.s[2], w13
-; CHECK-i32-NEXT:    fcvtzs w13, d20
 ; CHECK-i32-NEXT:    mov v3.s[3], w11
-; CHECK-i32-NEXT:    mov v6.s[3], w15
 ; CHECK-i32-NEXT:    mov v4.s[3], w12
 ; CHECK-i32-NEXT:    mov v5.s[3], w14
+; CHECK-i32-NEXT:    mov v6.s[3], w15
 ; CHECK-i32-NEXT:    mov v7.s[3], w13
 ; CHECK-i32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll
index c2bb0c8..436b09e 100644
--- a/llvm/test/CodeGen/AArch64/sve-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll
@@ -156,94 +156,93 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpklo z7.s, z1.h
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
-; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z31.d, #0x8000000000000000
-; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    uunpklo z24.d, z3.s
+; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    uunpkhi z25.d, z3.s
+; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpkhi z6.d, z2.s
+; CHECK-NEXT:    uunpklo z24.d, z3.s
 ; CHECK-NEXT:    uunpklo z26.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z30.d, z1.s
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    movprfx z27, z4
 ; CHECK-NEXT:    frintx z27.h, p0/m, z4.h
-; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
-; CHECK-NEXT:    frintx z25.h, p0/m, z25.h
 ; CHECK-NEXT:    movprfx z28, z6
 ; CHECK-NEXT:    frintx z28.h, p0/m, z6.h
-; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z25.h, p0/m, z25.h
+; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
 ; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
 ; CHECK-NEXT:    frintx z7.h, p0/m, z7.h
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z0.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z0.h
+; CHECK-NEXT:    frintx z30.h, p0/m, z30.h
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z0.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z0.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z0.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z0.h
+; CHECK-NEXT:    fcvtzs z5.d, p4/m, z25.h
 ; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z0.h
 ; CHECK-NEXT:    fcvtzs z2.d, p1/m, z27.h
-; CHECK-NEXT:    fcvtzs z4.d, p3/m, z24.h
-; CHECK-NEXT:    fcvtzs z5.d, p4/m, z25.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z27.h, z29.h
 ; CHECK-NEXT:    fcvtzs z3.d, p2/m, z28.h
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z0.h
+; CHECK-NEXT:    fcvtzs z4.d, p3/m, z24.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z27.h, z29.h
 ; CHECK-NEXT:    fcvtzs z6.d, p5/m, z26.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z27.h, z27.h
-; CHECK-NEXT:    movprfx z27, z30
-; CHECK-NEXT:    frintx z27.h, p0/m, z30.h
-; CHECK-NEXT:    movprfx z30, z1
-; CHECK-NEXT:    frintx z30.h, p0/m, z1.h
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z31.d, p4/m, z7.h
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z28.h, z29.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    mov z28.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z31.d, p4/m, z7.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z0.h
+; CHECK-NEXT:    movprfx z28, z1
+; CHECK-NEXT:    frintx z28.h, p0/m, z1.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z0.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z24.h, z29.h
 ; CHECK-NEXT:    fcmuo p7.h, p0/z, z24.h, z24.h
 ; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmgt p8.h, p0/z, z25.h, z29.h
-; CHECK-NEXT:    fcvtzs z28.d, p4/m, z27.h
+; CHECK-NEXT:    fcvtzs z27.d, p4/m, z30.h
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z25.h, z25.h
 ; CHECK-NEXT:    mov z25.d, #0x8000000000000000
 ; CHECK-NEXT:    sel z1.d, p5, z24.d, z3.d
-; CHECK-NEXT:    sel z3.d, p8, z24.d, z5.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z28.h, z0.h
 ; CHECK-NEXT:    sel z0.d, p3, z24.d, z2.d
 ; CHECK-NEXT:    sel z2.d, p6, z24.d, z4.d
+; CHECK-NEXT:    sel z3.d, p8, z24.d, z5.d
 ; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    mov z2.d, p7/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcvtzs z25.d, p4/m, z30.h
+; CHECK-NEXT:    fcvtzs z25.d, p4/m, z28.h
+; CHECK-NEXT:    mov z3.d, p10/m, #0 // =0x0
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z7.h, z29.h
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z27.h, z29.h
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    sel z4.d, p9, z24.d, z6.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z28.h, z29.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    sel z5.d, p5, z24.d, z31.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    sel z6.d, p6, z24.d, z28.d
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    sel z6.d, p6, z24.d, z27.d
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmuo p9.h, p0/z, z27.h, z27.h
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    sel z7.d, p4, z24.d, z25.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z28.h, z28.h
 ; CHECK-NEXT:    mov z6.d, p9/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    mov z4.d, p3/m, #0 // =0x0
@@ -355,11 +354,12 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
 ; CHECK-NEXT:    movprfx z13, z14
 ; CHECK-NEXT:    frintx z13.h, p0/m, z14.h
+; CHECK-NEXT:    frintx z20.h, p0/m, z20.h
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
 ; CHECK-NEXT:    mov z22.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z23.d, #0x8000000000000000
-; CHECK-NEXT:    mov z14.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z28.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z28.h
 ; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z30.h
@@ -381,10 +381,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    fcvtzs z12.d, p4/m, z11.h
 ; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    uunpkhi z11.d, z17.s
-; CHECK-NEXT:    movprfx z17, z20
-; CHECK-NEXT:    frintx z17.h, p0/m, z20.h
+; CHECK-NEXT:    mov z17.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z25.d, p1/m, z6.h
-; CHECK-NEXT:    mov z20.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z0.d, p5/m, z1.h
 ; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z28.h
 ; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
@@ -392,87 +390,87 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z28.h
 ; CHECK-NEXT:    fcvtzs z18.d, p6/m, z10.h
 ; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z30.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z2.d, p3/m, z31.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z21.d, p1/m, z13.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z28.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z20.h, z28.h
 ; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z28.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z10.h, z10.h
 ; CHECK-NEXT:    sel z10.d, p4, z29.d, z12.d
 ; CHECK-NEXT:    sel z12.d, p11, z29.d, z18.d
 ; CHECK-NEXT:    fcvtzs z26.d, p5/m, z11.h
-; CHECK-NEXT:    fcvtzs z22.d, p2/m, z17.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z20.h
 ; CHECK-NEXT:    fcvtzs z23.d, p3/m, z16.h
 ; CHECK-NEXT:    mov z10.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    mov z12.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z28.h
 ; CHECK-NEXT:    str z10, [x8, #7, mul vl]
 ; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z28.h
 ; CHECK-NEXT:    str z12, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, z29.d
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z28.h
 ; CHECK-NEXT:    mov z28.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z14.d, p6/m, z19.h
+; CHECK-NEXT:    mov z26.d, p4/m, z29.d
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z30.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z30.h
-; CHECK-NEXT:    fcvtzs z20.d, p7/m, z3.h
+; CHECK-NEXT:    fcvtzs z14.d, p6/m, z19.h
+; CHECK-NEXT:    fcvtzs z17.d, p7/m, z3.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z20.h, z30.h
 ; CHECK-NEXT:    fcvtzs z28.d, p2/m, z15.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z11.h, z11.h
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z16.h, z16.h
 ; CHECK-NEXT:    sel z11.d, p5, z29.d, z23.d
-; CHECK-NEXT:    sel z16.d, p3, z29.d, z22.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z16.h, z16.h
 ; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z30.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z30.h
+; CHECK-NEXT:    sel z16.d, p3, z29.d, z22.d
 ; CHECK-NEXT:    mov z26.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z11.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z30.h
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z30.h
-; CHECK-NEXT:    fcmuo p6.h, p0/z, z17.h, z17.h
+; CHECK-NEXT:    mov z11.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    str z26, [x8, #15, mul vl]
 ; CHECK-NEXT:    sel z26.d, p4, z29.d, z14.d
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z20.h, z20.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z30.h
 ; CHECK-NEXT:    str z11, [x8, #14, mul vl]
 ; CHECK-NEXT:    mov z28.d, p3/m, z29.d
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z30.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z13.h, z13.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
 ; CHECK-NEXT:    sel z3.d, p1, z29.d, z21.d
 ; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    sel z11.d, p2, z29.d, z17.d
 ; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z30.h
-; CHECK-NEXT:    sel z11.d, p2, z29.d, z20.d
-; CHECK-NEXT:    str z16, [x8, #13, mul vl]
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z16, [x8, #13, mul vl]
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z30.h
 ; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z9.d, p12/m, z29.d
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z30.h
 ; CHECK-NEXT:    str z3, [x8, #11, mul vl]
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z30.h
+; CHECK-NEXT:    mov z9.d, p12/m, z29.d
 ; CHECK-NEXT:    str z11, [x8, #10, mul vl]
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z30.h
 ; CHECK-NEXT:    mov z28.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    sel z3.d, p1, z29.d, z7.d
 ; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z30.h
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z7.d
+; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z27.h, z27.h
 ; CHECK-NEXT:    str z28, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z7.d, p2, z29.d, z24.d
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z30.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z30.h
+; CHECK-NEXT:    sel z7.d, p2, z29.d, z24.d
 ; CHECK-NEXT:    str z26, [x8, #9, mul vl]
 ; CHECK-NEXT:    sel z24.d, p4, z29.d, z25.d
-; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z30.h
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z31.h, z31.h
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z2.d, p6/m, z29.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z5.h, z5.h
 ; CHECK-NEXT:    str z9, [x8, #5, mul vl]
 ; CHECK-NEXT:    mov z0.d, p1/m, z29.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z5.h, z5.h
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
 ; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z1.h, z1.h
-; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z24.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    str z2, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    str z24, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
@@ -860,12 +858,11 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    movprfx z17, z18
 ; CHECK-NEXT:    frintx z17.s, p0/m, z18.s
 ; CHECK-NEXT:    fcmge p6.s, p0/z, z30.s, z29.s
-; CHECK-NEXT:    movprfx z18, z19
-; CHECK-NEXT:    frintx z18.s, p0/m, z19.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
 ; CHECK-NEXT:    mov z31.s, w9
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z19.s, p0/m, z19.s
 ; CHECK-NEXT:    uunpklo z6.d, z6.s
 ; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z27.d, p3/m, z24.s
@@ -881,7 +878,7 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    mov z8.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z6.s, p0/m, z6.s
 ; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z19.d, #0x8000000000000000
+; CHECK-NEXT:    mov z18.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z29.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z29.s
 ; CHECK-NEXT:    fcmge p4.s, p0/z, z25.s, z29.s
@@ -901,94 +898,94 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    fcvtzs z4.d, p3/m, z15.s
 ; CHECK-NEXT:    fcvtzs z16.d, p6/m, z13.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z17.s, z29.s
-; CHECK-NEXT:    fcmge p2.s, p0/z, z18.s, z29.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z19.s, z29.s
 ; CHECK-NEXT:    fcmgt p12.s, p0/z, z30.s, z31.s
 ; CHECK-NEXT:    fcmgt p5.s, p0/z, z15.s, z31.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z20.s, z29.s
 ; CHECK-NEXT:    fcvtzs z21.d, p1/m, z17.s
-; CHECK-NEXT:    fcvtzs z23.d, p2/m, z18.s
-; CHECK-NEXT:    fcmgt p11.s, p0/z, z13.s, z31.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z20.s, z29.s
+; CHECK-NEXT:    fcvtzs z23.d, p2/m, z19.s
 ; CHECK-NEXT:    sel z7.d, p12, z3.d, z12.d
+; CHECK-NEXT:    fcmgt p11.s, p0/z, z13.s, z31.s
 ; CHECK-NEXT:    mov z4.d, p5/m, z3.d
 ; CHECK-NEXT:    fcmge p4.s, p0/z, z22.s, z29.s
 ; CHECK-NEXT:    fcvtzs z0.d, p3/m, z20.s
 ; CHECK-NEXT:    fcmge p6.s, p0/z, z5.s, z29.s
-; CHECK-NEXT:    sel z12.d, p11, z3.d, z16.d
 ; CHECK-NEXT:    fcmge p7.s, p0/z, z14.s, z29.s
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z13.s, z13.s
+; CHECK-NEXT:    sel z12.d, p11, z3.d, z16.d
 ; CHECK-NEXT:    fcvtzs z8.d, p4/m, z22.s
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z13.s, z13.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z6.s, z29.s
 ; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z18.d, p7/m, z14.s
 ; CHECK-NEXT:    fcmuo p10.s, p0/z, z15.s, z15.s
 ; CHECK-NEXT:    mov z15.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z19.d, p7/m, z14.s
 ; CHECK-NEXT:    mov z12.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcvtzs z29.d, p2/m, z6.s
 ; CHECK-NEXT:    fcmgt p4.s, p0/z, z22.s, z31.s
+; CHECK-NEXT:    fcvtzs z29.d, p2/m, z6.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z20.s, z31.s
+; CHECK-NEXT:    str z12, [x8, #8, mul vl]
 ; CHECK-NEXT:    fcvtzs z15.d, p6/m, z5.s
 ; CHECK-NEXT:    mov z4.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    str z12, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z20.s, z31.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z18.s, z31.s
-; CHECK-NEXT:    str z4, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z22.s, z22.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z31.s
 ; CHECK-NEXT:    mov z8.d, p4/m, z3.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z22.s, z22.s
+; CHECK-NEXT:    str z4, [x8, #7, mul vl]
 ; CHECK-NEXT:    mov z0.d, p5/m, z3.d
-; CHECK-NEXT:    fcmuo p6.s, p0/z, z18.s, z18.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z19.s, z19.s
 ; CHECK-NEXT:    fcmgt p4.s, p0/z, z5.s, z31.s
 ; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p5.s, p0/z, z5.s, z5.s
 ; CHECK-NEXT:    sel z5.d, p3, z3.d, z23.d
-; CHECK-NEXT:    str z8, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p3.s, p0/z, z6.s, z31.s
-; CHECK-NEXT:    str z0, [x8, #14, mul vl]
+; CHECK-NEXT:    str z8, [x8, #15, mul vl]
 ; CHECK-NEXT:    fcmgt p1.s, p0/z, z17.s, z31.s
+; CHECK-NEXT:    str z0, [x8, #14, mul vl]
 ; CHECK-NEXT:    mov z5.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p2.s, p0/z, z14.s, z31.s
+; CHECK-NEXT:    sel z0.d, p3, z3.d, z29.d
 ; CHECK-NEXT:    fcmuo p6.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    sel z6.d, p4, z3.d, z15.d
 ; CHECK-NEXT:    str z5, [x8, #13, mul vl]
-; CHECK-NEXT:    sel z0.d, p3, z3.d, z29.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    sel z6.d, p4, z3.d, z15.d
 ; CHECK-NEXT:    sel z5.d, p1, z3.d, z21.d
-; CHECK-NEXT:    sel z29.d, p2, z3.d, z19.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    sel z29.d, p2, z3.d, z18.d
 ; CHECK-NEXT:    mov z6.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p3.s, p0/z, z14.s, z14.s
-; CHECK-NEXT:    mov z5.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z6, [x8, #9, mul vl]
+; CHECK-NEXT:    mov z0.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z31.s
+; CHECK-NEXT:    str z6, [x8, #9, mul vl]
+; CHECK-NEXT:    mov z5.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    str z0, [x8, #12, mul vl]
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z24.s, z31.s
 ; CHECK-NEXT:    mov z29.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    str z5, [x8, #11, mul vl]
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z31.s
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z30.s, z30.s
 ; CHECK-NEXT:    sel z5.d, p1, z3.d, z26.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z24.s, z31.s
 ; CHECK-NEXT:    str z29, [x8, #10, mul vl]
-; CHECK-NEXT:    sel z26.d, p2, z3.d, z27.d
 ; CHECK-NEXT:    ldr z4, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z11, [x8, #4, mul vl]
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z31.s
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z30.s, z30.s
+; CHECK-NEXT:    sel z26.d, p2, z3.d, z27.d
 ; CHECK-NEXT:    fcmgt p6.s, p0/z, z9.s, z31.s
-; CHECK-NEXT:    sel z6.d, p4, z3.d, z28.d
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z4.s, z31.s
 ; CHECK-NEXT:    fcmuo p5.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    sel z6.d, p4, z3.d, z28.d
 ; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z4.s, z31.s
 ; CHECK-NEXT:    fcmuo p2.s, p0/z, z25.s, z25.s
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z24.s, z24.s
 ; CHECK-NEXT:    sel z0.d, p6, z3.d, z10.d
-; CHECK-NEXT:    str z7, [x8, #5, mul vl]
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z24.s, z24.s
 ; CHECK-NEXT:    fcmuo p4.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z4.s, z4.s
 ; CHECK-NEXT:    sel z1.d, p1, z3.d, z2.d
+; CHECK-NEXT:    str z7, [x8, #5, mul vl]
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z4.s, z4.s
 ; CHECK-NEXT:    mov z0.d, p5/m, #0 // =0x0
 ; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z26.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z5.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    str z0, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    str z6, [x8, #3, mul vl]
 ; CHECK-NEXT:    str z26, [x8, #2, mul vl]
 ; CHECK-NEXT:    str z5, [x8, #1, mul vl]
@@ -1299,8 +1296,7 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x98, 0x01, 0x1e, 0x22 // sp + 16 + 152 * VG
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16
@@ -1311,182 +1307,177 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z25.d, x9
 ; CHECK-NEXT:    ldr z7, [x0, #3, mul vl]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z27, [x0, #4, mul vl]
 ; CHECK-NEXT:    ldr z4, [x0, #2, mul vl]
+; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z25.d, x9
 ; CHECK-NEXT:    ldr z2, [x0, #1, mul vl]
-; CHECK-NEXT:    ldr z9, [x0, #15, mul vl]
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    frintx z5.d, p0/m, z0.d
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    ldr z10, [x0, #14, mul vl]
+; CHECK-NEXT:    ldr z29, [x0, #6, mul vl]
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
+; CHECK-NEXT:    ldr z31, [x0, #7, mul vl]
 ; CHECK-NEXT:    movprfx z14, z27
 ; CHECK-NEXT:    frintx z14.d, p0/m, z27.d
-; CHECK-NEXT:    ldr z11, [x0, #13, mul vl]
 ; CHECK-NEXT:    frintx z4.d, p0/m, z4.d
-; CHECK-NEXT:    ldr z8, [x0, #12, mul vl]
 ; CHECK-NEXT:    ldr z27, [x0, #5, mul vl]
-; CHECK-NEXT:    ldr z18, [x0, #11, mul vl]
-; CHECK-NEXT:    ldr z13, [x0, #10, mul vl]
-; CHECK-NEXT:    ldr z29, [x0, #6, mul vl]
-; CHECK-NEXT:    fcmge p1.d, p0/z, z5.d, z25.d
+; CHECK-NEXT:    ldr z15, [x0, #8, mul vl]
 ; CHECK-NEXT:    mov x9, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    movprfx z3, z2
+; CHECK-NEXT:    frintx z3.d, p0/m, z2.d
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z25.d
 ; CHECK-NEXT:    movprfx z28, z27
 ; CHECK-NEXT:    frintx z28.d, p0/m, z27.d
-; CHECK-NEXT:    mov z1.d, x9
-; CHECK-NEXT:    fcmge p2.d, p0/z, z7.d, z25.d
 ; CHECK-NEXT:    frintx z29.d, p0/m, z29.d
-; CHECK-NEXT:    movprfx z3, z2
-; CHECK-NEXT:    frintx z3.d, p0/m, z2.d
-; CHECK-NEXT:    fcmge p3.d, p0/z, z14.d, z25.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z7.d, z25.d
+; CHECK-NEXT:    mov z1.d, x9
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.d, p0/z, z14.d, z25.d
+; CHECK-NEXT:    ldr z11, [x0, #13, mul vl]
+; CHECK-NEXT:    ldr z18, [x0, #11, mul vl]
+; CHECK-NEXT:    fcmge p5.d, p0/z, z4.d, z25.d
+; CHECK-NEXT:    ldr z19, [x0, #9, mul vl]
+; CHECK-NEXT:    movprfx z20, z31
+; CHECK-NEXT:    frintx z20.d, p0/m, z31.d
+; CHECK-NEXT:    frintx z15.d, p0/m, z15.d
+; CHECK-NEXT:    ldr z9, [x0, #15, mul vl]
+; CHECK-NEXT:    ldr z10, [x0, #14, mul vl]
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.d
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z7.d
 ; CHECK-NEXT:    mov z12.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z0.d, p1/m, z5.d
+; CHECK-NEXT:    fcvtzs z30.d, p3/m, z14.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z18.d, p0/m, z18.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p5.d, p0/z, z4.d, z25.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z28.d, z25.d
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    ldr z8, [x0, #12, mul vl]
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z14.d, z1.d
+; CHECK-NEXT:    ldr z13, [x0, #10, mul vl]
+; CHECK-NEXT:    fcvtzs z6.d, p5/m, z4.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z29.d, z25.d
 ; CHECK-NEXT:    mov z16.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z17.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z24.d, p2/m, z7.d
 ; CHECK-NEXT:    frintx z10.d, p0/m, z10.d
 ; CHECK-NEXT:    frintx z9.d, p0/m, z9.d
-; CHECK-NEXT:    fcvtzs z30.d, p3/m, z14.d
-; CHECK-NEXT:    frintx z13.d, p0/m, z13.d
 ; CHECK-NEXT:    mov z21.d, #0x8000000000000000
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    fcvtzs z12.d, p1/m, z28.d
+; CHECK-NEXT:    frintx z13.d, p0/m, z13.d
 ; CHECK-NEXT:    mov z22.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z0, z8
-; CHECK-NEXT:    frintx z0.d, p0/m, z8.d
-; CHECK-NEXT:    ldr z31, [x0, #7, mul vl]
-; CHECK-NEXT:    ldr z15, [x0, #8, mul vl]
-; CHECK-NEXT:    ldr z19, [x0, #9, mul vl]
-; CHECK-NEXT:    fcmge p1.d, p0/z, z28.d, z25.d
-; CHECK-NEXT:    fcvtzs z6.d, p5/m, z4.d
+; CHECK-NEXT:    frintx z8.d, p0/m, z8.d
 ; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    fcmgt p10.d, p0/z, z14.d, z1.d
 ; CHECK-NEXT:    mov z27.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcvtzs z31.d, p2/m, z29.d
 ; CHECK-NEXT:    mov z23.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z20, z31
-; CHECK-NEXT:    frintx z20.d, p0/m, z31.d
-; CHECK-NEXT:    frintx z15.d, p0/m, z15.d
-; CHECK-NEXT:    mov z31.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p2.d, p0/z, z29.d, z25.d
 ; CHECK-NEXT:    fcmuo p8.d, p0/z, z14.d, z14.d
 ; CHECK-NEXT:    movprfx z14, z19
 ; CHECK-NEXT:    frintx z14.d, p0/m, z19.d
 ; CHECK-NEXT:    movprfx z19, z11
 ; CHECK-NEXT:    frintx z19.d, p0/m, z11.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    fcvtzs z12.d, p1/m, z28.d
 ; CHECK-NEXT:    mov z11.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z30.d, p10/m, z27.d
-; CHECK-NEXT:    fcvtzs z31.d, p2/m, z29.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z25.d
 ; CHECK-NEXT:    fcmge p5.d, p0/z, z20.d, z25.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z15.d, z25.d
-; CHECK-NEXT:    fcvtzs z2.d, p4/m, z3.d
 ; CHECK-NEXT:    mov z30.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p6.d, p0/z, z15.d, z25.d
+; CHECK-NEXT:    fcvtzs z5.d, p4/m, z3.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z18.d, z25.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z19.d, z25.d
+; CHECK-NEXT:    str z30, [x8, #4, mul vl]
 ; CHECK-NEXT:    fcvtzs z16.d, p5/m, z20.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z19.d, z25.d
 ; CHECK-NEXT:    fcvtzs z17.d, p6/m, z15.d
-; CHECK-NEXT:    fcmgt p12.d, p0/z, z28.d, z1.d
-; CHECK-NEXT:    fcvtzs z21.d, p1/m, z18.d
-; CHECK-NEXT:    fcvtzs z22.d, p2/m, z19.d
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z20.d, z1.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z10.d, z25.d
+; CHECK-NEXT:    fcvtzs z21.d, p1/m, z18.d
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z19.d
 ; CHECK-NEXT:    fcmgt p11.d, p0/z, z15.d, z1.d
-; CHECK-NEXT:    sel z8.d, p12, z27.d, z12.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z9.d, z25.d
-; CHECK-NEXT:    sel z12.d, p5, z27.d, z16.d
 ; CHECK-NEXT:    fcmge p6.d, p0/z, z14.d, z25.d
 ; CHECK-NEXT:    fcvtzs z23.d, p3/m, z10.d
 ; CHECK-NEXT:    fcmge p7.d, p0/z, z13.d, z25.d
-; CHECK-NEXT:    fcvtzs z26.d, p4/m, z9.d
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z15.d, z15.d
-; CHECK-NEXT:    sel z15.d, p11, z27.d, z17.d
+; CHECK-NEXT:    sel z15.d, p5, z27.d, z16.d
+; CHECK-NEXT:    sel z16.d, p11, z27.d, z17.d
+; CHECK-NEXT:    fcvtzs z26.d, p4/m, z9.d
 ; CHECK-NEXT:    fcvtzs z11.d, p6/m, z14.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z8.d, z25.d
 ; CHECK-NEXT:    mov z25.d, #0x8000000000000000
+; CHECK-NEXT:    mov z16.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p4.d, p0/z, z9.d, z1.d
-; CHECK-NEXT:    mov z15.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z10.d, z1.d
+; CHECK-NEXT:    str z16, [x8, #8, mul vl]
+; CHECK-NEXT:    fcvtzs z25.d, p2/m, z8.d
 ; CHECK-NEXT:    fcmgt p3.d, p0/z, z19.d, z1.d
-; CHECK-NEXT:    fcvtzs z25.d, p2/m, z0.d
-; CHECK-NEXT:    str z15, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, z27.d
 ; CHECK-NEXT:    fcmuo p9.d, p0/z, z20.d, z20.d
 ; CHECK-NEXT:    mov z20.d, #0x8000000000000000
+; CHECK-NEXT:    mov z26.d, p4/m, z27.d
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z9.d, z9.d
 ; CHECK-NEXT:    sel z9.d, p5, z27.d, z23.d
 ; CHECK-NEXT:    fcmuo p2.d, p0/z, z10.d, z10.d
 ; CHECK-NEXT:    sel z10.d, p3, z27.d, z22.d
-; CHECK-NEXT:    fcmuo p6.d, p0/z, z19.d, z19.d
 ; CHECK-NEXT:    fcvtzs z20.d, p7/m, z13.d
-; CHECK-NEXT:    mov z12.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    mov z15.d, p9/m, #0 // =0x0
 ; CHECK-NEXT:    mov z26.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z9.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    str z12, [x8, #7, mul vl]
-; CHECK-NEXT:    mov z10.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p4.d, p0/z, z14.d, z1.d
+; CHECK-NEXT:    str z15, [x8, #7, mul vl]
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z8.d, z1.d
 ; CHECK-NEXT:    str z26, [x8, #15, mul vl]
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z19.d, z19.d
 ; CHECK-NEXT:    str z9, [x8, #14, mul vl]
 ; CHECK-NEXT:    fcmgt p1.d, p0/z, z18.d, z1.d
-; CHECK-NEXT:    str z10, [x8, #13, mul vl]
 ; CHECK-NEXT:    fcmgt p2.d, p0/z, z13.d, z1.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z14.d, z14.d
-; CHECK-NEXT:    fcmuo p6.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z0.d, p4, z27.d, z11.d
+; CHECK-NEXT:    sel z26.d, p4, z27.d, z11.d
 ; CHECK-NEXT:    mov z25.d, p3/m, z27.d
-; CHECK-NEXT:    sel z26.d, p1, z27.d, z21.d
-; CHECK-NEXT:    sel z9.d, p2, z27.d, z20.d
 ; CHECK-NEXT:    fcmuo p4.d, p0/z, z18.d, z18.d
-; CHECK-NEXT:    mov z0.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z25.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    mov z10.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p3.d, p0/z, z13.d, z13.d
+; CHECK-NEXT:    sel z9.d, p2, z27.d, z20.d
+; CHECK-NEXT:    fcmgt p12.d, p0/z, z28.d, z1.d
+; CHECK-NEXT:    str z10, [x8, #13, mul vl]
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z8.d, z8.d
+; CHECK-NEXT:    sel z8.d, p1, z27.d, z21.d
+; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z1.d
-; CHECK-NEXT:    str z0, [x8, #9, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z25, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z14.d, z14.d
+; CHECK-NEXT:    mov z8.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z12.d, p12/m, z27.d
+; CHECK-NEXT:    str z9, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z25.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p2.d, p0/z, z4.d, z1.d
-; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    str z8, [x8, #11, mul vl]
+; CHECK-NEXT:    mov z5.d, p1/m, z27.d
 ; CHECK-NEXT:    fcmgt p4.d, p0/z, z7.d, z1.d
-; CHECK-NEXT:    str z26, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p1/m, z27.d
+; CHECK-NEXT:    str z25, [x8, #12, mul vl]
+; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p3.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z29.d, z1.d
-; CHECK-NEXT:    str z9, [x8, #10, mul vl]
 ; CHECK-NEXT:    mov z6.d, p2/m, z27.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str z30, [x8, #4, mul vl]
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z29.d, z1.d
+; CHECK-NEXT:    str z26, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z24.d, p4/m, z27.d
+; CHECK-NEXT:    mov z12.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p5.d, p0/z, z29.d, z29.d
-; CHECK-NEXT:    sel z0.d, p4, z27.d, z24.d
-; CHECK-NEXT:    mov z8.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p2.d, p0/z, z7.d, z7.d
 ; CHECK-NEXT:    sel z25.d, p6, z27.d, z31.d
-; CHECK-NEXT:    mov z1.d, p1/m, z27.d
+; CHECK-NEXT:    str z12, [x8, #5, mul vl]
 ; CHECK-NEXT:    fcmuo p3.d, p0/z, z4.d, z4.d
-; CHECK-NEXT:    str z8, [x8, #5, mul vl]
 ; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
 ; CHECK-NEXT:    mov z25.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p0.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z27.d, z2.d
+; CHECK-NEXT:    mov z24.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z6.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    str z25, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z0, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z24, [x8, #3, mul vl]
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    str z6, [x8, #2, mul vl]
-; CHECK-NEXT:    str z2, [x8, #1, mul vl]
-; CHECK-NEXT:    str z1, [x8]
-; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    str z5, [x8, #1, mul vl]
+; CHECK-NEXT:    str z0, [x8]
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-lrint.ll b/llvm/test/CodeGen/AArch64/sve-lrint.ll
index f1224d3..38f29e1 100644
--- a/llvm/test/CodeGen/AArch64/sve-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lrint.ll
@@ -157,94 +157,93 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpklo z7.s, z1.h
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
-; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z31.d, #0x8000000000000000
-; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    uunpklo z24.d, z3.s
+; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    uunpkhi z25.d, z3.s
+; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpkhi z6.d, z2.s
+; CHECK-NEXT:    uunpklo z24.d, z3.s
 ; CHECK-NEXT:    uunpklo z26.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z30.d, z1.s
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    movprfx z27, z4
 ; CHECK-NEXT:    frintx z27.h, p0/m, z4.h
-; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
-; CHECK-NEXT:    frintx z25.h, p0/m, z25.h
 ; CHECK-NEXT:    movprfx z28, z6
 ; CHECK-NEXT:    frintx z28.h, p0/m, z6.h
-; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z25.h, p0/m, z25.h
+; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
 ; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
 ; CHECK-NEXT:    frintx z7.h, p0/m, z7.h
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z0.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z0.h
+; CHECK-NEXT:    frintx z30.h, p0/m, z30.h
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z0.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z0.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z0.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z0.h
+; CHECK-NEXT:    fcvtzs z5.d, p4/m, z25.h
 ; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z0.h
 ; CHECK-NEXT:    fcvtzs z2.d, p1/m, z27.h
-; CHECK-NEXT:    fcvtzs z4.d, p3/m, z24.h
-; CHECK-NEXT:    fcvtzs z5.d, p4/m, z25.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z27.h, z29.h
 ; CHECK-NEXT:    fcvtzs z3.d, p2/m, z28.h
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z0.h
+; CHECK-NEXT:    fcvtzs z4.d, p3/m, z24.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z27.h, z29.h
 ; CHECK-NEXT:    fcvtzs z6.d, p5/m, z26.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z27.h, z27.h
-; CHECK-NEXT:    movprfx z27, z30
-; CHECK-NEXT:    frintx z27.h, p0/m, z30.h
-; CHECK-NEXT:    movprfx z30, z1
-; CHECK-NEXT:    frintx z30.h, p0/m, z1.h
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z31.d, p4/m, z7.h
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z28.h, z29.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    mov z28.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z31.d, p4/m, z7.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z0.h
+; CHECK-NEXT:    movprfx z28, z1
+; CHECK-NEXT:    frintx z28.h, p0/m, z1.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z0.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z24.h, z29.h
 ; CHECK-NEXT:    fcmuo p7.h, p0/z, z24.h, z24.h
 ; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmgt p8.h, p0/z, z25.h, z29.h
-; CHECK-NEXT:    fcvtzs z28.d, p4/m, z27.h
+; CHECK-NEXT:    fcvtzs z27.d, p4/m, z30.h
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z25.h, z25.h
 ; CHECK-NEXT:    mov z25.d, #0x8000000000000000
 ; CHECK-NEXT:    sel z1.d, p5, z24.d, z3.d
-; CHECK-NEXT:    sel z3.d, p8, z24.d, z5.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z28.h, z0.h
 ; CHECK-NEXT:    sel z0.d, p3, z24.d, z2.d
 ; CHECK-NEXT:    sel z2.d, p6, z24.d, z4.d
+; CHECK-NEXT:    sel z3.d, p8, z24.d, z5.d
 ; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    mov z2.d, p7/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcvtzs z25.d, p4/m, z30.h
+; CHECK-NEXT:    fcvtzs z25.d, p4/m, z28.h
+; CHECK-NEXT:    mov z3.d, p10/m, #0 // =0x0
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z7.h, z29.h
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z27.h, z29.h
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    sel z4.d, p9, z24.d, z6.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z28.h, z29.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    sel z5.d, p5, z24.d, z31.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
-; CHECK-NEXT:    sel z6.d, p6, z24.d, z28.d
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    sel z6.d, p6, z24.d, z27.d
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmuo p9.h, p0/z, z27.h, z27.h
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    sel z7.d, p4, z24.d, z25.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z28.h, z28.h
 ; CHECK-NEXT:    mov z6.d, p9/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    mov z4.d, p3/m, #0 // =0x0
@@ -356,11 +355,12 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
 ; CHECK-NEXT:    movprfx z13, z14
 ; CHECK-NEXT:    frintx z13.h, p0/m, z14.h
+; CHECK-NEXT:    frintx z20.h, p0/m, z20.h
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
 ; CHECK-NEXT:    mov z22.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z23.d, #0x8000000000000000
-; CHECK-NEXT:    mov z14.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z28.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z28.h
 ; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z30.h
@@ -382,10 +382,8 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    fcvtzs z12.d, p4/m, z11.h
 ; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    uunpkhi z11.d, z17.s
-; CHECK-NEXT:    movprfx z17, z20
-; CHECK-NEXT:    frintx z17.h, p0/m, z20.h
+; CHECK-NEXT:    mov z17.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z25.d, p1/m, z6.h
-; CHECK-NEXT:    mov z20.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z0.d, p5/m, z1.h
 ; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z28.h
 ; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
@@ -393,87 +391,87 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z28.h
 ; CHECK-NEXT:    fcvtzs z18.d, p6/m, z10.h
 ; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z30.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z2.d, p3/m, z31.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z21.d, p1/m, z13.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z28.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z20.h, z28.h
 ; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z28.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z10.h, z10.h
 ; CHECK-NEXT:    sel z10.d, p4, z29.d, z12.d
 ; CHECK-NEXT:    sel z12.d, p11, z29.d, z18.d
 ; CHECK-NEXT:    fcvtzs z26.d, p5/m, z11.h
-; CHECK-NEXT:    fcvtzs z22.d, p2/m, z17.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z20.h
 ; CHECK-NEXT:    fcvtzs z23.d, p3/m, z16.h
 ; CHECK-NEXT:    mov z10.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    mov z12.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z28.h
 ; CHECK-NEXT:    str z10, [x8, #7, mul vl]
 ; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z28.h
 ; CHECK-NEXT:    str z12, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, z29.d
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z28.h
 ; CHECK-NEXT:    mov z28.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z14.d, p6/m, z19.h
+; CHECK-NEXT:    mov z26.d, p4/m, z29.d
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z30.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z30.h
-; CHECK-NEXT:    fcvtzs z20.d, p7/m, z3.h
+; CHECK-NEXT:    fcvtzs z14.d, p6/m, z19.h
+; CHECK-NEXT:    fcvtzs z17.d, p7/m, z3.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z20.h, z30.h
 ; CHECK-NEXT:    fcvtzs z28.d, p2/m, z15.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z11.h, z11.h
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z16.h, z16.h
 ; CHECK-NEXT:    sel z11.d, p5, z29.d, z23.d
-; CHECK-NEXT:    sel z16.d, p3, z29.d, z22.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z16.h, z16.h
 ; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z30.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z30.h
+; CHECK-NEXT:    sel z16.d, p3, z29.d, z22.d
 ; CHECK-NEXT:    mov z26.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z11.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z30.h
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z30.h
-; CHECK-NEXT:    fcmuo p6.h, p0/z, z17.h, z17.h
+; CHECK-NEXT:    mov z11.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    str z26, [x8, #15, mul vl]
 ; CHECK-NEXT:    sel z26.d, p4, z29.d, z14.d
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z20.h, z20.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z30.h
 ; CHECK-NEXT:    str z11, [x8, #14, mul vl]
 ; CHECK-NEXT:    mov z28.d, p3/m, z29.d
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z30.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z13.h, z13.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
 ; CHECK-NEXT:    sel z3.d, p1, z29.d, z21.d
 ; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    sel z11.d, p2, z29.d, z17.d
 ; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z30.h
-; CHECK-NEXT:    sel z11.d, p2, z29.d, z20.d
-; CHECK-NEXT:    str z16, [x8, #13, mul vl]
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z16, [x8, #13, mul vl]
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z30.h
 ; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z9.d, p12/m, z29.d
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z30.h
 ; CHECK-NEXT:    str z3, [x8, #11, mul vl]
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z30.h
+; CHECK-NEXT:    mov z9.d, p12/m, z29.d
 ; CHECK-NEXT:    str z11, [x8, #10, mul vl]
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z30.h
 ; CHECK-NEXT:    mov z28.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    sel z3.d, p1, z29.d, z7.d
 ; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z30.h
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z7.d
+; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z27.h, z27.h
 ; CHECK-NEXT:    str z28, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z7.d, p2, z29.d, z24.d
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z30.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z30.h
+; CHECK-NEXT:    sel z7.d, p2, z29.d, z24.d
 ; CHECK-NEXT:    str z26, [x8, #9, mul vl]
 ; CHECK-NEXT:    sel z24.d, p4, z29.d, z25.d
-; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z30.h
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z31.h, z31.h
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z2.d, p6/m, z29.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z5.h, z5.h
 ; CHECK-NEXT:    str z9, [x8, #5, mul vl]
 ; CHECK-NEXT:    mov z0.d, p1/m, z29.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z5.h, z5.h
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
 ; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z1.h, z1.h
-; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z24.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    str z2, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    str z24, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
@@ -861,12 +859,11 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    movprfx z17, z18
 ; CHECK-NEXT:    frintx z17.s, p0/m, z18.s
 ; CHECK-NEXT:    fcmge p6.s, p0/z, z30.s, z29.s
-; CHECK-NEXT:    movprfx z18, z19
-; CHECK-NEXT:    frintx z18.s, p0/m, z19.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
 ; CHECK-NEXT:    mov z31.s, w9
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z19.s, p0/m, z19.s
 ; CHECK-NEXT:    uunpklo z6.d, z6.s
 ; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z27.d, p3/m, z24.s
@@ -882,7 +879,7 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    mov z8.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z6.s, p0/m, z6.s
 ; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z19.d, #0x8000000000000000
+; CHECK-NEXT:    mov z18.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z29.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z29.s
 ; CHECK-NEXT:    fcmge p4.s, p0/z, z25.s, z29.s
@@ -902,94 +899,94 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    fcvtzs z4.d, p3/m, z15.s
 ; CHECK-NEXT:    fcvtzs z16.d, p6/m, z13.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z17.s, z29.s
-; CHECK-NEXT:    fcmge p2.s, p0/z, z18.s, z29.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z19.s, z29.s
 ; CHECK-NEXT:    fcmgt p12.s, p0/z, z30.s, z31.s
 ; CHECK-NEXT:    fcmgt p5.s, p0/z, z15.s, z31.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z20.s, z29.s
 ; CHECK-NEXT:    fcvtzs z21.d, p1/m, z17.s
-; CHECK-NEXT:    fcvtzs z23.d, p2/m, z18.s
-; CHECK-NEXT:    fcmgt p11.s, p0/z, z13.s, z31.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z20.s, z29.s
+; CHECK-NEXT:    fcvtzs z23.d, p2/m, z19.s
 ; CHECK-NEXT:    sel z7.d, p12, z3.d, z12.d
+; CHECK-NEXT:    fcmgt p11.s, p0/z, z13.s, z31.s
 ; CHECK-NEXT:    mov z4.d, p5/m, z3.d
 ; CHECK-NEXT:    fcmge p4.s, p0/z, z22.s, z29.s
 ; CHECK-NEXT:    fcvtzs z0.d, p3/m, z20.s
 ; CHECK-NEXT:    fcmge p6.s, p0/z, z5.s, z29.s
-; CHECK-NEXT:    sel z12.d, p11, z3.d, z16.d
 ; CHECK-NEXT:    fcmge p7.s, p0/z, z14.s, z29.s
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z13.s, z13.s
+; CHECK-NEXT:    sel z12.d, p11, z3.d, z16.d
 ; CHECK-NEXT:    fcvtzs z8.d, p4/m, z22.s
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z13.s, z13.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z6.s, z29.s
 ; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z18.d, p7/m, z14.s
 ; CHECK-NEXT:    fcmuo p10.s, p0/z, z15.s, z15.s
 ; CHECK-NEXT:    mov z15.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z19.d, p7/m, z14.s
 ; CHECK-NEXT:    mov z12.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcvtzs z29.d, p2/m, z6.s
 ; CHECK-NEXT:    fcmgt p4.s, p0/z, z22.s, z31.s
+; CHECK-NEXT:    fcvtzs z29.d, p2/m, z6.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z20.s, z31.s
+; CHECK-NEXT:    str z12, [x8, #8, mul vl]
 ; CHECK-NEXT:    fcvtzs z15.d, p6/m, z5.s
 ; CHECK-NEXT:    mov z4.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    str z12, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z20.s, z31.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z18.s, z31.s
-; CHECK-NEXT:    str z4, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z22.s, z22.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z31.s
 ; CHECK-NEXT:    mov z8.d, p4/m, z3.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z22.s, z22.s
+; CHECK-NEXT:    str z4, [x8, #7, mul vl]
 ; CHECK-NEXT:    mov z0.d, p5/m, z3.d
-; CHECK-NEXT:    fcmuo p6.s, p0/z, z18.s, z18.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z19.s, z19.s
 ; CHECK-NEXT:    fcmgt p4.s, p0/z, z5.s, z31.s
 ; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p5.s, p0/z, z5.s, z5.s
 ; CHECK-NEXT:    sel z5.d, p3, z3.d, z23.d
-; CHECK-NEXT:    str z8, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p3.s, p0/z, z6.s, z31.s
-; CHECK-NEXT:    str z0, [x8, #14, mul vl]
+; CHECK-NEXT:    str z8, [x8, #15, mul vl]
 ; CHECK-NEXT:    fcmgt p1.s, p0/z, z17.s, z31.s
+; CHECK-NEXT:    str z0, [x8, #14, mul vl]
 ; CHECK-NEXT:    mov z5.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p2.s, p0/z, z14.s, z31.s
+; CHECK-NEXT:    sel z0.d, p3, z3.d, z29.d
 ; CHECK-NEXT:    fcmuo p6.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    sel z6.d, p4, z3.d, z15.d
 ; CHECK-NEXT:    str z5, [x8, #13, mul vl]
-; CHECK-NEXT:    sel z0.d, p3, z3.d, z29.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    sel z6.d, p4, z3.d, z15.d
 ; CHECK-NEXT:    sel z5.d, p1, z3.d, z21.d
-; CHECK-NEXT:    sel z29.d, p2, z3.d, z19.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    sel z29.d, p2, z3.d, z18.d
 ; CHECK-NEXT:    mov z6.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p3.s, p0/z, z14.s, z14.s
-; CHECK-NEXT:    mov z5.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z6, [x8, #9, mul vl]
+; CHECK-NEXT:    mov z0.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z31.s
+; CHECK-NEXT:    str z6, [x8, #9, mul vl]
+; CHECK-NEXT:    mov z5.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    str z0, [x8, #12, mul vl]
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z24.s, z31.s
 ; CHECK-NEXT:    mov z29.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    str z5, [x8, #11, mul vl]
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z31.s
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z30.s, z30.s
 ; CHECK-NEXT:    sel z5.d, p1, z3.d, z26.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z24.s, z31.s
 ; CHECK-NEXT:    str z29, [x8, #10, mul vl]
-; CHECK-NEXT:    sel z26.d, p2, z3.d, z27.d
 ; CHECK-NEXT:    ldr z4, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z11, [x8, #4, mul vl]
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z31.s
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z30.s, z30.s
+; CHECK-NEXT:    sel z26.d, p2, z3.d, z27.d
 ; CHECK-NEXT:    fcmgt p6.s, p0/z, z9.s, z31.s
-; CHECK-NEXT:    sel z6.d, p4, z3.d, z28.d
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z4.s, z31.s
 ; CHECK-NEXT:    fcmuo p5.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    sel z6.d, p4, z3.d, z28.d
 ; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z4.s, z31.s
 ; CHECK-NEXT:    fcmuo p2.s, p0/z, z25.s, z25.s
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z24.s, z24.s
 ; CHECK-NEXT:    sel z0.d, p6, z3.d, z10.d
-; CHECK-NEXT:    str z7, [x8, #5, mul vl]
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z24.s, z24.s
 ; CHECK-NEXT:    fcmuo p4.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z4.s, z4.s
 ; CHECK-NEXT:    sel z1.d, p1, z3.d, z2.d
+; CHECK-NEXT:    str z7, [x8, #5, mul vl]
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z4.s, z4.s
 ; CHECK-NEXT:    mov z0.d, p5/m, #0 // =0x0
 ; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z26.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z5.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    str z0, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    str z6, [x8, #3, mul vl]
 ; CHECK-NEXT:    str z26, [x8, #2, mul vl]
 ; CHECK-NEXT:    str z5, [x8, #1, mul vl]
@@ -1300,8 +1297,7 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x98, 0x01, 0x1e, 0x22 // sp + 16 + 152 * VG
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16
@@ -1312,182 +1308,177 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z25.d, x9
 ; CHECK-NEXT:    ldr z7, [x0, #3, mul vl]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z27, [x0, #4, mul vl]
 ; CHECK-NEXT:    ldr z4, [x0, #2, mul vl]
+; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z25.d, x9
 ; CHECK-NEXT:    ldr z2, [x0, #1, mul vl]
-; CHECK-NEXT:    ldr z9, [x0, #15, mul vl]
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    frintx z5.d, p0/m, z0.d
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    ldr z10, [x0, #14, mul vl]
+; CHECK-NEXT:    ldr z29, [x0, #6, mul vl]
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
+; CHECK-NEXT:    ldr z31, [x0, #7, mul vl]
 ; CHECK-NEXT:    movprfx z14, z27
 ; CHECK-NEXT:    frintx z14.d, p0/m, z27.d
-; CHECK-NEXT:    ldr z11, [x0, #13, mul vl]
 ; CHECK-NEXT:    frintx z4.d, p0/m, z4.d
-; CHECK-NEXT:    ldr z8, [x0, #12, mul vl]
 ; CHECK-NEXT:    ldr z27, [x0, #5, mul vl]
-; CHECK-NEXT:    ldr z18, [x0, #11, mul vl]
-; CHECK-NEXT:    ldr z13, [x0, #10, mul vl]
-; CHECK-NEXT:    ldr z29, [x0, #6, mul vl]
-; CHECK-NEXT:    fcmge p1.d, p0/z, z5.d, z25.d
+; CHECK-NEXT:    ldr z15, [x0, #8, mul vl]
 ; CHECK-NEXT:    mov x9, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    movprfx z3, z2
+; CHECK-NEXT:    frintx z3.d, p0/m, z2.d
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z25.d
 ; CHECK-NEXT:    movprfx z28, z27
 ; CHECK-NEXT:    frintx z28.d, p0/m, z27.d
-; CHECK-NEXT:    mov z1.d, x9
-; CHECK-NEXT:    fcmge p2.d, p0/z, z7.d, z25.d
 ; CHECK-NEXT:    frintx z29.d, p0/m, z29.d
-; CHECK-NEXT:    movprfx z3, z2
-; CHECK-NEXT:    frintx z3.d, p0/m, z2.d
-; CHECK-NEXT:    fcmge p3.d, p0/z, z14.d, z25.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z7.d, z25.d
+; CHECK-NEXT:    mov z1.d, x9
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.d, p0/z, z14.d, z25.d
+; CHECK-NEXT:    ldr z11, [x0, #13, mul vl]
+; CHECK-NEXT:    ldr z18, [x0, #11, mul vl]
+; CHECK-NEXT:    fcmge p5.d, p0/z, z4.d, z25.d
+; CHECK-NEXT:    ldr z19, [x0, #9, mul vl]
+; CHECK-NEXT:    movprfx z20, z31
+; CHECK-NEXT:    frintx z20.d, p0/m, z31.d
+; CHECK-NEXT:    frintx z15.d, p0/m, z15.d
+; CHECK-NEXT:    ldr z9, [x0, #15, mul vl]
+; CHECK-NEXT:    ldr z10, [x0, #14, mul vl]
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.d
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z7.d
 ; CHECK-NEXT:    mov z12.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z0.d, p1/m, z5.d
+; CHECK-NEXT:    fcvtzs z30.d, p3/m, z14.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z18.d, p0/m, z18.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p5.d, p0/z, z4.d, z25.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z28.d, z25.d
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    ldr z8, [x0, #12, mul vl]
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z14.d, z1.d
+; CHECK-NEXT:    ldr z13, [x0, #10, mul vl]
+; CHECK-NEXT:    fcvtzs z6.d, p5/m, z4.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z29.d, z25.d
 ; CHECK-NEXT:    mov z16.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z17.d, #0x8000000000000000
-; CHECK-NEXT:    fcvtzs z24.d, p2/m, z7.d
 ; CHECK-NEXT:    frintx z10.d, p0/m, z10.d
 ; CHECK-NEXT:    frintx z9.d, p0/m, z9.d
-; CHECK-NEXT:    fcvtzs z30.d, p3/m, z14.d
-; CHECK-NEXT:    frintx z13.d, p0/m, z13.d
 ; CHECK-NEXT:    mov z21.d, #0x8000000000000000
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    fcvtzs z12.d, p1/m, z28.d
+; CHECK-NEXT:    frintx z13.d, p0/m, z13.d
 ; CHECK-NEXT:    mov z22.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z0, z8
-; CHECK-NEXT:    frintx z0.d, p0/m, z8.d
-; CHECK-NEXT:    ldr z31, [x0, #7, mul vl]
-; CHECK-NEXT:    ldr z15, [x0, #8, mul vl]
-; CHECK-NEXT:    ldr z19, [x0, #9, mul vl]
-; CHECK-NEXT:    fcmge p1.d, p0/z, z28.d, z25.d
-; CHECK-NEXT:    fcvtzs z6.d, p5/m, z4.d
+; CHECK-NEXT:    frintx z8.d, p0/m, z8.d
 ; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    fcmgt p10.d, p0/z, z14.d, z1.d
 ; CHECK-NEXT:    mov z27.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcvtzs z31.d, p2/m, z29.d
 ; CHECK-NEXT:    mov z23.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z20, z31
-; CHECK-NEXT:    frintx z20.d, p0/m, z31.d
-; CHECK-NEXT:    frintx z15.d, p0/m, z15.d
-; CHECK-NEXT:    mov z31.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p2.d, p0/z, z29.d, z25.d
 ; CHECK-NEXT:    fcmuo p8.d, p0/z, z14.d, z14.d
 ; CHECK-NEXT:    movprfx z14, z19
 ; CHECK-NEXT:    frintx z14.d, p0/m, z19.d
 ; CHECK-NEXT:    movprfx z19, z11
 ; CHECK-NEXT:    frintx z19.d, p0/m, z11.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    fcvtzs z12.d, p1/m, z28.d
 ; CHECK-NEXT:    mov z11.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z30.d, p10/m, z27.d
-; CHECK-NEXT:    fcvtzs z31.d, p2/m, z29.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z25.d
 ; CHECK-NEXT:    fcmge p5.d, p0/z, z20.d, z25.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z15.d, z25.d
-; CHECK-NEXT:    fcvtzs z2.d, p4/m, z3.d
 ; CHECK-NEXT:    mov z30.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p6.d, p0/z, z15.d, z25.d
+; CHECK-NEXT:    fcvtzs z5.d, p4/m, z3.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z18.d, z25.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z19.d, z25.d
+; CHECK-NEXT:    str z30, [x8, #4, mul vl]
 ; CHECK-NEXT:    fcvtzs z16.d, p5/m, z20.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z19.d, z25.d
 ; CHECK-NEXT:    fcvtzs z17.d, p6/m, z15.d
-; CHECK-NEXT:    fcmgt p12.d, p0/z, z28.d, z1.d
-; CHECK-NEXT:    fcvtzs z21.d, p1/m, z18.d
-; CHECK-NEXT:    fcvtzs z22.d, p2/m, z19.d
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z20.d, z1.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z10.d, z25.d
+; CHECK-NEXT:    fcvtzs z21.d, p1/m, z18.d
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z19.d
 ; CHECK-NEXT:    fcmgt p11.d, p0/z, z15.d, z1.d
-; CHECK-NEXT:    sel z8.d, p12, z27.d, z12.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z9.d, z25.d
-; CHECK-NEXT:    sel z12.d, p5, z27.d, z16.d
 ; CHECK-NEXT:    fcmge p6.d, p0/z, z14.d, z25.d
 ; CHECK-NEXT:    fcvtzs z23.d, p3/m, z10.d
 ; CHECK-NEXT:    fcmge p7.d, p0/z, z13.d, z25.d
-; CHECK-NEXT:    fcvtzs z26.d, p4/m, z9.d
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z15.d, z15.d
-; CHECK-NEXT:    sel z15.d, p11, z27.d, z17.d
+; CHECK-NEXT:    sel z15.d, p5, z27.d, z16.d
+; CHECK-NEXT:    sel z16.d, p11, z27.d, z17.d
+; CHECK-NEXT:    fcvtzs z26.d, p4/m, z9.d
 ; CHECK-NEXT:    fcvtzs z11.d, p6/m, z14.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z8.d, z25.d
 ; CHECK-NEXT:    mov z25.d, #0x8000000000000000
+; CHECK-NEXT:    mov z16.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p4.d, p0/z, z9.d, z1.d
-; CHECK-NEXT:    mov z15.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z10.d, z1.d
+; CHECK-NEXT:    str z16, [x8, #8, mul vl]
+; CHECK-NEXT:    fcvtzs z25.d, p2/m, z8.d
 ; CHECK-NEXT:    fcmgt p3.d, p0/z, z19.d, z1.d
-; CHECK-NEXT:    fcvtzs z25.d, p2/m, z0.d
-; CHECK-NEXT:    str z15, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, z27.d
 ; CHECK-NEXT:    fcmuo p9.d, p0/z, z20.d, z20.d
 ; CHECK-NEXT:    mov z20.d, #0x8000000000000000
+; CHECK-NEXT:    mov z26.d, p4/m, z27.d
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z9.d, z9.d
 ; CHECK-NEXT:    sel z9.d, p5, z27.d, z23.d
 ; CHECK-NEXT:    fcmuo p2.d, p0/z, z10.d, z10.d
 ; CHECK-NEXT:    sel z10.d, p3, z27.d, z22.d
-; CHECK-NEXT:    fcmuo p6.d, p0/z, z19.d, z19.d
 ; CHECK-NEXT:    fcvtzs z20.d, p7/m, z13.d
-; CHECK-NEXT:    mov z12.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    mov z15.d, p9/m, #0 // =0x0
 ; CHECK-NEXT:    mov z26.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z9.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    str z12, [x8, #7, mul vl]
-; CHECK-NEXT:    mov z10.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p4.d, p0/z, z14.d, z1.d
+; CHECK-NEXT:    str z15, [x8, #7, mul vl]
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z8.d, z1.d
 ; CHECK-NEXT:    str z26, [x8, #15, mul vl]
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z19.d, z19.d
 ; CHECK-NEXT:    str z9, [x8, #14, mul vl]
 ; CHECK-NEXT:    fcmgt p1.d, p0/z, z18.d, z1.d
-; CHECK-NEXT:    str z10, [x8, #13, mul vl]
 ; CHECK-NEXT:    fcmgt p2.d, p0/z, z13.d, z1.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z14.d, z14.d
-; CHECK-NEXT:    fcmuo p6.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z0.d, p4, z27.d, z11.d
+; CHECK-NEXT:    sel z26.d, p4, z27.d, z11.d
 ; CHECK-NEXT:    mov z25.d, p3/m, z27.d
-; CHECK-NEXT:    sel z26.d, p1, z27.d, z21.d
-; CHECK-NEXT:    sel z9.d, p2, z27.d, z20.d
 ; CHECK-NEXT:    fcmuo p4.d, p0/z, z18.d, z18.d
-; CHECK-NEXT:    mov z0.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z25.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    mov z10.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p3.d, p0/z, z13.d, z13.d
+; CHECK-NEXT:    sel z9.d, p2, z27.d, z20.d
+; CHECK-NEXT:    fcmgt p12.d, p0/z, z28.d, z1.d
+; CHECK-NEXT:    str z10, [x8, #13, mul vl]
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z8.d, z8.d
+; CHECK-NEXT:    sel z8.d, p1, z27.d, z21.d
+; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z1.d
-; CHECK-NEXT:    str z0, [x8, #9, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z25, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z14.d, z14.d
+; CHECK-NEXT:    mov z8.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z12.d, p12/m, z27.d
+; CHECK-NEXT:    str z9, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z25.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p2.d, p0/z, z4.d, z1.d
-; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    str z8, [x8, #11, mul vl]
+; CHECK-NEXT:    mov z5.d, p1/m, z27.d
 ; CHECK-NEXT:    fcmgt p4.d, p0/z, z7.d, z1.d
-; CHECK-NEXT:    str z26, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p1/m, z27.d
+; CHECK-NEXT:    str z25, [x8, #12, mul vl]
+; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p3.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z29.d, z1.d
-; CHECK-NEXT:    str z9, [x8, #10, mul vl]
 ; CHECK-NEXT:    mov z6.d, p2/m, z27.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str z30, [x8, #4, mul vl]
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z29.d, z1.d
+; CHECK-NEXT:    str z26, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z24.d, p4/m, z27.d
+; CHECK-NEXT:    mov z12.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p5.d, p0/z, z29.d, z29.d
-; CHECK-NEXT:    sel z0.d, p4, z27.d, z24.d
-; CHECK-NEXT:    mov z8.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p2.d, p0/z, z7.d, z7.d
 ; CHECK-NEXT:    sel z25.d, p6, z27.d, z31.d
-; CHECK-NEXT:    mov z1.d, p1/m, z27.d
+; CHECK-NEXT:    str z12, [x8, #5, mul vl]
 ; CHECK-NEXT:    fcmuo p3.d, p0/z, z4.d, z4.d
-; CHECK-NEXT:    str z8, [x8, #5, mul vl]
 ; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
 ; CHECK-NEXT:    mov z25.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p0.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z27.d, z2.d
+; CHECK-NEXT:    mov z24.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z6.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    str z25, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z0, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z24, [x8, #3, mul vl]
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    str z6, [x8, #2, mul vl]
-; CHECK-NEXT:    str z2, [x8, #1, mul vl]
-; CHECK-NEXT:    str z1, [x8]
-; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    str z5, [x8, #1, mul vl]
+; CHECK-NEXT:    str z0, [x8]
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 6951a6c..41a6cdc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -591,21 +591,18 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov x8, #6 // =0x6
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    ld1h { z5.d }, p0/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    fcvt z2.d, p0/m, z2.h
 ; CHECK-NEXT:    mov x8, #2 // =0x2
-; CHECK-NEXT:    fcvt z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvt z2.d, p0/m, z2.h
 ; CHECK-NEXT:    ld1h { z7.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    fcvt z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvt z6.d, p0/m, z6.h
 ; CHECK-NEXT:    fcvt z4.d, p0/m, z4.h
+; CHECK-NEXT:    fcvt z5.d, p0/m, z5.h
 ; CHECK-NEXT:    stp q0, q1, [x1, #96]
-; CHECK-NEXT:    movprfx z0, z5
-; CHECK-NEXT:    fcvt z0.d, p0/m, z5.h
-; CHECK-NEXT:    movprfx z1, z6
-; CHECK-NEXT:    fcvt z1.d, p0/m, z6.h
+; CHECK-NEXT:    fcvt z7.d, p0/m, z7.h
 ; CHECK-NEXT:    stp q2, q3, [x1, #64]
-; CHECK-NEXT:    movprfx z2, z7
-; CHECK-NEXT:    fcvt z2.d, p0/m, z7.h
-; CHECK-NEXT:    stp q4, q0, [x1, #32]
-; CHECK-NEXT:    stp q1, q2, [x1]
+; CHECK-NEXT:    stp q4, q5, [x1, #32]
+; CHECK-NEXT:    stp q6, q7, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 25c98c4d..21a2692 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -603,47 +603,45 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z2.h, z1.h[1]
 ; CHECK-NEXT:    mov z3.h, z0.h[3]
 ; CHECK-NEXT:    mov z4.h, z0.h[2]
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
 ; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    fcvtzu z7.d, p0/m, z1.h
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
+; CHECK-NEXT:    movprfx z6, z0
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    mov z16.h, z1.h[3]
 ; CHECK-NEXT:    mov z1.h, z1.h[2]
 ; CHECK-NEXT:    mov z17.h, z0.h[1]
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
 ; CHECK-NEXT:    fcvtzu z4.d, p0/m, z4.h
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
-; CHECK-NEXT:    fcvtzu z16.d, p0/m, z16.h
 ; CHECK-NEXT:    mov z18.h, z5.h[3]
-; CHECK-NEXT:    fcvtzu z17.d, p0/m, z17.h
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
 ; CHECK-NEXT:    mov z19.h, z6.h[3]
 ; CHECK-NEXT:    mov z20.h, z6.h[2]
+; CHECK-NEXT:    fcvtzu z17.d, p0/m, z17.h
 ; CHECK-NEXT:    mov z21.h, z6.h[1]
+; CHECK-NEXT:    fcvtzu z16.d, p0/m, z16.h
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
 ; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
 ; CHECK-NEXT:    zip1 z2.d, z7.d, z2.d
 ; CHECK-NEXT:    mov z7.h, z5.h[2]
 ; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
 ; CHECK-NEXT:    mov z4.h, z5.h[1]
 ; CHECK-NEXT:    fcvtzu z19.d, p0/m, z19.h
-; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
 ; CHECK-NEXT:    fcvtzu z20.d, p0/m, z20.h
+; CHECK-NEXT:    fcvtzu z21.d, p0/m, z21.h
+; CHECK-NEXT:    fcvtzu z18.d, p0/m, z18.h
+; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z7.h
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z17.d
-; CHECK-NEXT:    movprfx z17, z21
-; CHECK-NEXT:    fcvtzu z17.d, p0/m, z21.h
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z16.d
-; CHECK-NEXT:    movprfx z16, z18
-; CHECK-NEXT:    fcvtzu z16.d, p0/m, z18.h
-; CHECK-NEXT:    fcvtzu z7.d, p0/m, z7.h
 ; CHECK-NEXT:    fcvtzu z4.d, p0/m, z4.h
+; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q0, q3, [x1, #64]
 ; CHECK-NEXT:    zip1 z0.d, z20.d, z19.d
-; CHECK-NEXT:    zip1 z3.d, z6.d, z17.d
-; CHECK-NEXT:    stp q2, q1, [x1]
-; CHECK-NEXT:    zip1 z1.d, z7.d, z16.d
+; CHECK-NEXT:    zip1 z3.d, z6.d, z21.d
+; CHECK-NEXT:    zip1 z1.d, z7.d, z18.d
 ; CHECK-NEXT:    zip1 z2.d, z5.d, z4.d
 ; CHECK-NEXT:    stp q3, q0, [x1, #96]
 ; CHECK-NEXT:    stp q2, q1, [x1, #32]
@@ -2306,47 +2304,45 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z2.h, z1.h[1]
 ; CHECK-NEXT:    mov z3.h, z0.h[3]
 ; CHECK-NEXT:    mov z4.h, z0.h[2]
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
 ; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.h
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
+; CHECK-NEXT:    movprfx z6, z0
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    mov z16.h, z1.h[3]
 ; CHECK-NEXT:    mov z1.h, z1.h[2]
 ; CHECK-NEXT:    mov z17.h, z0.h[1]
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
 ; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.h
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
-; CHECK-NEXT:    fcvtzs z16.d, p0/m, z16.h
 ; CHECK-NEXT:    mov z18.h, z5.h[3]
-; CHECK-NEXT:    fcvtzs z17.d, p0/m, z17.h
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
 ; CHECK-NEXT:    mov z19.h, z6.h[3]
 ; CHECK-NEXT:    mov z20.h, z6.h[2]
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z17.h
 ; CHECK-NEXT:    mov z21.h, z6.h[1]
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z16.h
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
 ; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
 ; CHECK-NEXT:    zip1 z2.d, z7.d, z2.d
 ; CHECK-NEXT:    mov z7.h, z5.h[2]
 ; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
 ; CHECK-NEXT:    mov z4.h, z5.h[1]
 ; CHECK-NEXT:    fcvtzs z19.d, p0/m, z19.h
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
 ; CHECK-NEXT:    fcvtzs z20.d, p0/m, z20.h
+; CHECK-NEXT:    fcvtzs z21.d, p0/m, z21.h
+; CHECK-NEXT:    fcvtzs z18.d, p0/m, z18.h
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.h
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z17.d
-; CHECK-NEXT:    movprfx z17, z21
-; CHECK-NEXT:    fcvtzs z17.d, p0/m, z21.h
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z16.d
-; CHECK-NEXT:    movprfx z16, z18
-; CHECK-NEXT:    fcvtzs z16.d, p0/m, z18.h
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.h
 ; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.h
+; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q0, q3, [x1, #64]
 ; CHECK-NEXT:    zip1 z0.d, z20.d, z19.d
-; CHECK-NEXT:    zip1 z3.d, z6.d, z17.d
-; CHECK-NEXT:    stp q2, q1, [x1]
-; CHECK-NEXT:    zip1 z1.d, z7.d, z16.d
+; CHECK-NEXT:    zip1 z3.d, z6.d, z21.d
+; CHECK-NEXT:    zip1 z1.d, z7.d, z18.d
 ; CHECK-NEXT:    zip1 z2.d, z5.d, z4.d
 ; CHECK-NEXT:    stp q3, q0, [x1, #96]
 ; CHECK-NEXT:    stp q2, q1, [x1, #32]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 200e462..4379194 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -551,41 +551,39 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    movprfx z5, z3
 ; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
-; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    uunpklo z4.d, z4.s
-; CHECK-NEXT:    uunpklo z5.d, z5.s
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    movprfx z6, z2
 ; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
 ; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z5.d, z5.s
+; CHECK-NEXT:    uunpklo z4.d, z4.s
 ; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    uunpklo z6.d, z6.s
-; CHECK-NEXT:    ucvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    uunpklo z7.d, z7.s
 ; CHECK-NEXT:    ucvtf z5.d, p0/m, z5.d
+; CHECK-NEXT:    ucvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    ucvtf z6.d, p0/m, z6.d
+; CHECK-NEXT:    ucvtf z7.d, p0/m, z7.d
 ; CHECK-NEXT:    stp q3, q5, [x1]
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    ucvtf z3.d, p0/m, z7.d
 ; CHECK-NEXT:    stp q0, q4, [x1, #64]
-; CHECK-NEXT:    movprfx z0, z6
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z6.d
-; CHECK-NEXT:    stp q1, q3, [x1, #32]
-; CHECK-NEXT:    stp q2, q0, [x1, #96]
+; CHECK-NEXT:    stp q1, q7, [x1, #32]
+; CHECK-NEXT:    stp q2, q6, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
@@ -1192,14 +1190,12 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    fcvt z0.h, p0/m, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvt z1.h, p0/m, z2.s
+; CHECK-NEXT:    fcvt z1.h, p0/m, z1.s
+; CHECK-NEXT:    fcvt z2.h, p0/m, z2.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z0.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z2.h, z2.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -1952,41 +1948,39 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    movprfx z5, z3
 ; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
-; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
-; CHECK-NEXT:    sunpklo z4.d, z4.s
-; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    movprfx z6, z2
 ; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
 ; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    sunpklo z4.d, z4.s
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    sunpklo z6.d, z6.s
-; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    sunpklo z7.d, z7.s
 ; CHECK-NEXT:    scvtf z5.d, p0/m, z5.d
+; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    scvtf z6.d, p0/m, z6.d
+; CHECK-NEXT:    scvtf z7.d, p0/m, z7.d
 ; CHECK-NEXT:    stp q3, q5, [x1]
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    scvtf z3.d, p0/m, z7.d
 ; CHECK-NEXT:    stp q0, q4, [x1, #64]
-; CHECK-NEXT:    movprfx z0, z6
-; CHECK-NEXT:    scvtf z0.d, p0/m, z6.d
-; CHECK-NEXT:    stp q1, q3, [x1, #32]
-; CHECK-NEXT:    stp q2, q0, [x1, #96]
+; CHECK-NEXT:    stp q1, q7, [x1, #32]
+; CHECK-NEXT:    stp q2, q6, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
@@ -2413,32 +2407,29 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    movprfx z4, z1
 ; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    movprfx z6, z3
 ; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
 ; CHECK-NEXT:    movprfx z7, z5
 ; CHECK-NEXT:    ext z7.b, z7.b, z5.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    sunpklo z5.d, z5.s
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
 ; CHECK-NEXT:    sunpklo z4.d, z4.s
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    sunpklo z6.d, z6.s
 ; CHECK-NEXT:    sunpklo z7.d, z7.s
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    scvtf z5.d, p0/m, z5.d
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
+; CHECK-NEXT:    scvtf z6.d, p0/m, z6.d
+; CHECK-NEXT:    scvtf z7.d, p0/m, z7.d
 ; CHECK-NEXT:    stp q1, q4, [x1, #64]
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    scvtf z1.d, p0/m, z5.d
+; CHECK-NEXT:    stp q5, q7, [x1]
+; CHECK-NEXT:    stp q3, q6, [x1, #32]
 ; CHECK-NEXT:    stp q0, q2, [x1, #96]
-; CHECK-NEXT:    movprfx z0, z6
-; CHECK-NEXT:    scvtf z0.d, p0/m, z6.d
-; CHECK-NEXT:    movprfx z2, z7
-; CHECK-NEXT:    scvtf z2.d, p0/m, z7.d
-; CHECK-NEXT:    stp q1, q2, [x1]
-; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
index b2b4331..48b6dd9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
 
 ; fold (fadd (fma x, y, (fpext (fmul u, v))), z) -> (fma x, y, (fma (fpext u), (fpext v), z))
 define amdgpu_vs float @test_f16_f32_add_fma_ext_mul(float %x, float %y, float %z, half %u, half %v) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
index 4d603f7..21997e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-FAST-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-FAST-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-FAST-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-FAST-DENORM %s
 
 ; fold (fadd fast (fpext (fmul fast x, y)), z) -> (fma (fpext x), (fpext y), z)
 ; fold (fadd fast x, (fpext (fmul fast y, z))) -> (fma (fpext y), (fpext z), x)
@@ -49,21 +49,26 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
 ; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
 ; GFX9-FAST-DENORM:       ; %bb.0: ; %.entry
 ; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v0, s0, v0
-; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v1, s1, v1
-; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v2, s2, v2
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v5, v1
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v0, s6, v3
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v1, s7, v4
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v2, s8, v5
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v3, s9, v6
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v4, s10, v7
+; GFX9-FAST-DENORM-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v0, s1, v0
+; GFX9-FAST-DENORM-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v0, s2, v0
+; GFX9-FAST-DENORM-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-FAST-DENORM-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-FAST-DENORM-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v0, s0
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v1, s3
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v2, s1
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v4, s2
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v0, s6, v0
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v1, s7, v1
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v2, s8, v2
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v3, s9, v3
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v4, s10, v4
 ; GFX9-FAST-DENORM-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
@@ -94,23 +99,29 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
 ; GFX9-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:
 ; GFX9-FAST-DENORM:       ; %bb.0: ; %.entry
 ; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v0, s0, v0
-; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v1, s1, v1
-; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v2, s2, v2
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v5, v1
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v0, s6, v3
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v1, s7, v4
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v2, s8, v5
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v3, s9, v6
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v4, s10, v7
-; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v5, s11, v8
+; GFX9-FAST-DENORM-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v0, s1, v0
+; GFX9-FAST-DENORM-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-FAST-DENORM-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-FAST-DENORM-NEXT:    v_pk_mul_f16 v0, s2, v0
+; GFX9-FAST-DENORM-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-FAST-DENORM-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-FAST-DENORM-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-FAST-DENORM-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v0, s0
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v1, s3
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v2, s1
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v4, s2
+; GFX9-FAST-DENORM-NEXT:    v_cvt_f32_f16_e32 v5, s5
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v0, s6, v0
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v1, s7, v1
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v2, s8, v2
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v3, s9, v3
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v4, s10, v4
+; GFX9-FAST-DENORM-NEXT:    v_add_f32_e32 v5, s11, v5
 ; GFX9-FAST-DENORM-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
index 6ea0a94..8183a4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11-DENORM %s
 
 ; fadd (fma a, b, (fmul c, d)), e --> fma a, b, (fma c, d, e)
 ; fadd e, (fma a, b, (fmul c, d)) --> fma a, b, (fma c, d, e)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 3f6e3d8..1e02f63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
 
 define float @test_f32_add_mul(float %x, float %y, float %z) {
 ; GFX9-LABEL: test_f32_add_mul:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll
index 4d6e60c..8879f7d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
 
 ; fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z))
 define amdgpu_vs float @test_f16_to_f32_sub_ext_mul(half %x, half %y, float %z) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll
index 814a347..df6c8df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
 
 ; fold (fsub (fpext (fneg (fmul, x, y))), z) -> (fneg (fma (fpext x), (fpext y), z))
 define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul(half %x, half %y, float %z) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index 99bdcdd..d046b85 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11-DENORM %s
 
 ; fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
 ; fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index 70f961e..c0a828e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
 
 ; fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
 define float @test_f32_sub_ext_neg_mul(float %x, float %y, float %z) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index 0b09cab..067704c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -1,10 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX12 %s
 
 define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX6-LABEL: v_fma_f32:
@@ -25,6 +27,12 @@ define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f32:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -36,6 +44,16 @@ define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
   ret float %fma
 }
@@ -62,6 +80,12 @@ define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z)
 ; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_v2f32:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_v2f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -75,6 +99,17 @@ define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX11-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX12-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z)
   ret <2 x float> %fma
 }
@@ -102,6 +137,12 @@ define half @v_fma_f16(half %x, half %y, half %z) {
 ; GFX9-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,6 +161,16 @@ define half @v_fma_f16(half %x, half %y, half %z) {
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fma = call half @llvm.fma.f16(half %x, half %y, half %z)
   ret half %fma
 }
@@ -147,6 +198,12 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
 ; GFX9-NEXT:    v_fma_f16 v0, -v0, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f16_fneg_lhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f16 v0, -v0, v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -164,6 +221,16 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_fma_f16 v0, -v0, v1, v2
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f16_fneg_lhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg half %x
   %fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
   ret half %fma
@@ -192,6 +259,12 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
 ; GFX9-NEXT:    v_fma_f16 v0, v0, -v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f16_fneg_rhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f16 v0, v0, -v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -209,6 +282,16 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_fma_f16 v0, v0, -v1, v2
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f16_fneg_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.y = fneg half %y
   %fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
   ret half %fma
@@ -237,6 +320,12 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
 ; GFX9-NEXT:    v_fma_f16 v0, v0, v1, -v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f16_fneg_add:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f16_fneg_add:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -254,6 +343,16 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_fma_f16 v0, v0, v1, -v2
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f16_fneg_add:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg half %z
   %fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
   ret half %fma
@@ -293,6 +392,12 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -304,6 +409,16 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z)
   ret <2 x half> %fma
 }
@@ -348,6 +463,12 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_v2f16_fneg_lhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_v2f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -359,6 +480,16 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_v2f16_fneg_lhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y, <2 x half> %z)
   ret <2 x half> %fma
@@ -404,6 +535,12 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_v2f16_fneg_rhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_v2f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -415,6 +552,16 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_v2f16_fneg_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %y.fneg = fneg <2 x half> %y
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y.fneg, <2 x half> %z)
   ret <2 x half> %fma
@@ -454,6 +601,12 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_v2f16_fneg_lhs_rhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_v2f16_fneg_lhs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -465,6 +618,16 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_v2f16_fneg_lhs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %y.fneg = fneg <2 x half> %y
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg, <2 x half> %z)
@@ -512,6 +675,13 @@ define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
 ; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_v3f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX90A-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_v3f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -525,6 +695,17 @@ define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX12-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z)
   ret <3 x half> %fma
 }
@@ -581,6 +762,13 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_v4f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX90A-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_v4f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -594,6 +782,17 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX12-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <4 x half> @llvm.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z)
   ret <4 x half> %fma
 }
@@ -617,6 +816,14 @@ define double @v_fma_f64(double %x, double %y, double %z) {
 ; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f64:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fmac_f64_e32 v[4:5], v[0:1], v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -628,6 +835,16 @@ define double @v_fma_f64(double %x, double %y, double %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fma = call double @llvm.fma.f64(double %x, double %y, double %z)
   ret double %fma
 }
@@ -651,6 +868,12 @@ define double @v_fma_f64_fneg_all(double %x, double %y, double %z) {
 ; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f64_fneg_all:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f64_fneg_all:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -662,6 +885,16 @@ define double @v_fma_f64_fneg_all(double %x, double %y, double %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f64_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg double %x
   %neg.y = fneg double %y
   %neg.z = fneg double %z
@@ -691,6 +924,17 @@ define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double>
 ; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_v2f64:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fmac_f64_e32 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT:    v_fmac_f64_e32 v[10:11], v[2:3], v[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v8
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v9
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v10
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v11
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_v2f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -704,6 +948,17 @@ define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double>
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
 ; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_v2f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GFX12-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z)
   ret <2 x double> %fma
 }
@@ -727,6 +982,12 @@ define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) {
 ; GFX9-NEXT:    v_fma_f32 v0, |v0|, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f32_fabs_lhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f32_fabs_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -738,6 +999,16 @@ define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, |v0|, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f32_fabs_lhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
   ret float %fma
@@ -762,6 +1033,12 @@ define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) {
 ; GFX9-NEXT:    v_fma_f32 v0, v0, |v1|, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f32_fabs_rhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f32 v0, v0, |v1|, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f32_fabs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -773,6 +1050,16 @@ define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, |v1|, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f32_fabs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z)
   ret float %fma
@@ -797,6 +1084,12 @@ define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) {
 ; GFX9-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f32_fabs_lhs_rhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f32_fabs_lhs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -808,6 +1101,16 @@ define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f32_fabs_lhs_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %fma = call float @llvm.fma.f32(float %fabs.x, float %fabs.y, float %z)
@@ -830,6 +1133,11 @@ define amdgpu_ps float @v_fma_f32_sgpr_vgpr_vgpr(float inreg %x, float %y, float
 ; GFX9-NEXT:    v_fma_f32 v0, s0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX90A-LABEL: v_fma_f32_sgpr_vgpr_vgpr:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX90A-NEXT:    ; return to shader part epilog
+;
 ; GFX10-LABEL: v_fma_f32_sgpr_vgpr_vgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_fma_f32 v0, s0, v0, v1
@@ -839,6 +1147,11 @@ define amdgpu_ps float @v_fma_f32_sgpr_vgpr_vgpr(float inreg %x, float %y, float
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_fma_f32 v0, s0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: v_fma_f32_sgpr_vgpr_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
   ret float %fma
 }
@@ -859,6 +1172,11 @@ define amdgpu_ps float @v_fma_f32_vgpr_sgpr_vgpr(float %x, float inreg %y, float
 ; GFX9-NEXT:    v_fma_f32 v0, v0, s0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX90A-LABEL: v_fma_f32_vgpr_sgpr_vgpr:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX90A-NEXT:    ; return to shader part epilog
+;
 ; GFX10-LABEL: v_fma_f32_vgpr_sgpr_vgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_fma_f32 v0, s0, v0, v1
@@ -868,6 +1186,11 @@ define amdgpu_ps float @v_fma_f32_vgpr_sgpr_vgpr(float %x, float inreg %y, float
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_fma_f32 v0, s0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: v_fma_f32_vgpr_sgpr_vgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
   ret float %fma
 }
@@ -894,6 +1217,13 @@ define amdgpu_ps float @v_fma_f32_sgpr_sgpr_sgpr(float inreg %x, float inreg %y,
 ; GFX9-NEXT:    v_fma_f32 v0, s0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX90A-LABEL: v_fma_f32_sgpr_sgpr_sgpr:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX90A-NEXT:    ; return to shader part epilog
+;
 ; GFX10-LABEL: v_fma_f32_sgpr_sgpr_sgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
@@ -905,6 +1235,12 @@ define amdgpu_ps float @v_fma_f32_sgpr_sgpr_sgpr(float inreg %x, float inreg %y,
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_fma_f32 v0, s1, s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: v_fma_f32_sgpr_sgpr_sgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_fmac_f32 s2, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    ; return to shader part epilog
   %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
   ret float %fma
 }
@@ -928,6 +1264,12 @@ define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) {
 ; GFX9-NEXT:    v_fma_f32 v0, -v0, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f32_fneg_lhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f32_fneg_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -939,6 +1281,16 @@ define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, -v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f32_fneg_lhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z)
   ret float %fma
@@ -963,6 +1315,12 @@ define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) {
 ; GFX9-NEXT:    v_fma_f32 v0, v0, -v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f32_fneg_rhs:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f32 v0, v0, -v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f32_fneg_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -974,6 +1332,16 @@ define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, -v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f32_fneg_rhs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.y = fneg float %y
   %fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z)
   ret float %fma
@@ -998,6 +1366,12 @@ define float @v_fma_f32_fneg_z(float %x, float %y, float %z) {
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, -v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX90A-LABEL: v_fma_f32_fneg_z:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: v_fma_f32_fneg_z:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1009,6 +1383,16 @@ define float @v_fma_f32_fneg_z(float %x, float %y, float %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, v1, -v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_fma_f32_fneg_z:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg float %z
   %fma = call float @llvm.fma.f32(float %x, float %y, float %neg.z)
   ret float %fma
@@ -1030,6 +1414,11 @@ define amdgpu_ps float @dont_crash_after_fma_mix_select_attempt(float inreg %x,
 ; GFX9-NEXT:    v_fma_f32 v0, |s0|, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX90A-LABEL: dont_crash_after_fma_mix_select_attempt:
+; GFX90A:       ; %bb.0: ; %.entry
+; GFX90A-NEXT:    v_fma_f32 v0, |s0|, v0, v1
+; GFX90A-NEXT:    ; return to shader part epilog
+;
 ; GFX10-LABEL: dont_crash_after_fma_mix_select_attempt:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    v_fma_f32 v0, |s0|, v0, v1
@@ -1039,12 +1428,331 @@ define amdgpu_ps float @dont_crash_after_fma_mix_select_attempt(float inreg %x,
 ; GFX11:       ; %bb.0: ; %.entry
 ; GFX11-NEXT:    v_fma_f32 v0, |s0|, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: dont_crash_after_fma_mix_select_attempt:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    v_fma_f32 v0, |s0|, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
 .entry:
   %fabs.x = call contract float @llvm.fabs.f32(float %x)
   %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
   ret float %fma
 }
 
+define amdgpu_ps half @fma_s16_uniform(half inreg %a, half inreg %b, half inreg %c) {
+; GFX6-LABEL: fma_s16_uniform:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, s0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, s1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, s2
+; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: fma_s16_uniform:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_fma_f16 v0, s0, v0, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: fma_s16_uniform:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_fma_f16 v0, s0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX90A-LABEL: fma_s16_uniform:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NEXT:    v_fma_f16 v0, s0, v0, v1
+; GFX90A-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: fma_s16_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_fma_f16 v0, s1, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fma_s16_uniform:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    v_fmac_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: fma_s16_uniform:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-FAKE16-NEXT:    v_fma_f16 v0, s1, s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fma_s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_fmac_f16 s2, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    ; return to shader part epilog
+  %fma = call half @llvm.fma.f16(half %a, half %b, half %c)
+  ret half %fma
+}
+
+define amdgpu_ps float @fma_s32_uniform(float inreg %a, float inreg %b, float inreg %c) {
+; GFX6-LABEL: fma_s32_uniform:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: fma_s32_uniform:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: fma_s32_uniform:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX90A-LABEL: fma_s32_uniform:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX90A-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: fma_s32_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_fma_f32 v0, s1, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: fma_s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_fma_f32 v0, s1, s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fma_s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_fmac_f32 s2, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    ; return to shader part epilog
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %fma
+}
+
+define amdgpu_ps void @fma_s64_uniform(double inreg %a, double inreg %b, double inreg %c, ptr addrspace(1) %ptr) {
+; GFX6-LABEL: fma_s64_uniform:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    v_mov_b32_e32 v5, s5
+; GFX6-NEXT:    v_fma_f64 v[2:3], s[0:1], v[2:3], v[4:5]
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[0:1], 0
+; GFX6-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: fma_s64_uniform:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_fma_f64 v[2:3], s[0:1], v[2:3], v[4:5]
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fma_s64_uniform:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_fma_f64 v[2:3], s[0:1], v[2:3], v[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: fma_s64_uniform:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_fmac_f64_e32 v[4:5], s[0:1], v[2:3]
+; GFX90A-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX10-LABEL: fma_s64_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    v_fma_f64 v[2:3], s[0:1], s[2:3], v[2:3]
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fma_s64_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT:    v_fma_f64 v[2:3], s[0:1], s[2:3], v[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fma_s64_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    v_fma_f64 v[2:3], s[0:1], s[2:3], v[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fma = call double @llvm.fma.f64(double %a, double %b, double %c)
+  store double %fma, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps <2 x half> @fma_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b, <2 x half> inreg %c) {
+; GFX6-LABEL: fma_v2s16_uniform:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, s0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, s3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, s5
+; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_fma_f32 v1, v3, v4, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: fma_v2s16_uniform:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX8-NEXT:    v_fma_f16 v0, s0, v0, v1
+; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_fma_f16 v0, s3, v0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: fma_v2s16_uniform:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_pk_fma_f16 v0, s0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX90A-LABEL: fma_v2s16_uniform:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NEXT:    v_pk_fma_f16 v0, s0, v0, v1
+; GFX90A-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: fma_v2s16_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_pk_fma_f16 v0, s0, s1, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: fma_v2s16_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_pk_fma_f16 v0, s0, s1, v0
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fma_v2s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX12-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX12-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX12-NEXT:    s_fmac_f16 s2, s0, s1
+; GFX12-NEXT:    s_fmac_f16 s5, s3, s4
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s2, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+  ret <2 x half> %fma
+}
+
+define amdgpu_ps <2 x float> @fma_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b, <2 x float> inreg %c) {
+; GFX6-LABEL: fma_v2s32_uniform:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    v_fma_f32 v1, s1, v1, v2
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: fma_v2s32_uniform:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_fma_f32 v1, s1, v1, v2
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: fma_v2s32_uniform:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_fma_f32 v0, s0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_fma_f32 v1, s1, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX90A-LABEL: fma_v2s32_uniform:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_fma_f32 v[0:1], s[0:1], v[0:1], v[2:3]
+; GFX90A-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: fma_v2s32_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_fma_f32 v0, s2, s0, v0
+; GFX10-NEXT:    v_fma_f32 v1, s3, s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: fma_v2s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_fma_f32 v0, s2, s0, v0
+; GFX11-NEXT:    v_fma_f32 v1, s3, s1, v1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fma_v2s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_fmac_f32 s4, s0, s2
+; GFX12-NEXT:    s_fmac_f32 s5, s1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    ; return to shader part epilog
+  %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+  ret <2 x float> %fma
+}
+
 declare half @llvm.fma.f16(half, half, half) #0
 declare float @llvm.fma.f32(float, float, float) #0
 declare double @llvm.fma.f64(double, double, double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmad.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmad.ll
new file mode 100644
index 0000000..4907ee1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmad.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji --denormal-fp-math=preserve-sign -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 --denormal-fp-math=preserve-sign -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 --denormal-fp-math=preserve-sign -o - %s | FileCheck -check-prefix=GFX10 %s
+
+define amdgpu_ps float @fmad_s32_uniform(float inreg %a, float inreg %b, float inreg %c) {
+; GFX8-LABEL: fmad_s32_uniform:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mad_f32 v0, s0, v0, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: fmad_s32_uniform:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mad_f32 v0, s0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: fmad_s32_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mad_f32 v0, s1, s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %mul = fmul float %a, %b
+  %result = fadd float %mul, %c
+  ret float %result
+}
+
+define amdgpu_ps float @fmad_s32_div(float %a, float %b, float %c) {
+; GFX8-LABEL: fmad_s32_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: fmad_s32_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: fmad_s32_div:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX10-NEXT:    ; return to shader part epilog
+  %mul = fmul float %a, %b
+  %result = fadd float %mul, %c
+  ret float %result
+}
+
+define amdgpu_ps half @fmad_s16_uniform(half inreg %a, half inreg %b, half inreg %c) {
+; GFX8-LABEL: fmad_s16_uniform:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mad_f16 v0, s0, v0, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: fmad_s16_uniform:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mad_legacy_f16 v0, s0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: fmad_s16_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mul_f16_e64 v0, s0, s1
+; GFX10-NEXT:    v_add_f16_e32 v0, s2, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %mul = fmul half %a, %b
+  %result = fadd half %mul, %c
+  ret half %result
+}
+
+define amdgpu_ps half @fmad_s16_div(half %a, half %b, half %c) {
+; GFX8-LABEL: fmad_s16_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mad_f16 v0, v0, v1, v2
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: fmad_s16_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: fmad_s16_div:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX10-NEXT:    ; return to shader part epilog
+  %mul = fmul half %a, %b
+  %result = fadd half %mul, %c
+  ret half %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll
index dc4545b..cc2a8ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 %s -o - | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx908 %s -o - | FileCheck %s
 
 define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
 ; CHECK-LABEL: test_fmamix_constant_bus_violation_sss:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index 1220c0e..1e7c7dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -1,10 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
-
-; TODO: Switch test to use -new-reg-bank-select after adding G_FNEG support.
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
 
 define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fmul_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll
index 2351bf2..5242532 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 %s -o - | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 %s -o - | FileCheck %s
 
 define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 {
 ; CHECK-LABEL: test_fmamix_constant_bus_violation_sss:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir
index d63fc07..9dfc770 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s
 
 ---
 name: fma_sss
@@ -15,10 +14,7 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
-    ; CHECK-NEXT: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY3]], [[COPY4]], [[COPY5]]
+    ; CHECK-NEXT: [[FMA:%[0-9]+]]:sgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = COPY $sgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.f64.ll
index b97cd91..1fecc2b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.f64.ll
@@ -4,8 +4,15 @@
 define double @rsq_f64(double %x) {
 ; CHECK-LABEL: define double @rsq_f64(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f64(double [[X]], i32 608)
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -16,8 +23,15 @@ define double @rsq_f64(double %x) {
 define double @neg_rsq_f64(double %x) {
 ; CHECK-LABEL: define double @neg_rsq_f64(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double -1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f64(double [[X]], i32 608)
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP4]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -28,8 +42,15 @@ define double @neg_rsq_f64(double %x) {
 define double @rsq_f64_nnan(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_nnan(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f64(double [[X]], i32 608)
+; CHECK-NEXT:    [[TMP3:%.*]] = select nnan contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg nnan contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul nnan contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan double @llvm.sqrt.f64(double %x)
@@ -40,8 +61,15 @@ define double @rsq_f64_nnan(double %x) {
 define double @neg_rsq_f64_nnan(double %x) {
 ; CHECK-LABEL: define double @neg_rsq_f64_nnan(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan contract double -1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f64(double [[X]], i32 608)
+; CHECK-NEXT:    [[TMP3:%.*]] = select nnan contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg nnan contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul nnan contract double [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP4]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan double @llvm.sqrt.f64(double %x)
@@ -52,8 +80,13 @@ define double @neg_rsq_f64_nnan(double %x) {
 define double @rsq_f64_ninf(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call ninf contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv ninf contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul ninf contract double [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul ninf contract double [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract ninf double @llvm.sqrt.f64(double %x)
@@ -64,8 +97,13 @@ define double @rsq_f64_ninf(double %x) {
 define double @neg_rsq_f64_ninf(double %x) {
 ; CHECK-LABEL: define double @neg_rsq_f64_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call ninf contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv ninf contract double -1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul ninf contract double [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul ninf contract double [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP2]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract ninf double @llvm.sqrt.f64(double %x)
@@ -76,8 +114,13 @@ define double @neg_rsq_f64_ninf(double %x) {
 define double @rsq_f64_nnan_ninf(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_nnan_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan ninf contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan ninf contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul nnan ninf contract double [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan ninf contract double [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan ninf double @llvm.sqrt.f64(double %x)
@@ -88,8 +131,13 @@ define double @rsq_f64_nnan_ninf(double %x) {
 define double @neg_rsq_f64_nnan_ninf(double %x) {
 ; CHECK-LABEL: define double @neg_rsq_f64_nnan_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan ninf contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan ninf contract double -1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul nnan ninf contract double [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan ninf contract double [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP2]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan ninf double @llvm.sqrt.f64(double %x)
@@ -100,8 +148,15 @@ define double @neg_rsq_f64_nnan_ninf(double %x) {
 define double @rsq_f64_sqrt_nnan_ninf(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_sqrt_nnan_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan ninf contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp nnan ninf contract oeq double [[X]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = select nnan ninf contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg nnan ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan ninf contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul nnan ninf contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan ninf double @llvm.sqrt.f64(double %x)
@@ -112,8 +167,13 @@ define double @rsq_f64_sqrt_nnan_ninf(double %x) {
 define double @rsq_f64_fdiv_nnan_ninf(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_fdiv_nnan_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan ninf contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul nnan ninf contract double [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan ninf contract double [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -125,7 +185,30 @@ define <2 x double> @rsq_v2f64(<2 x double> %x) {
 ; CHECK-LABEL: define <2 x double> @rsq_v2f64(
 ; CHECK-SAME: <2 x double> [[X:%.*]]) {
 ; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract <2 x double> @llvm.sqrt.v2f64(<2 x double> [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract <2 x double> splat (double 1.000000e+00), [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[SQRT_X]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[SQRT_X]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[X]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP3]], i32 608)
+; CHECK-NEXT:    [[TMP7:%.*]] = select contract i1 [[TMP6]], double [[TMP5]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fneg contract double [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract double [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call contract double @llvm.fma.f64(double [[TMP9]], double [[TMP5]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul contract double [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call contract double @llvm.fma.f64(double [[TMP10]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract double @llvm.fma.f64(double [[TMP11]], double [[TMP12]], double [[TMP5]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[TMP4]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP4]], i32 608)
+; CHECK-NEXT:    [[TMP16:%.*]] = select contract i1 [[TMP15]], double [[TMP14]], double [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fneg contract double [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul contract double [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call contract double @llvm.fma.f64(double [[TMP18]], double [[TMP14]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul contract double [[TMP19]], [[TMP14]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call contract double @llvm.fma.f64(double [[TMP19]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP22:%.*]] = call contract double @llvm.fma.f64(double [[TMP20]], double [[TMP21]], double [[TMP14]])
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x double> poison, double [[TMP13]], i64 0
+; CHECK-NEXT:    [[FDIV:%.*]] = insertelement <2 x double> [[TMP23]], double [[TMP22]], i64 1
 ; CHECK-NEXT:    ret <2 x double> [[FDIV]]
 ;
   %sqrt.x = call contract <2 x double> @llvm.sqrt.f64(<2 x double> %x)
@@ -137,7 +220,30 @@ define <2 x double> @neg_rsq_v2f64(<2 x double> %x) {
 ; CHECK-LABEL: define <2 x double> @neg_rsq_v2f64(
 ; CHECK-SAME: <2 x double> [[X:%.*]]) {
 ; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract <2 x double> @llvm.sqrt.v2f64(<2 x double> [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract <2 x double> splat (double 1.000000e+00), [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[SQRT_X]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[SQRT_X]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[X]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP3]], i32 608)
+; CHECK-NEXT:    [[TMP7:%.*]] = select contract i1 [[TMP6]], double [[TMP5]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fneg contract double [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract double [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call contract double @llvm.fma.f64(double [[TMP9]], double [[TMP5]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul contract double [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call contract double @llvm.fma.f64(double [[TMP10]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract double @llvm.fma.f64(double [[TMP11]], double [[TMP12]], double [[TMP5]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[TMP4]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP4]], i32 608)
+; CHECK-NEXT:    [[TMP16:%.*]] = select contract i1 [[TMP15]], double [[TMP14]], double [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fneg contract double [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul contract double [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call contract double @llvm.fma.f64(double [[TMP18]], double [[TMP14]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul contract double [[TMP19]], [[TMP14]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call contract double @llvm.fma.f64(double [[TMP19]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP22:%.*]] = call contract double @llvm.fma.f64(double [[TMP20]], double [[TMP21]], double [[TMP14]])
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x double> poison, double [[TMP13]], i64 0
+; CHECK-NEXT:    [[FDIV:%.*]] = insertelement <2 x double> [[TMP23]], double [[TMP22]], i64 1
 ; CHECK-NEXT:    ret <2 x double> [[FDIV]]
 ;
   %sqrt.x = call contract <2 x double> @llvm.sqrt.f64(<2 x double> %x)
@@ -149,7 +255,30 @@ define <2 x double> @mixed_sign_rsq_v2f64(<2 x double> %x) {
 ; CHECK-LABEL: define <2 x double> @mixed_sign_rsq_v2f64(
 ; CHECK-SAME: <2 x double> [[X:%.*]]) {
 ; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract <2 x double> @llvm.sqrt.v2f64(<2 x double> [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract <2 x double> <double 1.000000e+00, double -1.000000e+00>, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[SQRT_X]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[SQRT_X]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[X]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP3]], i32 608)
+; CHECK-NEXT:    [[TMP7:%.*]] = select contract i1 [[TMP6]], double [[TMP5]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fneg contract double [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract double [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call contract double @llvm.fma.f64(double [[TMP9]], double [[TMP5]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul contract double [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call contract double @llvm.fma.f64(double [[TMP10]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract double @llvm.fma.f64(double [[TMP11]], double [[TMP12]], double [[TMP5]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[TMP4]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP4]], i32 608)
+; CHECK-NEXT:    [[TMP16:%.*]] = select contract i1 [[TMP15]], double [[TMP14]], double [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fneg contract double [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul contract double [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call contract double @llvm.fma.f64(double [[TMP18]], double [[TMP14]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul contract double [[TMP19]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call contract double @llvm.fma.f64(double [[TMP19]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract double @llvm.fma.f64(double [[TMP20]], double [[TMP21]], double [[TMP17]])
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x double> poison, double [[TMP13]], i64 0
+; CHECK-NEXT:    [[FDIV:%.*]] = insertelement <2 x double> [[TMP24]], double [[TMP23]], i64 1
 ; CHECK-NEXT:    ret <2 x double> [[FDIV]]
 ;
   %sqrt.x = call contract <2 x double> @llvm.sqrt.f64(<2 x double> %x)
@@ -161,7 +290,22 @@ define <2 x double> @rsq_some_elements_v2f64(<2 x double> %x) {
 ; CHECK-LABEL: define <2 x double> @rsq_some_elements_v2f64(
 ; CHECK-SAME: <2 x double> [[X:%.*]]) {
 ; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract <2 x double> @llvm.sqrt.v2f64(<2 x double> [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract <2 x double> <double 1.000000e+00, double 2.000000e+00>, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[SQRT_X]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[SQRT_X]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[X]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[X]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP3]], i32 608)
+; CHECK-NEXT:    [[TMP7:%.*]] = select contract i1 [[TMP6]], double [[TMP5]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fneg contract double [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract double [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call contract double @llvm.fma.f64(double [[TMP9]], double [[TMP5]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul contract double [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call contract double @llvm.fma.f64(double [[TMP10]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP13:%.*]] = call contract double @llvm.fma.f64(double [[TMP11]], double [[TMP12]], double [[TMP5]])
+; CHECK-NEXT:    [[TMP14:%.*]] = fdiv contract double 2.000000e+00, [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TMP13]], i64 0
+; CHECK-NEXT:    [[FDIV:%.*]] = insertelement <2 x double> [[TMP15]], double [[TMP14]], i64 1
 ; CHECK-NEXT:    ret <2 x double> [[FDIV]]
 ;
   %sqrt.x = call contract <2 x double> @llvm.sqrt.f64(<2 x double> %x)
@@ -324,8 +468,15 @@ define double @rsq_amdgcn_f64_nnan_ninf(double %x) {
 define double @rsq_f64_input_known_not_zero(double nofpclass(zero) %x) {
 ; CHECK-LABEL: define double @rsq_f64_input_known_not_zero(
 ; CHECK-SAME: double nofpclass(zero) [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp contract oeq double [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -336,8 +487,15 @@ define double @rsq_f64_input_known_not_zero(double nofpclass(zero) %x) {
 define double @rsq_f64_input_known_not_pinf(double nofpclass(pinf) %x) {
 ; CHECK-LABEL: define double @rsq_f64_input_known_not_pinf(
 ; CHECK-SAME: double nofpclass(pinf) [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp contract oeq double [[X]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -348,8 +506,13 @@ define double @rsq_f64_input_known_not_pinf(double nofpclass(pinf) %x) {
 define double @rsq_f64_input_known_not_pinf_zero(double nofpclass(pinf zero) %x) {
 ; CHECK-LABEL: define double @rsq_f64_input_known_not_pinf_zero(
 ; CHECK-SAME: double nofpclass(pinf zero) [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract double [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -360,8 +523,15 @@ define double @rsq_f64_input_known_not_pinf_zero(double nofpclass(pinf zero) %x)
 define double @rsq_f64_input_known_not_pinf_zero_dynamic_fp(double nofpclass(pinf zero) %x) #0 {
 ; CHECK-LABEL: define double @rsq_f64_input_known_not_pinf_zero_dynamic_fp(
 ; CHECK-SAME: double nofpclass(pinf zero) [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp contract oeq double [[X]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -372,8 +542,15 @@ define double @rsq_f64_input_known_not_pinf_zero_dynamic_fp(double nofpclass(pin
 define double @rsq_f64_input_known_not_pinf_zero_daz(double nofpclass(pinf zero) %x) #1 {
 ; CHECK-LABEL: define double @rsq_f64_input_known_not_pinf_zero_daz(
 ; CHECK-SAME: double nofpclass(pinf zero) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp contract oeq double [[X]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -384,8 +561,13 @@ define double @rsq_f64_input_known_not_pinf_zero_daz(double nofpclass(pinf zero)
 define double @rsq_f64_input_known_not_pinf_zero_denorm_daz(double nofpclass(pinf zero sub) %x) #1 {
 ; CHECK-LABEL: define double @rsq_f64_input_known_not_pinf_zero_denorm_daz(
 ; CHECK-SAME: double nofpclass(pinf zero sub) [[X:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract double [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -396,8 +578,17 @@ define double @rsq_f64_input_known_not_pinf_zero_denorm_daz(double nofpclass(pin
 define double @rsq_f64_dynamic_denormal(double %x) #0 {
 ; CHECK-LABEL: define double @rsq_f64_dynamic_denormal(
 ; CHECK-SAME: double [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp contract oeq double [[X]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp contract oeq double [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select contract i1 [[TMP4]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract double [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call contract double @llvm.fma.f64(double [[TMP8]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP9]], double [[TMP10]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -408,8 +599,15 @@ define double @rsq_f64_dynamic_denormal(double %x) #0 {
 define double @rsq_f64_dynamic_denormal_no_pinf(double nofpclass(pinf) %x) #0 {
 ; CHECK-LABEL: define double @rsq_f64_dynamic_denormal_no_pinf(
 ; CHECK-SAME: double nofpclass(pinf) [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp contract oeq double [[X]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -420,8 +618,15 @@ define double @rsq_f64_dynamic_denormal_no_pinf(double nofpclass(pinf) %x) #0 {
 define double @rsq_f64_dynamic_denormal_no_zero_no_denorm(double nofpclass(zero sub) %x) #0 {
 ; CHECK-LABEL: define double @rsq_f64_dynamic_denormal_no_zero_no_denorm(
 ; CHECK-SAME: double nofpclass(zero sub) [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp contract oeq double [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -432,8 +637,15 @@ define double @rsq_f64_dynamic_denormal_no_zero_no_denorm(double nofpclass(zero
 define double @rsq_f64_nnan_sqrt(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_nnan_sqrt(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f64(double [[X]], i32 608)
+; CHECK-NEXT:    [[TMP3:%.*]] = select nnan contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg nnan contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul nnan contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan double @llvm.sqrt.f64(double %x)
@@ -444,8 +656,15 @@ define double @rsq_f64_nnan_sqrt(double %x) {
 define double @rsq_f64_nnan_fdiv(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_nnan_fdiv(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f64(double [[X]], i32 608)
+; CHECK-NEXT:    [[TMP3:%.*]] = select nnan contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg nnan contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul nnan contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -456,8 +675,15 @@ define double @rsq_f64_nnan_fdiv(double %x) {
 define double @rsq_f64_ninf_sqrt(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_ninf_sqrt(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call ninf contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ninf contract oeq double [[X]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = select ninf contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul ninf contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul ninf contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract ninf double @llvm.sqrt.f64(double %x)
@@ -468,8 +694,13 @@ define double @rsq_f64_ninf_sqrt(double %x) {
 define double @rsq_f64_ninf_fdiv(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_ninf_fdiv(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv ninf contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul ninf contract double [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul ninf contract double [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -480,8 +711,15 @@ define double @rsq_f64_ninf_fdiv(double %x) {
 define double @rsq_f64_ninf_sqrt_nnan_fdiv(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_ninf_sqrt_nnan_fdiv(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call ninf contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp nnan ninf contract oeq double [[X]], 0.000000e+00
+; CHECK-NEXT:    [[TMP8:%.*]] = select nnan ninf contract i1 [[TMP7]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul nnan ninf contract double [[TMP8]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan ninf contract double [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract ninf double @llvm.sqrt.f64(double %x)
@@ -492,8 +730,13 @@ define double @rsq_f64_ninf_sqrt_nnan_fdiv(double %x) {
 define double @rsq_f64_nann_sqrt_ninf_fdiv(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_nann_sqrt_ninf_fdiv(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv ninf contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul nnan ninf contract double [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan ninf contract double [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan ninf contract double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan double @llvm.sqrt.f64(double %x)
@@ -506,8 +749,15 @@ define double @rsq_f64_assume_nonzero(double %x) {
 ; CHECK-SAME: double [[X:%.*]]) {
 ; CHECK-NEXT:    [[NONZERO:%.*]] = fcmp one double [[X]], 0.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NONZERO]])
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp contract oeq double [[X]], 0x7FF0000000000000
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %nonzero = fcmp one double %x, 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index e34fdd9..43bfe73 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -1,9 +1,16 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SDAG,SI-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GISEL,SI-GISEL %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SDAG,VI-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GISEL,VI-GISEL %s
+; Test amdgpu-codegenprepare implementation of rsq formation
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-IR,SI-SDAG,SI-SDAG-IR %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-IR,SI-GISEL,SI-GISEL-IR %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,VI-IR,VI-SDAG,VI-SDAG-IR %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,VI-IR,VI-GISEL,VI-GISEL-IR %s
+
+; Test codegen implementation.
+; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-CG,SI-SDAG,SI-SDAG-CG %s
+; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-CG,SI-GISEL,SI-GISEL-CG %s
+; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,VI-CG,VI-SDAG,VI-SDAG-CG %s
+; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,VI-CG,VI-GISEL,VI-GISEL-CG %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.readfirstlane(i32)
@@ -13,173 +20,253 @@ declare double @llvm.amdgcn.sqrt.f64(double)
 declare double @llvm.fabs.f64(double)
 
 define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
-; SI-SDAG-LABEL: s_rsq_f64:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
-; SI-SDAG-NEXT:    s_mov_b32 s2, 0x3ff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-SDAG-NEXT:    ; return to shader part epilog
+; SI-SDAG-IR-LABEL: s_rsq_f64:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s1
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-IR-NEXT:    ; return to shader part epilog
 ;
-; SI-GISEL-LABEL: s_rsq_f64:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-GISEL-NEXT:    ; return to shader part epilog
+; SI-GISEL-IR-LABEL: s_rsq_f64:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s1
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-IR-NEXT:    ; return to shader part epilog
 ;
-; VI-SDAG-LABEL: s_rsq_f64:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-SDAG-NEXT:    ; return to shader part epilog
+; VI-SDAG-IR-LABEL: s_rsq_f64:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-IR-NEXT:    ; return to shader part epilog
 ;
-; VI-GISEL-LABEL: s_rsq_f64:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-GISEL-NEXT:    ; return to shader part epilog
+; VI-GISEL-IR-LABEL: s_rsq_f64:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s1
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-IR-NEXT:    ; return to shader part epilog
+;
+; SI-SDAG-CG-LABEL: s_rsq_f64:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-CG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; SI-SDAG-CG-NEXT:    s_mov_b32 s2, 0x3ff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; SI-GISEL-CG-LABEL: s_rsq_f64:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-CG-NEXT:    ; return to shader part epilog
+;
+; VI-SDAG-CG-LABEL: s_rsq_f64:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-CG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; VI-GISEL-CG-LABEL: s_rsq_f64:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-CG-NEXT:    ; return to shader part epilog
   %rsq = call contract double @llvm.sqrt.f64(double %x)
   %result = fdiv contract double 1.0, %rsq
   %cast = bitcast double %result to <2 x i32>
@@ -193,173 +280,257 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 }
 
 define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
-; SI-SDAG-LABEL: s_rsq_f64_fabs:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    s_and_b64 s[2:3], s[2:3], exec
-; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
-; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
-; SI-SDAG-NEXT:    s_mov_b32 s2, 0x3ff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-SDAG-NEXT:    ; return to shader part epilog
+; SI-SDAG-IR-LABEL: s_rsq_f64_fabs:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e64 v[0:1], |s[0:1]|
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-SDAG-IR-NEXT:    s_and_b32 s2, s1, 0x7fffffff
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e64 vcc, |s[0:1]|, v2
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s2
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-IR-NEXT:    ; return to shader part epilog
 ;
-; SI-GISEL-LABEL: s_rsq_f64_fabs:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-GISEL-NEXT:    ; return to shader part epilog
+; SI-GISEL-IR-LABEL: s_rsq_f64_fabs:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e64 v[0:1], |s[0:1]|
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-GISEL-IR-NEXT:    s_and_b32 s2, s1, 0x7fffffff
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e64 vcc, |s[0:1]|, v2
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s2
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-IR-NEXT:    ; return to shader part epilog
 ;
-; VI-SDAG-LABEL: s_rsq_f64_fabs:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
-; VI-SDAG-NEXT:    s_and_b64 s[2:3], s[2:3], exec
-; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
-; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-SDAG-NEXT:    ; return to shader part epilog
+; VI-SDAG-IR-LABEL: s_rsq_f64_fabs:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e64 v[0:1], |s[0:1]|
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e64 vcc, |s[0:1]|, v2
+; VI-SDAG-IR-NEXT:    s_and_b32 s2, s1, 0x7fffffff
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s2
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-IR-NEXT:    ; return to shader part epilog
 ;
-; VI-GISEL-LABEL: s_rsq_f64_fabs:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-GISEL-NEXT:    ; return to shader part epilog
+; VI-GISEL-IR-LABEL: s_rsq_f64_fabs:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e64 v[0:1], |s[0:1]|
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e64 vcc, |s[0:1]|, v2
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; VI-GISEL-IR-NEXT:    s_and_b32 s0, s1, 0x7fffffff
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s0
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-IR-NEXT:    ; return to shader part epilog
+;
+; SI-SDAG-CG-LABEL: s_rsq_f64_fabs:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-CG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-CG-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; SI-SDAG-CG-NEXT:    s_mov_b32 s2, 0x3ff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; SI-GISEL-CG-LABEL: s_rsq_f64_fabs:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-CG-NEXT:    ; return to shader part epilog
+;
+; VI-SDAG-CG-LABEL: s_rsq_f64_fabs:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-CG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; VI-SDAG-CG-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; VI-GISEL-CG-LABEL: s_rsq_f64_fabs:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-CG-NEXT:    ; return to shader part epilog
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %rsq = call contract double @llvm.sqrt.f64(double %fabs.x)
   %result = fdiv contract double 1.0, %rsq
@@ -374,173 +545,253 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 }
 
 define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
-; SI-SDAG-LABEL: s_neg_rsq_f64:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
-; SI-SDAG-NEXT:    s_mov_b32 s2, 0xbff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-SDAG-NEXT:    ; return to shader part epilog
+; SI-SDAG-IR-LABEL: s_neg_rsq_f64:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s1
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], -v[0:1]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], -v[0:1]
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-IR-NEXT:    ; return to shader part epilog
 ;
-; SI-GISEL-LABEL: s_neg_rsq_f64:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-GISEL-NEXT:    ; return to shader part epilog
+; SI-GISEL-IR-LABEL: s_neg_rsq_f64:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s1
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], -v[0:1]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], -v[0:1]
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-IR-NEXT:    ; return to shader part epilog
 ;
-; VI-SDAG-LABEL: s_neg_rsq_f64:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-SDAG-NEXT:    ; return to shader part epilog
+; VI-SDAG-IR-LABEL: s_neg_rsq_f64:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], -v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], -v[0:1]
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-IR-NEXT:    ; return to shader part epilog
 ;
-; VI-GISEL-LABEL: s_neg_rsq_f64:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-GISEL-NEXT:    ; return to shader part epilog
+; VI-GISEL-IR-LABEL: s_neg_rsq_f64:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s1
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], -v[0:1]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], -v[0:1]
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-IR-NEXT:    ; return to shader part epilog
+;
+; SI-SDAG-CG-LABEL: s_neg_rsq_f64:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-CG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; SI-SDAG-CG-NEXT:    s_mov_b32 s2, 0xbff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; SI-GISEL-CG-LABEL: s_neg_rsq_f64:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-CG-NEXT:    ; return to shader part epilog
+;
+; VI-SDAG-CG-LABEL: s_neg_rsq_f64:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-CG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; VI-GISEL-CG-LABEL: s_neg_rsq_f64:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-CG-NEXT:    ; return to shader part epilog
   %rsq = call contract double @llvm.sqrt.f64(double %x)
   %result = fdiv contract double -1.0, %rsq
   %cast = bitcast double %result to <2 x i32>
@@ -554,173 +805,257 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 }
 
 define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
-; SI-SDAG-LABEL: s_neg_rsq_neg_f64:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
-; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
-; SI-SDAG-NEXT:    s_mov_b32 s2, 0xbff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-SDAG-NEXT:    ; return to shader part epilog
+; SI-SDAG-IR-LABEL: s_neg_rsq_neg_f64:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e64 v[0:1], -s[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-SDAG-IR-NEXT:    s_xor_b32 s2, s1, 0x80000000
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e64 vcc, -s[0:1], v2
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s2
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], -v[0:1]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], -v[0:1]
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-IR-NEXT:    ; return to shader part epilog
 ;
-; SI-GISEL-LABEL: s_neg_rsq_neg_f64:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-GISEL-NEXT:    ; return to shader part epilog
+; SI-GISEL-IR-LABEL: s_neg_rsq_neg_f64:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e64 v[0:1], -s[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-GISEL-IR-NEXT:    s_xor_b32 s2, s1, 0x80000000
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e64 vcc, -s[0:1], v2
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s2
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], -v[0:1]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], -v[0:1]
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-IR-NEXT:    ; return to shader part epilog
 ;
-; VI-SDAG-LABEL: s_neg_rsq_neg_f64:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
-; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-SDAG-NEXT:    ; return to shader part epilog
+; VI-SDAG-IR-LABEL: s_neg_rsq_neg_f64:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e64 v[0:1], -s[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e64 vcc, -s[0:1], v2
+; VI-SDAG-IR-NEXT:    s_xor_b32 s2, s1, 0x80000000
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s2
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], -v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], -v[0:1]
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-IR-NEXT:    ; return to shader part epilog
 ;
-; VI-GISEL-LABEL: s_neg_rsq_neg_f64:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-GISEL-NEXT:    ; return to shader part epilog
+; VI-GISEL-IR-LABEL: s_neg_rsq_neg_f64:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e64 v[0:1], -s[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e64 vcc, -s[0:1], v2
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; VI-GISEL-IR-NEXT:    s_xor_b32 s0, s1, 0x80000000
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s0
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], -v[0:1]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], -v[0:1]
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-IR-NEXT:    ; return to shader part epilog
+;
+; SI-SDAG-CG-LABEL: s_neg_rsq_neg_f64:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 9
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-CG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; SI-SDAG-CG-NEXT:    s_mov_b32 s2, 0xbff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; SI-GISEL-CG-LABEL: s_neg_rsq_neg_f64:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-CG-NEXT:    ; return to shader part epilog
+;
+; VI-SDAG-CG-LABEL: s_neg_rsq_neg_f64:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 9
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-CG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; VI-GISEL-CG-LABEL: s_neg_rsq_neg_f64:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-CG-NEXT:    ; return to shader part epilog
   %x.neg = fneg double %x
   %rsq = call contract double @llvm.sqrt.f64(double %x.neg)
   %result = fdiv contract double -1.0, %rsq
@@ -735,338 +1070,478 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 }
 
 define double @v_rsq_f64(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64_fabs(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64_fabs:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64_fabs:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e64 v[2:3], |v[0:1]|
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v5, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e64 vcc, |v[0:1]|, v5
+; SI-SDAG-IR-NEXT:    v_and_b32_e32 v4, 0x7fffffff, v1
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64_fabs:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64_fabs:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e64 v[2:3], |v[0:1]|
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e64 vcc, |v[0:1]|, v5
+; SI-GISEL-IR-NEXT:    v_and_b32_e32 v4, 0x7fffffff, v1
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64_fabs:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64_fabs:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e64 v[2:3], |v[0:1]|
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e64 vcc, |v[0:1]|, v4
+; VI-SDAG-IR-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v1
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64_fabs:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64_fabs:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e64 v[2:3], |v[0:1]|
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e64 vcc, |v[0:1]|, v4
+; VI-GISEL-IR-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64_fabs:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64_fabs:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64_fabs:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64_fabs:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %sqrt = call contract double @llvm.sqrt.f64(double %fabs.x)
   %rsq = fdiv contract double 1.0, %sqrt
@@ -1412,169 +1887,237 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 }
 
 define double @v_neg_rsq_f64(double %x) {
-; SI-SDAG-LABEL: v_neg_rsq_f64:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_neg_rsq_f64:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_neg_rsq_f64:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], -1.0, v[0:1], -1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_neg_rsq_f64:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_neg_rsq_f64:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_neg_rsq_f64:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_neg_rsq_f64:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_neg_rsq_f64:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_neg_rsq_f64:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    s_mov_b32 s6, 0xbff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_neg_rsq_f64:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], -1.0, v[0:1], -1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_neg_rsq_f64:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_neg_rsq_f64:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract double -1.0, %sqrt
   ret double %rsq
@@ -2710,169 +3253,241 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 }
 
 define double @v_rsq_f64_fneg_fabs(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 9
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64_fneg_fabs:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e64 v[2:3], -|v[0:1]|
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    s_brev_b32 s5, 1
+; SI-SDAG-IR-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-SDAG-IR-NEXT:    v_or_b32_e32 v4, 0x80000000, v1
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64_fneg_fabs:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e64 v[2:3], -|v[0:1]|
+; SI-GISEL-IR-NEXT:    v_cmp_eq_f64_e64 vcc, -|v[0:1]|, 0
+; SI-GISEL-IR-NEXT:    v_or_b32_e32 v4, 0x80000000, v1
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 9
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64_fneg_fabs:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e64 v[2:3], -|v[0:1]|
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_brev_b32 s5, 1
+; VI-SDAG-IR-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-SDAG-IR-NEXT:    v_or_b32_e32 v4, 0x80000000, v1
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64_fneg_fabs:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e64 v[2:3], -|v[0:1]|
+; VI-GISEL-IR-NEXT:    v_cmp_eq_f64_e64 vcc, -|v[0:1]|, 0
+; VI-GISEL-IR-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64_fneg_fabs:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 9
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64_fneg_fabs:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64_fneg_fabs:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 9
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64_fneg_fabs:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call double @llvm.fabs.f64(double %x)
   %fneg.fabs = fneg double %fabs
   %sqrt = call contract double @llvm.sqrt.f64(double %fneg.fabs)
@@ -2881,1753 +3496,2457 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 }
 
 define double @v_rsq_f64__afn_sqrt(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64__afn_sqrt:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64__afn_sqrt:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64__afn_sqrt:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64__afn_sqrt:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64__afn_sqrt:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64__afn_sqrt:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64__afn_sqrt:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64__afn_sqrt:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_fdiv(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64__afn_fdiv:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64__afn_fdiv:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64__afn_fdiv:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64__afn_fdiv:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64__afn_fdiv:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64__afn_fdiv:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64__afn_fdiv:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64__afn_fdiv:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64__afn:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64__afn:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64__afn:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64__afn:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64__afn:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64__afn:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64__afn:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64__afn:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64__afn:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64__afn:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64__afn:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64__afn:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_neg_rsq_f64__afn(double %x) {
-; SI-SDAG-LABEL: v_neg_rsq_f64__afn:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_neg_rsq_f64__afn:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_neg_rsq_f64__afn:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3]
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_neg_rsq_f64__afn:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_neg_rsq_f64__afn:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_neg_rsq_f64__afn:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_neg_rsq_f64__afn:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3]
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_neg_rsq_f64__afn:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_neg_rsq_f64__afn:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_neg_rsq_f64__afn:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3]
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_neg_rsq_f64__afn:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_neg_rsq_f64__afn:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3]
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double -1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_ninf(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64__afn_ninf:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64__afn_ninf:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64__afn_ninf:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64__afn_ninf:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64__afn_ninf:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64__afn_ninf:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64__afn_ninf:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64__afn_ninf:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64__afn_ninf:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64__afn_ninf:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64__afn_ninf:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64__afn_ninf:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn ninf double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_nnan(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64__afn_nnan:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64__afn_nnan:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64__afn_nnan:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64__afn_nnan:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64__afn_nnan:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64__afn_nnan:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64__afn_nnan:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64__afn_nnan:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64__afn_nnan:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64__afn_nnan:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64__afn_nnan:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64__afn_nnan:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_nnan_ninf(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan ninf double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
-; SI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3]
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3]
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3]
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3]
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan ninf double -1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__nnan_ninf(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64__nnan_ninf:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
-; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64__nnan_ninf:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64__nnan_ninf:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64__nnan_ninf:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64__nnan_ninf:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v7
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64__nnan_ninf:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
+; SI-GISEL-CG-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v10
+; SI-GISEL-CG-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64__nnan_ninf:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-SDAG-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64__nnan_ninf:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-GISEL-CG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-GISEL-CG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract nnan ninf double 1.0, %sqrt
   ret double %rsq
 }
 
 define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
-; SI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
-; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7]
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[8:9], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[6:7]
-; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[0:1], v[8:9]
-; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[8:9], 0.5
-; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1]
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
-; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[4:5]
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[6:7]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 1.0
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[4:5]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[6:7]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[0:1], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7]
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v12
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[8:9], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
-; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v10, 0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v11, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[4:5]
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[6:7]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 1.0
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[10:11], 0.5
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[12:13], v[2:3], v[6:7]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[4:5]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[12:13], v[2:3], v[6:7]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
-; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[5:6], v[0:1]
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[7:8], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[9:10], -v[0:1], v[5:6], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[11:12], -v[2:3], v[7:8], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[9:10], v[5:6], v[5:6]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[11:12], v[7:8], v[7:8]
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[4:5]
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[6:7]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 1.0
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[8:9], v[0:1], v[4:5]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[10:11], v[2:3], v[6:7]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[4:5]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[6:7]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v8, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v9, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[4:5]
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[6:7]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 1.0
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[10:11], v[0:1], v[4:5]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], 0.5
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[12:13], v[2:3], v[6:7]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[10:11], v[0:1], v[4:5]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[12:13], v[2:3], v[6:7]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[8:9], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[6:7], v[0:1], v[8:9]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[8:9], 0.5
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1]
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v5, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[0:1], v[6:7]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7]
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v6, 8, v12
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[8:9], v[4:5]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v13, 0x260
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[5:6], v[0:1]
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[7:8], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[9:10], -v[0:1], v[5:6], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[11:12], -v[2:3], v[7:8], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[9:10], v[5:6], v[5:6]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[11:12], v[7:8], v[7:8]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v5, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   %rsq = fdiv contract afn nnan ninf <2 x double> <double 1.0, double 1.0>, %sqrt
   ret <2 x double> %rsq
 }
 
 define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) {
-; SI-SDAG-LABEL: s_rsq_f64_unsafe:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-SDAG-NEXT:    ; return to shader part epilog
+; SI-SDAG-IR-LABEL: s_rsq_f64_unsafe:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s1
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-IR-NEXT:    ; return to shader part epilog
 ;
-; SI-GISEL-LABEL: s_rsq_f64_unsafe:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-GISEL-NEXT:    ; return to shader part epilog
+; SI-GISEL-IR-LABEL: s_rsq_f64_unsafe:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s1
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-IR-NEXT:    ; return to shader part epilog
 ;
-; VI-SDAG-LABEL: s_rsq_f64_unsafe:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
-; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-SDAG-NEXT:    ; return to shader part epilog
-;
-; VI-GISEL-LABEL: s_rsq_f64_unsafe:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-GISEL-NEXT:    ; return to shader part epilog
+; VI-SDAG-IR-LABEL: s_rsq_f64_unsafe:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s0, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s1, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-IR-NEXT:    ; return to shader part epilog
+;
+; VI-GISEL-IR-LABEL: s_rsq_f64_unsafe:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v2, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s1
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-IR-NEXT:    ; return to shader part epilog
+;
+; SI-SDAG-CG-LABEL: s_rsq_f64_unsafe:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-CG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; SI-GISEL-CG-LABEL: s_rsq_f64_unsafe:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-CG-NEXT:    ; return to shader part epilog
+;
+; VI-SDAG-CG-LABEL: s_rsq_f64_unsafe:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-CG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v0, s2
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-CG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-CG-NEXT:    ; return to shader part epilog
+;
+; VI-GISEL-CG-LABEL: s_rsq_f64_unsafe:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-CG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-CG-NEXT:    ; return to shader part epilog
   %rsq = call contract afn double @llvm.sqrt.f64(double %x)
   %result = fdiv contract afn double 1.0, %rsq
   %cast = bitcast double %result to <2 x i32>
@@ -4641,145 +5960,213 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) {
 }
 
 define double @v_rsq_f64_unsafe(double %x) {
-; SI-SDAG-LABEL: v_rsq_f64_unsafe:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0
-; SI-SDAG-NEXT:    s_brev_b32 s5, 8
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-IR-LABEL: v_rsq_f64_unsafe:
+; SI-SDAG-IR:       ; %bb.0:
+; SI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; SI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; SI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_rsq_f64_unsafe:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-IR-LABEL: v_rsq_f64_unsafe:
+; SI-GISEL-IR:       ; %bb.0:
+; SI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; SI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; SI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_f64_unsafe:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
-; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-IR-LABEL: v_rsq_f64_unsafe:
+; VI-SDAG-IR:       ; %bb.0:
+; VI-SDAG-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-IR-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-IR-NEXT:    s_mov_b32 s5, 0x3fd80000
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-SDAG-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-SDAG-IR-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], 0.5
+; VI-SDAG-IR-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; VI-SDAG-IR-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-GISEL-LABEL: v_rsq_f64_unsafe:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-GISEL-IR-LABEL: v_rsq_f64_unsafe:
+; VI-GISEL-IR:       ; %bb.0:
+; VI-GISEL-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-IR-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
+; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
+; VI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
+; VI-GISEL-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-SDAG-CG-LABEL: v_rsq_f64_unsafe:
+; SI-SDAG-CG:       ; %bb.0:
+; SI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-CG-LABEL: v_rsq_f64_unsafe:
+; SI-GISEL-CG:       ; %bb.0:
+; SI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-CG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-CG-LABEL: v_rsq_f64_unsafe:
+; VI-SDAG-CG:       ; %bb.0:
+; VI-SDAG-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-CG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-CG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-CG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-CG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-CG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-CG-LABEL: v_rsq_f64_unsafe:
+; VI-GISEL-CG:       ; %bb.0:
+; VI-GISEL-CG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-CG-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-CG-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-CG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-CG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-CG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-CG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-CG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-CG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-CG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-GISEL-CG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-CG-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call afn contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv afn contract double 1.0, %sqrt
   ret double %rsq
@@ -4828,39 +6215,22 @@ define double @v_rsq_amdgcn_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_rsq_amdgcn_sqrt_f64:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_rsq_amdgcn_sqrt_f64:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-LABEL: v_rsq_amdgcn_sqrt_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.amdgcn.sqrt.f64(double %x)
   %rsq = fdiv contract double 1.0, %sqrt
   ret double %rsq
@@ -4909,39 +6279,22 @@ define double @v_neg_rsq_amdgcn_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-SDAG-LABEL: v_neg_rsq_amdgcn_sqrt_f64:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_neg_rsq_amdgcn_sqrt_f64:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; VI-LABEL: v_neg_rsq_amdgcn_sqrt_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
+; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-NEXT:    v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0
+; VI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.amdgcn.sqrt.f64(double %x)
   %rsq = fdiv contract double -1.0, %sqrt
   ret double %rsq
@@ -4992,41 +6345,23 @@ define amdgpu_ps <2 x i32> @s_rsq_amdgcn_sqrt_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; SI-GISEL-NEXT:    ; return to shader part epilog
 ;
-; VI-SDAG-LABEL: s_rsq_amdgcn_sqrt_f64:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-SDAG-NEXT:    ; return to shader part epilog
-;
-; VI-GISEL-LABEL: s_rsq_amdgcn_sqrt_f64:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
-; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; VI-GISEL-NEXT:    ; return to shader part epilog
+; VI-LABEL: s_rsq_amdgcn_sqrt_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
+; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
+; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; VI-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-NEXT:    ; return to shader part epilog
   %rsq = call contract double @llvm.amdgcn.sqrt.f64(double %x)
   %result = fdiv contract double 1.0, %rsq
   %cast = bitcast double %result to <2 x i32>
@@ -5718,6 +7053,8 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 }
 
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
-; GISEL: {{.*}}
-; SDAG: {{.*}}
+; SI: {{.*}}
+; SI-CG: {{.*}}
+; SI-IR: {{.*}}
+; VI-CG: {{.*}}
+; VI-IR: {{.*}}
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi-def.mir b/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi-def.mir
index 6827057..efeb346 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi-def.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi-def.mir
@@ -1,4 +1,3 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -run-pass mir-canonicalizer -mir-vreg-namer-use-stable-hash -verify-machineinstrs %s -o - | FileCheck %s
 
 ---
@@ -11,9 +10,9 @@ body:             |
     ; CHECK-LABEL: name: multi_def_renaming
     ; CHECK: liveins: $vgpr3, $vgpr4_vgpr5
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %bb0_cb7ce318324a7ba8__1:vreg_64, %bb0_cb7ce318324a7ba8__2:sreg_64 = V_MAD_U64_U32_e64 $vgpr3, $vgpr3, $vgpr4_vgpr5, 0, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit-def %bb0_ac008e8cd8069470__1, implicit-def %bb0_ac008e8cd8069470__2, implicit-def %bb0_ac008e8cd8069470__3
-    ; CHECK-NEXT: %bb0_640fe5cc4c57ace5__1:vgpr_32 = COPY %bb0_cb7ce318324a7ba8__2.sub0
+    ; CHECK-NEXT: %bb0_[[V_MAD_DEF_HASH:[[:xdigit:]]+]]__1:vreg_64, %bb0_[[V_MAD_DEF_HASH]]__2:sreg_64 = V_MAD_U64_U32_e64 $vgpr3, $vgpr3, $vgpr4_vgpr5, 0, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit-def %bb0_[[S_NOP_DEF_HASH:[[:xdigit:]]+]]__1, implicit-def %bb0_[[S_NOP_DEF_HASH]]__2, implicit-def %bb0_[[S_NOP_DEF_HASH]]__3
+    ; CHECK-NEXT: %bb0_[[COPY_DEF_HASH:[[:xdigit:]]+]]__1:vgpr_32 = COPY %bb0_[[V_MAD_DEF_HASH]]__2.sub0
     ; CHECK-NEXT: S_ENDPGM 0
     %0:vreg_64, %1:sreg_64 = V_MAD_U64_U32_e64 $vgpr3, $vgpr3, $vgpr4_vgpr5, 0, implicit $exec
     S_NOP 0, implicit-def %2:vreg_256, implicit-def %3:vreg_256, implicit-def %4:vreg_256
diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine-3.ll b/llvm/test/CodeGen/X86/fma-fneg-combine-3.ll
index a53faf5..27aaba4 100644
--- a/llvm/test/CodeGen/X86/fma-fneg-combine-3.ll
+++ b/llvm/test/CodeGen/X86/fma-fneg-combine-3.ll
@@ -1,52 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=FMA4
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=FMA3,AVX2
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=FMA3,AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=FMA3
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=FMA3
 
 ; PR173172
 
 define void @fnma(ptr %_0, ptr %a, ptr %b, ptr %c) {
 ; FMA4-LABEL: fnma:
 ; FMA4:       # %bb.0:
-; FMA4-NEXT:    vmovups (%rsi), %xmm0
-; FMA4-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; FMA4-NEXT:    vmovups (%rdx), %xmm1
 ; FMA4-NEXT:    vmovups (%rcx), %xmm2
 ; FMA4-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
 ; FMA4-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; FMA4-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; FMA4-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm1
+; FMA4-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm1
 ; FMA4-NEXT:    vmovups %ymm0, (%rdi)
 ; FMA4-NEXT:    vzeroupper
 ; FMA4-NEXT:    retq
 ;
-; AVX2-LABEL: fnma:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX2-NEXT:    vxorps (%rsi), %xmm0, %xmm0
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovups (%rdx), %xmm1
-; AVX2-NEXT:    vmovups (%rcx), %xmm2
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
-; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX2-NEXT:    vfmadd231ps {{.*#+}} ymm1 = (ymm0 * ymm3) + ymm1
-; AVX2-NEXT:    vmovups %ymm1, (%rdi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: fnma:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovups (%rsi), %xmm0
-; AVX512-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups (%rdx), %xmm1
-; AVX512-NEXT:    vmovups (%rcx), %xmm2
-; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
-; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vfmadd231ps {{.*#+}} ymm1 = (ymm0 * ymm3) + ymm1
-; AVX512-NEXT:    vmovups %ymm1, (%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; FMA3-LABEL: fnma:
+; FMA3:       # %bb.0:
+; FMA3-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; FMA3-NEXT:    vmovups (%rdx), %xmm1
+; FMA3-NEXT:    vmovups (%rcx), %xmm2
+; FMA3-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
+; FMA3-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; FMA3-NEXT:    vfnmadd231ps {{.*#+}} ymm1 = -(ymm0 * ymm3) + ymm1
+; FMA3-NEXT:    vmovups %ymm1, (%rdi)
+; FMA3-NEXT:    vzeroupper
+; FMA3-NEXT:    retq
   %i = load <4 x float>, ptr %a, align 4
   %i1 = fneg <4 x float> %i
   %i2 = shufflevector <4 x float> %i1, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -97,45 +79,27 @@ define void @fnma2(ptr %_0, ptr %a, ptr %b, ptr %c) {
 define void @fnma3(ptr %_0, ptr %a, ptr %b, ptr %c) {
 ; FMA4-LABEL: fnma3:
 ; FMA4:       # %bb.0:
-; FMA4-NEXT:    vmovups (%rsi), %xmm0
-; FMA4-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmovups (%rdx), %xmm1
-; FMA4-NEXT:    vmovups (%rcx), %xmm2
-; FMA4-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
-; FMA4-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; FMA4-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; FMA4-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm1
+; FMA4-NEXT:    vmovups (%rdx), %xmm0
+; FMA4-NEXT:    vmovups (%rcx), %xmm1
+; FMA4-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; FMA4-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; FMA4-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; FMA4-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm3) + ymm0
 ; FMA4-NEXT:    vmovups %ymm0, (%rdi)
 ; FMA4-NEXT:    vzeroupper
 ; FMA4-NEXT:    retq
 ;
-; AVX2-LABEL: fnma3:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovups (%rdx), %xmm0
-; AVX2-NEXT:    vmovups (%rcx), %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX2-NEXT:    vxorps (%rsi), %xmm2, %xmm2
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm2 * ymm3) + ymm0
-; AVX2-NEXT:    vmovups %ymm0, (%rdi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: fnma3:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovups (%rsi), %xmm0
-; AVX512-NEXT:    vmovups (%rdx), %xmm1
-; AVX512-NEXT:    vmovups (%rcx), %xmm2
-; AVX512-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
-; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vfmadd231ps {{.*#+}} ymm1 = (ymm0 * ymm3) + ymm1
-; AVX512-NEXT:    vmovups %ymm1, (%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; FMA3-LABEL: fnma3:
+; FMA3:       # %bb.0:
+; FMA3-NEXT:    vmovups (%rdx), %xmm0
+; FMA3-NEXT:    vmovups (%rcx), %xmm1
+; FMA3-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; FMA3-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; FMA3-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; FMA3-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm2 * ymm3) + ymm0
+; FMA3-NEXT:    vmovups %ymm0, (%rdi)
+; FMA3-NEXT:    vzeroupper
+; FMA3-NEXT:    retq
   %va = load <4 x float>, ptr %a, align 4
   %vb = load <4 x float>, ptr %b, align 4
   %vc = load <4 x float>, ptr %c, align 4
diff --git a/llvm/test/CodeGen/X86/pr172046.ll b/llvm/test/CodeGen/X86/pr172046.ll
new file mode 100644
index 0000000..850e04b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr172046.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
+
+define i32 @shl_nuw_zext(i16 zeroext %x) {
+; X86-LABEL: shl_nuw_zext:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: shl_nuw_zext:
+; X64:       # %bb.0:
+; X64-NEXT:    shll $3, %edi
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    retq
+  %shl = shl nuw i16 %x, 3
+  %zext = zext i16 %shl to i32
+  ret i32 %zext
+}
+
+define i32 @shl_nsw_zext(i16 %x) {
+; X86-LABEL: shl_nsw_zext:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: shl_nsw_zext:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movl $256, %eax # imm = 0x100
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+  %shl = shl nsw i16 256, %x
+  %sext = sext i16 %shl to i32
+  ret i32 %sext
+}
diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll
index a93be22..469b624 100644
--- a/llvm/test/CodeGen/X86/sshl_sat.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat.ll
@@ -179,9 +179,9 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
 ; X86-LABEL: func4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shlb $4, %dl
 ; X86-NEXT:    movb %dl, %ch
 ; X86-NEXT:    shlb %cl, %ch
diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll
index 9768e47..9985ac6 100644
--- a/llvm/test/CodeGen/X86/ushl_sat.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat.ll
@@ -156,9 +156,9 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
 ; X86-LABEL: func4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shlb $4, %al
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shlb %cl, %dl
diff --git a/llvm/test/Transforms/GlobalMerge/global-merge-comdat.ll b/llvm/test/Transforms/GlobalMerge/global-merge-comdat.ll
new file mode 100644
index 0000000..fed568b
--- /dev/null
+++ b/llvm/test/Transforms/GlobalMerge/global-merge-comdat.ll
@@ -0,0 +1,7 @@
+; RUN: opt -global-merge -global-merge-max-offset=16 -global-merge-group-by-use=false %s -S -o - | FileCheck %s
+; CHECK: @_MergedGlobals = private global <{ i64, i64 }> zeroinitializer, section "__foo", comdat($__foo), align 8
+
+$__foo = comdat nodeduplicate
+
+@__bar = private global i64 0, section "__foo", comdat($__foo), align 8
+@__baz = private global i64 0, section "__foo", comdat($__foo), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
index 961662c6..0783a28 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
@@ -11,10 +11,10 @@ define void @p(double %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> <i32 1, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> <double 1.000000e+00, double 1.000000e+00, double poison, double poison>, <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x double> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
index 1abc16d..0cc4d3d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
@@ -4,16 +4,15 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP9:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[FMUL:%.*]] = sitofp i32 0 to float
-; CHECK-NEXT:    [[SITOFP:%.*]] = sitofp i32 0 to float
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[SITOFP]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 0, i32 0, i32 poison, i32 7>
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FMUL]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index c1cc3f2..d13a857 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -7,30 +7,36 @@
 define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    br label [[INVOKE_CONT:%.*]]
 ; CHECK:       invoke.cont:
-; CHECK-NEXT:    br i1 [[ARG]], label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
+; CHECK-NEXT:    br i1 %arg, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
 ; CHECK:       arrayctor.cont:
 ; CHECK-NEXT:    [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER:%.*]]
 ; CHECK:       for.cond36.preheader:
-; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
 ; CHECK:       cond.false51.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true48.us:
-; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
 ; CHECK:       cond.false66.us:
-; CHECK-NEXT:    store <2 x double> <double 0x404900049667B5F2, double 0x404E0515D587DA7B>, ptr undef, align 8
-; CHECK-NEXT:    store <2 x double> <double 2.000000e-07, double 0x3F91A436DC4B6CE6>, ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
+; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, 0x3EB0C6F7A0B5ED8D
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double poison, double 0xBFA5CC2D1960285F>, double [[ADD_I276_US]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> <double 0.000000e+00, double 1.000000e-01>, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], splat (double 1.400000e+02)
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr undef, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 2.000000e-01, double 3.000000e-01>, [[TMP1]]
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
 ; CHECK-NEXT:    ret void
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.body42.lr.ph.us:
-; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
 ; CHECK:       _Z5clampd.exit.1:
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER]]
 ;
@@ -90,7 +96,7 @@ _Z5clampd.exit.1:
 define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
 ; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    store <2 x double> <double 0x3FFA356C1D8A7F76, double 0x3FFDC4F38B38BEF4>, ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index ca65ff8..6d713e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -9,38 +9,33 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[DIV_2_I_I:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP20]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 0x7FF8000000000000, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 1.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[FA_SROA_9_0:%.*]] = phi float [ [[DIV_2_I_I]], %[[IF_THEN]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x float> [ [[TMP10]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP22]], float [[FA_SROA_9_0]], i32 1
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP28]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x float> <float 1.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP32:%.*]] = fmul <4 x float> <float -0.000000e+00, float -0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP31]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fadd <4 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP32]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[CALL25]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 91ec61b..6942df5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -25,7 +25,8 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]]
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[I82]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
index fd7f0c6..a07e617 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
@@ -6,17 +6,14 @@ define i1 @test(double %circ_radius, ptr %x) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double poison, double poison>, double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd <4 x double> <double -0.000000e+00, double 0.000000e+00, double 1.000000e+00, double -0.000000e+00>, [[TMP13]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[ADD20]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> <double poison, double poison, double 0.000000e+00, double poison>, <4 x i32> <i32 1, i32 2, i32 6, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP8]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp olt <4 x double> [[TMP9]], splat (double 1.000000e+00)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
index a9baede..eb3b183 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
@@ -6,18 +6,17 @@ define i1 @test(double %circ_radius, ptr %x, double %0) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]], double [[TMP0:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 1
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP16]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], <double -0.000000e+00, double 0.000000e+00, double -0.000000e+00, double -0.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD20]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double 0.000000e+00, double 1.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x double> [[TMP9]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP7]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP12]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x double> [[TMP13]], splat (double 1.000000e+00)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
index b71dbc4..8c68432 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
@@ -8,11 +8,13 @@ define void @test(ptr %quat, float %call13) {
 ; CHECK-SAME: ptr [[QUAT:%.*]], float [[CALL13:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL121:%.*]] = load volatile float, ptr null, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[CALL121]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP2]], <float 0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fmuladd.f32(float [[CALL13]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[CALL121]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> zeroinitializer, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[QUAT]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
index 6dc9806..f101991 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
@@ -4,7 +4,9 @@
 define float @test() {
 ; CHECK-LABEL: define float @test() {
 ; CHECK-NEXT:  [[LABEL:.*]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float 0.000000e+00, i32 0
+; CHECK-NEXT:    [[SUB_I102_I:%.*]] = fsub float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float poison, float poison>, float [[SUB_I102_I]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -12,12 +14,26 @@ define float @test() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <8 x float> [[TMP7]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> <float poison, float 1.000000e+00>, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> zeroinitializer, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <12 x float> [[TMP16]], <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <8 x float> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> [[TMP20]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <12 x float> [[TMP22]], <12 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP21]], <8 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <20 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <20 x float> [[TMP23]], <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
 ; CHECK-NEXT:    br label %[[REGION_30:.*]]
 ; CHECK:       [[REGION_30]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP10]], %[[LABEL]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP25]], %[[LABEL]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <20 x float> [[TMP26]], i32 7
 ; CHECK-NEXT:    ret float [[TMP27]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
index c58c63e..7b29872 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
@@ -11,23 +11,30 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[ARRAYIDX_I1464:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX_I1464]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[THIS]], align 8
+; CHECK-NEXT:    [[DIV251:%.*]] = fmul double [[TMP1]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL257:%.*]] = fmul double [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL305:%.*]] = fmul double [[TMP4]], 0.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
+; CHECK-NEXT:    [[NEG356:%.*]] = fmul double [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG356]], double 0.000000e+00, double 0.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[THIS]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg double [[TMP3]]
 ; CHECK-NEXT:    [[NEG380:%.*]] = fmul double [[TMP1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double [[MUL257]])
 ; CHECK-NEXT:    [[FNEG381:%.*]] = fneg double [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[MUL257]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = fneg <2 x double> [[TMP11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[DIV251]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FNEG381]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP12]], [[TMP14]]
+; CHECK-NEXT:    [[NEG417:%.*]] = fneg double [[MUL257]]
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG417]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[FNEG418:%.*]] = fneg double [[TMP16]]
+; CHECK-NEXT:    [[MUL419:%.*]] = fmul double [[DIV251]], [[FNEG418]]
 ; CHECK-NEXT:    [[NEG436:%.*]] = fmul double [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> <double 1.000000e+00, double poison>, double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP14]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = fneg <2 x double> [[TMP17]]
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[FNEG437:%.*]] = fneg double [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fneg double [[TMP4]]
 ; CHECK-NEXT:    [[NEG455:%.*]] = fmul double [[TMP1]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG455]], double 0.000000e+00, double [[MUL305]])
@@ -35,18 +42,19 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[FNEG474:%.*]] = fneg double [[TMP20]]
 ; CHECK-NEXT:    [[NEG492:%.*]] = fneg double [[MUL305]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG492]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> <double 1.000000e+00, double 0.000000e+00>, [[TMP13]]
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP23]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x double> poison, double [[DIV251]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x double> poison, double [[FNEG437]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> [[TMP24]], double [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[FNEG474]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP21]], i32 3
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul <4 x double> [[TMP23]], [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x double> poison, double [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> [[TMP29]], double [[FNEG381]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x double> [[TMP25]], double [[TMP10]], i32 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP28]], double [[TMP19]], i32 5
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x double> [[TMP32]], double [[FNEG474]], i32 6
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x double> [[TMP33]], double [[TMP21]], i32 7
-; CHECK-NEXT:    [[TMP34:%.*]] = fmul <8 x double> [[TMP31]], [[TMP22]]
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x double> [[TMP29]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[MUL419]], i32 3
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x double> [[TMP28]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP35:%.*]] = fptrunc <8 x double> [[TMP34]] to <8 x float>
 ; CHECK-NEXT:    store <8 x float> [[TMP35]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index d10d266..2a0e788 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
 
 define void @add0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @add0(
@@ -336,12 +336,32 @@ entry:
 }
 
 define void @add1f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @add1f(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @add1f(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd fast <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @add1f(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -367,9 +387,18 @@ entry:
 define void @sub0f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -536,9 +565,18 @@ entry:
 define void @mulf(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -593,12 +631,32 @@ entry:
 }
 
 define void @add1fn(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @add1fn(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @add1fn(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @add1fn(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -624,9 +682,18 @@ entry:
 define void @sub0fn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0fn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -713,9 +780,18 @@ entry:
 define void @mulfn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulfn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index b23da5f..125c2dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -52,12 +52,11 @@ define <2 x float> @replace_through_casts_and_binop(i16 %inp) {
 ; CHECK-SAME: i16 [[INP:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[INP]], -10
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i16 [[INP]], 5
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[MUL]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> [[TMP1]], i16 [[ADD]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[TMP5]], <float 2.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i16 [[MUL]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 2.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = sitofp i16 [[ADD]] to float
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i64 1
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %add = add nsw i16 %inp, -10
diff --git a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
index c79969d..793d089 100644
--- a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
@@ -1,98 +1,52 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
 
 define void @exceed(double %0, double %1) {
-; X86-LABEL: @exceed(
-; X86-NEXT:  entry:
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; X86-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; X86-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; X86-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; X86-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; X86-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; X86-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
-; X86-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
-; X86-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; X86-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
-; X86-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
-; X86-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
-; X86-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
-; X86-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
-; X86-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
-; X86-NEXT:    switch i32 undef, label [[BB1:%.*]] [
-; X86-NEXT:      i32 0, label [[BB2:%.*]]
-; X86-NEXT:    ]
-; X86:       bb1:
-; X86-NEXT:    br label [[LABEL:%.*]]
-; X86:       bb2:
-; X86-NEXT:    br label [[LABEL]]
-; X86:       label:
-; X86-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
-; X86-NEXT:    ret void
-;
-; AARCH64-LABEL: @exceed(
-; AARCH64-NEXT:  entry:
-; AARCH64-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; AARCH64-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; AARCH64-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; AARCH64-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; AARCH64-NEXT:    [[IX2:%.*]] = fmul double [[TMP7]], [[TMP7]]
-; AARCH64-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
-; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> <double poison, double 1.000000e+00>, <2 x i32> <i32 0, i32 3>
-; AARCH64-NEXT:    [[TMP11:%.*]] = fdiv fast <2 x double> [[TMP9]], [[TMP10]]
-; AARCH64-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; AARCH64-NEXT:    [[IX:%.*]] = fmul double [[TMP12]], undef
-; AARCH64-NEXT:    [[IX1:%.*]] = fmul double [[TMP12]], undef
-; AARCH64-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
-; AARCH64-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], [[TMP8]]
-; AARCH64-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP11]], undef
-; AARCH64-NEXT:    switch i32 undef, label [[BB1:%.*]] [
-; AARCH64-NEXT:      i32 0, label [[BB2:%.*]]
-; AARCH64-NEXT:    ]
-; AARCH64:       bb1:
-; AARCH64-NEXT:    br label [[LABEL:%.*]]
-; AARCH64:       bb2:
-; AARCH64-NEXT:    br label [[LABEL]]
-; AARCH64:       label:
-; AARCH64-NEXT:    [[TMP16:%.*]] = phi <2 x double> [ [[TMP14]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
-; AARCH64-NEXT:    ret void
+; CHECK-LABEL: @exceed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
+; CHECK-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
+; CHECK-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
+; CHECK-NEXT:    switch i32 undef, label [[BB1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[LABEL:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[LABEL]]
+; CHECK:       label:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
+; CHECK-NEXT:    ret void
 ;
 entry:
   %i10 = fdiv fast double %0, %1
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 43994310..32e5969 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -7,52 +7,56 @@ define i1 @test(float %0, double %1) {
 ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; X86-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
-; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
-; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
-; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
+; X86-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; X86-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
 ; X86-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP6]], [[TMP12]]
+; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
 ; X86-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; X86-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP15]], [[TMP10]]
-; X86-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP15]], [[TMP10]]
-; X86-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; X86-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
-; X86-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
-; X86-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
-; X86-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
-; X86-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
-; X86-NEXT:    ret i1 [[TMP23]]
+; X86-NEXT:    [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP16]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
+; X86-NEXT:    [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]]
+; X86-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float>
+; X86-NEXT:    [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer
+; X86-NEXT:    [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer
+; X86-NEXT:    [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]]
+; X86-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]])
+; X86-NEXT:    ret i1 [[TMP25]]
 ;
 ; AARCH64-LABEL: define i1 @test
 ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; AARCH64-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
-; AARCH64-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
-; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
-; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
-; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
+; AARCH64-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; AARCH64-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
 ; AARCH64-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP6]], [[TMP13]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP10]], [[TMP13]]
 ; AARCH64-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP16:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP15]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP17:%.*]] = fsub <8 x double> [[TMP16]], [[TMP10]]
-; AARCH64-NEXT:    [[TMP18:%.*]] = fmul <8 x double> [[TMP16]], [[TMP10]]
-; AARCH64-NEXT:    [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; AARCH64-NEXT:    [[TMP20:%.*]] = fptrunc <8 x double> [[TMP19]] to <8 x float>
-; AARCH64-NEXT:    [[TMP21:%.*]] = fmul <8 x float> [[TMP20]], zeroinitializer
-; AARCH64-NEXT:    [[TMP22:%.*]] = fcmp oeq <8 x float> [[TMP21]], zeroinitializer
-; AARCH64-NEXT:    [[TMP23:%.*]] = freeze <8 x i1> [[TMP22]]
-; AARCH64-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP23]])
-; AARCH64-NEXT:    ret i1 [[TMP24]]
+; AARCH64-NEXT:    [[TMP17:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
+; AARCH64-NEXT:    [[TMP19:%.*]] = fsub <8 x double> [[TMP16]], [[TMP18]]
+; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x double> [[TMP16]], [[TMP18]]
+; AARCH64-NEXT:    [[TMP21:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP22:%.*]] = fptrunc <8 x double> [[TMP21]] to <8 x float>
+; AARCH64-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], zeroinitializer
+; AARCH64-NEXT:    [[TMP24:%.*]] = fcmp oeq <8 x float> [[TMP23]], zeroinitializer
+; AARCH64-NEXT:    [[TMP25:%.*]] = freeze <8 x i1> [[TMP24]]
+; AARCH64-NEXT:    [[TMP26:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP25]])
+; AARCH64-NEXT:    ret i1 [[TMP26]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index 09e3ef41..eefc99f 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -6,34 +6,34 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778)
 ; X86-LABEL: @test(
 ; X86-NEXT:  entry:
 ; X86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
-; X86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
-; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
+; X86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; X86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; X86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; X86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
 ; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
-; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
-; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; X86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
 ; X86-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; X86-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AARCH86-LABEL: @test(
 ; AARCH86-NEXT:  entry:
 ; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; AARCH86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
-; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
-; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
+; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
 ; AARCH86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
-; AARCH86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
-; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
-; AARCH86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
-; AARCH86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
-; AARCH86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; AARCH86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; AARCH86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; AARCH86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
 ; AARCH86-NEXT:    [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index b3de3b8..5373f6c 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -64,7 +64,6 @@ define <4 x float> @fadd_v4f32_mixed_types(<4 x float> %a0) {
   ret <4 x float> %post
 }
 
-; Negative test - multiple use of fadd
 define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_op(
 ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
@@ -101,6 +100,42 @@ define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %
   ret <4 x double> %post
 }
 
+declare void @use_v32i8(<32 x i8>)
+define <32 x i8> @max_expense_multi_use_triggered(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: define <32 x i8> @max_expense_multi_use_triggered(
+; CHECK-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = shufflevector <32 x i8> [[A]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[B1:%.*]] = shufflevector <32 x i8> [[B]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[OP:%.*]] = add <32 x i8> [[A1]], [[B1]]
+; CHECK-NEXT:    call void @use_v32i8(<32 x i8> [[OP]])
+; CHECK-NEXT:    [[POST:%.*]] = add <32 x i8> [[A]], [[B]]
+; CHECK-NEXT:    ret <32 x i8> [[POST]]
+;
+  %a1 = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %b1 = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %op = add <32 x i8> %a1, %b1
+  call void @use_v32i8(<32 x i8> %op)
+  %post = shufflevector <32 x i8> %op, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <32 x i8> %post
+}
+
+define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[POST:%.*]] = fadd <4 x double> [[A]], [[TMP1]]
+; CHECK-NEXT:    call void @use_v4f64(<4 x double> [[A1]])
+; CHECK-NEXT:    ret <4 x double> [[POST]]
+;
+  %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %op = fadd <4 x double> %a1, %b1
+  %post = shufflevector <4 x double> %op, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  call void @use_v4f64(<4 x double> %a1)
+  ret <4 x double> %post
+}
+
 define <4 x i32> @sdiv_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: define <4 x i32> @sdiv_v4i32(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
diff --git a/llvm/tools/bugpoint/CMakeLists.txt b/llvm/tools/bugpoint/CMakeLists.txt
index 3c42af1..0329741 100644
--- a/llvm/tools/bugpoint/CMakeLists.txt
+++ b/llvm/tools/bugpoint/CMakeLists.txt
@@ -14,6 +14,7 @@ set(LLVM_LINK_COMPONENTS
   InstCombine
   Instrumentation
   Linker
+  Plugins
   ObjCARCOpts
   ScalarOpts
   Support
diff --git a/llvm/tools/bugpoint/bugpoint.cpp b/llvm/tools/bugpoint/bugpoint.cpp
index c48703c..7f1510a 100644
--- a/llvm/tools/bugpoint/bugpoint.cpp
+++ b/llvm/tools/bugpoint/bugpoint.cpp
@@ -15,13 +15,13 @@
 #include "BugDriver.h"
 #include "ToolRunner.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LegacyPassNameParser.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/AlwaysTrue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
diff --git a/llvm/tools/llc/CMakeLists.txt b/llvm/tools/llc/CMakeLists.txt
index f7a200d..5be7db6 100644
--- a/llvm/tools/llc/CMakeLists.txt
+++ b/llvm/tools/llc/CMakeLists.txt
@@ -8,12 +8,12 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   CodeGenTypes
   Core
-  Extensions
   IRPrinter
   IRReader
   MC
   MIRParser
   Passes
+  Plugins
   Remarks
   ScalarOpts
   SelectionDAG
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index 51c0206..3a23b94 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -41,6 +40,7 @@
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/tools/llvm-lto2/CMakeLists.txt b/llvm/tools/llvm-lto2/CMakeLists.txt
index 2ddfdfd..60ddce1 100644
--- a/llvm/tools/llvm-lto2/CMakeLists.txt
+++ b/llvm/tools/llvm-lto2/CMakeLists.txt
@@ -6,12 +6,12 @@ set(LLVM_LINK_COMPONENTS
   BitReader
   CodeGen
   Core
-  Extensions
   Linker
   LTO
   MC
   Object
   Passes
+  Plugins
   Support
   Target
   TargetParser
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index ad66410..25f0876 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -18,9 +18,9 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/LTO/LTO.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/Caching.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt
index 6dd74ae1..0c5d356 100644
--- a/llvm/tools/opt/CMakeLists.txt
+++ b/llvm/tools/opt/CMakeLists.txt
@@ -19,6 +19,7 @@ set(LLVM_LINK_COMPONENTS
   Instrumentation
   MC
   ObjCARCOpts
+  Plugins
   Remarks
   ScalarOpts
   Support
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 6c139a6..2f57ae6 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/LibcallLoweringInfo.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -32,6 +31,7 @@
 #include "llvm/IRPrinter/IRPrintingPasses.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/ToolOutputFile.h"
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 5cccef7..b0c2733 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/LLVMContext.h"
@@ -40,6 +39,7 @@
 #include "llvm/LinkAllPasses.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/llvm/unittests/Analysis/CMakeLists.txt b/llvm/unittests/Analysis/CMakeLists.txt
index 3e411a8..50bf453 100644
--- a/llvm/unittests/Analysis/CMakeLists.txt
+++ b/llvm/unittests/Analysis/CMakeLists.txt
@@ -3,9 +3,9 @@ set(LLVM_LINK_COMPONENTS
   AsmParser
   CodeGen
   Core
-  Extensions
   Instrumentation
   Passes
+  Plugins
   Support
   TargetParser
   TransformUtils
diff --git a/llvm/unittests/Analysis/InlineAdvisorPlugin/InlineAdvisorPlugin.cpp b/llvm/unittests/Analysis/InlineAdvisorPlugin/InlineAdvisorPlugin.cpp
index d96b593..498a772 100644
--- a/llvm/unittests/Analysis/InlineAdvisorPlugin/InlineAdvisorPlugin.cpp
+++ b/llvm/unittests/Analysis/InlineAdvisorPlugin/InlineAdvisorPlugin.cpp
@@ -1,8 +1,8 @@
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/llvm/unittests/Analysis/InlineOrderPlugin/InlineOrderPlugin.cpp b/llvm/unittests/Analysis/InlineOrderPlugin/InlineOrderPlugin.cpp
index db353d9..6fe54c2 100644
--- a/llvm/unittests/Analysis/InlineOrderPlugin/InlineOrderPlugin.cpp
+++ b/llvm/unittests/Analysis/InlineOrderPlugin/InlineOrderPlugin.cpp
@@ -1,8 +1,8 @@
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp b/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp
index 9cb4a6a..0218543 100644
--- a/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp
@@ -1,9 +1,9 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Config/config.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Testing/Support/Error.h"
diff --git a/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp b/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp
index ba22170..cb2d23c 100644
--- a/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp
@@ -1,9 +1,9 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Config/config.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Testing/Support/Error.h"
diff --git a/llvm/unittests/AsmParser/AsmParserTest.cpp b/llvm/unittests/AsmParser/AsmParserTest.cpp
index 898a829..f67ed15 100644
--- a/llvm/unittests/AsmParser/AsmParserTest.cpp
+++ b/llvm/unittests/AsmParser/AsmParserTest.cpp
@@ -514,6 +514,8 @@ TEST(AsmParserTest, ParserObjectLocations) {
   auto MainLoc = MaybeMainLoc.value();
   auto ExpectedMainLoc = FileLocRange(FileLoc{0, 0}, FileLoc{4, 1});
   ASSERT_EQ_LOC(MainLoc, ExpectedMainLoc);
+  ASSERT_EQ(ParserContext.getFunctionAtLocation(MainLoc.Start),
+            ParserContext.getFunctionAtLocation(MainLoc));
 
   auto &EntryBB = MainFn->getEntryBlock();
   auto MaybeEntryBBLoc = ParserContext.getBlockLocation(&EntryBB);
@@ -521,6 +523,8 @@ TEST(AsmParserTest, ParserObjectLocations) {
   auto EntryBBLoc = MaybeEntryBBLoc.value();
   auto ExpectedEntryBBLoc = FileLocRange(FileLoc{1, 0}, FileLoc{3, 14});
   ASSERT_EQ_LOC(EntryBBLoc, ExpectedEntryBBLoc);
+  ASSERT_EQ(ParserContext.getBlockAtLocation(MaybeEntryBBLoc->Start),
+            ParserContext.getBlockAtLocation(*MaybeEntryBBLoc));
 
   SmallVector<FileLocRange> InstructionLocations = {
       FileLocRange(FileLoc{2, 4}, FileLoc{2, 21}),
@@ -531,6 +535,8 @@ TEST(AsmParserTest, ParserObjectLocations) {
     ASSERT_TRUE(MaybeMainLoc.has_value());
     auto InstLoc = MaybeInstLoc.value();
     ASSERT_EQ_LOC(InstLoc, ExpectedLoc);
+    ASSERT_EQ(ParserContext.getInstructionAtLocation(MaybeInstLoc->Start),
+              ParserContext.getInstructionAtLocation(*MaybeInstLoc));
   }
 }
 
diff --git a/llvm/unittests/Passes/Plugins/CMakeLists.txt b/llvm/unittests/Passes/Plugins/CMakeLists.txt
index f3c6b90..709b000 100644
--- a/llvm/unittests/Passes/Plugins/CMakeLists.txt
+++ b/llvm/unittests/Passes/Plugins/CMakeLists.txt
@@ -3,7 +3,7 @@
 # work with DLLs on Windows (where a shared library can't have undefined
 # references), so just skip this testcase on Windows.
 if (NOT WIN32 AND NOT CYGWIN)
-  set(LLVM_LINK_COMPONENTS Support Extensions Passes Core AsmParser)
+  set(LLVM_LINK_COMPONENTS Support Passes Plugins Core AsmParser)
   add_llvm_unittest(PluginsTests
     PluginsTest.cpp
 
diff --git a/llvm/unittests/Passes/Plugins/DoublerPlugin/DoublerPlugin.cpp b/llvm/unittests/Passes/Plugins/DoublerPlugin/DoublerPlugin.cpp
index 42667ff..23e66cf 100644
--- a/llvm/unittests/Passes/Plugins/DoublerPlugin/DoublerPlugin.cpp
+++ b/llvm/unittests/Passes/Plugins/DoublerPlugin/DoublerPlugin.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 
 using namespace llvm;
 
diff --git a/llvm/unittests/Passes/Plugins/PluginsTest.cpp b/llvm/unittests/Passes/Plugins/PluginsTest.cpp
index bb11991..d4d519e 100644
--- a/llvm/unittests/Passes/Plugins/PluginsTest.cpp
+++ b/llvm/unittests/Passes/Plugins/PluginsTest.cpp
@@ -9,11 +9,11 @@
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Config/config.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
diff --git a/llvm/unittests/Passes/Plugins/TestPlugin/TestPlugin.cpp b/llvm/unittests/Passes/Plugins/TestPlugin/TestPlugin.cpp
index 15db0c5..5a0bcdb 100644
--- a/llvm/unittests/Passes/Plugins/TestPlugin/TestPlugin.cpp
+++ b/llvm/unittests/Passes/Plugins/TestPlugin/TestPlugin.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 
 #include "../TestPlugin.h"
 
diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
index d0e28e9..7427876 100644
--- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
@@ -18,6 +18,7 @@ static_library("CodeGen") {
     "//llvm/lib/Bitcode/Reader",
     "//llvm/lib/CodeGen",
     "//llvm/lib/Demangle",
+    "//llvm/lib/Extensions",
     "//llvm/lib/Frontend/Driver",
     "//llvm/lib/Frontend/HLSL",
     "//llvm/lib/Frontend/Offloading",
diff --git a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
index cdf39d6..2af873e 100644
--- a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
@@ -15,6 +15,7 @@ static_library("Frontend") {
     "//clang/lib/Serialization",
     "//llvm/include/llvm/Config:llvm-config",
     "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/Extensions",
     "//llvm/lib/Option",
     "//llvm/lib/ProfileData",
     "//llvm/lib/Support",
diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn
index 5c58903..432bfa8 100644
--- a/llvm/utils/gn/secondary/clang/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn
@@ -62,6 +62,7 @@ write_lit_config("lit_site_cfg") {
     "ENABLE_SHARED=0",
     "LLVM_EXTERNAL_LIT=",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
+    "LLVM_INCLUDE_EXAMPLES=0",
     "LLVM_INCLUDE_SPIRV_TOOLS_TESTS=0",
     "LLVM_LIT_TOOLS_DIR=",  # Intentionally empty, matches cmake build.
     "LLVM_TOOL_LLVM_DRIVER_BUILD=0",  # FIXME: Add actual support for this.
diff --git a/llvm/utils/gn/secondary/clang/tools/clang-linker-wrapper/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-linker-wrapper/BUILD.gn
index 6fb51e5..045c060 100644
--- a/llvm/utils/gn/secondary/clang/tools/clang-linker-wrapper/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/clang-linker-wrapper/BUILD.gn
@@ -14,6 +14,7 @@ executable("clang-linker-wrapper") {
     "//llvm/lib/BinaryFormat",
     "//llvm/lib/Bitcode/Writer",
     "//llvm/lib/CodeGen",
+    "//llvm/lib/Extensions",
     "//llvm/lib/IR",
     "//llvm/lib/IRReader",
     "//llvm/lib/LTO",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Extensions/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Extensions/BUILD.gn
index e580187..82fb722 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Extensions/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Extensions/BUILD.gn
@@ -1,5 +1,8 @@
 static_library("Extensions") {
   output_name = "LLVMExtensions"
-  sources = [ "Extensions.cpp" ]
+  sources = [
+    "Extensions.cpp",
+    "PassPlugin.cpp",
+  ]
   deps = [ "//llvm/lib/Support" ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
index 4230c55..d75a0d1 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
@@ -26,7 +26,6 @@ static_library("Passes") {
     "PassBuilder.cpp",
     "PassBuilderBindings.cpp",
     "PassBuilderPipelines.cpp",
-    "PassPlugin.cpp",
     "StandardInstrumentations.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/tools/bugpoint/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/bugpoint/BUILD.gn
index e6a5d8e..7eac98e 100644
--- a/llvm/utils/gn/secondary/llvm/tools/bugpoint/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/bugpoint/BUILD.gn
@@ -5,6 +5,7 @@ executable("bugpoint") {
     "//llvm/lib/Analysis",
     "//llvm/lib/Bitcode/Writer",
     "//llvm/lib/CodeGen",
+    "//llvm/lib/Extensions",
     "//llvm/lib/IR",
     "//llvm/lib/IRReader",
     "//llvm/lib/Linker",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llc/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llc/BUILD.gn
index 8756ee5..5113cf4 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llc/BUILD.gn
@@ -5,6 +5,7 @@ executable("llc") {
     "//llvm/lib/CodeGen/AsmPrinter",
     "//llvm/lib/CodeGen/MIRParser",
     "//llvm/lib/CodeGen/SelectionDAG",
+    "//llvm/lib/Extensions",
     "//llvm/lib/IR",
     "//llvm/lib/IRPrinter",
     "//llvm/lib/IRReader",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-lto2/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-lto2/BUILD.gn
index c0a14d7..652971c 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-lto2/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-lto2/BUILD.gn
@@ -1,6 +1,7 @@
 executable("llvm-lto2") {
   deps = [
     "//llvm/lib/Bitcode/Reader",
+    "//llvm/lib/Extensions",
     "//llvm/lib/IR",
     "//llvm/lib/LTO",
     "//llvm/lib/Linker",
diff --git a/llvm/utils/gn/secondary/llvm/tools/opt/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/opt/BUILD.gn
index b044c92..f19e4ed 100644
--- a/llvm/utils/gn/secondary/llvm/tools/opt/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/opt/BUILD.gn
@@ -5,6 +5,7 @@ static_library("lib") {
     "//llvm/lib/Analysis",
     "//llvm/lib/Bitcode/Writer",
     "//llvm/lib/CodeGen",
+    "//llvm/lib/Extensions",
     "//llvm/lib/IR",
     "//llvm/lib/IRPrinter",
     "//llvm/lib/MC",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
index 82411b8..b92e707 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
@@ -4,6 +4,7 @@ unittest("AnalysisTests") {
   deps = [
     "//llvm/lib/Analysis",
     "//llvm/lib/AsmParser",
+    "//llvm/lib/Extensions",
     "//llvm/lib/IR",
     "//llvm/lib/Passes",
     "//llvm/lib/Support",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Passes/Plugins/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Passes/Plugins/BUILD.gn
index c6567e1..62af4b4 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Passes/Plugins/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Passes/Plugins/BUILD.gn
@@ -2,6 +2,7 @@ import("//third-party/unittest/unittest.gni")
 unittest("PluginsTests") {
   deps = [
     "//llvm/include/llvm/Config:config",
+    "//llvm/lib/Extensions",
     "//llvm/lib/IR",
     "//llvm/lib/Passes",
     "//llvm/lib/Support",
diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h
index e364570..efce718 100644
--- a/mlir/include/mlir/Analysis/DataFlowFramework.h
+++ b/mlir/include/mlir/Analysis/DataFlowFramework.h
@@ -70,11 +70,9 @@ struct ProgramPoint : public StorageUniquer::BaseStorage {
   ProgramPoint() {}
 
   /// Create a new program point from the given program point.
-  ProgramPoint(const ProgramPoint &point) {
-    this->block = point.getBlock();
-    this->point = point.getPoint();
-    this->op = point.getOperation();
-  }
+  ProgramPoint(const ProgramPoint &point)
+      : block(point.getBlock()), point(point.getPoint()),
+        op(point.getOperation()) {}
 
   static ProgramPoint *construct(StorageUniquer::StorageAllocator &alloc,
                                  KeyTy &&key) {
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 620cc97..eb321bb 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -3675,7 +3675,8 @@ OpFoldResult ViewOp::fold(FoldAdaptor adaptor) {
   MemRefType sourceMemrefType = getSource().getType();
   MemRefType resultMemrefType = getResult().getType();
 
-  if (resultMemrefType == sourceMemrefType && resultMemrefType.hasStaticShape())
+  if (resultMemrefType == sourceMemrefType &&
+      resultMemrefType.hasStaticShape() && isZeroInteger(getByteShift()))
     return getViewSource();
 
   return {};
@@ -3684,7 +3685,7 @@ OpFoldResult ViewOp::fold(FoldAdaptor adaptor) {
 namespace {
 
 struct ViewOpShapeFolder : public OpRewritePattern<ViewOp> {
-  using OpRewritePattern<ViewOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ViewOp viewOp,
                                 PatternRewriter &rewriter) const override {
@@ -3751,26 +3752,22 @@ struct ViewOpShapeFolder : public OpRewritePattern<ViewOp> {
   }
 };
 
+/// view(memref.cast(%source)) -> view(%source).
 struct ViewOpMemrefCastFolder : public OpRewritePattern<ViewOp> {
-  using OpRewritePattern<ViewOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ViewOp viewOp,
                                 PatternRewriter &rewriter) const override {
-    Value memrefOperand = viewOp.getOperand(0);
-    CastOp memrefCastOp = memrefOperand.getDefiningOp<CastOp>();
+    auto memrefCastOp = viewOp.getSource().getDefiningOp<CastOp>();
     if (!memrefCastOp)
       return failure();
-    Value allocOperand = memrefCastOp.getOperand();
-    AllocOp allocOp = allocOperand.getDefiningOp<AllocOp>();
-    if (!allocOp)
-      return failure();
-    rewriter.replaceOpWithNewOp<ViewOp>(viewOp, viewOp.getType(), allocOperand,
-                                        viewOp.getByteShift(),
-                                        viewOp.getSizes());
+
+    rewriter.replaceOpWithNewOp<ViewOp>(
+        viewOp, viewOp.getType(), memrefCastOp.getSource(),
+        viewOp.getByteShift(), viewOp.getSizes());
     return success();
   }
 };
-
 } // namespace
 
 void ViewOp::getCanonicalizationPatterns(RewritePatternSet &results,
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index 6031130..7b4dea6 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -1336,21 +1336,52 @@ func.func @fold_assume_alignment_chain(%0: memref<128xf32>) -> memref<128xf32> {
 
 // -----
 
+// CHECK-LABEL: func @fold_view_cast
+//  CHECK-SAME:   (%[[ARG:.*]]: memref<128xi8>)
+func.func @fold_view_cast(%0: memref<128xi8>) -> memref<i32> {
+  %c0 = arith.constant 0 : index
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[RES:.*]] = memref.view %[[ARG]][%[[C0]]][] : memref<128xi8> to memref<i32>
+  // CHECK: return %[[RES]]
+  %1 = memref.cast %0 : memref<128xi8> to memref<?xi8>
+  %res = memref.view %1[%c0][] : memref<?xi8> to memref<i32>
+  return %res : memref<i32>
+}
+
+// -----
+
 // CHECK-LABEL: func @fold_view_same_source_result_types
+//  CHECK-SAME:   (%[[ARG:.*]]: memref<128xi8>)
 func.func @fold_view_same_source_result_types(%0: memref<128xi8>) -> memref<128xi8> {
-  %c0 = arith.constant 0: index
+  %c0 = arith.constant 0 : index
   // CHECK-NOT: memref.view
+  // CHECK: return %[[ARG]]
   %res = memref.view %0[%c0][] : memref<128xi8> to memref<128xi8>
   return %res : memref<128xi8>
 }
 
 // -----
 
-// CHECK-LABEL: func @non_fold_view_same_source_res_types
-//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]
-func.func @non_fold_view_same_source_res_types(%0: memref<?xi8>, %arg0 : index) -> memref<?xi8> {
+// CHECK-LABEL: func @non_fold_view_non_zero_offset
+//  CHECK-SAME:   (%[[ARG:.*]]: memref<128xi8>)
+func.func @non_fold_view_non_zero_offset(%0: memref<128xi8>) -> memref<128xi8> {
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[RES:.*]] = memref.view %[[ARG]][%[[C1]]][] : memref<128xi8> to memref<128xi8>
+  // CHECK: return %[[RES]]
+  %res = memref.view %0[%c1][] : memref<128xi8> to memref<128xi8>
+  return %res : memref<128xi8>
+}
+
+// -----
+
+// CHECK-LABEL: func @non_fold_view_same_source_dynamic_size
+//  CHECK-SAME:   (%[[ARG:.*]]: memref<?xi8>, %[[SIZE:.*]]: index)
+func.func @non_fold_view_same_source_dynamic_size(%0: memref<?xi8>, %arg0 : index) -> memref<?xi8> {
   %c0 = arith.constant 0: index
-  // CHECK: memref.view
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[RES:.*]] = memref.view %[[ARG]][%[[C0]]][%[[SIZE]]] : memref<?xi8> to memref<?xi8>
+  // CHECK: return %[[RES]]
   %res = memref.view %0[%c0][%arg0] : memref<?xi8> to memref<?xi8>
   return %res : memref<?xi8>
 }
diff --git a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir
index efffcaa..5b40946 100644
--- a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir
+++ b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir
@@ -2,7 +2,7 @@
 // RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" -debug-only=serialize-to-isa \
 // RUN:  2>&1 | FileCheck %s
 
-// CHECK-LABEL: Generated by LLVM NVPTX Back-End
+// CHECK-LABEL: {{Generated by (LLVM NVPTX Back-End|NVIDIA NVVM Compiler)}}
 // CHECK: .visible .func kernel_a()
 // CHECK: ret;
 gpu.module @bar {
@@ -12,7 +12,7 @@ gpu.module @bar {
   }
 }
 
-// CHECK-LABEL: Generated by LLVM NVPTX Back-End
+// CHECK-LABEL: {{Generated by (LLVM NVPTX Back-End|NVIDIA NVVM Compiler)}}
 // CHECK: .visible .func  ({{.+}}) fma(
 // CHECK: fma.rn.f32
 
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
index a1e2729..6ba9c163 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
@@ -19,10 +19,10 @@
 
 // Basic PTX check to make sure we are generating the right instructions.
 // CHECK-PTX: mbarrier.init.shared.b64
-// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
-// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
-// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
-// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
+// CHECK-PTX-DAG: mbarrier.arrive.expect_tx.shared.b64
+// CHECK-PTX-DAG: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
+// CHECK-PTX-DAG: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
+// CHECK-PTX-DAG: mbarrier.arrive.expect_tx.shared.b64
 // CHECK-PTX: mbarrier.try_wait.parity.shared.b64
 
 // RUN: mlir-opt %s \
diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt
index 5a99d26..41c3b97 100644
--- a/polly/lib/CMakeLists.txt
+++ b/polly/lib/CMakeLists.txt
@@ -23,8 +23,8 @@ set(POLLY_COMPONENTS
     Analysis
     ipo
     MC
-    Extensions
     Passes
+    Plugins
     Linker
     IRReader
     Analysis
diff --git a/polly/lib/Plugin/Polly.cpp b/polly/lib/Plugin/Polly.cpp
index 6be19ae..8dadb7d 100644
--- a/polly/lib/Plugin/Polly.cpp
+++ b/polly/lib/Plugin/Polly.cpp
@@ -9,8 +9,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/RegisterPasses.h"
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/PassRegistry.h"
+#include "llvm/Plugins/PassPlugin.h"
 
 // Pass Plugin Entrypoints
 
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index edd8c1c..ca96fee 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -42,11 +42,11 @@
 #include "polly/Support/DumpModulePass.h"
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_STRING
-#include "llvm/Extensions/PassPlugin.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/TargetSelect.h"
author	Alexis Engelke <engelke@in.tum.de>	2025-12-22 18:37:58 +0000
committer	Alexis Engelke <engelke@in.tum.de>	2025-12-22 18:37:58 +0000
commit	c60e63f3a30f356904e4085b631c1e2ed059b7a0 (patch)
tree	6fc55ac669c0f075243853d09d36536af799b349
parent	5889bee0925c32e508b7817e36e379c79cefba2f (diff)
parent	c3678c4165b554a2908dd7571c6373dc8142587d (diff)
download	llvm-users/aengelke/spr/main.clang-invoke-pass-plugin-precodegencallback.zip llvm-users/aengelke/spr/main.clang-invoke-pass-plugin-precodegencallback.tar.gz llvm-users/aengelke/spr/main.clang-invoke-pass-plugin-precodegencallback.tar.bz2