aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2022-11-20 08:40:25 -0800
committerMatt Arsenault <Matthew.Arsenault@amd.com>2023-07-25 07:54:11 -0400
commite3fd8f83a801b1918508c7c0a71cc31bc95ad4d2 (patch)
treeaa3422ab947d7251f1720ceb1a68e651a054cdb9
parent47b3ada432f8afee9723a4b3d27b3efbef34dedf (diff)
downloadllvm-e3fd8f83a801b1918508c7c0a71cc31bc95ad4d2.zip
llvm-e3fd8f83a801b1918508c7c0a71cc31bc95ad4d2.tar.gz
llvm-e3fd8f83a801b1918508c7c0a71cc31bc95ad4d2.tar.bz2
AMDGPU: Correctly expand f64 sqrt intrinsic
rocm-device-libs and llpc were avoiding using f64 sqrt intrinsics in favor of their own expansions. Port the expansion into the backend. Both of these users should be updated to call the intrinsic instead. The library and llpc expansions are slightly different. llpc uses an ldexp to do the scale; the library uses a multiply. Use ldexp to do the scale instead of the multiply. I believe v_ldexp_f64 and v_mul_f64 are always the same number of cycles, but it's cheaper to materialize the 32-bit integer constant than the 64-bit double constant. The libraries have another fast version of sqrt which will be handled separately. I am tempted to do this in an IR expansion instead. In the IR we could take advantage of computeKnownFPClass to avoid the 0-or-inf argument check.
-rw-r--r--llvm/docs/AMDGPUUsage.rst5
-rw-r--r--llvm/docs/ReleaseNotes.rst3
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp96
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp88
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td2
-rw-r--r--llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir279
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll1837
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll45
-rw-r--r--llvm/test/CodeGen/AMDGPU/rsq.f64.ll4431
13 files changed, 5932 insertions, 881 deletions
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index bec7b2f..d90c83f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -965,6 +965,9 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
========================================= ==========================================================
LLVM Intrinsic Description
========================================= ==========================================================
+ llvm.amdgcn.sqrt Provides direct access to v_sqrt_f64, v_sqrt_f32 and v_sqrt_f16
+ (on targets with half support). Peforms sqrt function.
+
llvm.amdgcn.log Provides direct access to v_log_f32 and v_log_f16
(on targets with half support). Peforms log2 function.
@@ -980,6 +983,8 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
inputs. Backend will optimize out denormal scaling if
marked with the :ref:`afn <fastmath_afn>` flag.
+ :ref:`llvm.sqrt <int_sqrt>` Implemented for double, float and half (and vectors).
+
:ref:`llvm.log <int_log>` Implemented for float and half (and vectors).
:ref:`llvm.exp <int_exp>` Implemented for float and half (and vectors).
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index a04cbd2..a1ca57e 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -173,6 +173,9 @@ Changes to the AMDGPU Backend
* Implemented new 2ulp IEEE lowering strategy for float
reciprocal. This is used by default for OpenCL on gfx9+.
+* `llvm.sqrt.f64` is now lowered correctly. Use `llvm.amdgcn.sqrt.f64`
+ for raw instruction access.
+
Changes to the ARM Backend
--------------------------
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index a1ff764..5341b57 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1181,6 +1181,13 @@ public:
const SrcOp &Op0, const SrcOp &Op1,
std::optional<unsigned> Flags = std::nullopt);
+ /// Build and insert a \p Res = G_IS_FPCLASS \p Pred\p Src, \p Mask
+ MachineInstrBuilder buildIsFPClass(const DstOp &Res, const SrcOp &Src,
+ unsigned Mask) {
+ return buildInstr(TargetOpcode::G_IS_FPCLASS, {Res},
+ {Src, SrcOp(static_cast<int64_t>(Mask))});
+ }
+
/// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1
///
/// \pre setBasicBlock or setMI must have been called.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 923a549..120c00b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -907,7 +907,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S16, S64);
if (ST.has16BitInsts()) {
- getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
+ getActionDefinitionsBuilder(G_FSQRT)
+ .legalFor({S32, S16})
+ .customFor({S64})
+ .scalarize(0)
+ .clampScalar(0, S16, S64);
+ getActionDefinitionsBuilder(G_FFLOOR)
.legalFor({S32, S64, S16})
.scalarize(0)
.clampScalar(0, S16, S64);
@@ -925,7 +930,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
} else {
getActionDefinitionsBuilder(G_FSQRT)
- .legalFor({S32, S64})
+ .legalFor({S32})
+ .customFor({S64})
.scalarize(0)
.clampScalar(0, S32, S64);
@@ -1996,6 +2002,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeFDIV(MI, MRI, B);
case TargetOpcode::G_FFREXP:
return legalizeFFREXP(MI, MRI, B);
+ case TargetOpcode::G_FSQRT:
+ return legalizeFSQRT(MI, MRI, B);
case TargetOpcode::G_UDIV:
case TargetOpcode::G_UREM:
case TargetOpcode::G_UDIVREM:
@@ -4829,6 +4837,90 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // For double type, the SQRT and RSQ instructions don't have required
+ // precision, we apply Goldschmidt's algorithm to improve the result:
+ //
+ // y0 = rsq(x)
+ // g0 = x * y0
+ // h0 = 0.5 * y0
+ //
+ // r0 = 0.5 - h0 * g0
+ // g1 = g0 * r0 + g0
+ // h1 = h0 * r0 + h0
+ //
+ // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
+ // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
+ // h2 = h1 * r1 + h1
+ //
+ // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
+ // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
+ //
+ // sqrt(x) = g3
+
+ const LLT S1 = LLT::scalar(1);
+ const LLT S32 = LLT::scalar(32);
+ const LLT F64 = LLT::scalar(64);
+
+ Register Dst = MI.getOperand(0).getReg();
+ assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
+
+ Register X = MI.getOperand(1).getReg();
+ unsigned Flags = MI.getFlags();
+
+ auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
+
+ auto ZeroInt = B.buildConstant(S32, 0);
+ auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
+
+ // Scale up input if it is too small.
+ auto ScaleUpFactor = B.buildConstant(S32, 256);
+ auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
+ auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
+
+ auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false)
+ .addReg(SqrtX.getReg(0));
+
+ auto Half = B.buildFConstant(F64, 0.5);
+ auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
+ auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
+
+ auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
+ auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
+
+ auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
+ auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
+
+ auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
+ auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
+
+ auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
+
+ auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
+ auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
+
+ auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
+
+ // Scale down the result.
+ auto ScaleDownFactor = B.buildConstant(S32, -128);
+ auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
+ SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
+
+ // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
+ // with finite only or nsz because rsq(+/-0) = +/-inf
+
+ // TODO: Check for DAZ and expand to subnormals
+ auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
+
+ // If x is +INF, +0, or -0, use its original value
+ B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
// FIXME: Why do we handle this one but not other removed instructions?
//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 1a91be1..04773f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -157,6 +157,9 @@ public:
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 97de41c..3148f49 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -219,6 +219,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
+ setOperationAction(ISD::FSQRT, MVT::f64, Custom);
+
setOperationAction(ISD::SELECT_CC,
{MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
@@ -4924,7 +4926,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
"Load should return a value and a chain");
return Result;
}
-
+ case ISD::FSQRT:
+ if (Op.getValueType() == MVT::f64)
+ return lowerFSQRTF64(Op, DAG);
+ return SDValue();
case ISD::FSIN:
case ISD::FCOS:
return LowerTrig(Op, DAG);
@@ -9749,6 +9754,87 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
+ // For double type, the SQRT and RSQ instructions don't have required
+ // precision, we apply Goldschmidt's algorithm to improve the result:
+ //
+ // y0 = rsq(x)
+ // g0 = x * y0
+ // h0 = 0.5 * y0
+ //
+ // r0 = 0.5 - h0 * g0
+ // g1 = g0 * r0 + g0
+ // h1 = h0 * r0 + h0
+ //
+ // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
+ // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
+ // h2 = h1 * r1 + h1
+ //
+ // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
+ // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
+ //
+ // sqrt(x) = g3
+
+ SDNodeFlags Flags = Op->getFlags();
+
+ SDLoc DL(Op);
+
+ SDValue X = Op.getOperand(0);
+ SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
+
+ SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
+
+ SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
+
+ // Scale up input if it is too small.
+ SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
+ SDValue ScaleUp =
+ DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
+ SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
+
+ SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
+
+ SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
+
+ SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
+ SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
+
+ SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
+ SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
+
+ SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
+
+ SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
+
+ SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
+ SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
+
+ SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
+
+ SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
+ SDValue SqrtD1 =
+ DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
+
+ SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
+
+ SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
+ SDValue ScaleDown =
+ DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
+ SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
+
+ // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
+ // with finite only or nsz because rsq(+/-0) = +/-inf
+
+ // TODO: Check for DAZ and expand to subnormals
+ SDValue IsZeroOrInf =
+ DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
+ DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
+
+ // If x is +INF, +0, or -0, use its original value
+ return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
+ Flags);
+}
+
SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 4e62a05..1745c0b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -109,6 +109,7 @@ private:
SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index ec38d22..1a8efc6 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -332,7 +332,7 @@ defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>;
let TRANS = 1, SchedRW = [WriteTrans64] in {
defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
-defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>;
+defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
} // End TRANS = 1, SchedRW = [WriteTrans64]
let TRANS = 1, SchedRW = [WriteTrans32] in {
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
index a601768..f493cc5 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
@@ -52,10 +52,10 @@ define i32 @fsqrt(i32 %arg) {
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
+; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
;
; ALL-SIZE-LABEL: 'fsqrt'
@@ -63,10 +63,10 @@ define i32 @fsqrt(i32 %arg) {
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
+; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%F32 = call float @llvm.sqrt.f32(float undef)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
index 145f11d..7f97419 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
@@ -1,9 +1,9 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=SI,GCN %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefixes=VI,GCN %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN %s
---
name: test_fsqrt_s32
@@ -11,24 +11,12 @@ body: |
bb.0:
liveins: $vgpr0
- ; SI-LABEL: name: test_fsqrt_s32
- ; SI: liveins: $vgpr0
- ; SI-NEXT: {{ $}}
- ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
- ; SI-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
- ; VI-LABEL: name: test_fsqrt_s32
- ; VI: liveins: $vgpr0
- ; VI-NEXT: {{ $}}
- ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
- ; VI-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
- ; GFX9-LABEL: name: test_fsqrt_s32
- ; GFX9: liveins: $vgpr0
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
- ; GFX9-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
+ ; GCN-LABEL: name: test_fsqrt_s32
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
+ ; GCN-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = G_FSQRT %0
$vgpr0 = COPY %1
@@ -40,29 +28,83 @@ body: |
bb.0:
liveins: $vgpr0
- ; SI-LABEL: name: test_fsqrt_s64
- ; SI: liveins: $vgpr0
- ; SI-NEXT: {{ $}}
- ; SI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
- ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64)
- ; VI-LABEL: name: test_fsqrt_s64
- ; VI: liveins: $vgpr0
- ; VI-NEXT: {{ $}}
- ; VI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
- ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64)
- ; GFX9-LABEL: name: test_fsqrt_s64
- ; GFX9: liveins: $vgpr0
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
- ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64)
+ ; GCN-LABEL: name: test_fsqrt_s64
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000
+ ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]]
+ ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
+ ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]]
+ ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[COPY]], [[SELECT]](s32)
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64)
+ ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01
+ ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]]
+ ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]]
+ ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]]
+ ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]]
+ ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]]
+ ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]]
+ ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]]
+ ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]]
+ ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]]
+ ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]]
+ ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]]
+ ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]]
+ ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+ ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]]
+ ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32)
+ ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608
+ ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]]
+ ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = G_FSQRT %0
$vgpr0_vgpr1 = COPY %1
...
+
+---
+name: test_fsqrt_s64_ninf
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: test_fsqrt_s64_ninf
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000
+ ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]]
+ ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
+ ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]]
+ ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[COPY]], [[SELECT]](s32)
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64)
+ ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01
+ ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]]
+ ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]]
+ ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]]
+ ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]]
+ ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]]
+ ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]]
+ ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]]
+ ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]]
+ ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]]
+ ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]]
+ ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]]
+ ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]]
+ ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+ ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]]
+ ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[FMA6]], [[SELECT1]](s32)
+ ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608
+ ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = ninf G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]]
+ ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = ninf G_FSQRT %0
+ $vgpr0_vgpr1 = COPY %1
+
+...
---
name: test_fsqrt_s16
body: |
@@ -108,33 +150,15 @@ body: |
bb.0:
liveins: $vgpr0_vgpr1
- ; SI-LABEL: name: test_fsqrt_v2s32
- ; SI: liveins: $vgpr0_vgpr1
- ; SI-NEXT: {{ $}}
- ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
- ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
- ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
- ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
- ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
- ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
- ; VI-LABEL: name: test_fsqrt_v2s32
- ; VI: liveins: $vgpr0_vgpr1
- ; VI-NEXT: {{ $}}
- ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
- ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
- ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
- ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
- ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
- ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
- ; GFX9-LABEL: name: test_fsqrt_v2s32
- ; GFX9: liveins: $vgpr0_vgpr1
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
- ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
- ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
- ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ; GCN-LABEL: name: test_fsqrt_v2s32
+ ; GCN: liveins: $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
+ ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
+ ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = G_FSQRT %0
$vgpr0_vgpr1 = COPY %1
@@ -146,36 +170,16 @@ body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2
- ; SI-LABEL: name: test_fsqrt_v3s32
- ; SI: liveins: $vgpr0_vgpr1_vgpr2
- ; SI-NEXT: {{ $}}
- ; SI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
- ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
- ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
- ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
- ; SI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
- ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
- ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
- ; VI-LABEL: name: test_fsqrt_v3s32
- ; VI: liveins: $vgpr0_vgpr1_vgpr2
- ; VI-NEXT: {{ $}}
- ; VI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
- ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
- ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
- ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
- ; VI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
- ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
- ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
- ; GFX9-LABEL: name: test_fsqrt_v3s32
- ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
- ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
- ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
- ; GFX9-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
- ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ; GCN-LABEL: name: test_fsqrt_v3s32
+ ; GCN: liveins: $vgpr0_vgpr1_vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+ ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
+ ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
+ ; GCN-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
%1:_(<3 x s32>) = G_FSQRT %0
$vgpr0_vgpr1_vgpr2 = COPY %1
@@ -187,33 +191,58 @@ body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3
- ; SI-LABEL: name: test_fsqrt_v2s64
- ; SI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
- ; SI-NEXT: {{ $}}
- ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
- ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]]
- ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]]
- ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64)
- ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
- ; VI-LABEL: name: test_fsqrt_v2s64
- ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
- ; VI-NEXT: {{ $}}
- ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
- ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]]
- ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]]
- ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64)
- ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
- ; GFX9-LABEL: name: test_fsqrt_v2s64
- ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]]
- ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]]
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64)
- ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ; GCN-LABEL: name: test_fsqrt_v2s64
+ ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GCN-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+ ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000
+ ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s64), [[C]]
+ ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
+ ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]]
+ ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[UV]], [[SELECT]](s32)
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64)
+ ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01
+ ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]]
+ ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]]
+ ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]]
+ ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]]
+ ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]]
+ ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]]
+ ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]]
+ ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]]
+ ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]]
+ ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]]
+ ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]]
+ ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]]
+ ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+ ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]]
+ ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32)
+ ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608
+ ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]]
+ ; GCN-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s64), [[C]]
+ ; GCN-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C1]]
+ ; GCN-NEXT: [[FLDEXP2:%[0-9]+]]:_(s64) = G_FLDEXP [[UV1]], [[SELECT3]](s32)
+ ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP2]](s64)
+ ; GCN-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[INT1]], [[C3]]
+ ; GCN-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP2]], [[INT1]]
+ ; GCN-NEXT: [[FNEG3:%[0-9]+]]:_(s64) = G_FNEG [[FMUL2]]
+ ; GCN-NEXT: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FNEG3]], [[FMUL3]], [[C3]]
+ ; GCN-NEXT: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FMUL3]], [[FMA7]], [[FMUL3]]
+ ; GCN-NEXT: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMUL2]], [[FMA7]], [[FMUL2]]
+ ; GCN-NEXT: [[FNEG4:%[0-9]+]]:_(s64) = G_FNEG [[FMA8]]
+ ; GCN-NEXT: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG4]], [[FMA8]], [[FLDEXP2]]
+ ; GCN-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMA8]]
+ ; GCN-NEXT: [[FNEG5:%[0-9]+]]:_(s64) = G_FNEG [[FMA11]]
+ ; GCN-NEXT: [[FMA12:%[0-9]+]]:_(s64) = G_FMA [[FNEG5]], [[FMA11]], [[FLDEXP2]]
+ ; GCN-NEXT: [[FMA13:%[0-9]+]]:_(s64) = G_FMA [[FMA12]], [[FMA9]], [[FMA11]]
+ ; GCN-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C1]]
+ ; GCN-NEXT: [[FLDEXP3:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA13]], [[SELECT4]](s32)
+ ; GCN-NEXT: [[IS_FPCLASS1:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP2]](s64), 608
+ ; GCN-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS1]](s1), [[FLDEXP2]], [[FLDEXP3]]
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT2]](s64), [[SELECT5]](s64)
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(<2 x s64>) = G_FSQRT %0
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 6239393..8bb8f6c 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,48 +1,248 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
define double @v_sqrt_f64(double %x) {
-; GCN-LABEL: v_sqrt_f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_fneg(double %x) {
-; GCN-LABEL: v_sqrt_f64_fneg:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e64 v[0:1], -v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fneg:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 9
+; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fneg:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%x.neg = fneg double %x
%result = call double @llvm.sqrt.f64(double %x.neg)
ret double %result
}
define double @v_sqrt_f64_fabs(double %x) {
-; GCN-LABEL: v_sqrt_f64_fabs:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]|
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fabs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fabs:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%x.fabs = call double @llvm.fabs.f64(double %x)
%result = call double @llvm.sqrt.f64(double %x.fabs)
ret double %result
}
define double @v_sqrt_f64_fneg_fabs(double %x) {
-; GCN-LABEL: v_sqrt_f64_fneg_fabs:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]|
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fneg_fabs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 9
+; SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%x.fabs = call double @llvm.fabs.f64(double %x)
%x.fabs.neg = fneg double %x.fabs
%result = call double @llvm.sqrt.f64(double %x.fabs.neg)
@@ -50,42 +250,245 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
}
define double @v_sqrt_f64_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_ninf:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_ninf:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call ninf double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" {
-; GCN-LABEL: v_sqrt_f64_no_infs_attribute:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_no_infs_attribute:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call ninf double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_nnan(double %x) {
-; GCN-LABEL: v_sqrt_f64_nnan:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nnan:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nnan:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nnan double @llvm.sqrt.f64(double %x)
ret double %result
}
define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s1, v1
-; GCN-NEXT: ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, 0
+; GISEL-NEXT: s_brev_b32 s3, 8
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GISEL-NEXT: ; return to shader part epilog
%result = call double @llvm.sqrt.f64(double %x)
%cast = bitcast double %result to <2 x i32>
%cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -98,12 +501,65 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
}
define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64_ninf:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s1, v1
-; GCN-NEXT: ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64_ninf:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, 0
+; GISEL-NEXT: s_brev_b32 s3, 8
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GISEL-NEXT: ; return to shader part epilog
%result = call ninf double @llvm.sqrt.f64(double %x)
%cast = bitcast double %result to <2 x i32>
%cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -116,12 +572,65 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
}
define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64_afn:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s1, v1
-; GCN-NEXT: ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64_afn:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64_afn:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, 0
+; GISEL-NEXT: s_brev_b32 s3, 8
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GISEL-NEXT: ; return to shader part epilog
%result = call afn double @llvm.sqrt.f64(double %x)
%cast = bitcast double %result to <2 x i32>
%cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -134,12 +643,65 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
}
define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64_afn_nnan_ninf:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s1, v1
-; GCN-NEXT: ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, 0
+; GISEL-NEXT: s_brev_b32 s3, 8
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GISEL-NEXT: ; return to shader part epilog
%result = call afn nnan ninf double @llvm.sqrt.f64(double %x)
%cast = bitcast double %result to <2 x i32>
%cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -152,167 +714,1147 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
}
define double @v_sqrt_f64_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_nsz:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nsz:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nsz:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_nnan_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_nnan_ninf:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nnan ninf double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_nnan_ninf_nsz:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nnan ninf nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_afn(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_afn_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nsz:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nsz:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nsz:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
-; GCN-LABEL: v_sqrt_v2f64_afn:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v2f64_afn:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v2f64_afn:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
ret <2 x double> %result
}
define double @v_sqrt_f64_afn_nnan(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nnan:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nnan:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nnan:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nnan double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_fabs_afn_ninf:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]|
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%fabs = call double @llvm.fabs.f64(double %x)
%result = call afn ninf double @llvm.sqrt.f64(double %fabs)
ret double %result
}
define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nnan ninf double @llvm.sqrt.f64(double %x)
ret double %result
}
define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
-; GCN-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
ret <2 x double> %result
}
define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nnan ninf nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
-; GCN-LABEL: v_sqrt_f64__approx_func_fp_math:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
-; GCN-LABEL: v_sqrt_f64__enough_unsafe_attrs:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
-; GCN-LABEL: v_sqrt_f64__unsafe_attr:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64__unsafe_attr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
-; GCN-LABEL: v_sqrt_v2f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v2f64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0
+; SDAG-NEXT: s_brev_b32 s5, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v2f64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: s_brev_b32 s5, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
ret <2 x double> %result
}
define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
-; GCN-LABEL: v_sqrt_v3f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT: v_sqrt_f64_e32 v[4:5], v[4:5]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v3f64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s6, 0
+; SDAG-NEXT: s_brev_b32 s7, 8
+; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
+; SDAG-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
+; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3]
+; SDAG-NEXT: v_rsq_f64_e32 v[10:11], v[4:5]
+; SDAG-NEXT: v_mul_f64 v[12:13], v[0:1], v[6:7]
+; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9]
+; SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5
+; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[10:11]
+; SDAG-NEXT: v_mul_f64 v[10:11], v[10:11], 0.5
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[12:13], 0.5
+; SDAG-NEXT: v_fma_f64 v[20:21], -v[8:9], v[14:15], 0.5
+; SDAG-NEXT: v_fma_f64 v[22:23], -v[10:11], v[16:17], 0.5
+; SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
+; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
+; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[12:13], v[18:19], v[6:7], v[12:13]
+; SDAG-NEXT: v_fma_f64 v[14:15], v[20:21], v[8:9], v[14:15]
+; SDAG-NEXT: v_fma_f64 v[16:17], v[22:23], v[10:11], v[16:17]
+; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1]
+; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3]
+; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5]
+; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[12:13]
+; SDAG-NEXT: v_mov_b32_e32 v12, 0xffffff80
+; SDAG-NEXT: v_mov_b32_e32 v13, 0x260
+; SDAG-NEXT: v_fma_f64 v[8:9], v[20:21], v[8:9], v[14:15]
+; SDAG-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5]
+; SDAG-NEXT: v_fma_f64 v[10:11], v[22:23], v[10:11], v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7]
+; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14
+; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13
+; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15
+; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13
+; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v3f64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s6, 0
+; GISEL-NEXT: s_brev_b32 s7, 8
+; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
+; GISEL-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v6, 0x100
+; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
+; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[6:7]
+; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v7
+; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[2:3]
+; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[4:5]
+; GISEL-NEXT: v_mul_f64 v[12:13], v[6:7], 0.5
+; GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7]
+; GISEL-NEXT: v_mul_f64 v[14:15], v[8:9], 0.5
+; GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[8:9]
+; GISEL-NEXT: v_mul_f64 v[16:17], v[10:11], 0.5
+; GISEL-NEXT: v_mul_f64 v[10:11], v[4:5], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5
+; GISEL-NEXT: v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5
+; GISEL-NEXT: v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5
+; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
+; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
+; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
+; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
+; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
+; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
+; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
+; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80
+; GISEL-NEXT: v_mov_b32_e32 v13, 0x260
+; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc
+; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7]
+; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14
+; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13
+; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13
+; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15
+; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13
+; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <3 x double> @llvm.sqrt.v3f64(<3 x double> %x)
ret <3 x double> %result
}
@@ -329,5 +1871,4 @@ attributes #2 = { "approx-func-fp-math"="true" }
attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
attributes #4 = { "unsafe-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GISEL: {{.*}}
-; SDAG: {{.*}}
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 74d3404..0f2eb38 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -3,6 +3,7 @@
declare float @llvm.amdgcn.rcp.f32(float) #0
declare double @llvm.amdgcn.rcp.f64(double) #0
+declare double @llvm.amdgcn.sqrt.f64(double) #0
declare double @llvm.sqrt.f64(double) #0
declare float @llvm.sqrt.f32(float) #0
@@ -124,7 +125,15 @@ define amdgpu_kernel void @unsafe_rcp_pat_f64(ptr addrspace(1) %out, double %src
; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f64:
; SI-NOT: v_rsq_f64_e32
-; SI: v_sqrt_f64
+; SI: v_rsq_f64
+; SI: v_mul_f64
+; SI: v_mul_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
; SI: v_rcp_f64
define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 {
%sqrt = call double @llvm.sqrt.f64(double %src)
@@ -133,12 +142,42 @@ define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %s
ret void
}
+; FUNC-LABEL: {{^}}safe_amdgcn_sqrt_rsq_rcp_pat_f64:
+; SI-NOT: v_rsq_f64_e32
+; SI: v_sqrt_f64
+; SI: v_rcp_f64
+define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+ %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src)
+ %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+ store double %rcp, double addrspace(1)* %out, align 8
+ ret void
+}
+
; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f64:
+; SI: v_rsq_f64
+; SI: v_mul_f64
+; SI: v_mul_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_rcp_f64
+; SI: buffer_store_dwordx2
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+ %sqrt = call double @llvm.sqrt.f64(double %src)
+ %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+ store double %rcp, double addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_amdgcn_sqrt_rsq_rcp_pat_f64:
; SI: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SQRT]]
; SI: buffer_store_dwordx2 [[RESULT]]
-define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
- %sqrt = call double @llvm.sqrt.f64(double %src)
+define amdgpu_kernel void @unsafe_amdgcn_sqrt_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
+ %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src)
%rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
store double %rcp, ptr addrspace(1) %out, align 8
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index a20aaac..9caea1b 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -15,8 +15,30 @@ declare double @llvm.fabs.f64(double)
define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
; SI-SDAG-LABEL: s_rsq_f64:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8
; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -37,8 +59,32 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
;
; SI-GISEL-LABEL: s_rsq_f64:
; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s2, 0
+; SI-GISEL-NEXT: s_brev_b32 s3, 8
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -59,7 +105,29 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
;
; VI-SDAG-LABEL: s_rsq_f64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -77,7 +145,31 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
;
; VI-GISEL-LABEL: s_rsq_f64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-GISEL-NEXT: s_mov_b32 s2, 0
+; VI-GISEL-NEXT: s_brev_b32 s3, 8
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -107,8 +199,30 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
; SI-SDAG-LABEL: s_rsq_f64_fabs:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; SI-SDAG-NEXT: s_and_b64 s[0:1], s[2:3], exec
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8
; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -129,8 +243,32 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
;
; SI-GISEL-LABEL: s_rsq_f64_fabs:
; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; SI-GISEL-NEXT: s_mov_b32 s2, 0
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; SI-GISEL-NEXT: s_brev_b32 s3, 8
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -151,7 +289,29 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
;
; VI-SDAG-LABEL: s_rsq_f64_fabs:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], s[2:3], exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -169,7 +329,31 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
;
; VI-GISEL-LABEL: s_rsq_f64_fabs:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; VI-GISEL-NEXT: s_mov_b32 s2, 0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: s_brev_b32 s3, 8
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -200,8 +384,30 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
; SI-SDAG-LABEL: s_neg_rsq_f64:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8
; SI-SDAG-NEXT: s_mov_b32 s2, 0xbff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -222,8 +428,32 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
;
; SI-GISEL-LABEL: s_neg_rsq_f64:
; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s2, 0
+; SI-GISEL-NEXT: s_brev_b32 s3, 8
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -244,7 +474,29 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
;
; VI-SDAG-LABEL: s_neg_rsq_f64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -262,7 +514,31 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
;
; VI-GISEL-LABEL: s_neg_rsq_f64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-GISEL-NEXT: s_mov_b32 s2, 0
+; VI-GISEL-NEXT: s_brev_b32 s3, 8
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -292,8 +568,30 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
; SI-SDAG-LABEL: s_neg_rsq_neg_f64:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 9
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0
+; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8
; SI-SDAG-NEXT: s_mov_b32 s2, 0xbff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -314,8 +612,32 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
;
; SI-GISEL-LABEL: s_neg_rsq_neg_f64:
; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s2, 0
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; SI-GISEL-NEXT: s_brev_b32 s3, 8
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -336,7 +658,29 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
;
; VI-SDAG-LABEL: s_neg_rsq_neg_f64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 9
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -354,7 +698,31 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
;
; VI-GISEL-LABEL: s_neg_rsq_neg_f64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1]
+; VI-GISEL-NEXT: s_mov_b32 s2, 0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: s_brev_b32 s3, 8
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -386,8 +754,30 @@ define double @v_rsq_f64(double %x) {
; SI-SDAG-LABEL: v_rsq_f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -407,8 +797,30 @@ define double @v_rsq_f64(double %x) {
; SI-GISEL-LABEL: v_rsq_f64:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -428,7 +840,29 @@ define double @v_rsq_f64(double %x) {
; VI-SDAG-LABEL: v_rsq_f64:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -445,7 +879,29 @@ define double @v_rsq_f64(double %x) {
; VI-GISEL-LABEL: v_rsq_f64:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -467,8 +923,30 @@ define double @v_rsq_f64_fabs(double %x) {
; SI-SDAG-LABEL: v_rsq_f64_fabs:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -488,8 +966,30 @@ define double @v_rsq_f64_fabs(double %x) {
; SI-GISEL-LABEL: v_rsq_f64_fabs:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -509,7 +1009,29 @@ define double @v_rsq_f64_fabs(double %x) {
; VI-SDAG-LABEL: v_rsq_f64_fabs:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -526,7 +1048,29 @@ define double @v_rsq_f64_fabs(double %x) {
; VI-GISEL-LABEL: v_rsq_f64_fabs:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -549,8 +1093,30 @@ define double @v_rsq_f64_missing_contract0(double %x) {
; SI-SDAG-LABEL: v_rsq_f64_missing_contract0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -570,8 +1136,30 @@ define double @v_rsq_f64_missing_contract0(double %x) {
; SI-GISEL-LABEL: v_rsq_f64_missing_contract0:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -591,7 +1179,29 @@ define double @v_rsq_f64_missing_contract0(double %x) {
; VI-SDAG-LABEL: v_rsq_f64_missing_contract0:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -608,7 +1218,29 @@ define double @v_rsq_f64_missing_contract0(double %x) {
; VI-GISEL-LABEL: v_rsq_f64_missing_contract0:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -630,8 +1262,30 @@ define double @v_rsq_f64_missing_contract1(double %x) {
; SI-SDAG-LABEL: v_rsq_f64_missing_contract1:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -651,8 +1305,30 @@ define double @v_rsq_f64_missing_contract1(double %x) {
; SI-GISEL-LABEL: v_rsq_f64_missing_contract1:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -672,7 +1348,29 @@ define double @v_rsq_f64_missing_contract1(double %x) {
; VI-SDAG-LABEL: v_rsq_f64_missing_contract1:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -689,7 +1387,29 @@ define double @v_rsq_f64_missing_contract1(double %x) {
; VI-GISEL-LABEL: v_rsq_f64_missing_contract1:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -711,8 +1431,30 @@ define double @v_neg_rsq_f64(double %x) {
; SI-SDAG-LABEL: v_neg_rsq_f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -732,8 +1474,30 @@ define double @v_neg_rsq_f64(double %x) {
; SI-GISEL-LABEL: v_neg_rsq_f64:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], -1.0, v[0:1], -1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -753,7 +1517,29 @@ define double @v_neg_rsq_f64(double %x) {
; VI-SDAG-LABEL: v_neg_rsq_f64:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -770,7 +1556,29 @@ define double @v_neg_rsq_f64(double %x) {
; VI-GISEL-LABEL: v_neg_rsq_f64:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -792,101 +1600,222 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
; SI-SDAG-LABEL: v_rsq_v2f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000
-; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
+; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10
+; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v15
+; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9]
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0
; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9]
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11]
+; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7
+; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13]
+; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9]
; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
-; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13
-; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13]
-; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17]
-; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[18:19], v[10:11]
-; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9
-; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19]
+; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7]
+; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19]
+; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11
; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19
; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc
; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
-; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13]
+; SI-SDAG-NEXT: s_nop 0
+; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_rsq_v2f64:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0x3ff00000
-; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v13
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19
+; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7]
+; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19
; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
-; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
+; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
+; SI-GISEL-NEXT: s_nop 0
+; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_rsq_v2f64:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
-; VI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
-; VI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
-; VI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
-; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
-; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; VI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9]
-; VI-SDAG-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11]
-; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13]
-; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
-; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT: v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], 1.0
+; VI-SDAG-NEXT: v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0
+; VI-SDAG-NEXT: v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[9:10], v[5:6]
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[11:12], v[7:8]
+; VI-SDAG-NEXT: v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10]
+; VI-SDAG-NEXT: v_div_scale_f64 v[13:14], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12]
+; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10]
+; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12]
+; VI-SDAG-NEXT: v_mul_f64 v[15:16], v[13:14], v[9:10]
+; VI-SDAG-NEXT: v_mul_f64 v[19:20], v[17:18], v[11:12]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18]
+; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16]
; VI-SDAG-NEXT: s_mov_b64 vcc, s[4:5]
-; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
; VI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -894,9 +1823,48 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
; VI-GISEL-LABEL: v_rsq_v2f64:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
@@ -929,101 +1897,222 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
; SI-SDAG-LABEL: v_neg_rsq_v2f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000
-; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0
-; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
+; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10
+; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v15
+; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9]
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], -1.0
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0
; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9]
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11]
+; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7
+; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13]
+; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9]
; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
-; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
-; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13
-; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13]
-; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17]
-; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[18:19], v[10:11]
-; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9
-; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19]
+; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7]
+; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19]
+; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11
; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19
; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc
; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
-; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13]
+; SI-SDAG-NEXT: s_nop 0
+; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_neg_rsq_v2f64:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000
-; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
-; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v13
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19
+; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7]
+; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19
; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0
-; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], -1.0
+; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
+; SI-GISEL-NEXT: s_nop 0
+; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_neg_rsq_v2f64:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0
-; VI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0
-; VI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
-; VI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT: v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
-; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
-; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; VI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9]
-; VI-SDAG-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11]
-; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13]
-; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
-; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT: v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0
+; VI-SDAG-NEXT: v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], -1.0
+; VI-SDAG-NEXT: v_div_scale_f64 v[17:18], s[4:5], -1.0, v[2:3], -1.0
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[9:10], v[5:6]
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[11:12], v[7:8]
+; VI-SDAG-NEXT: v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10]
+; VI-SDAG-NEXT: v_div_scale_f64 v[13:14], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12]
+; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10]
+; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12]
+; VI-SDAG-NEXT: v_mul_f64 v[15:16], v[13:14], v[9:10]
+; VI-SDAG-NEXT: v_mul_f64 v[19:20], v[17:18], v[11:12]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18]
+; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16]
; VI-SDAG-NEXT: s_mov_b64 vcc, s[4:5]
-; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
; VI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1031,9 +2120,48 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
; VI-GISEL-LABEL: v_neg_rsq_v2f64:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0
; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
@@ -1066,8 +2194,30 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
; SI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1089,43 +2239,105 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
; SI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; SI-GISEL-NEXT: v_mov_b32_e32 v16, 0xbff00000
-; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5]
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v13
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v16, v13
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7]
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19
+; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19
; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0
-; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], s[4:5]
+; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
+; SI-GISEL-NEXT: s_nop 0
+; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5]
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1144,9 +2356,48 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
; VI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5]
; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5]
; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
@@ -1179,105 +2430,224 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
; SI-SDAG-LABEL: v_neg_pos_rsq_v2f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000
-; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
+; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v10
+; SI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], v15
+; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9]
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[12:13], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0
; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9]
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11]
+; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7
+; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13]
+; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9]
; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
-; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13
-; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13]
-; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7]
; SI-SDAG-NEXT: s_mov_b32 s4, 0x3ff00000
-; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[18:19], v[10:11]
-; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17]
-; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19]
-; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9
+; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19]
+; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11
; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v19
; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc
; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
; SI-SDAG-NEXT: s_nop 0
-; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13]
+; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_neg_pos_rsq_v2f64:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; SI-GISEL-NEXT: v_mov_b32_e32 v16, 0xbff00000
-; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v13
+; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v16, v13
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7]
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0x3ff00000
+; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v8, v19
+; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0
-; SI-GISEL-NEXT: s_nop 1
-; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
+; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
+; SI-GISEL-NEXT: s_nop 0
+; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_neg_pos_rsq_v2f64:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
-; VI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
-; VI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
-; VI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT: v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
-; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
-; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; VI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9]
-; VI-SDAG-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11]
-; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13]
-; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
-; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT: v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0
+; VI-SDAG-NEXT: v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0
+; VI-SDAG-NEXT: v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[9:10], v[5:6]
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[11:12], v[7:8]
+; VI-SDAG-NEXT: v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10]
+; VI-SDAG-NEXT: v_div_scale_f64 v[13:14], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12]
+; VI-SDAG-NEXT: v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10]
+; VI-SDAG-NEXT: v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12]
+; VI-SDAG-NEXT: v_mul_f64 v[15:16], v[13:14], v[9:10]
+; VI-SDAG-NEXT: v_mul_f64 v[19:20], v[17:18], v[11:12]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18]
+; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16]
; VI-SDAG-NEXT: s_mov_b64 vcc, s[4:5]
-; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
; VI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1285,9 +2655,48 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
; VI-GISEL-LABEL: v_neg_pos_rsq_v2f64:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
@@ -1320,8 +2729,30 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
; SI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 9
+; SI-SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1341,8 +2772,30 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1362,7 +2815,29 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
; VI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 9
+; VI-SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1379,7 +2854,29 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1403,8 +2900,30 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
; SI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1424,8 +2943,30 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1445,7 +2986,29 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
; VI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1462,7 +3025,29 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1481,226 +3066,1033 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
}
define double @v_rsq_f64__afn_fdiv(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_fdiv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_fdiv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract double @llvm.sqrt.f64(double %x)
%rsq = fdiv contract afn double 1.0, %sqrt
ret double %rsq
}
define double @v_rsq_f64__afn(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract afn double @llvm.sqrt.f64(double %x)
%rsq = fdiv contract afn double 1.0, %sqrt
ret double %rsq
}
define double @v_neg_rsq_f64__afn(double %x) {
-; SDAG-LABEL: v_neg_rsq_f64__afn:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_neg_rsq_f64__afn:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_neg_rsq_f64__afn:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_neg_rsq_f64__afn:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_neg_rsq_f64__afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_neg_rsq_f64__afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract afn double @llvm.sqrt.f64(double %x)
%rsq = fdiv contract afn double -1.0, %sqrt
ret double %rsq
}
define double @v_rsq_f64__afn_ninf(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_ninf:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_ninf:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_ninf:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_ninf:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_ninf:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract afn ninf double @llvm.sqrt.f64(double %x)
%rsq = fdiv contract afn ninf double 1.0, %sqrt
ret double %rsq
}
define double @v_rsq_f64__afn_nnan(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_nnan:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_nnan:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_nnan:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_nnan:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_nnan:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_nnan:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract afn nnan double @llvm.sqrt.f64(double %x)
%rsq = fdiv contract afn nnan double 1.0, %sqrt
ret double %rsq
}
define double @v_rsq_f64__afn_nnan_ninf(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
%rsq = fdiv contract afn nnan ninf double 1.0, %sqrt
ret double %rsq
}
define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
-; SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
%rsq = fdiv contract afn nnan ninf double -1.0, %sqrt
ret double %rsq
@@ -1710,8 +4102,30 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
; SI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1731,8 +4145,30 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1752,7 +4188,29 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
; VI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1769,7 +4227,29 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1788,71 +4268,250 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
}
define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
-; SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
-; SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[0:1], v[8:9]
+; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5
+; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7]
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7]
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[6:7], v[2:3]
-; SI-GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[2:3]
-; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; SI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[0:1]
-; SI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[2:3]
-; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0
-; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9]
-; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v10, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11]
+; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5]
+; SI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; VI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[5:6], v[0:1]
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[7:8], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[9:10], -v[0:1], v[5:6], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[11:12], -v[2:3], v[7:8], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[9:10], v[5:6], v[5:6]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[11:12], v[7:8], v[7:8]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; VI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[4:5], v[0:1]
-; VI-GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[6:7], v[2:3]
-; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; VI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[0:1]
-; VI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[2:3]
-; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0
-; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0
-; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9]
-; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5]
+; VI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
%rsq = fdiv contract afn nnan ninf <2 x double> <double 1.0, double 1.0>, %sqrt
@@ -1860,34 +4519,155 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
}
define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
-; SDAG-LABEL: s_rsq_f64_unsafe:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; SDAG-NEXT: v_readfirstlane_b32 s1, v1
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: s_rsq_f64_unsafe:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], s[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
-; GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GISEL-NEXT: v_readfirstlane_b32 s0, v0
-; GISEL-NEXT: v_readfirstlane_b32 s1, v1
-; GISEL-NEXT: ; return to shader part epilog
+; SI-SDAG-LABEL: s_rsq_f64_unsafe:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SI-SDAG-NEXT: ; return to shader part epilog
+;
+; SI-GISEL-LABEL: s_rsq_f64_unsafe:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_mov_b32 s2, 0
+; SI-GISEL-NEXT: s_brev_b32 s3, 8
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; SI-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; SI-GISEL-NEXT: ; return to shader part epilog
+;
+; VI-SDAG-LABEL: s_rsq_f64_unsafe:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; VI-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; VI-SDAG-NEXT: ; return to shader part epilog
+;
+; VI-GISEL-LABEL: s_rsq_f64_unsafe:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_mov_b32 s2, 0
+; VI-GISEL-NEXT: s_brev_b32 s3, 8
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; VI-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; VI-GISEL-NEXT: ; return to shader part epilog
%rsq = call contract double @llvm.sqrt.f64(double %x)
%result = fdiv contract double 1.0, %rsq
%cast = bitcast double %result to <2 x i32>
@@ -1901,32 +4681,147 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
}
define double @v_rsq_f64_unsafe(double %x) #0 {
-; SDAG-LABEL: v_rsq_f64_unsafe:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64_unsafe:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64_unsafe:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64_unsafe:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64_unsafe:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64_unsafe:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
%sqrt = call double @llvm.sqrt.f64(double %x)
%rsq = fdiv double 1.0, %sqrt
ret double %rsq
@@ -2190,7 +5085,29 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
; SI-SDAG-LABEL: v_div_contract_sqrt_f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_mov_b32_e32 v11, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2210,7 +5127,29 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
; SI-GISEL-LABEL: v_div_contract_sqrt_f64:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x260
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
@@ -2230,7 +5169,29 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
; VI-SDAG-LABEL: v_div_contract_sqrt_f64:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; VI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2247,7 +5208,29 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
; VI-GISEL-LABEL: v_div_contract_sqrt_f64:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2269,7 +5252,29 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
; SI-SDAG-LABEL: v_div_arcp_sqrt_f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_mov_b32_e32 v11, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2289,7 +5294,29 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
; SI-GISEL-LABEL: v_div_arcp_sqrt_f64:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x260
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
@@ -2309,7 +5336,29 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
; VI-SDAG-LABEL: v_div_arcp_sqrt_f64:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; VI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2326,7 +5375,29 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
; VI-GISEL-LABEL: v_div_arcp_sqrt_f64:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2348,7 +5419,29 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
; SI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT: s_mov_b32 s4, 0
+; SI-SDAG-NEXT: s_brev_b32 s5, 8
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_mov_b32_e32 v11, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2368,7 +5461,29 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
; SI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT: s_mov_b32 s4, 0
+; SI-GISEL-NEXT: s_brev_b32 s5, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x260
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
@@ -2388,7 +5503,29 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
; VI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; VI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2405,7 +5542,29 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
; VI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2427,9 +5586,30 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
; SI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT: s_brev_b32 s7, 8
; SI-SDAG-NEXT: s_mov_b32 s6, 0
+; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
; SI-SDAG-NEXT: s_mov_b32 s7, 0x40700000
+; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7]
; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
@@ -2449,10 +5629,31 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
; SI-GISEL-NEXT: s_mov_b32 s6, 0
+; SI-GISEL-NEXT: s_brev_b32 s7, 8
+; SI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
; SI-GISEL-NEXT: s_mov_b32 s7, 0x40700000
; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x40700000
+; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7]
; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], s[6:7], v[0:1], s[6:7]
; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
@@ -2472,9 +5673,30 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT: s_brev_b32 s5, 8
; VI-SDAG-NEXT: s_mov_b32 s4, 0
+; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
; VI-SDAG-NEXT: s_mov_b32 s5, 0x40700000
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5]
; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -2491,9 +5713,30 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
; VI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1]
; VI-GISEL-NEXT: s_mov_b32 s4, 0
+; VI-GISEL-NEXT: s_brev_b32 s5, 8
+; VI-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x100
; VI-GISEL-NEXT: s_mov_b32 s5, 0x40700000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5]
; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -2514,3 +5757,5 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
attributes #0 = { "unsafe-fp-math"="true" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
+; GISEL: {{.*}}
+; SDAG: {{.*}}