diff options
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTX.h | 9 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 36 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 9 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 127 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/div.ll | 140 |
8 files changed, 245 insertions, 95 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 6c0d0e3..8464028 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -253,7 +253,14 @@ enum PrmtMode { RC16, }; } -} + +enum class DivPrecisionLevel : unsigned { + Approx = 0, + Full = 1, + IEEE754 = 2, +}; + +} // namespace NVPTX void initializeNVPTXDAGToDAGISelLegacyPass(PassRegistry &); } // namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 2247ae3..b05a471 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -66,8 +66,9 @@ bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { return SelectionDAGISel::runOnMachineFunction(MF); } -int NVPTXDAGToDAGISel::getDivF32Level() const { - return Subtarget->getTargetLowering()->getDivF32Level(); +NVPTX::DivPrecisionLevel +NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const { + return Subtarget->getTargetLowering()->getDivF32Level(*MF, *N); } bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 92efabc..648e8e2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -43,7 +43,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { // If true, generate mul.wide from sext and mul bool doMulWide; - int getDivF32Level() const; + NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const; bool usePrecSqrtF32() const; bool useF32FTZ() const; bool allowFMA() const; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 82d00ef..68a2935 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -85,11 +85,16 @@ static cl::opt<unsigned> FMAContractLevelOpt( " 1: do it 2: do it aggressively"), cl::init(2)); -static cl::opt<int> UsePrecDivF32( +static cl::opt<NVPTX::DivPrecisionLevel> UsePrecDivF32( "nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), - cl::init(2)); + cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", + "Use div.approx"), + clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), + clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", + "Use IEEE Compliant F32 div.rnd if available")), + cl::init(NVPTX::DivPrecisionLevel::IEEE754)); static cl::opt<bool> UsePrecSqrtF32( "nvptx-prec-sqrtf32", cl::Hidden, @@ -109,17 +114,22 @@ static cl::opt<bool> ForceMinByValParamAlign( " params of device functions."), cl::init(false)); -int NVPTXTargetLowering::getDivF32Level() const { - if (UsePrecDivF32.getNumOccurrences() > 0) { - // If nvptx-prec-div32=N is used on the command-line, always honor it +NVPTX::DivPrecisionLevel +NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF, + const SDNode &N) const { + // If nvptx-prec-div32=N is used on the command-line, always honor it + if (UsePrecDivF32.getNumOccurrences() > 0) return UsePrecDivF32; - } else { - // Otherwise, use div.approx if fast math is enabled - if (getTargetMachine().Options.UnsafeFPMath) - return 0; - else - return 2; - } + + // Otherwise, use div.approx if fast math is enabled + if (allowUnsafeFPMath(MF)) + return NVPTX::DivPrecisionLevel::Approx; + + const SDNodeFlags Flags = N.getFlags(); + if (Flags.hasApproximateFuncs()) + return NVPTX::DivPrecisionLevel::Approx; + + return NVPTX::DivPrecisionLevel::IEEE754; } bool NVPTXTargetLowering::usePrecSqrtF32() const { @@ -4975,7 +4985,7 @@ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, return allowUnsafeFPMath(MF); } -bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { +bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const { // Honor TargetOptions flags that explicitly say unsafe math is okay. if (MF.getTarget().Options.UnsafeFPMath) return true; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 3dff83d..0a40718 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -216,11 +216,8 @@ public: // Get the degree of precision we want from 32-bit floating point division // operations. - // - // 0 - Use ptx div.approx - // 1 - Use ptx.div.full (approximate, but less so than div.approx) - // 2 - Use IEEE-compliant div instructions, if available. - int getDivF32Level() const; + NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, + const SDNode &N) const; // Get whether we should use a precise or approximate 32-bit floating point // sqrt instruction. @@ -237,7 +234,7 @@ public: unsigned combineRepeatedFPDivisors() const override { return 2; } bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const; - bool allowUnsafeFPMath(MachineFunction &MF) const; + bool allowUnsafeFPMath(const MachineFunction &MF) const; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT) const override { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 2c65ee6..444d35b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -151,9 +151,6 @@ def doRsqrtOpt : Predicate<"doRsqrtOpt()">; def doMulWide : Predicate<"doMulWide">; -def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; -def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; - def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; @@ -1119,26 +1116,19 @@ def INEG64 : //----------------------------------- // Constant 1.0f -def FloatConst1 : PatLeaf<(fpimm), [{ - return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && - N->getValueAPF().convertToFloat() == 1.0f; +def f32imm_1 : FPImmLeaf<f32, [{ + return &Imm.getSemantics() == &llvm::APFloat::IEEEsingle() && + Imm.convertToFloat() == 1.0f; }]>; // Constant 1.0 (double) -def DoubleConst1 : PatLeaf<(fpimm), [{ - return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && - N->getValueAPF().convertToDouble() == 1.0; +def f64imm_1 : FPImmLeaf<f64, [{ + return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() && + Imm.convertToDouble() == 1.0; }]>; // Constant -1.0 (double) -def DoubleConstNeg1 : PatLeaf<(fpimm), [{ - return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && - N->getValueAPF().convertToDouble() == -1.0; -}]>; - - -// Constant -X -> X (double) -def NegDoubleConst : SDNodeXForm<fpimm, [{ - return CurDAG->getTargetConstantFP(-(N->getValueAPF()), - SDLoc(N), MVT::f64); +def f64imm_neg1 : FPImmLeaf<f64, [{ + return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() && + Imm.convertToDouble() == -1.0; }]>; defm FADD : F3_fma_component<"add", fadd>; @@ -1189,11 +1179,11 @@ def BFNEG16x2 : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>; // // F64 division // -def FDIV641r : +def FRCP64r : NVPTXInst<(outs Float64Regs:$dst), - (ins f64imm:$a, Float64Regs:$b), + (ins Float64Regs:$b), "rcp.rn.f64 \t$dst, $b;", - [(set f64:$dst, (fdiv DoubleConst1:$a, f64:$b))]>; + [(set f64:$dst, (fdiv f64imm_1, f64:$b))]>; def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b), @@ -1207,24 +1197,31 @@ def FDIV64ri : // fdiv will be converted to rcp // fneg (fdiv 1.0, X) => fneg (rcp.rn X) -def : Pat<(fdiv DoubleConstNeg1:$a, f64:$b), - (FNEGf64 (FDIV641r (NegDoubleConst node:$a), $b))>; +def : Pat<(fdiv f64imm_neg1, f64:$b), + (FNEGf64 (FRCP64r $b))>; // // F32 Approximate reciprocal // -def FDIV321r_ftz : + +def fdiv_approx : PatFrag<(ops node:$a, node:$b), + (fdiv node:$a, node:$b), [{ + return getDivF32Level(N) == NVPTX::DivPrecisionLevel::Approx; +}]>; + + +def FRCP32_approx_r_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), + (ins Float32Regs:$b), "rcp.approx.ftz.f32 \t$dst, $b;", - [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, - Requires<[do_DIVF32_APPROX, doF32FTZ]>; -def FDIV321r : + [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>, + Requires<[doF32FTZ]>; +def FRCP32_approx_r : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), + (ins Float32Regs:$b), "rcp.approx.f32 \t$dst, $b;", - [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, - Requires<[do_DIVF32_APPROX]>; + [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>; + // // F32 Approximate division // @@ -1232,43 +1229,43 @@ def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.approx.ftz.f32 \t$dst, $a, $b;", - [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, - Requires<[do_DIVF32_APPROX, doF32FTZ]>; + [(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>, + Requires<[doF32FTZ]>; def FDIV32approxri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.approx.ftz.f32 \t$dst, $a, $b;", - [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, - Requires<[do_DIVF32_APPROX, doF32FTZ]>; + [(set f32:$dst, (fdiv_approx f32:$a, fpimm:$b))]>, + Requires<[doF32FTZ]>; def FDIV32approxrr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.approx.f32 \t$dst, $a, $b;", - [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, - Requires<[do_DIVF32_APPROX]>; + [(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>; def FDIV32approxri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.approx.f32 \t$dst, $a, $b;", - [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, - Requires<[do_DIVF32_APPROX]>; + [(set f32:$dst, (fdiv_approx f32:$a, fpimm:$b))]>; // // F32 Semi-accurate reciprocal // // rcp.approx gives the same result as div.full(1.0f, a) and is faster. // -def FDIV321r_approx_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.ftz.f32 \t$dst, $b;", - [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, - Requires<[do_DIVF32_FULL, doF32FTZ]>; -def FDIV321r_approx : - NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.f32 \t$dst, $b;", - [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, - Requires<[do_DIVF32_FULL]>; + +def fdiv_full : PatFrag<(ops node:$a, node:$b), + (fdiv node:$a, node:$b), [{ + return getDivF32Level(N) == NVPTX::DivPrecisionLevel::Full; +}]>; + + +def : Pat<(fdiv_full f32imm_1, f32:$b), + (FRCP32_approx_r_ftz $b)>, + Requires<[doF32FTZ]>; + +def : Pat<(fdiv_full f32imm_1, f32:$b), + (FRCP32_approx_r $b)>; + // // F32 Semi-accurate division // @@ -1276,40 +1273,38 @@ def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.full.ftz.f32 \t$dst, $a, $b;", - [(set f32:$dst, (fdiv Float32Regs:$a, f32:$b))]>, - Requires<[do_DIVF32_FULL, doF32FTZ]>; + [(set f32:$dst, (fdiv_full f32:$a, f32:$b))]>, + Requires<[doF32FTZ]>; def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.full.ftz.f32 \t$dst, $a, $b;", - [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, - Requires<[do_DIVF32_FULL, doF32FTZ]>; + [(set f32:$dst, (fdiv_full f32:$a, fpimm:$b))]>, + Requires<[doF32FTZ]>; def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.full.f32 \t$dst, $a, $b;", - [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, - Requires<[do_DIVF32_FULL]>; + [(set f32:$dst, (fdiv_full f32:$a, f32:$b))]>; def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.full.f32 \t$dst, $a, $b;", - [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, - Requires<[do_DIVF32_FULL]>; + [(set f32:$dst, (fdiv_full f32:$a, fpimm:$b))]>; // // F32 Accurate reciprocal // -def FDIV321r_prec_ftz : +def FRCP32r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), + (ins Float32Regs:$b), "rcp.rn.ftz.f32 \t$dst, $b;", - [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, + [(set f32:$dst, (fdiv f32imm_1, f32:$b))]>, Requires<[doF32FTZ]>; -def FDIV321r_prec : +def FRCP32r_prec : NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), + (ins Float32Regs:$b), "rcp.rn.f32 \t$dst, $b;", - [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>; + [(set f32:$dst, (fdiv f32imm_1, f32:$b))]>; // // F32 Accurate division // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 4f8a798..64a0344 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1615,24 +1615,24 @@ def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;", F64RT, F64RT, int_nvvm_rsqrt_approx_d>; // 1.0f / sqrt_approx -> rsqrt_approx -def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_f f32:$a)), +def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_f f32:$a)), (INT_NVVM_RSQRT_APPROX_F $a)>, Requires<[doRsqrtOpt]>; -def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_ftz_f f32:$a)), +def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)), (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>, Requires<[doRsqrtOpt]>; // same for int_nvvm_sqrt_f when non-precision sqrt is requested -def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)), +def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)), (INT_NVVM_RSQRT_APPROX_F $a)>, Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>; -def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)), +def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)), (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>, Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>; -def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)), +def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)), (INT_NVVM_RSQRT_APPROX_F $a)>, Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>; -def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)), +def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)), (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>, Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>; // diff --git a/llvm/test/CodeGen/NVPTX/div.ll b/llvm/test/CodeGen/NVPTX/div.ll index bd8d9a3..4c2cb75 100644 --- a/llvm/test/CodeGen/NVPTX/div.ll +++ b/llvm/test/CodeGen/NVPTX/div.ll @@ -24,3 +24,143 @@ define float @div_full(float %a, float %b) { %4 = call float @llvm.nvvm.div.full.ftz(float %3, float 4.0) ret float %4 } + +define float @div_fast_rr(float %a, float %b) { +; CHECK-LABEL: div_fast_rr( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [div_fast_rr_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [div_fast_rr_param_1]; +; CHECK-NEXT: div.approx.f32 %f3, %f1, %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; +; CHECK-NEXT: ret; + %t1 = fdiv afn float %a, %b + ret float %t1 +} + +define float @div_fast_rr_ftz(float %a, float %b) #0 { +; CHECK-LABEL: div_fast_rr_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [div_fast_rr_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [div_fast_rr_ftz_param_1]; +; CHECK-NEXT: div.approx.ftz.f32 %f3, %f1, %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; +; CHECK-NEXT: ret; + %t1 = fdiv afn float %a, %b + ret float %t1 +} + +define float @div_fast_ri(float %a, float %b) { +; CHECK-LABEL: div_fast_ri( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [div_fast_ri_param_0]; +; CHECK-NEXT: mul.rn.f32 %f2, %f1, 0f3F000000; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %t1 = fdiv afn float %a, 2.0 + ret float %t1 +} + +define float @div_fast_ri_ftz(float %a, float %b) #0 { +; CHECK-LABEL: div_fast_ri_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [div_fast_ri_ftz_param_0]; +; CHECK-NEXT: mul.rn.ftz.f32 %f2, %f1, 0f3F000000; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %t1 = fdiv afn float %a, 2.0 + ret float %t1 +} + +define float @rcp_fast(float %a) { +; CHECK-LABEL: rcp_fast( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [rcp_fast_param_0]; +; CHECK-NEXT: rcp.approx.f32 %f2, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %t1 = fdiv afn float 1.0, %a + ret float %t1 +} + +define float @rcp_fast_ftz(float %a) #0 { +; CHECK-LABEL: rcp_fast_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [rcp_fast_ftz_param_0]; +; CHECK-NEXT: rcp.approx.ftz.f32 %f2, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %t1 = fdiv afn float 1.0, %a + ret float %t1 +} + +define float @div_fast_vec(float %a, float %b, float %c, float %d) { +; CHECK-LABEL: div_fast_vec( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [div_fast_vec_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [div_fast_vec_param_1]; +; CHECK-NEXT: ld.param.b32 %f3, [div_fast_vec_param_2]; +; CHECK-NEXT: ld.param.b32 %f4, [div_fast_vec_param_3]; +; CHECK-NEXT: div.approx.f32 %f5, %f2, %f4; +; CHECK-NEXT: div.approx.f32 %f6, %f1, %f3; +; CHECK-NEXT: add.rn.f32 %f7, %f6, %f5; +; CHECK-NEXT: st.param.b32 [func_retval0], %f7; +; CHECK-NEXT: ret; + %ins_a0 = insertelement <2 x float> poison, float %a, i32 0 + %ins_a1 = insertelement <2 x float> %ins_a0, float %b, i32 1 + %ins_b0 = insertelement <2 x float> poison, float %c, i32 0 + %ins_b1 = insertelement <2 x float> %ins_b0, float %d, i32 1 + %fdiv = fdiv fast <2 x float> %ins_a1, %ins_b1 + %ext0 = extractelement <2 x float> %fdiv, i32 0 + %ext1 = extractelement <2 x float> %fdiv, i32 1 + %fadd = fadd float %ext0, %ext1 + ret float %fadd +} + +define float @div_fast_vec_ftz(float %a, float %b, float %c, float %d) #0 { +; CHECK-LABEL: div_fast_vec_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [div_fast_vec_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [div_fast_vec_ftz_param_1]; +; CHECK-NEXT: ld.param.b32 %f3, [div_fast_vec_ftz_param_2]; +; CHECK-NEXT: ld.param.b32 %f4, [div_fast_vec_ftz_param_3]; +; CHECK-NEXT: div.approx.ftz.f32 %f5, %f2, %f4; +; CHECK-NEXT: div.approx.ftz.f32 %f6, %f1, %f3; +; CHECK-NEXT: add.rn.ftz.f32 %f7, %f6, %f5; +; CHECK-NEXT: st.param.b32 [func_retval0], %f7; +; CHECK-NEXT: ret; + %ins_a0 = insertelement <2 x float> poison, float %a, i32 0 + %ins_a1 = insertelement <2 x float> %ins_a0, float %b, i32 1 + %ins_b0 = insertelement <2 x float> poison, float %c, i32 0 + %ins_b1 = insertelement <2 x float> %ins_b0, float %d, i32 1 + %fdiv = fdiv fast <2 x float> %ins_a1, %ins_b1 + %ext0 = extractelement <2 x float> %fdiv, i32 0 + %ext1 = extractelement <2 x float> %fdiv, i32 1 + %fadd = fadd float %ext0, %ext1 + ret float %fadd +} + +attributes #0 = { "denormal-fp-math-f32" = "preserve-sign" } |