8 files changed, 245 insertions, 95 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 6c0d0e3..8464028 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -253,7 +253,14 @@ enum PrmtMode {
   RC16,
 };
 }
-}
+
+enum class DivPrecisionLevel : unsigned {
+  Approx = 0,
+  Full = 1,
+  IEEE754 = 2,
+};
+
+} // namespace NVPTX
 void initializeNVPTXDAGToDAGISelLegacyPass(PassRegistry &);
 } // namespace llvm
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2247ae3..b05a471 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -66,8 +66,9 @@ bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
-int NVPTXDAGToDAGISel::getDivF32Level() const {
-  return Subtarget->getTargetLowering()->getDivF32Level();
+NVPTX::DivPrecisionLevel
+NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
+  return Subtarget->getTargetLowering()->getDivF32Level(*MF, *N);
 }
 
 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 92efabc..648e8e2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -43,7 +43,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   // If true, generate mul.wide from sext and mul
   bool doMulWide;
 
-  int getDivF32Level() const;
+  NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const;
   bool usePrecSqrtF32() const;
   bool useF32FTZ() const;
   bool allowFMA() const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 82d00ef..68a2935 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -85,11 +85,16 @@ static cl::opt<unsigned> FMAContractLevelOpt(
              " 1: do it  2: do it aggressively"),
     cl::init(2));
 
-static cl::opt<int> UsePrecDivF32(
+static cl::opt<NVPTX::DivPrecisionLevel> UsePrecDivF32(
     "nvptx-prec-divf32", cl::Hidden,
     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
              " IEEE Compliant F32 div.rnd if available."),
-    cl::init(2));
+    cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0",
+                          "Use div.approx"),
+               clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
+               clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2",
+                          "Use IEEE Compliant F32 div.rnd if available")),
+    cl::init(NVPTX::DivPrecisionLevel::IEEE754));
 
 static cl::opt<bool> UsePrecSqrtF32(
     "nvptx-prec-sqrtf32", cl::Hidden,
@@ -109,17 +114,22 @@ static cl::opt<bool> ForceMinByValParamAlign(
              " params of device functions."),
     cl::init(false));
 
-int NVPTXTargetLowering::getDivF32Level() const {
-  if (UsePrecDivF32.getNumOccurrences() > 0) {
-    // If nvptx-prec-div32=N is used on the command-line, always honor it
+NVPTX::DivPrecisionLevel
+NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
+                                    const SDNode &N) const {
+  // If nvptx-prec-div32=N is used on the command-line, always honor it
+  if (UsePrecDivF32.getNumOccurrences() > 0)
     return UsePrecDivF32;
-  } else {
-    // Otherwise, use div.approx if fast math is enabled
-    if (getTargetMachine().Options.UnsafeFPMath)
-      return 0;
-    else
-      return 2;
-  }
+
+  // Otherwise, use div.approx if fast math is enabled
+  if (allowUnsafeFPMath(MF))
+    return NVPTX::DivPrecisionLevel::Approx;
+
+  const SDNodeFlags Flags = N.getFlags();
+  if (Flags.hasApproximateFuncs())
+    return NVPTX::DivPrecisionLevel::Approx;
+
+  return NVPTX::DivPrecisionLevel::IEEE754;
 }
 
 bool NVPTXTargetLowering::usePrecSqrtF32() const {
@@ -4975,7 +4985,7 @@ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
   return allowUnsafeFPMath(MF);
 }
 
-bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
+bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const {
   // Honor TargetOptions flags that explicitly say unsafe math is okay.
   if (MF.getTarget().Options.UnsafeFPMath)
     return true;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 3dff83d..0a40718 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -216,11 +216,8 @@ public:
 
   // Get the degree of precision we want from 32-bit floating point division
   // operations.
-  //
-  //  0 - Use ptx div.approx
-  //  1 - Use ptx.div.full (approximate, but less so than div.approx)
-  //  2 - Use IEEE-compliant div instructions, if available.
-  int getDivF32Level() const;
+  NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF,
+                                          const SDNode &N) const;
 
   // Get whether we should use a precise or approximate 32-bit floating point
   // sqrt instruction.
@@ -237,7 +234,7 @@ public:
   unsigned combineRepeatedFPDivisors() const override { return 2; }
 
   bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const;
-  bool allowUnsafeFPMath(MachineFunction &MF) const;
+  bool allowUnsafeFPMath(const MachineFunction &MF) const;
 
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT) const override {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 2c65ee6..444d35b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -151,9 +151,6 @@ def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
 
 def doMulWide      : Predicate<"doMulWide">;
 
-def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
-def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
-
 def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
 def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
 
@@ -1119,26 +1116,19 @@ def INEG64 :
 //-----------------------------------
 
 // Constant 1.0f
-def FloatConst1 : PatLeaf<(fpimm), [{
-  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
-         N->getValueAPF().convertToFloat() == 1.0f;
+def f32imm_1 : FPImmLeaf<f32, [{
+  return &Imm.getSemantics() == &llvm::APFloat::IEEEsingle() &&
+         Imm.convertToFloat() == 1.0f;
 }]>;
 // Constant 1.0 (double)
-def DoubleConst1 : PatLeaf<(fpimm), [{
-  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
-         N->getValueAPF().convertToDouble() == 1.0;
+def f64imm_1 : FPImmLeaf<f64, [{
+  return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() &&
+         Imm.convertToDouble() == 1.0;
 }]>;
 // Constant -1.0 (double)
-def DoubleConstNeg1 : PatLeaf<(fpimm), [{
-  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
-         N->getValueAPF().convertToDouble() == -1.0;
-}]>;
-
-
-// Constant -X -> X (double)
-def NegDoubleConst : SDNodeXForm<fpimm, [{
-  return CurDAG->getTargetConstantFP(-(N->getValueAPF()),
-                                     SDLoc(N), MVT::f64);
+def f64imm_neg1 : FPImmLeaf<f64, [{
+  return &Imm.getSemantics() == &llvm::APFloat::IEEEdouble() &&
+         Imm.convertToDouble() == -1.0;
 }]>;
 
 defm FADD : F3_fma_component<"add", fadd>;
@@ -1189,11 +1179,11 @@ def BFNEG16x2     : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>;
 //
 // F64 division
 //
-def FDIV641r :
+def FRCP64r :
   NVPTXInst<(outs Float64Regs:$dst),
-            (ins f64imm:$a, Float64Regs:$b),
+            (ins Float64Regs:$b),
             "rcp.rn.f64 \t$dst, $b;",
-            [(set f64:$dst, (fdiv DoubleConst1:$a, f64:$b))]>;
+            [(set f64:$dst, (fdiv f64imm_1, f64:$b))]>;
 def FDIV64rr :
   NVPTXInst<(outs Float64Regs:$dst),
             (ins Float64Regs:$a, Float64Regs:$b),
@@ -1207,24 +1197,31 @@ def FDIV64ri :
 
 // fdiv will be converted to rcp
 // fneg (fdiv 1.0, X) => fneg (rcp.rn X)
-def : Pat<(fdiv DoubleConstNeg1:$a, f64:$b),
-          (FNEGf64 (FDIV641r (NegDoubleConst node:$a), $b))>;
+def : Pat<(fdiv f64imm_neg1, f64:$b),
+          (FNEGf64 (FRCP64r $b))>;
 
 //
 // F32 Approximate reciprocal
 //
-def FDIV321r_ftz :
+
+def fdiv_approx : PatFrag<(ops node:$a, node:$b),
+                          (fdiv node:$a, node:$b), [{
+  return getDivF32Level(N) == NVPTX::DivPrecisionLevel::Approx;
+}]>;
+
+
+def FRCP32_approx_r_ftz :
   NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
+            (ins Float32Regs:$b),
             "rcp.approx.ftz.f32 \t$dst, $b;",
-            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
-            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV321r :
+            [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>,
+            Requires<[doF32FTZ]>;
+def FRCP32_approx_r :
   NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
+            (ins Float32Regs:$b),
             "rcp.approx.f32 \t$dst, $b;",
-            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
-            Requires<[do_DIVF32_APPROX]>;
+            [(set f32:$dst, (fdiv_approx f32imm_1, f32:$b))]>;
+
 //
 // F32 Approximate division
 //
@@ -1232,43 +1229,43 @@ def FDIV32approxrr_ftz :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, Float32Regs:$b),
             "div.approx.ftz.f32 \t$dst, $a, $b;",
-            [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
-            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+            [(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>,
+            Requires<[doF32FTZ]>;
 def FDIV32approxri_ftz :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, f32imm:$b),
             "div.approx.ftz.f32 \t$dst, $a, $b;",
-            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
-            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+            [(set f32:$dst, (fdiv_approx f32:$a, fpimm:$b))]>,
+            Requires<[doF32FTZ]>;
 def FDIV32approxrr :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, Float32Regs:$b),
             "div.approx.f32 \t$dst, $a, $b;",
-            [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
-            Requires<[do_DIVF32_APPROX]>;
+            [(set f32:$dst, (fdiv_approx f32:$a, f32:$b))]>;
 def FDIV32approxri :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, f32imm:$b),
             "div.approx.f32 \t$dst, $a, $b;",
-            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
-            Requires<[do_DIVF32_APPROX]>;
+            [(set f32:$dst, (fdiv_approx f32:$a, fpimm:$b))]>;
 //
 // F32 Semi-accurate reciprocal
 //
 // rcp.approx gives the same result as div.full(1.0f, a) and is faster.
 //
-def FDIV321r_approx_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
-            "rcp.approx.ftz.f32 \t$dst, $b;",
-            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
-            Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV321r_approx :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
-            "rcp.approx.f32 \t$dst, $b;",
-            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
-            Requires<[do_DIVF32_FULL]>;
+
+def fdiv_full : PatFrag<(ops node:$a, node:$b),
+                        (fdiv node:$a, node:$b), [{
+  return getDivF32Level(N) == NVPTX::DivPrecisionLevel::Full;
+}]>;
+
+
+def : Pat<(fdiv_full f32imm_1, f32:$b),
+          (FRCP32_approx_r_ftz $b)>,
+      Requires<[doF32FTZ]>;
+
+def : Pat<(fdiv_full f32imm_1, f32:$b),
+          (FRCP32_approx_r $b)>;
+
 //
 // F32 Semi-accurate division
 //
@@ -1276,40 +1273,38 @@ def FDIV32rr_ftz :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, Float32Regs:$b),
             "div.full.ftz.f32 \t$dst, $a, $b;",
-            [(set f32:$dst, (fdiv Float32Regs:$a, f32:$b))]>,
-            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+            [(set f32:$dst, (fdiv_full f32:$a, f32:$b))]>,
+            Requires<[doF32FTZ]>;
 def FDIV32ri_ftz :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, f32imm:$b),
             "div.full.ftz.f32 \t$dst, $a, $b;",
-            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
-            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+            [(set f32:$dst, (fdiv_full f32:$a, fpimm:$b))]>,
+            Requires<[doF32FTZ]>;
 def FDIV32rr :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, Float32Regs:$b),
             "div.full.f32 \t$dst, $a, $b;",
-            [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
-            Requires<[do_DIVF32_FULL]>;
+            [(set f32:$dst, (fdiv_full f32:$a, f32:$b))]>;
 def FDIV32ri :
   NVPTXInst<(outs Float32Regs:$dst),
             (ins Float32Regs:$a, f32imm:$b),
             "div.full.f32 \t$dst, $a, $b;",
-            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
-            Requires<[do_DIVF32_FULL]>;
+            [(set f32:$dst, (fdiv_full f32:$a, fpimm:$b))]>;
 //
 // F32 Accurate reciprocal
 //
-def FDIV321r_prec_ftz :
+def FRCP32r_prec_ftz :
   NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
+            (ins Float32Regs:$b),
             "rcp.rn.ftz.f32 \t$dst, $b;",
-            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
+            [(set f32:$dst, (fdiv f32imm_1, f32:$b))]>,
             Requires<[doF32FTZ]>;
-def FDIV321r_prec :
+def FRCP32r_prec :
   NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
+            (ins Float32Regs:$b),
             "rcp.rn.f32 \t$dst, $b;",
-            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>;
+            [(set f32:$dst, (fdiv f32imm_1, f32:$b))]>;
 //
 // F32 Accurate division
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 4f8a798..64a0344 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1615,24 +1615,24 @@ def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
   F64RT, F64RT, int_nvvm_rsqrt_approx_d>;
 
 // 1.0f / sqrt_approx -> rsqrt_approx
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_f f32:$a)),
+def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_f f32:$a)),
          (INT_NVVM_RSQRT_APPROX_F $a)>,
          Requires<[doRsqrtOpt]>;
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
+def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
          (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt]>;
 // same for int_nvvm_sqrt_f when non-precision sqrt is requested
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)),
+def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)),
          (INT_NVVM_RSQRT_APPROX_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)),
+def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)),
          (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
 
-def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)),
+def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)),
          (INT_NVVM_RSQRT_APPROX_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
-def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)),
+def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)),
          (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
 //
diff --git a/llvm/test/CodeGen/NVPTX/div.ll b/llvm/test/CodeGen/NVPTX/div.ll
index bd8d9a3..4c2cb75 100644
--- a/llvm/test/CodeGen/NVPTX/div.ll
+++ b/llvm/test/CodeGen/NVPTX/div.ll
@@ -24,3 +24,143 @@ define float @div_full(float %a, float %b) {
   %4 = call float @llvm.nvvm.div.full.ftz(float %3, float 4.0)
   ret float %4
 }
+
+define float @div_fast_rr(float %a, float %b) {
+; CHECK-LABEL: div_fast_rr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [div_fast_rr_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [div_fast_rr_param_1];
+; CHECK-NEXT:    div.approx.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-NEXT:    ret;
+  %t1 = fdiv afn float %a, %b
+  ret float %t1
+}
+
+define float @div_fast_rr_ftz(float %a, float %b) #0 {
+; CHECK-LABEL: div_fast_rr_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [div_fast_rr_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [div_fast_rr_ftz_param_1];
+; CHECK-NEXT:    div.approx.ftz.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
+; CHECK-NEXT:    ret;
+  %t1 = fdiv afn float %a, %b
+  ret float %t1
+}
+
+define float @div_fast_ri(float %a, float %b) {
+; CHECK-LABEL: div_fast_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [div_fast_ri_param_0];
+; CHECK-NEXT:    mul.rn.f32 %f2, %f1, 0f3F000000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %t1 = fdiv afn float %a, 2.0
+  ret float %t1
+}
+
+define float @div_fast_ri_ftz(float %a, float %b) #0 {
+; CHECK-LABEL: div_fast_ri_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [div_fast_ri_ftz_param_0];
+; CHECK-NEXT:    mul.rn.ftz.f32 %f2, %f1, 0f3F000000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %t1 = fdiv afn float %a, 2.0
+  ret float %t1
+}
+
+define float @rcp_fast(float %a) {
+; CHECK-LABEL: rcp_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [rcp_fast_param_0];
+; CHECK-NEXT:    rcp.approx.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %t1 = fdiv afn float 1.0, %a
+  ret float %t1
+}
+
+define float @rcp_fast_ftz(float %a) #0 {
+; CHECK-LABEL: rcp_fast_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [rcp_fast_ftz_param_0];
+; CHECK-NEXT:    rcp.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %t1 = fdiv afn float 1.0, %a
+  ret float %t1
+}
+
+define float @div_fast_vec(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: div_fast_vec(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [div_fast_vec_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [div_fast_vec_param_1];
+; CHECK-NEXT:    ld.param.b32 %f3, [div_fast_vec_param_2];
+; CHECK-NEXT:    ld.param.b32 %f4, [div_fast_vec_param_3];
+; CHECK-NEXT:    div.approx.f32 %f5, %f2, %f4;
+; CHECK-NEXT:    div.approx.f32 %f6, %f1, %f3;
+; CHECK-NEXT:    add.rn.f32 %f7, %f6, %f5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f7;
+; CHECK-NEXT:    ret;
+  %ins_a0 = insertelement <2 x float> poison, float %a, i32 0
+  %ins_a1 = insertelement <2 x float> %ins_a0, float %b, i32 1
+  %ins_b0 = insertelement <2 x float> poison, float %c, i32 0
+  %ins_b1 = insertelement <2 x float> %ins_b0, float %d, i32 1
+  %fdiv = fdiv fast <2 x float> %ins_a1, %ins_b1
+  %ext0 = extractelement <2 x float> %fdiv, i32 0
+  %ext1 = extractelement <2 x float> %fdiv, i32 1
+  %fadd = fadd float %ext0, %ext1
+  ret float %fadd
+}
+
+define float @div_fast_vec_ftz(float %a, float %b, float %c, float %d) #0 {
+; CHECK-LABEL: div_fast_vec_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [div_fast_vec_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [div_fast_vec_ftz_param_1];
+; CHECK-NEXT:    ld.param.b32 %f3, [div_fast_vec_ftz_param_2];
+; CHECK-NEXT:    ld.param.b32 %f4, [div_fast_vec_ftz_param_3];
+; CHECK-NEXT:    div.approx.ftz.f32 %f5, %f2, %f4;
+; CHECK-NEXT:    div.approx.ftz.f32 %f6, %f1, %f3;
+; CHECK-NEXT:    add.rn.ftz.f32 %f7, %f6, %f5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f7;
+; CHECK-NEXT:    ret;
+  %ins_a0 = insertelement <2 x float> poison, float %a, i32 0
+  %ins_a1 = insertelement <2 x float> %ins_a0, float %b, i32 1
+  %ins_b0 = insertelement <2 x float> poison, float %c, i32 0
+  %ins_b1 = insertelement <2 x float> %ins_b0, float %d, i32 1
+  %fdiv = fdiv fast <2 x float> %ins_a1, %ins_b1
+  %ext0 = extractelement <2 x float> %fdiv, i32 0
+  %ext1 = extractelement <2 x float> %fdiv, i32 1
+  %fadd = fadd float %ext0, %ext1
+  ret float %fadd
+}
+
+attributes #0 = { "denormal-fp-math-f32" = "preserve-sign" }