AMDGPU: Refine rcp/rsq intrinsic folding for modern FP rules

We have to assume undef could be an snan, which would need quieting so returning qnan is safer than undef. Also consider strictfp, and don't care if the result rounded.
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2020-05-21 21:04:06 -0400
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2020-05-23 13:28:36 -0400
commit: 27fe841aa650a24fd98da2fb6c6eb2fca806a63f (patch)
tree: 9a44f6de2a4dde6b48fd60a54393f019a565f579
parent: 1d96dca9491e3d75c11c3cd1acff5fcda8c2f613 (diff)
download: llvm-27fe841aa650a24fd98da2fb6c6eb2fca806a63f.zip
llvm-27fe841aa650a24fd98da2fb6c6eb2fca806a63f.tar.gz
llvm-27fe841aa650a24fd98da2fb6c6eb2fca806a63f.tar.bz2
2 files changed, 38 insertions, 20 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 89ffe065..118013a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3500,18 +3500,25 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Src = II->getArgOperand(0);
 
     // TODO: Move to ConstantFolding/InstSimplify?
-    if (isa<UndefValue>(Src))
-      return replaceInstUsesWith(CI, Src);
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II->getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return replaceInstUsesWith(CI, QNaN);
+    }
+
+    if (II->isStrictFP())
+      break;
 
     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
       const APFloat &ArgVal = C->getValueAPF();
       APFloat Val(ArgVal.getSemantics(), 1);
-      APFloat::opStatus Status = Val.divide(ArgVal,
-                                            APFloat::rmNearestTiesToEven);
-      // Only do this if it was exact and therefore not dependent on the
-      // rounding mode.
-      if (Status == APFloat::opOK)
-        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
+      Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
+
+      // This is more precise than the instruction may give.
+      //
+      // TODO: The instruction always flushes denormal results (except for f16),
+      // should this also?
+      return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
     }
 
     break;
@@ -3520,8 +3527,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Src = II->getArgOperand(0);
 
     // TODO: Move to ConstantFolding/InstSimplify?
-    if (isa<UndefValue>(Src))
-      return replaceInstUsesWith(CI, Src);
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II->getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return replaceInstUsesWith(CI, QNaN);
+    }
+
     break;
   }
   case Intrinsic::amdgcn_frexp_mant:
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 032ca08..a3a99e5 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -10,7 +10,7 @@ declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone
 
 define float @test_constant_fold_rcp_f32_undef() nounwind {
 ; CHECK-LABEL: @test_constant_fold_rcp_f32_undef(
-; CHECK-NEXT:    ret float undef
+; CHECK-NEXT:    ret float 0x7FF8000000000000
 ;
   %val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone
   ret float %val
@@ -50,8 +50,7 @@ define double @test_constant_fold_rcp_f64_half() nounwind {
 
 define float @test_constant_fold_rcp_f32_43() nounwind {
 ; CHECK-LABEL: @test_constant_fold_rcp_f32_43(
-; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01)
-; CHECK-NEXT:    ret float [[VAL]]
+; CHECK-NEXT:    ret float 0x3F97D05F40000000
 ;
   %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone
   ret float %val
@@ -59,13 +58,21 @@ define float @test_constant_fold_rcp_f32_43() nounwind {
 
 define double @test_constant_fold_rcp_f64_43() nounwind {
 ; CHECK-LABEL: @test_constant_fold_rcp_f64_43(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01)
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0x3F97D05F417D05F4
 ;
   %val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone
   ret double %val
 }
 
+define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp {
+; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp(
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #7
+; CHECK-NEXT:    ret float [[VAL]]
+;
+  %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone
+  ret float %val
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.rsq
 ; --------------------------------------------------------------------
@@ -74,7 +81,7 @@ declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone
 
 define float @test_constant_fold_rsq_f32_undef() nounwind {
 ; CHECK-LABEL: @test_constant_fold_rsq_f32_undef(
-; CHECK-NEXT:    ret float undef
+; CHECK-NEXT:    ret float 0x7FF8000000000000
 ;
   %val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone
   ret float %val
@@ -2387,8 +2394,8 @@ declare i32 @llvm.amdgcn.ballot.i32(i1) nounwind readnone convergent
 
 define i64 @ballot_nocombine_64(i1 %i) {
 ; CHECK-LABEL: @ballot_nocombine_64(
-; CHECK-NEXT:    %b = call i64 @llvm.amdgcn.ballot.i64(i1 %i)
-; CHECK-NEXT:    ret i64 %b
+; CHECK-NEXT:    [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[I:%.*]])
+; CHECK-NEXT:    ret i64 [[B]]
 ;
   %b = call i64 @llvm.amdgcn.ballot.i64(i1 %i)
   ret i64 %b
@@ -2413,8 +2420,8 @@ define i64 @ballot_one_64() {
 
 define i32 @ballot_nocombine_32(i1 %i) {
 ; CHECK-LABEL: @ballot_nocombine_32(
-; CHECK-NEXT:    %b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)
-; CHECK-NEXT:    ret i32 %b
+; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I:%.*]])
+; CHECK-NEXT:    ret i32 [[B]]
 ;
   %b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)
   ret i32 %b
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2020-05-21 21:04:06 -0400
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2020-05-23 13:28:36 -0400
commit	27fe841aa650a24fd98da2fb6c6eb2fca806a63f (patch)
tree	9a44f6de2a4dde6b48fd60a54393f019a565f579
parent	1d96dca9491e3d75c11c3cd1acff5fcda8c2f613 (diff)
download	llvm-27fe841aa650a24fd98da2fb6c6eb2fca806a63f.zip llvm-27fe841aa650a24fd98da2fb6c6eb2fca806a63f.tar.gz llvm-27fe841aa650a24fd98da2fb6c6eb2fca806a63f.tar.bz2