[InstCombine] Recognize copysign idioms (#101324)

This patch folds `(bitcast (or (and (bitcast X to int), signmask), nneg Y) to fp)` into `copysign((bitcast Y to fp), X)`. I found this pattern exists in some graphics applications/math libraries. Alive2: https://alive2.llvm.org/ce/z/ggQZV2
author: Yingwei Zheng <dtcxzyw2333@gmail.com> 2024-08-01 00:14:29 +0800
committer: GitHub <noreply@github.com> 2024-08-01 00:14:29 +0800
commit: b455edbc4566dca5a367122eadd0bae9058fbd7b (patch)
tree: a2d476c356ad7f010355b32ec092ea9467005830
parent: 8b2688bd173e79392927bcaed91855e7c4db8eaa (diff)
download: llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.zip
llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.tar.gz
llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.tar.bz2
4 files changed, 200 insertions, 19 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4323635..97ee845 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2666,6 +2666,27 @@ Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
   return RetVal;
 }
 
+/// Fold (bitcast (or (and (bitcast X to int), signmask), nneg Y) to fp) to
+/// copysign((bitcast Y to fp), X)
+static Value *foldCopySignIdioms(BitCastInst &CI,
+                                 InstCombiner::BuilderTy &Builder,
+                                 const SimplifyQuery &SQ) {
+  Value *X, *Y;
+  Type *FTy = CI.getType();
+  if (!FTy->isFPOrFPVectorTy())
+    return nullptr;
+  if (!match(&CI, m_ElementWiseBitCast(m_c_Or(
+                      m_And(m_ElementWiseBitCast(m_Value(X)), m_SignMask()),
+                      m_Value(Y)))))
+    return nullptr;
+  if (X->getType() != FTy)
+    return nullptr;
+  if (!isKnownNonNegative(Y, SQ))
+    return nullptr;
+
+  return Builder.CreateCopySign(Builder.CreateBitCast(Y, FTy), X);
+}
+
 Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   // If the operands are integer typed then apply the integer transforms,
   // otherwise just apply the common ones.
@@ -2807,6 +2828,9 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   if (Instruction *I = foldBitCastSelect(CI, Builder))
     return I;
 
+  if (Value *V = foldCopySignIdioms(CI, Builder, SQ.getWithInstruction(&CI)))
+    return replaceInstUsesWith(CI, V);
+
   return commonCastTransforms(CI);
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 6eb7a4a..ebbab5c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -634,11 +634,11 @@ define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CHECK-NEXT:    v_log_f16_e64 v2, |v0|
-; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_f16_e32 v1, v2, v1
 ; CHECK-NEXT:    v_exp_f16_e32 v1, v1
-; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -669,9 +669,9 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_exp_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f800000
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; CHECK-NEXT:    s_brev_b32 s4, 1
+; CHECK-NEXT:    s_brev_b32 s4, -2
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
-; CHECK-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index ba1caf3..bf21ed6 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -375,11 +375,8 @@ declare float @_Z4pownfi(float, i32)
 ; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs)
 ; GCN: %__ylogx = fmul fast float %__log2, 1.013000e+03
 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
-; GCN: %[[r0:.*]] = bitcast float %tmp to i32
-; GCN: %__pow_sign = and i32 %[[r0]], -2147483648
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r0:.*]] = tail call float @llvm.copysign.f32(float %__exp2, float %tmp)
+; GCN: store float %[[r0]], ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -435,11 +432,7 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
 ; GCN: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs)
 ; GCN: %__ylogx = fmul fast half %__log2, 0xH4A80
 ; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
-; GCN: %1 = bitcast half %x to i16
-; GCN: %__pow_sign = and i16 %1, -32768
-; GCN: %2 = bitcast half %__exp2 to i16
-; GCN: %3 = or disjoint i16 %__pow_sign, %2
-; GCN: %4 = bitcast i16 %3 to half
+; GCN: %1 = tail call half @llvm.copysign.f16(half %__exp2, half %x)
 define half @test_pow_fast_f16__y_13(half %x) {
   %powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
   ret half %powr
@@ -450,11 +443,7 @@ define half @test_pow_fast_f16__y_13(half %x) {
 ; GCN: %__log2 = tail call fast <2 x half> @llvm.log2.v2f16(<2 x half> %__fabs)
 ; GCN: %__ylogx = fmul fast <2 x half> %__log2, <half 0xH4A80, half 0xH4A80>
 ; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx)
-; GCN: %1 = bitcast <2 x half> %x to <2 x i16>
-; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
-; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
-; GCN: %3 = or disjoint <2 x i16> %__pow_sign, %2
-; GCN: %4 = bitcast <2 x i16> %3 to <2 x half>
+; GCN: %1 = tail call <2 x half> @llvm.copysign.v2f16(<2 x half> %__exp2, <2 x half> %x)
 define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) {
   %powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>)
   ret <2 x half> %powr
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
index 5599604..26047f2 100644
--- a/llvm/test/Transforms/InstCombine/bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -711,3 +711,171 @@ define ptr @select_bitcast_unsized_pointer(i1 %c) {
   %s = select i1 %c, ptr @f1, ptr @f2
   ret ptr %s
 }
+
+define float @copysign_idiom_constant(float %x) {
+; CHECK-LABEL: @copysign_idiom_constant(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, 1065353216
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MAG]] to float
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %cond = icmp sgt i32 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, %mag
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_commuted(float %x, i32 %magx) {
+; CHECK-LABEL: @copysign_idiom_commuted(
+; CHECK-NEXT:    [[MAG:%.*]] = add i32 [[MAGX:%.*]], -1
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[MAG]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MAG]] to float
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %mag = add i32 %magx, -1 ; thwart complexity-based canonicalization
+  %cond = icmp sgt i32 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %mag, %sign
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_abs(float %x, float %mag) {
+; CHECK-LABEL: @copysign_idiom_abs(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float [[MAG:%.*]], float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %abs = call float @llvm.fabs.f32(float %mag)
+  %absbits = bitcast float %abs to i32
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, %absbits
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define double @copysign_idiom_f64(double %x, i64 %mag) {
+; CHECK-LABEL: @copysign_idiom_f64(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MAG]] to double
+; CHECK-NEXT:    [[Y:%.*]] = call double @llvm.copysign.f64(double [[TMP1]], double [[X:%.*]])
+; CHECK-NEXT:    ret double [[Y]]
+;
+  %cond = icmp sgt i64 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast double %x to i64
+  %sign = and i64 %bits, -9223372036854775808
+  %res = or i64 %sign, %mag
+  %y = bitcast i64 %res to double
+  ret double %y
+}
+
+define <2 x float> @copysign_idiom_vec(<2 x float> %x) {
+; CHECK-LABEL: @copysign_idiom_vec(
+; CHECK-NEXT:    [[Y:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> [[X:%.*]])
+; CHECK-NEXT:    ret <2 x float> [[Y]]
+;
+  %bits = bitcast <2 x float> %x to <2 x i32>
+  %sign = and <2 x i32> %bits, splat(i32 -2147483648)
+  %res = or <2 x i32> %sign, splat(i32 1065353216)
+  %y = bitcast <2 x i32> %res to <2 x float>
+  ret <2 x float> %y
+}
+
+; negative tests
+
+define float @copysign_idiom_without_nneg(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom_without_nneg(
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT:    [[SIGN:%.*]] = and i32 [[BITS]], -2147483648
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[SIGN]], [[MAG:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = bitcast i32 [[RES]] to float
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, %mag
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_not_signmask(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom_not_signmask(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT:    [[SIGN:%.*]] = and i32 [[BITS]], -2147483647
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[SIGN]], [[MAG]]
+; CHECK-NEXT:    [[Y:%.*]] = bitcast i32 [[RES]] to float
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %cond = icmp sgt i32 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483647
+  %res = or i32 %sign, %mag
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_constant_wrong_type1(<1 x i32> %x) {
+; CHECK-LABEL: @copysign_idiom_constant_wrong_type1(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i32> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %bits = bitcast <1 x i32> %x to i32
+  %cond = icmp sgt i32 %bits, -1
+  call void @llvm.assume(i1 %cond)
+
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, 1065353216
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define half @copysign_idiom_constant_wrong_type2(bfloat %x, i16 %mag) {
+; CHECK-LABEL: @copysign_idiom_constant_wrong_type2(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i16 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast bfloat [[X:%.*]] to i16
+; CHECK-NEXT:    [[SIGN:%.*]] = and i16 [[BITS]], -32768
+; CHECK-NEXT:    [[RES:%.*]] = or disjoint i16 [[SIGN]], [[MAG]]
+; CHECK-NEXT:    [[Y:%.*]] = bitcast i16 [[RES]] to half
+; CHECK-NEXT:    ret half [[Y]]
+;
+  %cond = icmp sgt i16 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast bfloat %x to i16
+  %sign = and i16 %bits, -32768
+  %res = or i16 %sign, %mag
+  %y = bitcast i16 %res to half
+  ret half %y
+}
author	Yingwei Zheng <dtcxzyw2333@gmail.com>	2024-08-01 00:14:29 +0800
committer	GitHub <noreply@github.com>	2024-08-01 00:14:29 +0800
commit	b455edbc4566dca5a367122eadd0bae9058fbd7b (patch)
tree	a2d476c356ad7f010355b32ec092ea9467005830
parent	8b2688bd173e79392927bcaed91855e7c4db8eaa (diff)
download	llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.zip llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.tar.gz llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.tar.bz2