aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYingwei Zheng <dtcxzyw2333@gmail.com>2024-08-01 00:14:29 +0800
committerGitHub <noreply@github.com>2024-08-01 00:14:29 +0800
commitb455edbc4566dca5a367122eadd0bae9058fbd7b (patch)
treea2d476c356ad7f010355b32ec092ea9467005830
parent8b2688bd173e79392927bcaed91855e7c4db8eaa (diff)
downloadllvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.zip
llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.tar.gz
llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.tar.bz2
[InstCombine] Recognize copysign idioms (#101324)
This patch folds `(bitcast (or (and (bitcast X to int), signmask), nneg Y) to fp)` into `copysign((bitcast Y to fp), X)`. I found this pattern exists in some graphics applications/math libraries. Alive2: https://alive2.llvm.org/ce/z/ggQZV2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp24
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll19
-rw-r--r--llvm/test/Transforms/InstCombine/bitcast.ll168
4 files changed, 200 insertions, 19 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4323635..97ee845 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2666,6 +2666,27 @@ Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
return RetVal;
}
+/// Fold (bitcast (or (and (bitcast X to int), signmask), nneg Y) to fp) to
+/// copysign((bitcast Y to fp), X)
+static Value *foldCopySignIdioms(BitCastInst &CI,
+ InstCombiner::BuilderTy &Builder,
+ const SimplifyQuery &SQ) {
+ Value *X, *Y;
+ Type *FTy = CI.getType();
+ if (!FTy->isFPOrFPVectorTy())
+ return nullptr;
+ if (!match(&CI, m_ElementWiseBitCast(m_c_Or(
+ m_And(m_ElementWiseBitCast(m_Value(X)), m_SignMask()),
+ m_Value(Y)))))
+ return nullptr;
+ if (X->getType() != FTy)
+ return nullptr;
+ if (!isKnownNonNegative(Y, SQ))
+ return nullptr;
+
+ return Builder.CreateCopySign(Builder.CreateBitCast(Y, FTy), X);
+}
+
Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
// If the operands are integer typed then apply the integer transforms,
// otherwise just apply the common ones.
@@ -2807,6 +2828,9 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
if (Instruction *I = foldBitCastSelect(CI, Builder))
return I;
+ if (Value *V = foldCopySignIdioms(CI, Builder, SQ.getWithInstruction(&CI)))
+ return replaceInstUsesWith(CI, V);
+
return commonCastTransforms(CI);
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 6eb7a4a..ebbab5c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -634,11 +634,11 @@ define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
; CHECK-NEXT: v_or_b32_e32 v1, 1, v1
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
; CHECK-NEXT: v_log_f16_e64 v2, |v0|
-; CHECK-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
+; CHECK-NEXT: s_movk_i32 s4, 0x7fff
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
; CHECK-NEXT: v_mul_f16_e32 v1, v2, v1
; CHECK-NEXT: v_exp_f16_e32 v1, v1
-; CHECK-NEXT: v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = or i32 %y.arg, 1
%call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -669,9 +669,9 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
; CHECK-NEXT: v_exp_f32_e32 v1, v1
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1f800000
; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; CHECK-NEXT: s_brev_b32 s4, 1
+; CHECK-NEXT: s_brev_b32 s4, -2
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2
-; CHECK-NEXT: v_and_or_b32 v0, v0, s4, v1
+; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = or i32 %y.arg, 1
%call = tail call fast float @_Z4pownfi(float %x, i32 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index ba1caf3..bf21ed6 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -375,11 +375,8 @@ declare float @_Z4pownfi(float, i32)
; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs)
; GCN: %__ylogx = fmul fast float %__log2, 1.013000e+03
; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
-; GCN: %[[r0:.*]] = bitcast float %tmp to i32
-; GCN: %__pow_sign = and i32 %[[r0]], -2147483648
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r0:.*]] = tail call float @llvm.copysign.f32(float %__exp2, float %tmp)
+; GCN: store float %[[r0]], ptr addrspace(1) %a, align 4
define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
entry:
%tmp = load float, ptr addrspace(1) %a, align 4
@@ -435,11 +432,7 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
; GCN: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs)
; GCN: %__ylogx = fmul fast half %__log2, 0xH4A80
; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
-; GCN: %1 = bitcast half %x to i16
-; GCN: %__pow_sign = and i16 %1, -32768
-; GCN: %2 = bitcast half %__exp2 to i16
-; GCN: %3 = or disjoint i16 %__pow_sign, %2
-; GCN: %4 = bitcast i16 %3 to half
+; GCN: %1 = tail call half @llvm.copysign.f16(half %__exp2, half %x)
define half @test_pow_fast_f16__y_13(half %x) {
%powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
ret half %powr
@@ -450,11 +443,7 @@ define half @test_pow_fast_f16__y_13(half %x) {
; GCN: %__log2 = tail call fast <2 x half> @llvm.log2.v2f16(<2 x half> %__fabs)
; GCN: %__ylogx = fmul fast <2 x half> %__log2, <half 0xH4A80, half 0xH4A80>
; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx)
-; GCN: %1 = bitcast <2 x half> %x to <2 x i16>
-; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
-; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
-; GCN: %3 = or disjoint <2 x i16> %__pow_sign, %2
-; GCN: %4 = bitcast <2 x i16> %3 to <2 x half>
+; GCN: %1 = tail call <2 x half> @llvm.copysign.v2f16(<2 x half> %__exp2, <2 x half> %x)
define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) {
%powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>)
ret <2 x half> %powr
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
index 5599604..26047f2 100644
--- a/llvm/test/Transforms/InstCombine/bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -711,3 +711,171 @@ define ptr @select_bitcast_unsized_pointer(i1 %c) {
%s = select i1 %c, ptr @f1, ptr @f2
ret ptr %s
}
+
+define float @copysign_idiom_constant(float %x) {
+; CHECK-LABEL: @copysign_idiom_constant(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float [[X:%.*]])
+; CHECK-NEXT: ret float [[Y]]
+;
+ %bits = bitcast float %x to i32
+ %sign = and i32 %bits, -2147483648
+ %res = or i32 %sign, 1065353216
+ %y = bitcast i32 %res to float
+ ret float %y
+}
+
+define float @copysign_idiom(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom(
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1
+; CHECK-NEXT: call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MAG]] to float
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]])
+; CHECK-NEXT: ret float [[Y]]
+;
+ %cond = icmp sgt i32 %mag, -1
+ call void @llvm.assume(i1 %cond)
+
+ %bits = bitcast float %x to i32
+ %sign = and i32 %bits, -2147483648
+ %res = or i32 %sign, %mag
+ %y = bitcast i32 %res to float
+ ret float %y
+}
+
+define float @copysign_idiom_commuted(float %x, i32 %magx) {
+; CHECK-LABEL: @copysign_idiom_commuted(
+; CHECK-NEXT: [[MAG:%.*]] = add i32 [[MAGX:%.*]], -1
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[MAG]], -1
+; CHECK-NEXT: call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MAG]] to float
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]])
+; CHECK-NEXT: ret float [[Y]]
+;
+ %mag = add i32 %magx, -1 ; thwart complexity-based canonicalization
+ %cond = icmp sgt i32 %mag, -1
+ call void @llvm.assume(i1 %cond)
+
+ %bits = bitcast float %x to i32
+ %sign = and i32 %bits, -2147483648
+ %res = or i32 %mag, %sign
+ %y = bitcast i32 %res to float
+ ret float %y
+}
+
+define float @copysign_idiom_abs(float %x, float %mag) {
+; CHECK-LABEL: @copysign_idiom_abs(
+; CHECK-NEXT: [[Y:%.*]] = call float @llvm.copysign.f32(float [[MAG:%.*]], float [[X:%.*]])
+; CHECK-NEXT: ret float [[Y]]
+;
+ %abs = call float @llvm.fabs.f32(float %mag)
+ %absbits = bitcast float %abs to i32
+ %bits = bitcast float %x to i32
+ %sign = and i32 %bits, -2147483648
+ %res = or i32 %sign, %absbits
+ %y = bitcast i32 %res to float
+ ret float %y
+}
+
+define double @copysign_idiom_f64(double %x, i64 %mag) {
+; CHECK-LABEL: @copysign_idiom_f64(
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[MAG:%.*]], -1
+; CHECK-NEXT: call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[MAG]] to double
+; CHECK-NEXT: [[Y:%.*]] = call double @llvm.copysign.f64(double [[TMP1]], double [[X:%.*]])
+; CHECK-NEXT: ret double [[Y]]
+;
+ %cond = icmp sgt i64 %mag, -1
+ call void @llvm.assume(i1 %cond)
+
+ %bits = bitcast double %x to i64
+ %sign = and i64 %bits, -9223372036854775808
+ %res = or i64 %sign, %mag
+ %y = bitcast i64 %res to double
+ ret double %y
+}
+
+define <2 x float> @copysign_idiom_vec(<2 x float> %x) {
+; CHECK-LABEL: @copysign_idiom_vec(
+; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> [[X:%.*]])
+; CHECK-NEXT: ret <2 x float> [[Y]]
+;
+ %bits = bitcast <2 x float> %x to <2 x i32>
+ %sign = and <2 x i32> %bits, splat(i32 -2147483648)
+ %res = or <2 x i32> %sign, splat(i32 1065353216)
+ %y = bitcast <2 x i32> %res to <2 x float>
+ ret <2 x float> %y
+}
+
+; negative tests
+
+define float @copysign_idiom_without_nneg(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom_without_nneg(
+; CHECK-NEXT: [[BITS:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT: [[SIGN:%.*]] = and i32 [[BITS]], -2147483648
+; CHECK-NEXT: [[RES:%.*]] = or i32 [[SIGN]], [[MAG:%.*]]
+; CHECK-NEXT: [[Y:%.*]] = bitcast i32 [[RES]] to float
+; CHECK-NEXT: ret float [[Y]]
+;
+ %bits = bitcast float %x to i32
+ %sign = and i32 %bits, -2147483648
+ %res = or i32 %sign, %mag
+ %y = bitcast i32 %res to float
+ ret float %y
+}
+
+define float @copysign_idiom_not_signmask(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom_not_signmask(
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1
+; CHECK-NEXT: call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT: [[BITS:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT: [[SIGN:%.*]] = and i32 [[BITS]], -2147483647
+; CHECK-NEXT: [[RES:%.*]] = or i32 [[SIGN]], [[MAG]]
+; CHECK-NEXT: [[Y:%.*]] = bitcast i32 [[RES]] to float
+; CHECK-NEXT: ret float [[Y]]
+;
+ %cond = icmp sgt i32 %mag, -1
+ call void @llvm.assume(i1 %cond)
+
+ %bits = bitcast float %x to i32
+ %sign = and i32 %bits, -2147483647
+ %res = or i32 %sign, %mag
+ %y = bitcast i32 %res to float
+ ret float %y
+}
+
+define float @copysign_idiom_constant_wrong_type1(<1 x i32> %x) {
+; CHECK-LABEL: @copysign_idiom_constant_wrong_type1(
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i32> [[X:%.*]], i64 0
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT: call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT: ret float 1.000000e+00
+;
+ %bits = bitcast <1 x i32> %x to i32
+ %cond = icmp sgt i32 %bits, -1
+ call void @llvm.assume(i1 %cond)
+
+ %sign = and i32 %bits, -2147483648
+ %res = or i32 %sign, 1065353216
+ %y = bitcast i32 %res to float
+ ret float %y
+}
+
+define half @copysign_idiom_constant_wrong_type2(bfloat %x, i16 %mag) {
+; CHECK-LABEL: @copysign_idiom_constant_wrong_type2(
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt i16 [[MAG:%.*]], -1
+; CHECK-NEXT: call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT: [[BITS:%.*]] = bitcast bfloat [[X:%.*]] to i16
+; CHECK-NEXT: [[SIGN:%.*]] = and i16 [[BITS]], -32768
+; CHECK-NEXT: [[RES:%.*]] = or disjoint i16 [[SIGN]], [[MAG]]
+; CHECK-NEXT: [[Y:%.*]] = bitcast i16 [[RES]] to half
+; CHECK-NEXT: ret half [[Y]]
+;
+ %cond = icmp sgt i16 %mag, -1
+ call void @llvm.assume(i1 %cond)
+
+ %bits = bitcast bfloat %x to i16
+ %sign = and i16 %bits, -32768
+ %res = or i16 %sign, %mag
+ %y = bitcast i16 %res to half
+ ret half %y
+}