diff options
author | Yingwei Zheng <dtcxzyw2333@gmail.com> | 2024-08-01 00:14:29 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-08-01 00:14:29 +0800 |
commit | b455edbc4566dca5a367122eadd0bae9058fbd7b (patch) | |
tree | a2d476c356ad7f010355b32ec092ea9467005830 | |
parent | 8b2688bd173e79392927bcaed91855e7c4db8eaa (diff) | |
download | llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.zip llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.tar.gz llvm-b455edbc4566dca5a367122eadd0bae9058fbd7b.tar.bz2 |
[InstCombine] Recognize copysign idioms (#101324)
This patch folds `(bitcast (or (and (bitcast X to int), signmask), nneg
Y) to fp)` into `copysign((bitcast Y to fp), X)`. I found this pattern
exists in some graphics applications/math libraries.
Alive2: https://alive2.llvm.org/ce/z/ggQZV2
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp | 24 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll | 19 | ||||
-rw-r--r-- | llvm/test/Transforms/InstCombine/bitcast.ll | 168 |
4 files changed, 200 insertions, 19 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 4323635..97ee845 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2666,6 +2666,27 @@ Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI, return RetVal; } +/// Fold (bitcast (or (and (bitcast X to int), signmask), nneg Y) to fp) to +/// copysign((bitcast Y to fp), X) +static Value *foldCopySignIdioms(BitCastInst &CI, + InstCombiner::BuilderTy &Builder, + const SimplifyQuery &SQ) { + Value *X, *Y; + Type *FTy = CI.getType(); + if (!FTy->isFPOrFPVectorTy()) + return nullptr; + if (!match(&CI, m_ElementWiseBitCast(m_c_Or( + m_And(m_ElementWiseBitCast(m_Value(X)), m_SignMask()), + m_Value(Y))))) + return nullptr; + if (X->getType() != FTy) + return nullptr; + if (!isKnownNonNegative(Y, SQ)) + return nullptr; + + return Builder.CreateCopySign(Builder.CreateBitCast(Y, FTy), X); +} + Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { // If the operands are integer typed then apply the integer transforms, // otherwise just apply the common ones. @@ -2807,6 +2828,9 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { if (Instruction *I = foldBitCastSelect(CI, Builder)) return I; + if (Value *V = foldCopySignIdioms(CI, Builder, SQ.getWithInstruction(&CI))) + return replaceInstUsesWith(CI, V); + return commonCastTransforms(CI); } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 6eb7a4a..ebbab5c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -634,11 +634,11 @@ define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) { ; CHECK-NEXT: v_or_b32_e32 v1, 1, v1 ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 ; CHECK-NEXT: v_log_f16_e64 v2, |v0| -; CHECK-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; CHECK-NEXT: s_movk_i32 s4, 0x7fff ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CHECK-NEXT: v_mul_f16_e32 v1, v2, v1 ; CHECK-NEXT: v_exp_f16_e32 v1, v1 -; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) @@ -669,9 +669,9 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) { ; CHECK-NEXT: v_exp_f32_e32 v1, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x1f800000 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; CHECK-NEXT: s_brev_b32 s4, 1 +; CHECK-NEXT: s_brev_b32 s4, -2 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 -; CHECK-NEXT: v_and_or_b32 v0, v0, s4, v1 +; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %call = tail call fast float @_Z4pownfi(float %x, i32 %y) diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll index ba1caf3..bf21ed6 100644 --- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -375,11 +375,8 @@ declare float @_Z4pownfi(float, i32) ; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs) ; GCN: %__ylogx = fmul fast float %__log2, 1.013000e+03 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx) -; GCN: %[[r0:.*]] = bitcast float %tmp to i32 -; GCN: %__pow_sign = and i32 %[[r0]], -2147483648 -; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32 -; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]] -; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4 +; GCN: %[[r0:.*]] = tail call float @llvm.copysign.f32(float %__exp2, float %tmp) +; GCN: store float %[[r0]], ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -435,11 +432,7 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>) ; GCN: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs) ; GCN: %__ylogx = fmul fast half %__log2, 0xH4A80 ; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx) -; GCN: %1 = bitcast half %x to i16 -; GCN: %__pow_sign = and i16 %1, -32768 -; GCN: %2 = bitcast half %__exp2 to i16 -; GCN: %3 = or disjoint i16 %__pow_sign, %2 -; GCN: %4 = bitcast i16 %3 to half +; GCN: %1 = tail call half @llvm.copysign.f16(half %__exp2, half %x) define half @test_pow_fast_f16__y_13(half %x) { %powr = tail call fast half @_Z3powDhDh(half %x, half 13.0) ret half %powr @@ -450,11 +443,7 @@ define half @test_pow_fast_f16__y_13(half %x) { ; GCN: %__log2 = tail call fast <2 x half> @llvm.log2.v2f16(<2 x half> %__fabs) ; GCN: %__ylogx = fmul fast <2 x half> %__log2, <half 0xH4A80, half 0xH4A80> ; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx) -; GCN: %1 = bitcast <2 x half> %x to <2 x i16> -; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768> -; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16> -; GCN: %3 = or disjoint <2 x i16> %__pow_sign, %2 -; GCN: %4 = bitcast <2 x i16> %3 to <2 x half> +; GCN: %1 = tail call <2 x half> @llvm.copysign.v2f16(<2 x half> %__exp2, <2 x half> %x) define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) { %powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>) ret <2 x half> %powr diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll index 5599604..26047f2 100644 --- a/llvm/test/Transforms/InstCombine/bitcast.ll +++ b/llvm/test/Transforms/InstCombine/bitcast.ll @@ -711,3 +711,171 @@ define ptr @select_bitcast_unsized_pointer(i1 %c) { %s = select i1 %c, ptr @f1, ptr @f2 ret ptr %s } + +define float @copysign_idiom_constant(float %x) { +; CHECK-LABEL: @copysign_idiom_constant( +; CHECK-NEXT: [[Y:%.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float [[X:%.*]]) +; CHECK-NEXT: ret float [[Y]] +; + %bits = bitcast float %x to i32 + %sign = and i32 %bits, -2147483648 + %res = or i32 %sign, 1065353216 + %y = bitcast i32 %res to float + ret float %y +} + +define float @copysign_idiom(float %x, i32 %mag) { +; CHECK-LABEL: @copysign_idiom( +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MAG]] to float +; CHECK-NEXT: [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]]) +; CHECK-NEXT: ret float [[Y]] +; + %cond = icmp sgt i32 %mag, -1 + call void @llvm.assume(i1 %cond) + + %bits = bitcast float %x to i32 + %sign = and i32 %bits, -2147483648 + %res = or i32 %sign, %mag + %y = bitcast i32 %res to float + ret float %y +} + +define float @copysign_idiom_commuted(float %x, i32 %magx) { +; CHECK-LABEL: @copysign_idiom_commuted( +; CHECK-NEXT: [[MAG:%.*]] = add i32 [[MAGX:%.*]], -1 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[MAG]], -1 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MAG]] to float +; CHECK-NEXT: [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]]) +; CHECK-NEXT: ret float [[Y]] +; + %mag = add i32 %magx, -1 ; thwart complexity-based canonicalization + %cond = icmp sgt i32 %mag, -1 + call void @llvm.assume(i1 %cond) + + %bits = bitcast float %x to i32 + %sign = and i32 %bits, -2147483648 + %res = or i32 %mag, %sign + %y = bitcast i32 %res to float + ret float %y +} + +define float @copysign_idiom_abs(float %x, float %mag) { +; CHECK-LABEL: @copysign_idiom_abs( +; CHECK-NEXT: [[Y:%.*]] = call float @llvm.copysign.f32(float [[MAG:%.*]], float [[X:%.*]]) +; CHECK-NEXT: ret float [[Y]] +; + %abs = call float @llvm.fabs.f32(float %mag) + %absbits = bitcast float %abs to i32 + %bits = bitcast float %x to i32 + %sign = and i32 %bits, -2147483648 + %res = or i32 %sign, %absbits + %y = bitcast i32 %res to float + ret float %y +} + +define double @copysign_idiom_f64(double %x, i64 %mag) { +; CHECK-LABEL: @copysign_idiom_f64( +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[MAG:%.*]], -1 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[MAG]] to double +; CHECK-NEXT: [[Y:%.*]] = call double @llvm.copysign.f64(double [[TMP1]], double [[X:%.*]]) +; CHECK-NEXT: ret double [[Y]] +; + %cond = icmp sgt i64 %mag, -1 + call void @llvm.assume(i1 %cond) + + %bits = bitcast double %x to i64 + %sign = and i64 %bits, -9223372036854775808 + %res = or i64 %sign, %mag + %y = bitcast i64 %res to double + ret double %y +} + +define <2 x float> @copysign_idiom_vec(<2 x float> %x) { +; CHECK-LABEL: @copysign_idiom_vec( +; CHECK-NEXT: [[Y:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> [[X:%.*]]) +; CHECK-NEXT: ret <2 x float> [[Y]] +; + %bits = bitcast <2 x float> %x to <2 x i32> + %sign = and <2 x i32> %bits, splat(i32 -2147483648) + %res = or <2 x i32> %sign, splat(i32 1065353216) + %y = bitcast <2 x i32> %res to <2 x float> + ret <2 x float> %y +} + +; negative tests + +define float @copysign_idiom_without_nneg(float %x, i32 %mag) { +; CHECK-LABEL: @copysign_idiom_without_nneg( +; CHECK-NEXT: [[BITS:%.*]] = bitcast float [[X:%.*]] to i32 +; CHECK-NEXT: [[SIGN:%.*]] = and i32 [[BITS]], -2147483648 +; CHECK-NEXT: [[RES:%.*]] = or i32 [[SIGN]], [[MAG:%.*]] +; CHECK-NEXT: [[Y:%.*]] = bitcast i32 [[RES]] to float +; CHECK-NEXT: ret float [[Y]] +; + %bits = bitcast float %x to i32 + %sign = and i32 %bits, -2147483648 + %res = or i32 %sign, %mag + %y = bitcast i32 %res to float + ret float %y +} + +define float @copysign_idiom_not_signmask(float %x, i32 %mag) { +; CHECK-LABEL: @copysign_idiom_not_signmask( +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: [[BITS:%.*]] = bitcast float [[X:%.*]] to i32 +; CHECK-NEXT: [[SIGN:%.*]] = and i32 [[BITS]], -2147483647 +; CHECK-NEXT: [[RES:%.*]] = or i32 [[SIGN]], [[MAG]] +; CHECK-NEXT: [[Y:%.*]] = bitcast i32 [[RES]] to float +; CHECK-NEXT: ret float [[Y]] +; + %cond = icmp sgt i32 %mag, -1 + call void @llvm.assume(i1 %cond) + + %bits = bitcast float %x to i32 + %sign = and i32 %bits, -2147483647 + %res = or i32 %sign, %mag + %y = bitcast i32 %res to float + ret float %y +} + +define float @copysign_idiom_constant_wrong_type1(<1 x i32> %x) { +; CHECK-LABEL: @copysign_idiom_constant_wrong_type1( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i32> [[X:%.*]], i64 0 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[TMP1]], -1 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: ret float 1.000000e+00 +; + %bits = bitcast <1 x i32> %x to i32 + %cond = icmp sgt i32 %bits, -1 + call void @llvm.assume(i1 %cond) + + %sign = and i32 %bits, -2147483648 + %res = or i32 %sign, 1065353216 + %y = bitcast i32 %res to float + ret float %y +} + +define half @copysign_idiom_constant_wrong_type2(bfloat %x, i16 %mag) { +; CHECK-LABEL: @copysign_idiom_constant_wrong_type2( +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i16 [[MAG:%.*]], -1 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: [[BITS:%.*]] = bitcast bfloat [[X:%.*]] to i16 +; CHECK-NEXT: [[SIGN:%.*]] = and i16 [[BITS]], -32768 +; CHECK-NEXT: [[RES:%.*]] = or disjoint i16 [[SIGN]], [[MAG]] +; CHECK-NEXT: [[Y:%.*]] = bitcast i16 [[RES]] to half +; CHECK-NEXT: ret half [[Y]] +; + %cond = icmp sgt i16 %mag, -1 + call void @llvm.assume(i1 %cond) + + %bits = bitcast bfloat %x to i16 + %sign = and i16 %bits, -32768 + %res = or i16 %sign, %mag + %y = bitcast i16 %res to half + ret half %y +} |