diff options
author | Simon Pilgrim <RKSimon@users.noreply.github.com> | 2024-02-19 11:32:23 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-19 11:32:23 +0000 |
commit | 769c22f25b81b74e4da7871d4e552584605caef1 (patch) | |
tree | 9aa10ee22644a752acead10c640a15d2ca1fadf9 | |
parent | 21431e0f94c95703bd76e9ec5d2f0b39b8509680 (diff) | |
download | llvm-769c22f25b81b74e4da7871d4e552584605caef1.zip llvm-769c22f25b81b74e4da7871d4e552584605caef1.tar.gz llvm-769c22f25b81b74e4da7871d4e552584605caef1.tar.bz2 |
[VectorCombine] Fold reduce(trunc(x)) -> trunc(reduce(x)) iff cost effective (#81852)
Vector truncations can be pretty expensive, especially on X86, whilst scalar truncations are often free.
If the cost of performing the add/mul/and/or/xor reduction is cheap enough on the pre-truncated type, then avoid the vector truncation entirely.
Fixes https://github.com/llvm/llvm-project/issues/81469
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 57 | ||||
-rw-r--r-- | llvm/test/Transforms/VectorCombine/X86/reduction-of-truncations.ll | 123 |
2 files changed, 180 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index f18711b..dc669e3 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include <numeric> #include <queue> @@ -111,6 +112,7 @@ private: bool scalarizeLoadExtract(Instruction &I); bool foldShuffleOfBinops(Instruction &I); bool foldShuffleFromReductions(Instruction &I); + bool foldTruncFromReductions(Instruction &I); bool foldSelectShuffle(Instruction &I, bool FromReduction = false); void replaceValue(Value &Old, Value &New) { @@ -1526,6 +1528,60 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { return foldSelectShuffle(*Shuffle, true); } +/// Determine if its more efficient to fold: +/// reduce(trunc(x)) -> trunc(reduce(x)). +bool VectorCombine::foldTruncFromReductions(Instruction &I) { + auto *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + return false; + + Intrinsic::ID IID = II->getIntrinsicID(); + switch (IID) { + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + break; + default: + return false; + } + + unsigned ReductionOpc = getArithmeticReductionInstruction(IID); + Value *ReductionSrc = I.getOperand(0); + + Value *TruncSrc; + if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(TruncSrc))))) + return false; + + auto *Trunc = cast<CastInst>(ReductionSrc); + auto *TruncSrcTy = cast<VectorType>(TruncSrc->getType()); + auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType()); + Type *ResultTy = I.getType(); + + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost OldCost = + TTI.getCastInstrCost(Instruction::Trunc, ReductionSrcTy, TruncSrcTy, + TTI::CastContextHint::None, CostKind, Trunc) + + TTI.getArithmeticReductionCost(ReductionOpc, ReductionSrcTy, std::nullopt, + CostKind); + InstructionCost NewCost = + TTI.getArithmeticReductionCost(ReductionOpc, TruncSrcTy, std::nullopt, + CostKind) + + TTI.getCastInstrCost(Instruction::Trunc, ResultTy, + ReductionSrcTy->getScalarType(), + TTI::CastContextHint::None, CostKind); + + if (OldCost <= NewCost || !NewCost.isValid()) + return false; + + Value *NewReduction = Builder.CreateIntrinsic( + TruncSrcTy->getScalarType(), II->getIntrinsicID(), {TruncSrc}); + Value *NewTruncation = Builder.CreateTrunc(NewReduction, ResultTy); + replaceValue(I, *NewTruncation); + return true; +} + /// This method looks for groups of shuffles acting on binops, of the form: /// %x = shuffle ... /// %y = shuffle ... @@ -1917,6 +1973,7 @@ bool VectorCombine::run() { switch (Opcode) { case Instruction::Call: MadeChange |= foldShuffleFromReductions(I); + MadeChange |= foldTruncFromReductions(I); break; case Instruction::ICmp: case Instruction::FCmp: diff --git a/llvm/test/Transforms/VectorCombine/X86/reduction-of-truncations.ll b/llvm/test/Transforms/VectorCombine/X86/reduction-of-truncations.ll new file mode 100644 index 0000000..d5dd4cf --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/reduction-of-truncations.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S --passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: opt < %s -S --passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: opt < %s -S --passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: opt < %s -S --passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512 + +; +; Fold reduce(trunc(X)) -> trunc(reduce(X)) if more cost efficient +; + +; Cheap AVX512 v8i64 -> v8i32 truncation +define i32 @reduce_add_trunc_v8i64_i32(<8 x i64> %a0) { +; X64-LABEL: @reduce_add_trunc_v8i64_i32( +; X64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[A0:%.*]]) +; X64-NEXT: [[RED:%.*]] = trunc i64 [[TMP1]] to i32 +; X64-NEXT: ret i32 [[RED]] +; +; AVX512-LABEL: @reduce_add_trunc_v8i64_i32( +; AVX512-NEXT: [[TR:%.*]] = trunc <8 x i64> [[A0:%.*]] to <8 x i32> +; AVX512-NEXT: [[RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TR]]) +; AVX512-NEXT: ret i32 [[RED]] +; + %tr = trunc <8 x i64> %a0 to <8 x i32> + %red = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %tr) + ret i32 %red +} +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) + +; No legal vXi8 multiplication so vXi16 is always cheaper +define i8 @reduce_mul_trunc_v16i16_i8(<16 x i16> %a0) { +; CHECK-LABEL: @reduce_mul_trunc_v16i16_i8( +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> [[A0:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = trunc i16 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[RED]] +; + %tr = trunc <16 x i16> %a0 to <16 x i8> + %red = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %tr) + ret i8 %red +} +declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>) + +define i8 @reduce_or_trunc_v8i32_i8(<8 x i32> %a0) { +; CHECK-LABEL: @reduce_or_trunc_v8i32_i8( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[A0:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = trunc i32 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[RED]] +; + %tr = trunc <8 x i32> %a0 to <8 x i8> + %red = tail call i8 @llvm.vector.reduce.or.v8i32(<8 x i8> %tr) + ret i8 %red +} +declare i32 @llvm.vector.reduce.or.v8i8(<8 x i8>) + +define i8 @reduce_xor_trunc_v16i64_i8(<16 x i64> %a0) { +; CHECK-LABEL: @reduce_xor_trunc_v16i64_i8( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[A0:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = trunc i64 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[RED]] +; + %tr = trunc <16 x i64> %a0 to <16 x i8> + %red = tail call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %tr) + ret i8 %red +} +declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>) + +; Truncation source has other uses - OK to truncate reduction +define i16 @reduce_and_trunc_v16i64_i16(<16 x i64> %a0) { +; CHECK-LABEL: @reduce_and_trunc_v16i64_i16( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> [[A0:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = trunc i64 [[TMP1]] to i16 +; CHECK-NEXT: call void @use_v16i64(<16 x i64> [[A0]]) +; CHECK-NEXT: ret i16 [[RED]] +; + %tr = trunc <16 x i64> %a0 to <16 x i16> + %red = tail call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %tr) + call void @use_v16i64(<16 x i64> %a0) + ret i16 %red +} +declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>) + +; Negative Test: vXi16 multiply is much cheaper than vXi64 +define i16 @reduce_mul_trunc_v8i64_i16(<8 x i64> %a0) { +; CHECK-LABEL: @reduce_mul_trunc_v8i64_i16( +; CHECK-NEXT: [[TR:%.*]] = trunc <8 x i64> [[A0:%.*]] to <8 x i16> +; CHECK-NEXT: [[RED:%.*]] = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[TR]]) +; CHECK-NEXT: ret i16 [[RED]] +; + %tr = trunc <8 x i64> %a0 to <8 x i16> + %red = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %tr) + ret i16 %red +} +declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>) + +; Negative Test: min/max reductions can't use pre-truncated types. +define i8 @reduce_smin_trunc_v16i16_i8(<16 x i16> %a0) { +; CHECK-LABEL: @reduce_smin_trunc_v16i16_i8( +; CHECK-NEXT: [[TR:%.*]] = trunc <16 x i16> [[A0:%.*]] to <16 x i8> +; CHECK-NEXT: [[RED:%.*]] = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> [[TR]]) +; CHECK-NEXT: ret i8 [[RED]] +; + %tr = trunc <16 x i16> %a0 to <16 x i8> + %red = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %tr) + ret i8 %red +} +declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) + +; Negative Test: Truncation has other uses. +define i16 @reduce_and_trunc_v16i64_i16_multiuse(<16 x i64> %a0) { +; CHECK-LABEL: @reduce_and_trunc_v16i64_i16_multiuse( +; CHECK-NEXT: [[TR:%.*]] = trunc <16 x i64> [[A0:%.*]] to <16 x i16> +; CHECK-NEXT: [[RED:%.*]] = tail call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> [[TR]]) +; CHECK-NEXT: call void @use_v16i16(<16 x i16> [[TR]]) +; CHECK-NEXT: ret i16 [[RED]] +; + %tr = trunc <16 x i64> %a0 to <16 x i16> + %red = tail call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %tr) + call void @use_v16i16(<16 x i16> %tr) + ret i16 %red +} + +declare void @use_v16i64(<16 x i64>) +declare void @use_v16i16(<16 x i16>) + |