diff options
author | Sanjay Patel <spatel@rotateright.com> | 2020-02-13 16:08:15 -0500 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2020-02-13 17:23:27 -0500 |
commit | 19b62b79db1bb154b40e8baba9a28ab8aa935b6b (patch) | |
tree | 539aacfb0a34477e489d5342113738c6531471e3 /llvm/lib/Transforms | |
parent | fe36127982e0a5889cc0653718e62ba6acccf7c4 (diff) | |
download | llvm-19b62b79db1bb154b40e8baba9a28ab8aa935b6b.zip llvm-19b62b79db1bb154b40e8baba9a28ab8aa935b6b.tar.gz llvm-19b62b79db1bb154b40e8baba9a28ab8aa935b6b.tar.bz2 |
[VectorCombine] try to form vector binop to eliminate an extract element
binop (extelt X, C), (extelt Y, C) --> extelt (binop X, Y), C
This is a transform that has been considered for canonicalization (instcombine)
in the past because it reduces instruction count. But as shown in the x86 tests,
it's impossible to know if it's profitable without a cost model. There are many
potential target constraints to consider.
We have implemented similar transforms in the backend (DAGCombiner and
target-specific), but I don't think we have this exact fold there either (and if
we did it in SDAG, it wouldn't work across blocks).
Note: this patch was intended to handle the more general case where the extract
indexes do not match, but it got too big, so I scaled it back to this pattern
for now.
Differential Revision: https://reviews.llvm.org/D74495
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 83 |
1 files changed, 82 insertions, 1 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 2c69878..a70d962 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -30,6 +31,7 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "vector-combine" STATISTIC(NumVecCmp, "Number of vector compares formed"); +STATISTIC(NumVecBO, "Number of vector binops formed"); static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) { // Match a cmp with extracted vector operands. @@ -76,6 +78,85 @@ static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) { return true; } +/// Try to reduce extract element costs by converting scalar binops to vector +/// binops followed by extract. +static bool foldExtractBinop(Instruction &I, const TargetTransformInfo &TTI) { + // It is not safe to transform things like div, urem, etc. because we may + // create undefined behavior when executing those on unknown vector elements. + if (!isSafeToSpeculativelyExecute(&I)) + return false; + + // Match a scalar binop with extracted vector operands: + // bo (extelt X, C0), (extelt Y, C1) + Instruction *Ext0, *Ext1; + if (!match(&I, m_BinOp(m_Instruction(Ext0), m_Instruction(Ext1)))) + return false; + + Value *X, *Y; + uint64_t C0, C1; + if (!match(Ext0, m_ExtractElement(m_Value(X), m_ConstantInt(C0))) || + !match(Ext1, m_ExtractElement(m_Value(Y), m_ConstantInt(C1))) || + X->getType() != Y->getType()) + return false; + + // Check if using a vector binop would be cheaper. + Instruction::BinaryOps BOpcode = cast<BinaryOperator>(I).getOpcode(); + Type *ScalarTy = I.getType(); + Type *VecTy = X->getType(); + int ScalarBOCost = TTI.getArithmeticInstrCost(BOpcode, ScalarTy); + int VecBOCost = TTI.getArithmeticInstrCost(BOpcode, VecTy); + int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, + VecTy, C0); + int Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, + VecTy, C1); + + // Handle a special case - if the extract indexes are the same, the + // replacement sequence does not require a shuffle. Unless the vector binop is + // much more expensive than the scalar binop, this eliminates an extract. + // Extra uses of the extracts mean that we include those costs in the + // vector total because those instructions will not be eliminated. + if (C0 == C1) { + assert(Extract0Cost == Extract1Cost && "Different costs for same extract?"); + int ExtractCost = Extract0Cost; + if (X != Y) { + int ScalarCost = ExtractCost + ExtractCost + ScalarBOCost; + int VecCost = VecBOCost + ExtractCost + + !Ext0->hasOneUse() * ExtractCost + + !Ext1->hasOneUse() * ExtractCost; + if (ScalarCost <= VecCost) + return false; + } else { + // Handle an extra-special case. If the 2 binop operands are identical, + // adjust the formulas to account for that: + // bo (extelt X, C), (extelt X, C) --> extelt (bo X, X), C + // The extra use charge allows for either the CSE'd pattern or an + // unoptimized form with identical values. + bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2) + : !Ext0->hasOneUse() || !Ext1->hasOneUse(); + int ScalarCost = ExtractCost + ScalarBOCost; + int VecCost = VecBOCost + ExtractCost + HasUseTax * ExtractCost; + if (ScalarCost <= VecCost) + return false; + } + + // bo (extelt X, C), (extelt Y, C) --> extelt (bo X, Y), C + ++NumVecBO; + IRBuilder<> Builder(&I); + Value *NewBO = Builder.CreateBinOp(BOpcode, X, Y); + if (auto *VecBOInst = dyn_cast<Instruction>(NewBO)) { + // All IR flags are safe to back-propagate because any potential poison + // created in unused vector elements is discarded by the extract. + VecBOInst->copyIRFlags(&I); + } + Value *Extract = Builder.CreateExtractElement(NewBO, Ext0->getOperand(1)); + I.replaceAllUsesWith(Extract); + return true; + } + + // TODO: Handle C0 != C1 by shuffling 1 of the operands. + return false; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. static bool runImpl(Function &F, const TargetTransformInfo &TTI, @@ -92,7 +173,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI, // iteratively in this loop rather than waiting until the end. for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { MadeChange |= foldExtractCmp(I, TTI); - // TODO: More transforms go here. + MadeChange |= foldExtractBinop(I, TTI); } } |