aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp12
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h18
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp6
-rw-r--r--llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp30
-rw-r--r--llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp50
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp43
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp23
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp22
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHelpers.h16
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp118
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp17
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp4
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp70
16 files changed, 324 insertions, 119 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d1ca0a6..59e103cd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -880,11 +880,11 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
// zext(bool) + C -> bool ? C + 1 : C
if (match(Op0, m_ZExt(m_Value(X))) &&
X->getType()->getScalarSizeInBits() == 1)
- return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1);
+ return createSelectInst(X, InstCombiner::AddOne(Op1C), Op1);
// sext(bool) + C -> bool ? C - 1 : C
if (match(Op0, m_SExt(m_Value(X))) &&
X->getType()->getScalarSizeInBits() == 1)
- return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1);
+ return createSelectInst(X, InstCombiner::SubOne(Op1C), Op1);
// ~X + C --> (C-1) - X
if (match(Op0, m_Not(m_Value(X)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c13c6cc..cf6d0ec 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -64,6 +64,7 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/KnownFPClass.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TypeSize.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
@@ -3781,6 +3782,17 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(CI, Res);
}
}
+
+ // vector.reduce.add.vNiM(splat(%x)) -> mul(%x, N)
+ if (Value *Splat = getSplatValue(Arg)) {
+ ElementCount VecToReduceCount =
+ cast<VectorType>(Arg->getType())->getElementCount();
+ if (VecToReduceCount.isFixed()) {
+ unsigned VectorSize = VecToReduceCount.getFixedValue();
+ return BinaryOperator::CreateMul(
+ Splat, ConstantInt::get(Splat->getType(), VectorSize));
+ }
+ }
}
[[fallthrough]];
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7a979c1..4f94aa2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -23,6 +23,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
@@ -62,14 +63,14 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
public InstVisitor<InstCombinerImpl, Instruction *> {
public:
InstCombinerImpl(InstructionWorklist &Worklist, BuilderTy &Builder,
- bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+ Function &F, AAResults *AA, AssumptionCache &AC,
TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
DominatorTree &DT, OptimizationRemarkEmitter &ORE,
BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI,
ProfileSummaryInfo *PSI, const DataLayout &DL,
ReversePostOrderTraversal<BasicBlock *> &RPOT)
- : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
- BFI, BPI, PSI, DL, RPOT) {}
+ : InstCombiner(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI, BPI,
+ PSI, DL, RPOT) {}
virtual ~InstCombinerImpl() = default;
@@ -469,6 +470,17 @@ private:
Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
unsigned Depth = 0);
+ SelectInst *createSelectInst(Value *C, Value *S1, Value *S2,
+ const Twine &NameStr = "",
+ InsertPosition InsertBefore = nullptr,
+ Instruction *MDFrom = nullptr) {
+ SelectInst *SI =
+ SelectInst::Create(C, S1, S2, NameStr, InsertBefore, MDFrom);
+ if (!MDFrom)
+ setExplicitlyUnknownBranchWeightsIfProfiled(*SI, F, DEBUG_TYPE);
+ return SI;
+ }
+
public:
/// Create and insert the idiom we use to indicate a block is unreachable
/// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 550f095..d457e0c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1253,7 +1253,7 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
// shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
auto *NewC = Builder.CreateShl(ConstantInt::get(Ty, 1), C1);
- return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
+ return createSelectInst(X, NewC, ConstantInt::getNullValue(Ty));
}
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f0ddd5c..8fbaf68 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1735,7 +1735,7 @@ Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) {
Constant *Zero = ConstantInt::getNullValue(BO.getType());
Value *TVal = Builder.CreateBinOp(BO.getOpcode(), Ones, C);
Value *FVal = Builder.CreateBinOp(BO.getOpcode(), Zero, C);
- return SelectInst::Create(X, TVal, FVal);
+ return createSelectInst(X, TVal, FVal);
}
static Value *simplifyOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
@@ -5934,8 +5934,8 @@ static bool combineInstructionsOverFunction(
LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
<< F.getName() << "\n");
- InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
- ORE, BFI, BPI, PSI, DL, RPOT);
+ InstCombinerImpl IC(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI,
+ BPI, PSI, DL, RPOT);
IC.MaxArraySizeForCombine = MaxArraySize;
bool MadeChangeInThisIteration = IC.prepareWorklist(F);
MadeChangeInThisIteration |= IC.run();
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index e5bf2d1..d842275 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -35,6 +35,7 @@
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Regex.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Instrumentation/CFGMST.h"
#include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
@@ -92,8 +93,10 @@ class GCOVFunction;
class GCOVProfiler {
public:
- GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
- GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {}
+ GCOVProfiler()
+ : GCOVProfiler(GCOVOptions::getDefault(), *vfs::getRealFileSystem()) {}
+ GCOVProfiler(const GCOVOptions &Opts, vfs::FileSystem &VFS)
+ : Options(Opts), VFS(VFS) {}
bool
runOnModule(Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
@@ -110,6 +113,7 @@ public:
os->write_zeros(4 - s.size() % 4);
}
void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); }
+ vfs::FileSystem &getVirtualFileSystem() const { return VFS; }
private:
// Create the .gcno files for the Module based on DebugInfo.
@@ -166,6 +170,7 @@ private:
std::vector<Regex> ExcludeRe;
DenseSet<const BasicBlock *> ExecBlocks;
StringMap<bool> InstrumentedFiles;
+ vfs::FileSystem &VFS;
};
struct BBInfo {
@@ -214,10 +219,10 @@ static StringRef getFunctionName(const DISubprogram *SP) {
/// Prefer relative paths in the coverage notes. Clang also may split
/// up absolute paths into a directory and filename component. When
/// the relative path doesn't exist, reconstruct the absolute path.
-static SmallString<128> getFilename(const DIScope *SP) {
+static SmallString<128> getFilename(const DIScope *SP, vfs::FileSystem &VFS) {
SmallString<128> Path;
StringRef RelPath = SP->getFilename();
- if (sys::fs::exists(RelPath))
+ if (VFS.exists(RelPath))
Path = RelPath;
else
sys::path::append(Path, SP->getDirectory(), SP->getFilename());
@@ -357,7 +362,7 @@ namespace {
void writeOut(uint32_t CfgChecksum) {
write(GCOV_TAG_FUNCTION);
- SmallString<128> Filename = getFilename(SP);
+ SmallString<128> Filename = getFilename(SP, P->getVirtualFileSystem());
uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP));
BlockLen += 1 + wordsOfString(Filename) + 4;
@@ -455,7 +460,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
if (FilterRe.empty() && ExcludeRe.empty()) {
return true;
}
- SmallString<128> Filename = getFilename(F.getSubprogram());
+ SmallString<128> Filename = getFilename(F.getSubprogram(), VFS);
auto It = InstrumentedFiles.find(Filename);
if (It != InstrumentedFiles.end()) {
return It->second;
@@ -467,7 +472,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
// Path can be
// /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for
// such a case we must get the real_path.
- if (sys::fs::real_path(Filename, RealPath)) {
+ if (VFS.getRealPath(Filename, RealPath)) {
// real_path can fail with path like "foo.c".
RealFilename = Filename;
} else {
@@ -524,9 +529,10 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
SmallString<128> Filename = CU->getFilename();
sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
StringRef FName = sys::path::filename(Filename);
- SmallString<128> CurPath;
- if (sys::fs::current_path(CurPath))
+ ErrorOr<std::string> CWD = VFS.getCurrentWorkingDirectory();
+ if (!CWD)
return std::string(FName);
+ SmallString<128> CurPath{*CWD};
sys::path::append(CurPath, FName);
return std::string(CurPath);
}
@@ -554,7 +560,7 @@ bool GCOVProfiler::runOnModule(
PreservedAnalyses GCOVProfilerPass::run(Module &M,
ModuleAnalysisManager &AM) {
- GCOVProfiler Profiler(GCOVOpts);
+ GCOVProfiler Profiler(GCOVOpts, *VFS);
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
@@ -789,7 +795,7 @@ bool GCOVProfiler::emitProfileNotes(
// Add the function line number to the lines of the entry block
// to have a counter for the function definition.
uint32_t Line = SP->getLine();
- auto Filename = getFilename(SP);
+ auto Filename = getFilename(SP, VFS);
BranchProbabilityInfo *BPI = GetBPI(F);
BlockFrequencyInfo *BFI = GetBFI(F);
@@ -881,7 +887,7 @@ bool GCOVProfiler::emitProfileNotes(
if (SP != getDISubprogram(Scope))
continue;
- GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope()));
+ GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope(), VFS));
Lines.addLine(Loc.getLine());
}
Line = 0;
diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
index c215228..89980d5 100644
--- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
+++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/DropUnnecessaryAssumes.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -17,13 +18,48 @@ using namespace llvm;
using namespace llvm::PatternMatch;
static bool affectedValuesAreEphemeral(ArrayRef<Value *> Affected) {
- // If all the affected uses have only one use (part of the assume), then
- // the assume does not provide useful information. Note that additional
- // users may appear as a result of inlining and CSE, so we should only
- // make this assumption late in the optimization pipeline.
- // TODO: Handle dead cyclic usages.
- // TODO: Handle multiple dead assumes on the same value.
- return all_of(Affected, match_fn(m_OneUse(m_Value())));
+ // Check whether all the uses are ephemeral, i.e. recursively only used
+ // by assumes. In that case, the assume does not provide useful information.
+ // Note that additional users may appear as a result of inlining and CSE,
+ // so we should only make this assumption late in the optimization pipeline.
+ SmallSetVector<Instruction *, 32> Worklist;
+ auto AddUsers = [&](Value *V) {
+ for (User *U : V->users()) {
+ // Bail out if we need to inspect too many users.
+ if (Worklist.size() >= 32)
+ return false;
+ Worklist.insert(cast<Instruction>(U));
+ }
+ return true;
+ };
+
+ for (Value *V : Affected) {
+ // Do not handle assumes on globals for now. The use list for them may
+ // contain uses in other functions.
+ if (!isa<Instruction, Argument>(V))
+ return false;
+
+ if (!AddUsers(V))
+ return false;
+ }
+
+ for (unsigned Idx = 0; Idx < Worklist.size(); ++Idx) {
+ Instruction *I = Worklist[Idx];
+
+ // Use in assume is ephemeral.
+ if (isa<AssumeInst>(I))
+ continue;
+
+ // Use in side-effecting instruction is non-ephemeral.
+ if (I->mayHaveSideEffects() || I->isTerminator())
+ return false;
+
+ // Otherwise, recursively look at the users.
+ if (!AddUsers(I))
+ return false;
+ }
+
+ return true;
}
PreservedAnalyses
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 2d84b4a..216bdf4 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -84,7 +84,6 @@
#include <cstdint>
#include <iterator>
#include <map>
-#include <numeric>
#include <optional>
#include <set>
#include <tuple>
@@ -6356,25 +6355,25 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
if (DefaultResult) {
Value *ValueCompare =
Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
- SelectInst *SelectValueInst = cast<SelectInst>(Builder.CreateSelect(
- ValueCompare, ResultVector[1].first, DefaultResult, "switch.select"));
- SelectValue = SelectValueInst;
- if (HasBranchWeights) {
+ SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
+ DefaultResult, "switch.select");
+ if (auto *SI = dyn_cast<SelectInst>(SelectValue);
+ SI && HasBranchWeights) {
// We start with 3 probabilities, where the numerator is the
// corresponding BranchWeights[i], and the denominator is the sum over
// BranchWeights. We want the probability and negative probability of
// Condition == SecondCase.
assert(BranchWeights.size() == 3);
- setBranchWeights(SelectValueInst, BranchWeights[2],
+ setBranchWeights(SI, BranchWeights[2],
BranchWeights[0] + BranchWeights[1],
/*IsExpected=*/false);
}
}
Value *ValueCompare =
Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
- SelectInst *Ret = cast<SelectInst>(Builder.CreateSelect(
- ValueCompare, ResultVector[0].first, SelectValue, "switch.select"));
- if (HasBranchWeights) {
+ Value *Ret = Builder.CreateSelect(ValueCompare, ResultVector[0].first,
+ SelectValue, "switch.select");
+ if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
// We may have had a DefaultResult. Base the position of the first and
// second's branch weights accordingly. Also the proability that Condition
// != FirstCase needs to take that into account.
@@ -6382,7 +6381,7 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
size_t FirstCasePos = (Condition != nullptr);
size_t SecondCasePos = FirstCasePos + 1;
uint32_t DefaultCase = (Condition != nullptr) ? BranchWeights[0] : 0;
- setBranchWeights(Ret, BranchWeights[FirstCasePos],
+ setBranchWeights(SI, BranchWeights[FirstCasePos],
DefaultCase + BranchWeights[SecondCasePos],
/*IsExpected=*/false);
}
@@ -6422,13 +6421,13 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
Value *And = Builder.CreateAnd(Condition, AndMask);
Value *Cmp = Builder.CreateICmpEQ(
And, Constant::getIntegerValue(And->getType(), AndMask));
- SelectInst *Ret = cast<SelectInst>(
- Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
- if (HasBranchWeights) {
+ Value *Ret =
+ Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+ if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
// We know there's a Default case. We base the resulting branch
// weights off its probability.
assert(BranchWeights.size() >= 2);
- setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+ setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
BranchWeights[0], /*IsExpected=*/false);
}
return Ret;
@@ -6448,11 +6447,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
Value *And = Builder.CreateAnd(Condition, ~BitMask, "switch.and");
Value *Cmp = Builder.CreateICmpEQ(
And, Constant::getNullValue(And->getType()), "switch.selectcmp");
- SelectInst *Ret = cast<SelectInst>(
- Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
- if (HasBranchWeights) {
+ Value *Ret =
+ Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+ if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
assert(BranchWeights.size() >= 2);
- setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+ setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
BranchWeights[0], /*IsExpected=*/false);
}
return Ret;
@@ -6466,11 +6465,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
Value *Cmp2 = Builder.CreateICmpEQ(Condition, CaseValues[1],
"switch.selectcmp.case2");
Value *Cmp = Builder.CreateOr(Cmp1, Cmp2, "switch.selectcmp");
- SelectInst *Ret = cast<SelectInst>(
- Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
- if (HasBranchWeights) {
+ Value *Ret =
+ Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+ if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
assert(BranchWeights.size() >= 2);
- setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+ setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
BranchWeights[0], /*IsExpected=*/false);
}
return Ret;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 93a5f22..ab5c9c9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2438,8 +2438,9 @@ struct CSEDenseMapInfo {
} // end anonymous namespace
-///Perform cse of induction variable instructions.
-static void cse(BasicBlock *BB) {
+/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
+/// removal, in favor of the VPlan-based one.
+static void legacyCSE(BasicBlock *BB) {
// Perform simple cse.
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
for (Instruction &In : llvm::make_early_inc_range(*BB)) {
@@ -2543,7 +2544,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
// Remove redundant induction instructions.
- cse(HeaderBB);
+ legacyCSE(HeaderBB);
}
void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -3901,7 +3902,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
if (VF.isScalar())
continue;
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+ *CM.PSE.getSE());
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4158,7 +4160,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
+ *CM.PSE.getSE());
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6833,7 +6836,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7066,7 +7069,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
+ *CM.PSE.getSE());
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
@@ -8596,7 +8600,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+ *CM.PSE.getSE());
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
@@ -10053,7 +10058,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
- CM.CostKind);
+ CM.CostKind, *CM.PSE.getSE());
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 065622e..c547662 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1100,7 +1100,9 @@ class BinOpSameOpcodeHelper {
// constant + x cannot be -constant - x
// instead, it should be x - -constant
if (Pos == 1 ||
- (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
+ ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
+ FromOpcode == Instruction::Xor) &&
+ ToOpcode == Instruction::Sub))
return SmallVector<Value *>({LHS, RHS});
return SmallVector<Value *>({RHS, LHS});
}
@@ -1188,6 +1190,10 @@ public:
if (CIValue.isAllOnes())
InterchangeableMask = CanBeAll;
break;
+ case Instruction::Xor:
+ if (CIValue.isZero())
+ InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
+ break;
default:
if (CIValue.isZero())
InterchangeableMask = CanBeAll;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a1c6f79..728d291 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -845,19 +845,10 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
if (VF.isScalable())
return InstructionCost::getInvalid();
- // First compute the cost of the conditionally executed recipes, followed by
- // account for the branching cost, except if the mask is a header mask or
- // uniform condition.
- using namespace llvm::VPlanPatternMatch;
+ // Compute and return the cost of the conditionally executed recipes.
+ assert(VF.isVector() && "Can only compute vector cost at the moment.");
VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
- InstructionCost ThenCost = Then->cost(VF, Ctx);
-
- // For the scalar case, we may not always execute the original predicated
- // block, Thus, scale the block's cost by the probability of executing it.
- if (VF.isScalar())
- return ThenCost / getPredBlockCostDivisor(Ctx.CostKind);
-
- return ThenCost;
+ return Then->cost(VF, Ctx);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1759,7 +1750,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
}
InstructionCost VPCostContext::getScalarizationOverhead(
- Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
+ Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
+ bool AlwaysIncludeReplicatingR) {
if (VF.isScalar())
return 0;
@@ -1779,7 +1771,9 @@ InstructionCost VPCostContext::getScalarizationOverhead(
SmallPtrSet<const VPValue *, 4> UniqueOperands;
SmallVector<Type *> Tys;
for (auto *Op : Operands) {
- if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+ if (Op->isLiveIn() ||
+ (!AlwaysIncludeReplicatingR &&
+ isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
!UniqueOperands.insert(Op).second)
continue;
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fe59774..2a8baec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,12 +349,14 @@ struct VPCostContext {
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
+ ScalarEvolution &SE;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
const VPlan &Plan, LoopVectorizationCostModel &CM,
- TargetTransformInfo::TargetCostKind CostKind)
+ TargetTransformInfo::TargetCostKind CostKind,
+ ScalarEvolution &SE)
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
- CostKind(CostKind) {}
+ CostKind(CostKind), SE(SE) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
/// and \p Operands with \p VF. This is a convenience wrapper for the
- /// type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Type *ResultTy,
- ArrayRef<const VPValue *> Operands,
- ElementCount VF);
+ /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
+ /// is true, always compute the cost of scalarizing replicating operands.
+ InstructionCost
+ getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
+ ElementCount VF,
+ bool AlwaysIncludeReplicatingR = false);
};
/// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index cf5e6bf..b5e30cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const {
});
}
+/// Returns true if \p Ptr is a pointer computation for which the legacy cost
+/// model computes a SCEV expression when computing the address cost.
+static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+ auto *PtrR = Ptr->getDefiningRecipe();
+ if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
+ cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
+ Instruction::GetElementPtr) ||
+ isa<VPWidenGEPRecipe>(PtrR)))
+ return false;
+
+ // We are looking for a GEP where all indices are either loop invariant or
+ // inductions.
+ for (VPValue *Opd : drop_begin(PtrR->operands())) {
+ if (!Opd->isDefinedOutsideLoopRegions() &&
+ !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
+ return false;
+ }
+
+ return true;
+}
+
+/// Returns true if \p V is used as part of the address of another load or
+/// store.
+static bool isUsedByLoadStoreAddress(const VPUser *V) {
+ SmallPtrSet<const VPUser *, 4> Seen;
+ SmallVector<const VPUser *> WorkList = {V};
+
+ while (!WorkList.empty()) {
+ auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
+ if (!Cur || !Seen.insert(Cur).second)
+ continue;
+
+ for (VPUser *U : Cur->users()) {
+ if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
+ if (InterleaveR->getAddr() == Cur)
+ return true;
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
+ if (RepR->getOpcode() == Instruction::Load &&
+ RepR->getOperand(0) == Cur)
+ return true;
+ if (RepR->getOpcode() == Instruction::Store &&
+ RepR->getOperand(1) == Cur)
+ return true;
+ }
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
+ if (MemR->getAddr() == Cur && MemR->isConsecutive())
+ return true;
+ }
+ }
+
+ append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+ }
+ return false;
+}
+
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
}
case Instruction::Load:
case Instruction::Store: {
- if (isSingleScalar()) {
- bool IsLoad = UI->getOpcode() == Instruction::Load;
- Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
- Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
- const Align Alignment = getLoadStoreAlignment(UI);
- unsigned AS = getLoadStoreAddressSpace(UI);
- TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
- InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
- UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
- return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
- ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
- }
+ if (VF.isScalable() && !isSingleScalar())
+ return InstructionCost::getInvalid();
+
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- break;
+ const VPRegionBlock *ParentRegion = getParent()->getParent();
+ if (ParentRegion && ParentRegion->isReplicator())
+ break;
+
+ bool IsLoad = UI->getOpcode() == Instruction::Load;
+ const VPValue *PtrOp = getOperand(!IsLoad);
+ // TODO: Handle cases where we need to pass a SCEV to
+ // getAddressComputationCost.
+ if (shouldUseAddressAccessSCEV(PtrOp))
+ break;
+
+ Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+ Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+ const Align Alignment = getLoadStoreAlignment(UI);
+ unsigned AS = getLoadStoreAddressSpace(UI);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+ InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+ UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
+
+ Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
+
+ InstructionCost ScalarCost =
+ ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+ PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
+ if (isSingleScalar())
+ return ScalarCost;
+
+ SmallVector<const VPValue *> OpsToScalarize;
+ Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
+ // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
+ // don't assign scalarization overhead in general, if the target prefers
+ // vectorized addressing or the loaded value is used as part of an address
+ // of another load or store.
+ bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
+ if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
+ bool EfficientVectorLoadStore =
+ Ctx.TTI.supportsEfficientVectorElementLoadStore();
+ if (!(IsLoad && !PreferVectorizedAddressing) &&
+ !(!IsLoad && EfficientVectorLoadStore))
+ append_range(OpsToScalarize, operands());
+
+ if (!EfficientVectorLoadStore)
+ ResultTy = Ctx.Types.inferScalarType(this);
+ }
+
+ return (ScalarCost * VF.getFixedValue()) +
+ Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 58fab8f..5252e1f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2853,6 +2853,7 @@ void VPlanTransforms::replaceSymbolicStrides(
return R->getParent()->getParent() ||
R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
};
+ ValueToSCEVMapTy RewriteMap;
for (const SCEV *Stride : StridesMap.values()) {
using namespace SCEVPatternMatch;
auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
@@ -2880,6 +2881,22 @@ void VPlanTransforms::replaceSymbolicStrides(
VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
}
+ RewriteMap[StrideV] = PSE.getSCEV(StrideV);
+ }
+
+ for (VPRecipeBase &R : *Plan.getEntry()) {
+ auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
+ if (!ExpSCEV)
+ continue;
+ const SCEV *ScevExpr = ExpSCEV->getSCEV();
+ auto *NewSCEV =
+ SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
+ if (NewSCEV != ScevExpr) {
+ VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
+ ExpSCEV->replaceAllUsesWith(NewExp);
+ if (Plan.getTripCount() == ExpSCEV)
+ Plan.resetTripCount(NewExp);
+ }
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index eac0e70..0599930 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -13,6 +13,7 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
return all_of(Def->users(),
@@ -63,7 +64,6 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
};
VPValue *A, *B;
- using namespace VPlanPatternMatch;
if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One())))
return B == Plan.getTripCount() &&
@@ -90,7 +90,6 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
}
bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
- using namespace VPlanPatternMatch;
// Live-ins are uniform.
if (V->isLiveIn())
return true;
@@ -159,7 +158,6 @@ std::optional<VPValue *>
vputils::getRecipesForUncountableExit(VPlan &Plan,
SmallVectorImpl<VPRecipeBase *> &Recipes,
SmallVectorImpl<VPRecipeBase *> &GEPs) {
- using namespace llvm::VPlanPatternMatch;
// Given a VPlan like the following (just including the recipes contributing
// to loop control exiting here, not the actual work), we're looking to match
// the recipes contributing to the uncountable exit condition comparison
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 0ef933f..32704bd 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2487,21 +2487,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
return false;
+ // Check whether this is a binary shuffle.
+ bool IsBinaryShuffle = !isa<UndefValue>(V1);
+
auto *C0 = dyn_cast<CastInst>(V0);
auto *C1 = dyn_cast<CastInst>(V1);
- if (!C0 || !C1)
+ if (!C0 || (IsBinaryShuffle && !C1))
return false;
Instruction::CastOps Opcode = C0->getOpcode();
- if (C0->getSrcTy() != C1->getSrcTy())
+
+ // If this is allowed, foldShuffleOfCastops can get stuck in a loop
+ // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
+ if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
return false;
- // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
- if (Opcode != C1->getOpcode()) {
- if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
- Opcode = Instruction::SExt;
- else
+ if (IsBinaryShuffle) {
+ if (C0->getSrcTy() != C1->getSrcTy())
return false;
+ // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
+ if (Opcode != C1->getOpcode()) {
+ if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
+ Opcode = Instruction::SExt;
+ else
+ return false;
+ }
}
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -2544,23 +2554,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
InstructionCost CostC0 =
TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
TTI::CastContextHint::None, CostKind);
- InstructionCost CostC1 =
- TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
- TTI::CastContextHint::None, CostKind);
- InstructionCost OldCost = CostC0 + CostC1;
- OldCost +=
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
- CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
- InstructionCost NewCost =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, NewShuffleDstTy,
- CastSrcTy, NewMask, CostKind);
+ TargetTransformInfo::ShuffleKind ShuffleKind;
+ if (IsBinaryShuffle)
+ ShuffleKind = TargetTransformInfo::SK_PermuteTwoSrc;
+ else
+ ShuffleKind = TargetTransformInfo::SK_PermuteSingleSrc;
+
+ InstructionCost OldCost = CostC0;
+ OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
+ CostKind, 0, nullptr, {}, &I);
+
+ InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
+ CastSrcTy, NewMask, CostKind);
NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
TTI::CastContextHint::None, CostKind);
if (!C0->hasOneUse())
NewCost += CostC0;
- if (!C1->hasOneUse())
- NewCost += CostC1;
+ if (IsBinaryShuffle) {
+ InstructionCost CostC1 =
+ TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
+ TTI::CastContextHint::None, CostKind);
+ OldCost += CostC1;
+ if (!C1->hasOneUse())
+ NewCost += CostC1;
+ }
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -2568,14 +2586,20 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
if (NewCost > OldCost)
return false;
- Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
- C1->getOperand(0), NewMask);
+ Value *Shuf;
+ if (IsBinaryShuffle)
+ Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
+ NewMask);
+ else
+ Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
+
Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
// Intersect flags from the old casts.
if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
NewInst->copyIRFlags(C0);
- NewInst->andIRFlags(C1);
+ if (IsBinaryShuffle)
+ NewInst->andIRFlags(C1);
}
Worklist.pushValue(Shuf);
@@ -4433,7 +4457,7 @@ bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
// Create new mask using difference of the two incoming masks.
int MaskOffset = NewMask[0u];
- unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
+ unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
NewMask.clear();
for (unsigned I = 0u; I < InputNumElements; ++I) {