diff options
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineInternal.h | 18 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp | 30 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp | 50 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 43 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 23 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 22 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 16 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 118 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 17 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 70 |
16 files changed, 324 insertions, 119 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index d1ca0a6..59e103cd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -880,11 +880,11 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) { // zext(bool) + C -> bool ? C + 1 : C if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->getScalarSizeInBits() == 1) - return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1); + return createSelectInst(X, InstCombiner::AddOne(Op1C), Op1); // sext(bool) + C -> bool ? C - 1 : C if (match(Op0, m_SExt(m_Value(X))) && X->getType()->getScalarSizeInBits() == 1) - return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1); + return createSelectInst(X, InstCombiner::SubOne(Op1C), Op1); // ~X + C --> (C-1) - X if (match(Op0, m_Not(m_Value(X)))) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index c13c6cc..cf6d0ec 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -64,6 +64,7 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/KnownFPClass.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/TypeSize.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" @@ -3781,6 +3782,17 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { return replaceInstUsesWith(CI, Res); } } + + // vector.reduce.add.vNiM(splat(%x)) -> mul(%x, N) + if (Value *Splat = getSplatValue(Arg)) { + ElementCount VecToReduceCount = + cast<VectorType>(Arg->getType())->getElementCount(); + if (VecToReduceCount.isFixed()) { + unsigned VectorSize = VecToReduceCount.getFixedValue(); + return BinaryOperator::CreateMul( + Splat, ConstantInt::get(Splat->getType(), VectorSize)); + } + } } [[fallthrough]]; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 7a979c1..4f94aa2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -23,6 +23,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Value.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" @@ -62,14 +63,14 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final public InstVisitor<InstCombinerImpl, Instruction *> { public: InstCombinerImpl(InstructionWorklist &Worklist, BuilderTy &Builder, - bool MinimizeSize, AAResults *AA, AssumptionCache &AC, + Function &F, AAResults *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI, ProfileSummaryInfo *PSI, const DataLayout &DL, ReversePostOrderTraversal<BasicBlock *> &RPOT) - : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE, - BFI, BPI, PSI, DL, RPOT) {} + : InstCombiner(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI, BPI, + PSI, DL, RPOT) {} virtual ~InstCombinerImpl() = default; @@ -469,6 +470,17 @@ private: Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable, unsigned Depth = 0); + SelectInst *createSelectInst(Value *C, Value *S1, Value *S2, + const Twine &NameStr = "", + InsertPosition InsertBefore = nullptr, + Instruction *MDFrom = nullptr) { + SelectInst *SI = + SelectInst::Create(C, S1, S2, NameStr, InsertBefore, MDFrom); + if (!MDFrom) + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, F, DEBUG_TYPE); + return SI; + } + public: /// Create and insert the idiom we use to indicate a block is unreachable /// without having to rewrite the CFG from within InstCombine. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 550f095..d457e0c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -1253,7 +1253,7 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) { // shl (zext i1 X), C1 --> select (X, 1 << C1, 0) if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { auto *NewC = Builder.CreateShl(ConstantInt::get(Ty, 1), C1); - return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty)); + return createSelectInst(X, NewC, ConstantInt::getNullValue(Ty)); } } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index f0ddd5c..8fbaf68 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1735,7 +1735,7 @@ Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) { Constant *Zero = ConstantInt::getNullValue(BO.getType()); Value *TVal = Builder.CreateBinOp(BO.getOpcode(), Ones, C); Value *FVal = Builder.CreateBinOp(BO.getOpcode(), Zero, C); - return SelectInst::Create(X, TVal, FVal); + return createSelectInst(X, TVal, FVal); } static Value *simplifyOperationIntoSelectOperand(Instruction &I, SelectInst *SI, @@ -5934,8 +5934,8 @@ static bool combineInstructionsOverFunction( LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); - InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT, - ORE, BFI, BPI, PSI, DL, RPOT); + InstCombinerImpl IC(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI, + BPI, PSI, DL, RPOT); IC.MaxArraySizeForCombine = MaxArraySize; bool MadeChangeInThisIteration = IC.prepareWorklist(F); MadeChangeInThisIteration |= IC.run(); diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index e5bf2d1..d842275 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -35,6 +35,7 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/Regex.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation/CFGMST.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" @@ -92,8 +93,10 @@ class GCOVFunction; class GCOVProfiler { public: - GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {} - GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {} + GCOVProfiler() + : GCOVProfiler(GCOVOptions::getDefault(), *vfs::getRealFileSystem()) {} + GCOVProfiler(const GCOVOptions &Opts, vfs::FileSystem &VFS) + : Options(Opts), VFS(VFS) {} bool runOnModule(Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI, function_ref<BranchProbabilityInfo *(Function &F)> GetBPI, @@ -110,6 +113,7 @@ public: os->write_zeros(4 - s.size() % 4); } void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); } + vfs::FileSystem &getVirtualFileSystem() const { return VFS; } private: // Create the .gcno files for the Module based on DebugInfo. @@ -166,6 +170,7 @@ private: std::vector<Regex> ExcludeRe; DenseSet<const BasicBlock *> ExecBlocks; StringMap<bool> InstrumentedFiles; + vfs::FileSystem &VFS; }; struct BBInfo { @@ -214,10 +219,10 @@ static StringRef getFunctionName(const DISubprogram *SP) { /// Prefer relative paths in the coverage notes. Clang also may split /// up absolute paths into a directory and filename component. When /// the relative path doesn't exist, reconstruct the absolute path. -static SmallString<128> getFilename(const DIScope *SP) { +static SmallString<128> getFilename(const DIScope *SP, vfs::FileSystem &VFS) { SmallString<128> Path; StringRef RelPath = SP->getFilename(); - if (sys::fs::exists(RelPath)) + if (VFS.exists(RelPath)) Path = RelPath; else sys::path::append(Path, SP->getDirectory(), SP->getFilename()); @@ -357,7 +362,7 @@ namespace { void writeOut(uint32_t CfgChecksum) { write(GCOV_TAG_FUNCTION); - SmallString<128> Filename = getFilename(SP); + SmallString<128> Filename = getFilename(SP, P->getVirtualFileSystem()); uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP)); BlockLen += 1 + wordsOfString(Filename) + 4; @@ -455,7 +460,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) { if (FilterRe.empty() && ExcludeRe.empty()) { return true; } - SmallString<128> Filename = getFilename(F.getSubprogram()); + SmallString<128> Filename = getFilename(F.getSubprogram(), VFS); auto It = InstrumentedFiles.find(Filename); if (It != InstrumentedFiles.end()) { return It->second; @@ -467,7 +472,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) { // Path can be // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for // such a case we must get the real_path. - if (sys::fs::real_path(Filename, RealPath)) { + if (VFS.getRealPath(Filename, RealPath)) { // real_path can fail with path like "foo.c". RealFilename = Filename; } else { @@ -524,9 +529,10 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU, SmallString<128> Filename = CU->getFilename(); sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda"); StringRef FName = sys::path::filename(Filename); - SmallString<128> CurPath; - if (sys::fs::current_path(CurPath)) + ErrorOr<std::string> CWD = VFS.getCurrentWorkingDirectory(); + if (!CWD) return std::string(FName); + SmallString<128> CurPath{*CWD}; sys::path::append(CurPath, FName); return std::string(CurPath); } @@ -554,7 +560,7 @@ bool GCOVProfiler::runOnModule( PreservedAnalyses GCOVProfilerPass::run(Module &M, ModuleAnalysisManager &AM) { - GCOVProfiler Profiler(GCOVOpts); + GCOVProfiler Profiler(GCOVOpts, *VFS); FunctionAnalysisManager &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); @@ -789,7 +795,7 @@ bool GCOVProfiler::emitProfileNotes( // Add the function line number to the lines of the entry block // to have a counter for the function definition. uint32_t Line = SP->getLine(); - auto Filename = getFilename(SP); + auto Filename = getFilename(SP, VFS); BranchProbabilityInfo *BPI = GetBPI(F); BlockFrequencyInfo *BFI = GetBFI(F); @@ -881,7 +887,7 @@ bool GCOVProfiler::emitProfileNotes( if (SP != getDISubprogram(Scope)) continue; - GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope())); + GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope(), VFS)); Lines.addLine(Loc.getLine()); } Line = 0; diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index c215228..89980d5 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/DropUnnecessaryAssumes.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" @@ -17,13 +18,48 @@ using namespace llvm; using namespace llvm::PatternMatch; static bool affectedValuesAreEphemeral(ArrayRef<Value *> Affected) { - // If all the affected uses have only one use (part of the assume), then - // the assume does not provide useful information. Note that additional - // users may appear as a result of inlining and CSE, so we should only - // make this assumption late in the optimization pipeline. - // TODO: Handle dead cyclic usages. - // TODO: Handle multiple dead assumes on the same value. - return all_of(Affected, match_fn(m_OneUse(m_Value()))); + // Check whether all the uses are ephemeral, i.e. recursively only used + // by assumes. In that case, the assume does not provide useful information. + // Note that additional users may appear as a result of inlining and CSE, + // so we should only make this assumption late in the optimization pipeline. + SmallSetVector<Instruction *, 32> Worklist; + auto AddUsers = [&](Value *V) { + for (User *U : V->users()) { + // Bail out if we need to inspect too many users. + if (Worklist.size() >= 32) + return false; + Worklist.insert(cast<Instruction>(U)); + } + return true; + }; + + for (Value *V : Affected) { + // Do not handle assumes on globals for now. The use list for them may + // contain uses in other functions. + if (!isa<Instruction, Argument>(V)) + return false; + + if (!AddUsers(V)) + return false; + } + + for (unsigned Idx = 0; Idx < Worklist.size(); ++Idx) { + Instruction *I = Worklist[Idx]; + + // Use in assume is ephemeral. + if (isa<AssumeInst>(I)) + continue; + + // Use in side-effecting instruction is non-ephemeral. + if (I->mayHaveSideEffects() || I->isTerminator()) + return false; + + // Otherwise, recursively look at the users. + if (!AddUsers(I)) + return false; + } + + return true; } PreservedAnalyses diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 2d84b4a..216bdf4 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -84,7 +84,6 @@ #include <cstdint> #include <iterator> #include <map> -#include <numeric> #include <optional> #include <set> #include <tuple> @@ -6356,25 +6355,25 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, if (DefaultResult) { Value *ValueCompare = Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp"); - SelectInst *SelectValueInst = cast<SelectInst>(Builder.CreateSelect( - ValueCompare, ResultVector[1].first, DefaultResult, "switch.select")); - SelectValue = SelectValueInst; - if (HasBranchWeights) { + SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first, + DefaultResult, "switch.select"); + if (auto *SI = dyn_cast<SelectInst>(SelectValue); + SI && HasBranchWeights) { // We start with 3 probabilities, where the numerator is the // corresponding BranchWeights[i], and the denominator is the sum over // BranchWeights. We want the probability and negative probability of // Condition == SecondCase. assert(BranchWeights.size() == 3); - setBranchWeights(SelectValueInst, BranchWeights[2], + setBranchWeights(SI, BranchWeights[2], BranchWeights[0] + BranchWeights[1], /*IsExpected=*/false); } } Value *ValueCompare = Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp"); - SelectInst *Ret = cast<SelectInst>(Builder.CreateSelect( - ValueCompare, ResultVector[0].first, SelectValue, "switch.select")); - if (HasBranchWeights) { + Value *Ret = Builder.CreateSelect(ValueCompare, ResultVector[0].first, + SelectValue, "switch.select"); + if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) { // We may have had a DefaultResult. Base the position of the first and // second's branch weights accordingly. Also the proability that Condition // != FirstCase needs to take that into account. @@ -6382,7 +6381,7 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, size_t FirstCasePos = (Condition != nullptr); size_t SecondCasePos = FirstCasePos + 1; uint32_t DefaultCase = (Condition != nullptr) ? BranchWeights[0] : 0; - setBranchWeights(Ret, BranchWeights[FirstCasePos], + setBranchWeights(SI, BranchWeights[FirstCasePos], DefaultCase + BranchWeights[SecondCasePos], /*IsExpected=*/false); } @@ -6422,13 +6421,13 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, Value *And = Builder.CreateAnd(Condition, AndMask); Value *Cmp = Builder.CreateICmpEQ( And, Constant::getIntegerValue(And->getType(), AndMask)); - SelectInst *Ret = cast<SelectInst>( - Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult)); - if (HasBranchWeights) { + Value *Ret = + Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult); + if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) { // We know there's a Default case. We base the resulting branch // weights off its probability. assert(BranchWeights.size() >= 2); - setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0), + setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0), BranchWeights[0], /*IsExpected=*/false); } return Ret; @@ -6448,11 +6447,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, Value *And = Builder.CreateAnd(Condition, ~BitMask, "switch.and"); Value *Cmp = Builder.CreateICmpEQ( And, Constant::getNullValue(And->getType()), "switch.selectcmp"); - SelectInst *Ret = cast<SelectInst>( - Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult)); - if (HasBranchWeights) { + Value *Ret = + Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult); + if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) { assert(BranchWeights.size() >= 2); - setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0), + setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0), BranchWeights[0], /*IsExpected=*/false); } return Ret; @@ -6466,11 +6465,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, Value *Cmp2 = Builder.CreateICmpEQ(Condition, CaseValues[1], "switch.selectcmp.case2"); Value *Cmp = Builder.CreateOr(Cmp1, Cmp2, "switch.selectcmp"); - SelectInst *Ret = cast<SelectInst>( - Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult)); - if (HasBranchWeights) { + Value *Ret = + Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult); + if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) { assert(BranchWeights.size() >= 2); - setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0), + setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0), BranchWeights[0], /*IsExpected=*/false); } return Ret; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 93a5f22..ab5c9c9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2438,8 +2438,9 @@ struct CSEDenseMapInfo { } // end anonymous namespace -///Perform cse of induction variable instructions. -static void cse(BasicBlock *BB) { +/// FIXME: This legacy common-subexpression-elimination routine is scheduled for +/// removal, in favor of the VPlan-based one. +static void legacyCSE(BasicBlock *BB) { // Perform simple cse. SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; for (Instruction &In : llvm::make_early_inc_range(*BB)) { @@ -2543,7 +2544,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; // Remove redundant induction instructions. - cse(HeaderBB); + legacyCSE(HeaderBB); } void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { @@ -3901,7 +3902,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4158,7 +4160,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, + *CM.PSE.getSE()); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -6833,7 +6836,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7066,7 +7069,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -8596,7 +8600,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -10053,7 +10058,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind); + CM.CostKind, *CM.PSE.getSE()); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 065622e..c547662 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1100,7 +1100,9 @@ class BinOpSameOpcodeHelper { // constant + x cannot be -constant - x // instead, it should be x - -constant if (Pos == 1 || - (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub)) + ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or || + FromOpcode == Instruction::Xor) && + ToOpcode == Instruction::Sub)) return SmallVector<Value *>({LHS, RHS}); return SmallVector<Value *>({RHS, LHS}); } @@ -1188,6 +1190,10 @@ public: if (CIValue.isAllOnes()) InterchangeableMask = CanBeAll; break; + case Instruction::Xor: + if (CIValue.isZero()) + InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT; + break; default: if (CIValue.isZero()) InterchangeableMask = CanBeAll; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index a1c6f79..728d291 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -845,19 +845,10 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) { if (VF.isScalable()) return InstructionCost::getInvalid(); - // First compute the cost of the conditionally executed recipes, followed by - // account for the branching cost, except if the mask is a header mask or - // uniform condition. - using namespace llvm::VPlanPatternMatch; + // Compute and return the cost of the conditionally executed recipes. + assert(VF.isVector() && "Can only compute vector cost at the moment."); VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]); - InstructionCost ThenCost = Then->cost(VF, Ctx); - - // For the scalar case, we may not always execute the original predicated - // block, Thus, scale the block's cost by the probability of executing it. - if (VF.isScalar()) - return ThenCost / getPredBlockCostDivisor(Ctx.CostKind); - - return ThenCost; + return Then->cost(VF, Ctx); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1759,7 +1750,8 @@ VPCostContext::getOperandInfo(VPValue *V) const { } InstructionCost VPCostContext::getScalarizationOverhead( - Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) { + Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF, + bool AlwaysIncludeReplicatingR) { if (VF.isScalar()) return 0; @@ -1779,7 +1771,9 @@ InstructionCost VPCostContext::getScalarizationOverhead( SmallPtrSet<const VPValue *, 4> UniqueOperands; SmallVector<Type *> Tys; for (auto *Op : Operands) { - if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || + if (Op->isLiveIn() || + (!AlwaysIncludeReplicatingR && + isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fe59774..2a8baec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -349,12 +349,14 @@ struct VPCostContext { LoopVectorizationCostModel &CM; SmallPtrSet<Instruction *, 8> SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; + ScalarEvolution &SE; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind) + TargetTransformInfo::TargetCostKind CostKind, + ScalarEvolution &SE) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind) {} + CostKind(CostKind), SE(SE) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -374,10 +376,12 @@ struct VPCostContext { /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy /// and \p Operands with \p VF. This is a convenience wrapper for the - /// type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Type *ResultTy, - ArrayRef<const VPValue *> Operands, - ElementCount VF); + /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR + /// is true, always compute the cost of scalarizing replicating operands. + InstructionCost + getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands, + ElementCount VF, + bool AlwaysIncludeReplicatingR = false); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index cf5e6bf..b5e30cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const { }); } +/// Returns true if \p Ptr is a pointer computation for which the legacy cost +/// model computes a SCEV expression when computing the address cost. +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { + auto *PtrR = Ptr->getDefiningRecipe(); + if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && + cast<VPReplicateRecipe>(PtrR)->getOpcode() == + Instruction::GetElementPtr) || + isa<VPWidenGEPRecipe>(PtrR))) + return false; + + // We are looking for a GEP where all indices are either loop invariant or + // inductions. + for (VPValue *Opd : drop_begin(PtrR->operands())) { + if (!Opd->isDefinedOutsideLoopRegions() && + !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) + return false; + } + + return true; +} + +/// Returns true if \p V is used as part of the address of another load or +/// store. +static bool isUsedByLoadStoreAddress(const VPUser *V) { + SmallPtrSet<const VPUser *, 4> Seen; + SmallVector<const VPUser *> WorkList = {V}; + + while (!WorkList.empty()) { + auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val()); + if (!Cur || !Seen.insert(Cur).second) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users()); + } + return false; +} + InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast<Instruction>(getUnderlyingValue()); @@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, } case Instruction::Load: case Instruction::Store: { - if (isSingleScalar()) { - bool IsLoad = UI->getOpcode() == Instruction::Load; - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); - return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); - } + if (VF.isScalable() && !isSingleScalar()) + return InstructionCost::getInvalid(); + // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - break; + const VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion && ParentRegion->isReplicator()) + break; + + bool IsLoad = UI->getOpcode() == Instruction::Load; + const VPValue *PtrOp = getOperand(!IsLoad); + // TODO: Handle cases where we need to pass a SCEV to + // getAddressComputationCost. + if (shouldUseAddressAccessSCEV(PtrOp)) + break; + + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); + + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); + + InstructionCost ScalarCost = + ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + PtrTy, &Ctx.SE, nullptr, Ctx.CostKind); + if (isSingleScalar()) + return ScalarCost; + + SmallVector<const VPValue *> OpsToScalarize; + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we + // don't assign scalarization overhead in general, if the target prefers + // vectorized addressing or the loaded value is used as part of an address + // of another load or store. + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); + if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) { + bool EfficientVectorLoadStore = + Ctx.TTI.supportsEfficientVectorElementLoadStore(); + if (!(IsLoad && !PreferVectorizedAddressing) && + !(!IsLoad && EfficientVectorLoadStore)) + append_range(OpsToScalarize, operands()); + + if (!EfficientVectorLoadStore) + ResultTy = Ctx.Types.inferScalarType(this); + } + + return (ScalarCost * VF.getFixedValue()) + + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 58fab8f..5252e1f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2853,6 +2853,7 @@ void VPlanTransforms::replaceSymbolicStrides( return R->getParent()->getParent() || R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); }; + ValueToSCEVMapTy RewriteMap; for (const SCEV *Stride : StridesMap.values()) { using namespace SCEVPatternMatch; auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); @@ -2880,6 +2881,22 @@ void VPlanTransforms::replaceSymbolicStrides( VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); } + RewriteMap[StrideV] = PSE.getSCEV(StrideV); + } + + for (VPRecipeBase &R : *Plan.getEntry()) { + auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R); + if (!ExpSCEV) + continue; + const SCEV *ScevExpr = ExpSCEV->getSCEV(); + auto *NewSCEV = + SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap); + if (NewSCEV != ScevExpr) { + VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV); + ExpSCEV->replaceAllUsesWith(NewExp); + if (Plan.getTripCount() == ExpSCEV) + Plan.resetTripCount(NewExp); + } } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index eac0e70..0599930 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -13,6 +13,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" using namespace llvm; +using namespace llvm::VPlanPatternMatch; bool vputils::onlyFirstLaneUsed(const VPValue *Def) { return all_of(Def->users(), @@ -63,7 +64,6 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { }; VPValue *A, *B; - using namespace VPlanPatternMatch; if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One()))) return B == Plan.getTripCount() && @@ -90,7 +90,6 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { } bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { - using namespace VPlanPatternMatch; // Live-ins are uniform. if (V->isLiveIn()) return true; @@ -159,7 +158,6 @@ std::optional<VPValue *> vputils::getRecipesForUncountableExit(VPlan &Plan, SmallVectorImpl<VPRecipeBase *> &Recipes, SmallVectorImpl<VPRecipeBase *> &GEPs) { - using namespace llvm::VPlanPatternMatch; // Given a VPlan like the following (just including the recipes contributing // to loop control exiting here, not the actual work), we're looking to match // the recipes contributing to the uncountable exit condition comparison diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 0ef933f..32704bd 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2487,21 +2487,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask)))) return false; + // Check whether this is a binary shuffle. + bool IsBinaryShuffle = !isa<UndefValue>(V1); + auto *C0 = dyn_cast<CastInst>(V0); auto *C1 = dyn_cast<CastInst>(V1); - if (!C0 || !C1) + if (!C0 || (IsBinaryShuffle && !C1)) return false; Instruction::CastOps Opcode = C0->getOpcode(); - if (C0->getSrcTy() != C1->getSrcTy()) + + // If this is allowed, foldShuffleOfCastops can get stuck in a loop + // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle. + if (!IsBinaryShuffle && Opcode == Instruction::BitCast) return false; - // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds. - if (Opcode != C1->getOpcode()) { - if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value()))) - Opcode = Instruction::SExt; - else + if (IsBinaryShuffle) { + if (C0->getSrcTy() != C1->getSrcTy()) return false; + // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds. + if (Opcode != C1->getOpcode()) { + if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value()))) + Opcode = Instruction::SExt; + else + return false; + } } auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); @@ -2544,23 +2554,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { InstructionCost CostC0 = TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy, TTI::CastContextHint::None, CostKind); - InstructionCost CostC1 = - TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy, - TTI::CastContextHint::None, CostKind); - InstructionCost OldCost = CostC0 + CostC1; - OldCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy, - CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I); - InstructionCost NewCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, NewShuffleDstTy, - CastSrcTy, NewMask, CostKind); + TargetTransformInfo::ShuffleKind ShuffleKind; + if (IsBinaryShuffle) + ShuffleKind = TargetTransformInfo::SK_PermuteTwoSrc; + else + ShuffleKind = TargetTransformInfo::SK_PermuteSingleSrc; + + InstructionCost OldCost = CostC0; + OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask, + CostKind, 0, nullptr, {}, &I); + + InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy, + CastSrcTy, NewMask, CostKind); NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy, TTI::CastContextHint::None, CostKind); if (!C0->hasOneUse()) NewCost += CostC0; - if (!C1->hasOneUse()) - NewCost += CostC1; + if (IsBinaryShuffle) { + InstructionCost CostC1 = + TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy, + TTI::CastContextHint::None, CostKind); + OldCost += CostC1; + if (!C1->hasOneUse()) + NewCost += CostC1; + } LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost @@ -2568,14 +2586,20 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { if (NewCost > OldCost) return false; - Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0), - C1->getOperand(0), NewMask); + Value *Shuf; + if (IsBinaryShuffle) + Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0), + NewMask); + else + Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask); + Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy); // Intersect flags from the old casts. if (auto *NewInst = dyn_cast<Instruction>(Cast)) { NewInst->copyIRFlags(C0); - NewInst->andIRFlags(C1); + if (IsBinaryShuffle) + NewInst->andIRFlags(C1); } Worklist.pushValue(Shuf); @@ -4433,7 +4457,7 @@ bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) { // Create new mask using difference of the two incoming masks. int MaskOffset = NewMask[0u]; - unsigned Index = (InputNumElements - MaskOffset) % InputNumElements; + unsigned Index = (InputNumElements + MaskOffset) % InputNumElements; NewMask.clear(); for (unsigned I = 0u; I < InputNumElements; ++I) { |