aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp3
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp6
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp77
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFuse.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp40
-rw-r--r--llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp12
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp26
-rw-r--r--llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp51
-rw-r--r--llvm/lib/Transforms/Utils/BuildLibCalls.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/BypassSlowDivision.cpp32
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp52
-rw-r--r--llvm/lib/Transforms/Utils/LoopSimplify.cpp60
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp21
-rw-r--r--llvm/lib/Transforms/Utils/LoopVersioning.cpp9
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp173
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp40
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp11
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h94
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp23
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp173
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp4
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp27
32 files changed, 656 insertions, 353 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7a95df4..b575d76 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1378,8 +1378,7 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU,
IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
// We can't know the precise weights here, as they would depend on the value
// distribution of Call->getArgOperand(1). So we just mark it as "unknown".
- setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(),
- DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE);
Type *IndexTy = DL.getIndexType(Call->getType());
SmallVector<DominatorTree::UpdateType, 8> Updates;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index d85e4f7..9bdd8cb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -479,7 +479,7 @@ private:
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr);
- setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, DEBUG_TYPE, &F);
return Sel;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 67f837c..b158e0f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2261,11 +2261,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
}
Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
- if (!isa<Constant>(I.getOperand(1)))
- return nullptr;
+ bool IsOtherParamConst = isa<Constant>(I.getOperand(1));
if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
- if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+ if (Instruction *NewSel =
+ FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst))
return NewSel;
} else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 471c6ec..ceeece4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// adding/"accumulating" %s. "Accumulation" stores the result in one
// of the source registers, but this accumulate vs. add distinction
// is lost when dealing with LLVM intrinsics.)
+ //
+ // ZeroPurifies means that multiplying a known-zero with an uninitialized
+ // value results in an initialized value. This is applicable for integer
+ // multiplication, but not floating-point (counter-example: NaN).
void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+ bool ZeroPurifies,
unsigned EltSizeInBits = 0) {
IRBuilder<> IRB(&I);
@@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
assert(AccumulatorType == ReturnType);
}
- FixedVectorType *ImplicitReturnType = ReturnType;
+ FixedVectorType *ImplicitReturnType =
+ cast<FixedVectorType>(getShadowTy(ReturnType));
// Step 1: instrument multiplication of corresponding vector elements
if (EltSizeInBits) {
ImplicitReturnType = cast<FixedVectorType>(
@@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
ReturnType->getNumElements() * ReductionFactor);
}
- // Multiplying an *initialized* zero by an uninitialized element results in
- // an initialized zero element.
- //
- // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
- // results in an unpoisoned value. We can therefore adapt the visitAnd()
- // instrumentation:
- // OutShadow = (SaNonZero & SbNonZero)
- // | (VaNonZero & SbNonZero)
- // | (SaNonZero & VbNonZero)
- // where non-zero is checked on a per-element basis (not per bit).
- Value *SZero = Constant::getNullValue(Va->getType());
- Value *VZero = Constant::getNullValue(Sa->getType());
- Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
- Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
- Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
- Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
-
- Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
- Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
- Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
-
// Each element of the vector is represented by a single bit (poisoned or
// not) e.g., <8 x i1>.
- Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+ Value *SaNonZero = IRB.CreateIsNotNull(Sa);
+ Value *SbNonZero = IRB.CreateIsNotNull(Sb);
+ Value *And;
+ if (ZeroPurifies) {
+ // Multiplying an *initialized* zero by an uninitialized element results
+ // in an initialized zero element.
+ //
+ // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+ // results in an unpoisoned value. We can therefore adapt the visitAnd()
+ // instrumentation:
+ // OutShadow = (SaNonZero & SbNonZero)
+ // | (VaNonZero & SbNonZero)
+ // | (SaNonZero & VbNonZero)
+ // where non-zero is checked on a per-element basis (not per bit).
+ Value *VaInt = Va;
+ Value *VbInt = Vb;
+ if (!Va->getType()->isIntegerTy()) {
+ VaInt = CreateAppToShadowCast(IRB, Va);
+ VbInt = CreateAppToShadowCast(IRB, Vb);
+ }
+
+ Value *VaNonZero = IRB.CreateIsNotNull(VaInt);
+ Value *VbNonZero = IRB.CreateIsNotNull(VbInt);
+
+ Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+ Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+ Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+ And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+ } else {
+ And = IRB.CreateOr({SaNonZero, SbNonZero});
+ }
// Extend <8 x i1> to <8 x i16>.
// (The real pmadd intrinsic would have computed intermediate values of
@@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
case Intrinsic::x86_avx2_pmadd_ub_sw:
case Intrinsic::x86_avx512_pmaddubs_w_512:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true);
break;
// <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
case Intrinsic::x86_ssse3_pmadd_ub_sw:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
break;
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
case Intrinsic::x86_mmx_pmadd_wd:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
break;
// AVX Vector Neural Network Instructions: bytes
@@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx2_vpdpbuuds_128:
case Intrinsic::x86_avx2_vpdpbuuds_256:
case Intrinsic::x86_avx10_vpdpbuuds_512:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
break;
// AVX Vector Neural Network Instructions: words
@@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx512_vpdpwssds_128:
case Intrinsic::x86_avx512_vpdpwssds_256:
case Intrinsic::x86_avx512_vpdpwssds_512:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
break;
// TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index af53fa0..02f06be 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -734,7 +734,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
// Reserve bit 60-63 for other information purpose.
- FunctionHash &= 0x0FFFFFFFFFFFFFFF;
+ FunctionHash &= NamedInstrProfRecord::FUNC_HASH_MASK;
if (IsCS)
NamedInstrProfRecord::setCSFlagInHash(FunctionHash);
LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 19eccb9..9ffa602 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1796,14 +1796,16 @@ private:
// mergeLatch may remove the only block in FC1.
SE.forgetLoop(FC1.L);
SE.forgetLoop(FC0.L);
- // Forget block dispositions as well, so that there are no dangling
- // pointers to erased/free'ed blocks.
- SE.forgetBlockAndLoopDispositions();
// Move instructions from FC0.Latch to FC1.Latch.
// Note: mergeLatch requires an updated DT.
mergeLatch(FC0, FC1);
+ // Forget block dispositions as well, so that there are no dangling
+ // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+ // since merging the latches may affect the dispositions.
+ SE.forgetBlockAndLoopDispositions();
+
// Merge the loops.
SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
for (BasicBlock *BB : Blocks) {
@@ -2092,14 +2094,16 @@ private:
// mergeLatch may remove the only block in FC1.
SE.forgetLoop(FC1.L);
SE.forgetLoop(FC0.L);
- // Forget block dispositions as well, so that there are no dangling
- // pointers to erased/free'ed blocks.
- SE.forgetBlockAndLoopDispositions();
// Move instructions from FC0.Latch to FC1.Latch.
// Note: mergeLatch requires an updated DT.
mergeLatch(FC0, FC1);
+ // Forget block dispositions as well, so that there are no dangling
+ // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+ // since merging the latches may affect the dispositions.
+ SE.forgetBlockAndLoopDispositions();
+
// Merge the loops.
SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
for (BasicBlock *BB : Blocks) {
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca..9070d25 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero' idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt<bool, true>
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt<bool> ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),
cl::Hidden);
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector<uint32_t> BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+ if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+ // We're not changing the loop profile, so we can reuse the original loop's
+ // profile.
+ setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector<uint32_t> BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+ if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+ // We're not changing the loop profile, so we can reuse the original loop's
+ // profile.
+ setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index a883998..1b770be 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate {
/// Return true if the dependence from the store to the load has an
/// absolute distance of one.
/// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
- bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
- Loop *L) const {
+ bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L,
+ const DominatorTree &DT) const {
Value *LoadPtr = Load->getPointerOperand();
Value *StorePtr = Store->getPointerOperand();
Type *LoadType = getLoadStoreType(Load);
@@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate {
DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
"Should be a known dependence");
- int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
- int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
+ int64_t StrideLoad =
+ getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0);
+ int64_t StrideStore =
+ getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0);
if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
return false;
@@ -287,8 +289,8 @@ public:
// so deciding which one forwards is easy. The later one forwards as
// long as they both have a dependence distance of one to the load.
if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
- Cand.isDependenceDistanceOfOne(PSE, L) &&
- OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+ Cand.isDependenceDistanceOfOne(PSE, L, *DT) &&
+ OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) {
// They are in the same block, the later one will forward to the load.
if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
OtherCand = &Cand;
@@ -538,7 +540,7 @@ public:
// Check whether the SCEV difference is the same as the induction step,
// thus we load the value in the next iteration.
- if (!Cand.isDependenceDistanceOfOne(PSE, L))
+ if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT))
continue;
assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5..e902b71 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ private:
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+ // We don't really need to add branch weights to DummySwitch, because all
+ // but one branches are just a temporary artifact - see the comment on top
+ // of this function. But, it's easy to estimate the weights, and it helps
+ // maintain a property of the overall compiler - that the branch weights
+ // don't "just get dropped" accidentally (i.e. profcheck)
+ if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector<uint32_t> DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+ }
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 2bda9d8..802ae4e 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
}
// Do not attempt partial/runtime unrolling in FullLoopUnrolling
- if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) {
+ if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) ||
+ UP.Count < TripCount || UP.Count < MaxTripCount)) {
LLVM_DEBUG(
dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 239526e..0f3e664 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
@@ -329,8 +330,7 @@ static void buildPartialUnswitchConditionalBranch(
HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof)
: nullptr);
if (!HasBranchWeights)
- setExplicitlyUnknownBranchWeightsIfProfiled(
- *BR, *BR->getParent()->getParent(), DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
}
/// Copy a set of loop invariant values, and conditionally branch on them.
@@ -388,8 +388,7 @@ static void buildPartialInvariantUnswitchConditionalBranch(
IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
Direction ? &NormalSucc : &UnswitchedSucc, ProfData);
if (!ProfData)
- setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(),
- DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
}
/// Rewrite the PHI nodes in an unswitched loop exit basic block.
@@ -2831,9 +2830,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
MSSAU->getMemorySSA()->verifyMemorySSA();
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- Instruction *DeoptBlockTerm =
- SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
- GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI);
+ // llvm.experimental.guard doesn't have branch weights. We can assume,
+ // however, that the deopt path is unlikely.
+ Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+ GI->getArgOperand(0), GI, true,
+ !ProfcheckDisableMetadataFixes && EstimateProfile
+ ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+ : nullptr,
+ &DTU, &LI);
BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
// SplitBlockAndInsertIfThen inserts control flow that branches to
// DeoptBlockTerm if the condition is true. We want the opposite.
@@ -3197,10 +3201,14 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(*InvariantBr, DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 42b1fdf..8aa8aa2 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -39,36 +39,36 @@ using namespace llvm;
STATISTIC(NumBroken, "Number of blocks inserted");
namespace {
- struct BreakCriticalEdges : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- BreakCriticalEdges() : FunctionPass(ID) {
- initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
- }
+struct BreakCriticalEdges : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ BreakCriticalEdges() : FunctionPass(ID) {
+ initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
+ }
- bool runOnFunction(Function &F) override {
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ bool runOnFunction(Function &F) override {
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
- auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+ auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+ auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
- auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
- auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
- unsigned N =
- SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
- NumBroken += N;
- return N > 0;
- }
+ auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+ auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+ unsigned N = SplitAllCriticalEdges(
+ F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
+ NumBroken += N;
+ return N > 0;
+ }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
- // No loop canonicalization guarantees are broken by this pass.
- AU.addPreservedID(LoopSimplifyID);
- }
- };
-}
+ // No loop canonicalization guarantees are broken by this pass.
+ AU.addPreservedID(LoopSimplifyID);
+ }
+};
+} // namespace
char BreakCriticalEdges::ID = 0;
INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
@@ -76,6 +76,7 @@ INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
// Publicly exposed interface to pass...
char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
+
FunctionPass *llvm::createBreakCriticalEdgesPass() {
return new BreakCriticalEdges();
}
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 573a781..02b73e8 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1283,6 +1283,12 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
case LibFunc_ilogbl:
case LibFunc_logf:
case LibFunc_logl:
+ case LibFunc_nextafter:
+ case LibFunc_nextafterf:
+ case LibFunc_nextafterl:
+ case LibFunc_nexttoward:
+ case LibFunc_nexttowardf:
+ case LibFunc_nexttowardl:
case LibFunc_pow:
case LibFunc_powf:
case LibFunc_powl:
diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 7343c79..9f6d89e 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -40,22 +40,22 @@ using namespace llvm;
namespace {
- struct QuotRemPair {
- Value *Quotient;
- Value *Remainder;
-
- QuotRemPair(Value *InQuotient, Value *InRemainder)
- : Quotient(InQuotient), Remainder(InRemainder) {}
- };
-
- /// A quotient and remainder, plus a BB from which they logically "originate".
- /// If you use Quotient or Remainder in a Phi node, you should use BB as its
- /// corresponding predecessor.
- struct QuotRemWithBB {
- BasicBlock *BB = nullptr;
- Value *Quotient = nullptr;
- Value *Remainder = nullptr;
- };
+struct QuotRemPair {
+ Value *Quotient;
+ Value *Remainder;
+
+ QuotRemPair(Value *InQuotient, Value *InRemainder)
+ : Quotient(InQuotient), Remainder(InRemainder) {}
+};
+
+/// A quotient and remainder, plus a BB from which they logically "originate".
+/// If you use Quotient or Remainder in a Phi node, you should use BB as its
+/// corresponding predecessor.
+struct QuotRemWithBB {
+ BasicBlock *BB = nullptr;
+ Value *Quotient = nullptr;
+ Value *Remainder = nullptr;
+};
using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
using BypassWidthsTy = DenseMap<unsigned, unsigned>;
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 5ba6f95f..6086615 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
case Attribute::CoroDestroyOnlyWhenComplete:
case Attribute::CoroElideSafe:
case Attribute::NoDivergenceSource:
+ case Attribute::NoCreateUndefOrPoison:
continue;
// Those attributes should be safe to propagate to the extracted function.
case Attribute::AlwaysInline:
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 0642d51..dd8706c 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -16,22 +16,62 @@
using namespace llvm;
+static void mergeAttributes(LLVMContext &Ctx, const Module &M,
+ const DataLayout &DL, const Triple &TT,
+ Function *Func, FunctionType *FuncTy,
+ AttributeList FuncAttrs) {
+ AttributeList OldAttrs = Func->getAttributes();
+ AttributeList NewAttrs = OldAttrs;
+
+ {
+ AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs());
+ AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs());
+ OldBuilder.merge(NewBuilder);
+ NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder);
+ }
+
+ {
+ AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs());
+ AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs());
+ OldBuilder.merge(NewBuilder);
+ NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder);
+ }
+
+ for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) {
+ AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I));
+ AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I));
+ OldBuilder.merge(NewBuilder);
+ NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder);
+ }
+
+ Func->setAttributes(NewAttrs);
+}
+
PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
ModuleAnalysisManager &MAM) {
RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple());
LLVMContext &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
+ const Triple &TT = M.getTargetTriple();
- for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) {
- if (Impl == RTLIB::Unsupported)
+ for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+ if (!RTLCI.isAvailable(Impl))
continue;
- // TODO: Declare with correct type, calling convention, and attributes.
+ auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl);
- FunctionType *FuncTy =
- FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
+ // TODO: Declare with correct type, calling convention, and attributes.
+ if (!FuncTy)
+ FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
StringRef FuncName = RTLCI.getLibcallImplName(Impl);
- M.getOrInsertFunction(FuncName, FuncTy);
+
+ Function *Func =
+ cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee());
+ if (Func->getFunctionType() == FuncTy) {
+ mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs);
+ Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl));
+ }
}
return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 61ffb49..8da6a980 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -378,7 +378,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
if (P != Preheader) BackedgeBlocks.push_back(P);
}
- // Create and insert the new backedge block...
+ // Create and insert the new backedge block.
BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
Header->getName() + ".backedge", F);
BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
@@ -737,39 +737,39 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
}
namespace {
- struct LoopSimplify : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- LoopSimplify() : FunctionPass(ID) {
- initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
- }
+struct LoopSimplify : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ LoopSimplify() : FunctionPass(ID) {
+ initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
+ }
- bool runOnFunction(Function &F) override;
+ bool runOnFunction(Function &F) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
- // We need loop information to identify the loops...
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
+ // We need loop information to identify the loops.
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addPreserved<SCEVAAWrapperPass>();
- AU.addPreservedID(LCSSAID);
- AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added.
- AU.addPreserved<BranchProbabilityInfoWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
+ AU.addPreservedID(LCSSAID);
+ AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added.
+ AU.addPreserved<BranchProbabilityInfoWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
- /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
- void verifyAnalysis() const override;
- };
-}
+ /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
+ void verifyAnalysis() const override;
+};
+} // namespace
char LoopSimplify::ID = 0;
INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
@@ -780,12 +780,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops",
false, false)
-// Publicly exposed interface to pass...
+// Publicly exposed interface to pass.
char &llvm::LoopSimplifyID = LoopSimplify::ID;
Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
-/// it in any convenient order) inserting preheaders...
+/// it in any convenient order) inserting preheaders.
///
bool LoopSimplify::runOnFunction(Function &F) {
bool Changed = false;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 1e8f6cc..6c9467b 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
/// probability of executing at least one more iteration?
static BranchProbability
probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
+ // OriginalLoopProb == 1 would produce a division by zero in the calculation
+ // below. The problem is that case indicates an always infinite loop, but a
+ // remainder loop cannot be calculated at run time if the original loop is
+ // infinite as infinity % UnrollCount is undefined. We then choose
+ // probabilities indicating that all remainder loop iterations will always
+ // execute.
+ //
+ // Currently, the remainder loop here is an epilogue, which cannot be reached
+ // if the original loop is infinite, so the aforementioned choice is
+ // arbitrary.
+ //
+ // FIXME: Branch weights still need to be fixed in the case of prologues
+ // (issue #135812). In that case, the aforementioned choice seems reasonable
+ // for the goal of maintaining the original loop's block frequencies. That
+ // is, an infinite loop's initial iterations are not skipped, and the prologue
+ // loop body might have unique blocks that execute a finite number of times
+ // if, for example, the original loop body contains conditionals like i <
+ // UnrollCount.
+ if (OriginalLoopProb == BranchProbability::getOne())
+ return BranchProbability::getOne();
+
// Each of these variables holds the original loop's probability that the
// number of iterations it will execute is some m in the specified range.
BranchProbability ProbOne = OriginalLoopProb; // 1 <= m
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1..9c8b6ef 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,12 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(*BI, DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index cbc604e..37c048f 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -778,8 +778,10 @@ private:
return false;
// Add all values from the range to the set
- for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+ APInt Tmp = Span.getLower();
+ do
Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+ while (++Tmp != Span.getUpper());
UsedICmps++;
return true;
@@ -5212,8 +5214,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
// We don't have any info about this condition.
auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB)
: Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
- setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(),
- DEBUG_TYPE);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE);
OldTI->eraseFromParent();
@@ -6020,6 +6021,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
const DataLayout &DL) {
Value *Cond = SI->getCondition();
KnownBits Known = computeKnownBits(Cond, DL, AC, SI);
+ SmallPtrSet<const Constant *, 4> KnownValues;
+ bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4);
// We can also eliminate cases by determining that their values are outside of
// the limited range of the condition based on how many significant (non-sign)
@@ -6039,15 +6042,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
UniqueSuccessors.push_back(Successor);
++It->second;
}
- const APInt &CaseVal = Case.getCaseValue()->getValue();
+ ConstantInt *CaseC = Case.getCaseValue();
+ const APInt &CaseVal = CaseC->getValue();
if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
- (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) {
- DeadCases.push_back(Case.getCaseValue());
+ (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) ||
+ (IsKnownValuesValid && !KnownValues.contains(CaseC))) {
+ DeadCases.push_back(CaseC);
if (DTU)
--NumPerSuccessorCases[Successor];
LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
<< " is dead.\n");
- }
+ } else if (IsKnownValuesValid)
+ KnownValues.erase(CaseC);
}
// If we can prove that the cases must cover all possible values, the
@@ -6058,33 +6064,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
const unsigned NumUnknownBits =
Known.getBitWidth() - (Known.Zero | Known.One).popcount();
assert(NumUnknownBits <= Known.getBitWidth());
- if (HasDefault && DeadCases.empty() &&
- NumUnknownBits < 64 /* avoid overflow */) {
- uint64_t AllNumCases = 1ULL << NumUnknownBits;
- if (SI->getNumCases() == AllNumCases) {
+ if (HasDefault && DeadCases.empty()) {
+ if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) {
createUnreachableSwitchDefault(SI, DTU);
return true;
}
- // When only one case value is missing, replace default with that case.
- // Eliminating the default branch will provide more opportunities for
- // optimization, such as lookup tables.
- if (SI->getNumCases() == AllNumCases - 1) {
- assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
- IntegerType *CondTy = cast<IntegerType>(Cond->getType());
- if (CondTy->getIntegerBitWidth() > 64 ||
- !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
- return false;
- uint64_t MissingCaseVal = 0;
- for (const auto &Case : SI->cases())
- MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
- auto *MissingCase =
- cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
- SwitchInstProfUpdateWrapper SIW(*SI);
- SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
- createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
- SIW.setSuccessorWeight(0, 0);
- return true;
+ if (NumUnknownBits < 64 /* avoid overflow */) {
+ uint64_t AllNumCases = 1ULL << NumUnknownBits;
+ if (SI->getNumCases() == AllNumCases) {
+ createUnreachableSwitchDefault(SI, DTU);
+ return true;
+ }
+ // When only one case value is missing, replace default with that case.
+ // Eliminating the default branch will provide more opportunities for
+ // optimization, such as lookup tables.
+ if (SI->getNumCases() == AllNumCases - 1) {
+ assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+ IntegerType *CondTy = cast<IntegerType>(Cond->getType());
+ if (CondTy->getIntegerBitWidth() > 64 ||
+ !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+ return false;
+
+ uint64_t MissingCaseVal = 0;
+ for (const auto &Case : SI->cases())
+ MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+ auto *MissingCase = cast<ConstantInt>(
+ ConstantInt::get(Cond->getType(), MissingCaseVal));
+ SwitchInstProfUpdateWrapper SIW(*SI);
+ SIW.addCase(MissingCase, SI->getDefaultDest(),
+ SIW.getSuccessorWeight(0));
+ createUnreachableSwitchDefault(SI, DTU,
+ /*RemoveOrigDefaultBlock*/ false);
+ SIW.setSuccessorWeight(0, 0);
+ return true;
+ }
}
}
@@ -7570,6 +7584,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
return true;
}
+/// Tries to transform the switch when the condition is umin with a constant.
+/// In that case, the default branch can be replaced by the constant's branch.
+/// This method also removes dead cases when the simplification cannot replace
+/// the default branch.
+///
+/// For example:
+/// switch(umin(a, 3)) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// case 3:
+/// case 4:
+/// // ...
+/// default:
+/// unreachable
+/// }
+///
+/// Transforms into:
+///
+/// switch(a) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// default:
+/// // This is case 3
+/// }
+static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) {
+ Value *A;
+ ConstantInt *Constant;
+
+ if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant))))
+ return false;
+
+ SmallVector<DominatorTree::UpdateType> Updates;
+ SwitchInstProfUpdateWrapper SIW(*SI);
+ BasicBlock *BB = SIW->getParent();
+
+ // Dead cases are removed even when the simplification fails.
+ // A case is dead when its value is higher than the Constant.
+ for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) {
+ if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) {
+ ++I;
+ continue;
+ }
+ BasicBlock *DeadCaseBB = I->getCaseSuccessor();
+ DeadCaseBB->removePredecessor(BB);
+ Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB});
+ I = SIW->removeCase(I);
+ E = SIW->case_end();
+ }
+
+ auto Case = SI->findCaseValue(Constant);
+ // If the case value is not found, `findCaseValue` returns the default case.
+ // In this scenario, since there is no explicit `case 3:`, the simplification
+ // fails. The simplification also fails when the switch’s default destination
+ // is reachable.
+ if (!SI->defaultDestUnreachable() || Case == SI->case_default()) {
+ if (DTU)
+ DTU->applyUpdates(Updates);
+ return !Updates.empty();
+ }
+
+ BasicBlock *Unreachable = SI->getDefaultDest();
+ SIW.replaceDefaultDest(Case);
+ SIW.removeCase(Case);
+ SIW->setCondition(A);
+
+ Updates.push_back({DominatorTree::Delete, BB, Unreachable});
+
+ if (DTU)
+ DTU->applyUpdates(Updates);
+
+ return true;
+}
+
/// Tries to transform switch of powers of two to reduce switch range.
/// For example, switch like:
/// switch (C) { case 1: case 2: case 64: case 128: }
@@ -7642,19 +7731,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
// label. The other is those powers of 2 that don't appear in the case
// statement. We don't know the distribution of the values coming in, so
// the safest is to split 50-50 the original probability to `default`.
- uint64_t OrigDenominator = sum_of(map_range(
- Weights, [](const auto &V) { return static_cast<uint64_t>(V); }));
+ uint64_t OrigDenominator =
+ sum_of(map_range(Weights, StaticCastTo<uint64_t>));
SmallVector<uint64_t> NewWeights(2);
NewWeights[1] = Weights[0] / 2;
NewWeights[0] = OrigDenominator - NewWeights[1];
setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false);
-
- // For the original switch, we reduce the weight of the default by the
- // amount by which the previous branch contributes to getting to default,
- // and then make sure the remaining weights have the same relative ratio
- // wrt eachother.
+ // The probability of executing the default block stays constant. It was
+ // p_d = Weights[0] / OrigDenominator
+ // we rewrite as W/D
+ // We want to find the probability of the default branch of the switch
+ // statement. Let's call it X. We have W/D = W/2D + X * (1-W/2D)
+ // i.e. the original probability is the probability we go to the default
+ // branch from the BI branch, or we take the default branch on the SI.
+ // Meaning X = W / (2D - W), or (W/2) / (D - W/2)
+ // This matches using W/2 for the default branch probability numerator and
+ // D-W/2 as the denominator.
+ Weights[0] = NewWeights[1];
uint64_t CasesDenominator = OrigDenominator - Weights[0];
- Weights[0] /= 2;
for (auto &W : drop_begin(Weights))
W = NewWeights[0] * static_cast<double>(W) / CasesDenominator;
@@ -8037,6 +8131,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
if (simplifyDuplicateSwitchArms(SI, DTU))
return requestResimplify();
+ if (simplifySwitchWhenUMin(SI, DTU))
+ return requestResimplify();
+
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index fdfff16..03112c6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
bool CanAddPredicate = !llvm::shouldOptimizeForSize(
TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
- int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
- CanAddPredicate, false).value_or(0);
+ int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides,
+ CanAddPredicate, false)
+ .value_or(0);
if (Stride == 1 || Stride == -1)
return Stride;
return 0;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5c3f17..b7224a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7550,13 +7550,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- Load->getAlign(), VPIRMetadata(*Load, LVer),
- I->getDebugLoc());
+ VPIRMetadata(*Load, LVer), I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, Store->getAlign(),
- VPIRMetadata(*Store, LVer), I->getDebugLoc());
+ Reverse, VPIRMetadata(*Store, LVer),
+ I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -7934,6 +7933,26 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
(!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
}
+
+ // Check that all partial reductions in a chain are only used by other
+ // partial reductions with the same scale factor. Otherwise we end up creating
+ // users of scaled reductions where the types of the other operands don't
+ // match.
+ for (const auto &[Chain, Scale] : PartialReductionChains) {
+ auto AllUsersPartialRdx = [ScaleVal = Scale, this](const User *U) {
+ auto *UI = cast<Instruction>(U);
+ if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) {
+ return all_of(UI->users(), [ScaleVal, this](const User *U) {
+ auto *UI = cast<Instruction>(U);
+ return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal;
+ });
+ }
+ return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal ||
+ !OrigLoop->contains(UI->getParent());
+ };
+ if (!all_of(Chain.Reduction->users(), AllUsersPartialRdx))
+ ScaledReductionMap.erase(Chain.Reduction);
+ }
}
bool VPRecipeBuilder::getScaledReductions(
@@ -8117,11 +8136,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
- if (auto PartialRed =
- tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
- return PartialRed;
- }
+ if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
+ return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8155,9 +8171,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Accumulator);
- if (ScaleFactor !=
- vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
- return nullptr;
+ assert(ScaleFactor ==
+ vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) &&
+ "all accumulators in chain must have same scale factor");
unsigned ReductionOpcode = Reduction->getOpcode();
if (ReductionOpcode == Instruction::Sub) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bf3f52c..df835a0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20996,6 +20996,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
return false;
}))
return std::nullopt;
+ if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
+ EI.UserTE->hasCopyableElements() &&
+ EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
+ all_of(VL, [&](Value *V) {
+ if (S.isCopyableElement(V))
+ return true;
+ return isUsedOutsideBlock(V);
+ }))
+ return std::nullopt;
bool HasCopyables = S.areInstructionsWithCopyableElements();
if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 428a8f4..dd26a05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -304,18 +304,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
}
bool IsSingleScalar = vputils::isSingleScalar(Def);
-
VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
- // Check if there is a scalar value for the selected lane.
- if (!hasScalarValue(Def, LastLane)) {
- // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
- // VPExpandSCEVRecipes can also be a single scalar.
- assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe,
- VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
- "unexpected recipe found to be invariant");
- IsSingleScalar = true;
- LastLane = 0;
- }
// We need to construct the vector value for a single-scalar value by
// broadcasting the scalar to all lanes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index cfe1f1e..3062e1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1163,10 +1163,10 @@ public:
bool opcodeMayReadOrWriteFromMemory() const;
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override;
+ bool usesFirstLaneOnly(const VPValue *Op) const override;
/// Returns true if the recipe only uses the first part of operand \p Op.
- bool onlyFirstPartUsed(const VPValue *Op) const override;
+ bool usesFirstPartOnly(const VPValue *Op) const override;
/// Returns true if this VPInstruction produces a scalar value from a vector,
/// e.g. by performing a reduction or extracting a lane.
@@ -1393,13 +1393,13 @@ public:
return true;
}
- bool onlyFirstPartUsed(const VPValue *Op) const override {
+ bool usesFirstPartOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -1628,7 +1628,7 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
- bool onlyFirstLaneUsed(const VPValue *Op) const override;
+ bool usesFirstLaneOnly(const VPValue *Op) const override;
};
/// A recipe for widening Call instructions using library calls.
@@ -1725,7 +1725,9 @@ public:
#endif
};
-/// A recipe for widening select instructions.
+/// A recipe for widening select instructions. Supports both wide vector and
+/// single-scalar conditions, matching the behavior of LLVM IR's select
+/// instruction.
struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
public VPIRMetadata {
VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands)
@@ -1765,7 +1767,7 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getCond() && isInvariantCond();
@@ -1831,7 +1833,7 @@ public:
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
if (Op == getOperand(0))
@@ -1868,7 +1870,7 @@ public:
void execute(VPTransformState &State) override;
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -1882,7 +1884,7 @@ public:
}
/// Returns true if the recipe only uses the first part of operand \p Op.
- bool onlyFirstPartUsed(const VPValue *Op) const override {
+ bool usesFirstPartOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -1920,14 +1922,14 @@ public:
Type *getSourceElementType() const { return SourceElementTy; }
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}
/// Returns true if the recipe only uses the first part of operand \p Op.
- bool onlyFirstPartUsed(const VPValue *Op) const override {
+ bool usesFirstPartOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -2108,7 +2110,7 @@ public:
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// The recipe creates its own wide start value, so it only requests the
@@ -2323,7 +2325,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getStartValue();
@@ -2397,7 +2399,7 @@ public:
bool isInLoop() const { return IsInLoop; }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return isOrdered() || isInLoop();
@@ -2466,13 +2468,13 @@ public:
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Recursing through Blend recipes only, must terminate at header phi's the
// latest.
return all_of(users(),
- [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
+ [this](VPUser *U) { return U->usesFirstLaneOnly(this); });
}
};
@@ -2560,7 +2562,7 @@ public:
VPCostContext &Ctx) const override;
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+ bool usesFirstLaneOnly(const VPValue *Op) const override = 0;
/// Returns the number of stored operands of this interleave group. Returns 0
/// for load interleave groups.
@@ -2606,7 +2608,7 @@ public:
VPSlotTracker &SlotTracker) const override;
#endif
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
@@ -2654,7 +2656,7 @@ public:
#endif
/// The recipe only uses the first lane of the address, and EVL operand.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
@@ -2860,7 +2862,7 @@ public:
VPValue *getEVL() const { return getOperand(2); }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getEVL();
@@ -2922,7 +2924,7 @@ public:
bool isPredicated() const { return IsPredicated; }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return isSingleScalar();
@@ -3204,14 +3206,14 @@ protected:
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse, Align Alignment,
+ bool Consecutive, bool Reverse,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
- Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) {
+ Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
+ Reverse(Reverse) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
- assert(isa<VPVectorEndPointerRecipe>(getAddr()) ||
- !Reverse &&
- "Reversed acccess without VPVectorEndPointerRecipe address?");
+ assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) &&
+ "Reversed acccess without VPVectorEndPointerRecipe address?");
}
public:
@@ -3271,18 +3273,18 @@ public:
struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse, Align Alignment,
+ bool Consecutive, bool Reverse,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, Alignment, Metadata, DL),
+ Reverse, Metadata, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse, getAlign(),
- *this, getDebugLoc());
+ getMask(), Consecutive, Reverse, *this,
+ getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3297,7 +3299,7 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Widened, consecutive loads operations only demand the first lane of
@@ -3313,8 +3315,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
- {Addr, &EVL}, L.isConsecutive(), L.isReverse(),
- L.getAlign(), L, L.getDebugLoc()),
+ {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
+ L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -3338,7 +3340,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Widened loads only demand the first lane of EVL and consecutive loads
@@ -3352,16 +3354,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
VPValue *Mask, bool Consecutive, bool Reverse,
- Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL)
+ const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, Alignment, Metadata, DL) {
+ Consecutive, Reverse, Metadata, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, getAlign(), *this, getDebugLoc());
+ Reverse, *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3379,7 +3381,7 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Widened, consecutive stores only demand the first lane of their address,
@@ -3396,7 +3398,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
- S.isReverse(), S.getAlign(), S, S.getDebugLoc()) {
+ S.isReverse(), S, S.getDebugLoc()) {
setMask(Mask);
}
@@ -3422,7 +3424,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
#endif
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
if (Op == getEVL()) {
@@ -3506,14 +3508,14 @@ public:
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}
/// Returns true if the recipe only uses the first part of operand \p Op.
- bool onlyFirstPartUsed(const VPValue *Op) const override {
+ bool usesFirstPartOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -3588,7 +3590,7 @@ public:
}
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -3698,7 +3700,7 @@ public:
VPValue *getStepValue() const { return getOperand(2); }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
@@ -3763,7 +3765,7 @@ public:
VPValue *getStepValue() const { return getOperand(1); }
/// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1ee405a..80cd112 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -659,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case Instruction::Select: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
- Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
+ Value *Cond =
+ State.get(getOperand(0),
+ OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
return Builder.CreateSelect(Cond, Op1, Op2, Name);
@@ -1274,7 +1276,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
}
}
-bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
return vputils::onlyFirstLaneUsed(this);
@@ -1323,7 +1325,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
llvm_unreachable("switch should return");
}
-bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
if (Instruction::isBinaryOp(getOpcode()))
return vputils::onlyFirstPartUsed(this);
@@ -1690,7 +1692,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
if (!VFTy->getParamType(I.index())->isVectorTy())
Arg = State.get(I.value(), VPLane(0));
else
- Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+ Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
Args.push_back(Arg);
}
@@ -1759,7 +1761,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
State.TTI))
Arg = State.get(I.value(), VPLane(0));
else
- Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+ Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
State.TTI))
TysForDecl.push_back(Arg->getType());
@@ -1841,7 +1843,7 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
return Intrinsic::getBaseName(VectorIntrinsicID);
}
-bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
return all_of(enumerate(operands()), [this, &Op](const auto &X) {
auto [Idx, V] = X;
@@ -1968,16 +1970,13 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
getOperand(1)->printAsOperand(O, SlotTracker);
O << ", ";
getOperand(2)->printAsOperand(O, SlotTracker);
- O << (isInvariantCond() ? " (condition is loop invariant)" : "");
+ O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)"
+ : "");
}
#endif
void VPWidenSelectRecipe::execute(VPTransformState &State) {
- // The condition can be loop invariant but still defined inside the
- // loop. This means that we can't just use the original 'cond' value.
- // We have to take the 'vectorized' value and pick the first lane.
- // Instcombine will make this a no-op.
- Value *Cond = State.get(getCond(), isInvariantCond());
+ Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond()));
Value *Op0 = State.get(getOperand(1));
Value *Op1 = State.get(getOperand(2));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d9bb14..48bd697 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -91,14 +91,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/, Load->getAlign(),
- VPIRMetadata(*Load), Ingredient.getDebugLoc());
+ false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
+ Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- Store->getAlign(), VPIRMetadata(*Store),
- Ingredient.getDebugLoc());
+ VPIRMetadata(*Store), Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -154,27 +153,31 @@ static bool sinkScalarOperands(VPlan &Plan) {
bool ScalarVFOnly = Plan.hasScalarVFOnly();
bool Changed = false;
- auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo,
- VPSingleDefRecipe *Candidate) {
- // We only know how to duplicate VPReplicateRecipes and
- // VPScalarIVStepsRecipes for now.
+ SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
+ auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
+ VPBasicBlock *SinkTo, VPValue *Op) {
+ auto *Candidate =
+ dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
+ if (!Candidate)
+ return;
+
+ // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
+ // for now.
if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate))
- return false;
+ return;
- if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() ||
- Candidate->mayReadOrWriteMemory())
- return false;
+ if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
+ return;
if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
if (!ScalarVFOnly && RepR->isSingleScalar())
- return false;
+ return;
- return true;
+ WorkList.insert({SinkTo, Candidate});
};
// First, collect the operands of all recipes in replicate blocks as seeds for
// sinking.
- SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
@@ -182,14 +185,9 @@ static bool sinkScalarOperands(VPlan &Plan) {
VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
continue;
- for (auto &Recipe : *VPBB) {
- for (VPValue *Op : Recipe.operands()) {
- if (auto *Def =
- dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- if (IsValidSinkCandidate(VPBB, Def))
- WorkList.insert({VPBB, Def});
- }
- }
+ for (auto &Recipe : *VPBB)
+ for (VPValue *Op : Recipe.operands())
+ InsertIfValidSinkCandidate(VPBB, Op);
}
// Try to sink each replicate or scalar IV steps recipe in the worklist.
@@ -198,15 +196,15 @@ static bool sinkScalarOperands(VPlan &Plan) {
VPSingleDefRecipe *SinkCandidate;
std::tie(SinkTo, SinkCandidate) = WorkList[I];
- // All recipe users of the sink candidate must be in the same block SinkTo
- // or all users outside of SinkTo must have only their first lane used. In
+ // All recipe users of SinkCandidate must be in the same block SinkTo or all
+ // users outside of SinkTo must only use the first lane of SinkCandidate. In
// the latter case, we need to duplicate SinkCandidate.
auto UsersOutsideSinkTo =
make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
return cast<VPRecipeBase>(U)->getParent() != SinkTo;
});
if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
- return !U->onlyFirstLaneUsed(SinkCandidate);
+ return !U->usesFirstLaneOnly(SinkCandidate);
}))
continue;
bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
@@ -234,10 +232,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
}
SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
for (VPValue *Op : SinkCandidate->operands())
- if (auto *Def =
- dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- if (IsValidSinkCandidate(SinkTo, Def))
- WorkList.insert({SinkTo, Def});
+ InsertIfValidSinkCandidate(SinkTo, Op);
Changed = true;
}
return Changed;
@@ -1290,6 +1285,15 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
return;
}
+ // Look through broadcast of single-scalar when used as select conditions; in
+ // that case the scalar condition can be used directly.
+ if (match(Def,
+ m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue())) &&
+ vputils::isSingleScalar(C)) {
+ Def->setOperand(0, C);
+ return;
+ }
+
if (auto *Phi = dyn_cast<VPPhi>(Def)) {
if (Phi->getNumOperands() == 1)
Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -4178,6 +4182,59 @@ static bool isAlreadyNarrow(VPValue *VPV) {
return RepR && RepR->isSingleScalar();
}
+// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
+// a narrow variant.
+static VPValue *
+narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
+ auto *R = V->getDefiningRecipe();
+ if (!R || NarrowedOps.contains(V))
+ return V;
+
+ if (isAlreadyNarrow(V))
+ return V;
+
+ if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
+ for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
+ WideMember0->setOperand(
+ Idx,
+ narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
+ return V;
+ }
+
+ if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
+ // Narrow interleave group to wide load, as transformed VPlan will only
+ // process one original iteration.
+ auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
+ auto *L = new VPWidenLoadRecipe(
+ *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
+ /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
+ L->insertBefore(LoadGroup);
+ NarrowedOps.insert(L);
+ return L;
+ }
+
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
+ assert(RepR->isSingleScalar() &&
+ isa<LoadInst>(RepR->getUnderlyingInstr()) &&
+ "must be a single scalar load");
+ NarrowedOps.insert(RepR);
+ return RepR;
+ }
+
+ auto *WideLoad = cast<VPWidenLoadRecipe>(R);
+ VPValue *PtrOp = WideLoad->getAddr();
+ if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
+ PtrOp = VecPtr->getOperand(0);
+ // Narrow wide load to uniform scalar load, as transformed VPlan will only
+ // process one original iteration.
+ auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
+ /*IsUniform*/ true,
+ /*Mask*/ nullptr, *WideLoad);
+ N->insertBefore(WideLoad);
+ NarrowedOps.insert(N);
+ return N;
+}
+
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth) {
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
@@ -4279,65 +4336,15 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
SmallPtrSet<VPValue *, 4> NarrowedOps;
- auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
- auto *R = V->getDefiningRecipe();
- if (!R || NarrowedOps.contains(V))
- return V;
- if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
- // Narrow interleave group to wide load, as transformed VPlan will only
- // process one original iteration.
- auto *LI =
- cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
- auto *L = new VPWidenLoadRecipe(
- *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc());
- L->insertBefore(LoadGroup);
- NarrowedOps.insert(L);
- return L;
- }
-
- if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
- assert(RepR->isSingleScalar() &&
- isa<LoadInst>(RepR->getUnderlyingInstr()) &&
- "must be a single scalar load");
- NarrowedOps.insert(RepR);
- return RepR;
- }
- auto *WideLoad = cast<VPWidenLoadRecipe>(R);
-
- VPValue *PtrOp = WideLoad->getAddr();
- if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
- PtrOp = VecPtr->getOperand(0);
- // Narrow wide load to uniform scalar load, as transformed VPlan will only
- // process one original iteration.
- auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
- /*IsUniform*/ true,
- /*Mask*/ nullptr, *WideLoad);
- N->insertBefore(WideLoad);
- NarrowedOps.insert(N);
- return N;
- };
-
// Narrow operation tree rooted at store groups.
for (auto *StoreGroup : StoreGroups) {
- VPValue *Res = nullptr;
- VPValue *Member0 = StoreGroup->getStoredValues()[0];
- if (isAlreadyNarrow(Member0)) {
- Res = Member0;
- } else if (auto *WideMember0 =
- dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) {
- for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
- WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
- Res = WideMember0;
- } else {
- Res = NarrowOp(Member0);
- }
-
+ VPValue *Res =
+ narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
auto *SI =
cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
auto *S = new VPWidenStoreRecipe(
*SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
- /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc());
+ /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index d6a0028..d4b8b72b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -582,7 +582,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
/// Users that only demand the first lane can use the definition for lane
/// 0.
DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
- return U.onlyFirstLaneUsed(DefR);
+ return U.usesFirstLaneOnly(DefR);
});
// Update each build vector user that currently has DefR as its only
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6380d3..e22c5df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -18,12 +18,12 @@ using namespace llvm::VPlanPatternMatch;
bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
return all_of(Def->users(),
- [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); });
+ [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); });
}
bool vputils::onlyFirstPartUsed(const VPValue *Def) {
return all_of(Def->users(),
- [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); });
+ [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); });
}
bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83e3fca..5da7463 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -274,12 +274,12 @@ public:
virtual bool usesScalars(const VPValue *Op) const {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
- return onlyFirstLaneUsed(Op);
+ return usesFirstLaneOnly(Op);
}
/// Returns true if the VPUser only uses the first lane of operand \p Op.
/// Conservatively returns false.
- virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
+ virtual bool usesFirstLaneOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return false;
@@ -287,7 +287,7 @@ public:
/// Returns true if the VPUser only uses the first part of operand \p Op.
/// Conservatively returns false.
- virtual bool onlyFirstPartUsed(const VPValue *Op) const {
+ virtual bool usesFirstPartOnly(const VPValue *Op) const {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 91734a1..34754a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
for (const VPUser *U : V->users()) {
auto *UI = cast<VPRecipeBase>(U);
+ if (isa<VPIRPhi>(UI) &&
+ UI->getNumOperands() != UI->getParent()->getNumPredecessors()) {
+ errs() << "Phi-like recipe with different number of operands and "
+ "predecessors.\n";
+ return false;
+ }
+
if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) {
for (const auto &[IncomingVPV, IncomingVPBB] :
Phi->incoming_values_and_blocks()) {
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d6eb00d..27a8bbd 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
Value *ScalarV = Ext->getOperand(0);
if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
- &DT))
- ScalarV = Builder.CreateFreeze(ScalarV);
+ &DT)) {
+ // Check wether all lanes are extracted, all extracts trigger UB
+ // on poison, and the last extract (and hence all previous ones)
+ // are guaranteed to execute if Ext executes. If so, we do not
+ // need to insert a freeze.
+ SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
+ bool AllExtractsTriggerUB = true;
+ ExtractElementInst *LastExtract = nullptr;
+ BasicBlock *ExtBB = Ext->getParent();
+ for (User *U : Ext->users()) {
+ auto *Extract = cast<ExtractElementInst>(U);
+ if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
+ AllExtractsTriggerUB = false;
+ break;
+ }
+ ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
+ if (!LastExtract || LastExtract->comesBefore(Extract))
+ LastExtract = Extract;
+ }
+ if (ExtractedLanes.size() != DstTy->getNumElements() ||
+ !AllExtractsTriggerUB ||
+ !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(),
+ LastExtract->getIterator()))
+ ScalarV = Builder.CreateFreeze(ScalarV);
+ }
ScalarV = Builder.CreateBitCast(
ScalarV,
IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));