aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp35
-rw-r--r--llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp143
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnroll.cpp59
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp101
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp48
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp37
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h3
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp32
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp21
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h30
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHelpers.h5
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp26
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp62
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp19
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp50
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h3
17 files changed, 513 insertions, 168 deletions
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index 2f256df..b72d41a 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) {
return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column);
}
+static AllocationType getAllocType(const AllocationInfo *AllocInfo) {
+ return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
+ AllocInfo->Info.getAllocCount(),
+ AllocInfo->Info.getTotalLifetime());
+}
+
static AllocationType addCallStack(CallStackTrie &AllocTrie,
const AllocationInfo *AllocInfo,
uint64_t FullStackId) {
SmallVector<uint64_t> StackIds;
for (const auto &StackFrame : AllocInfo->CallStack)
StackIds.push_back(computeStackId(StackFrame));
- auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
- AllocInfo->Info.getAllocCount(),
- AllocInfo->Info.getTotalLifetime());
+ auto AllocType = getAllocType(AllocInfo);
std::vector<ContextTotalSize> ContextSizeInfo;
if (recordContextSizeInfoForAnalysis()) {
auto TotalSize = AllocInfo->Info.getTotalSize();
@@ -405,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI,
const std::set<const AllocationInfo *> &AllocInfoSet,
std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
&FullStackIdToAllocMatchInfo) {
+ // TODO: Remove this once the profile creation logic deduplicates contexts
+ // that are the same other than the IsInlineFrame bool. Until then, keep the
+ // largest.
+ DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo;
+ for (auto *AllocInfo : AllocInfoSet) {
+ auto FullStackId = computeFullStackId(AllocInfo->CallStack);
+ auto [It, Inserted] =
+ UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo});
+ // If inserted entry, done.
+ if (Inserted)
+ continue;
+ // Keep the larger one, or the noncold one if they are the same size.
+ auto CurSize = It->second->Info.getTotalSize();
+ auto NewSize = AllocInfo->Info.getTotalSize();
+ if ((CurSize > NewSize) ||
+ (CurSize == NewSize &&
+ getAllocType(AllocInfo) != AllocationType::NotCold))
+ continue;
+ It->second = AllocInfo;
+ }
// We may match this instruction's location list to multiple MIB
// contexts. Add them to a Trie specialized for trimming the contexts to
// the minimal needed to disambiguate contexts with unique behavior.
CallStackTrie AllocTrie(&ORE, MaxColdSize);
uint64_t TotalSize = 0;
uint64_t TotalColdSize = 0;
- for (auto *AllocInfo : AllocInfoSet) {
+ for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) {
// Check the full inlined call stack against this one.
// If we found and thus matched all frames on the call, include
// this MIB.
if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
InlinedCallStack)) {
NumOfMemProfMatchedAllocContexts++;
- uint64_t FullStackId = 0;
- if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
- FullStackId = computeFullStackId(AllocInfo->CallStack);
auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
TotalSize += AllocInfo->Info.getTotalSize();
if (AllocType == AllocationType::Cold)
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 78d4a57e..87eba5f 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -58,6 +58,18 @@ static cl::opt<bool>
cl::desc("Writes always set the type"), cl::Hidden,
cl::init(false));
+static cl::opt<bool> ClOutlineInstrumentation(
+ "tysan-outline-instrumentation",
+ cl::desc("Uses function calls for all TySan instrumentation, reducing "
+ "ELF size"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClVerifyOutlinedInstrumentation(
+ "tysan-verify-outlined-instrumentation",
+ cl::desc("Check types twice with both inlined instrumentation and "
+ "function calls. This verifies that they behave the same."),
+ cl::Hidden, cl::init(false));
+
STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses");
namespace {
@@ -105,12 +117,16 @@ private:
Regex AnonNameRegex;
Type *IntptrTy;
uint64_t PtrShift;
- IntegerType *OrdTy;
+ IntegerType *OrdTy, *U64Ty;
/// Callbacks to run-time library are computed in initializeCallbacks.
FunctionCallee TysanCheck;
FunctionCallee TysanCtorFunction;
+ FunctionCallee TysanIntrumentMemInst;
+ FunctionCallee TysanInstrumentWithShadowUpdate;
+ FunctionCallee TysanSetShadowType;
+
/// Callback to set types for gloabls.
Function *TysanGlobalsSetTypeFunction;
};
@@ -130,6 +146,8 @@ TypeSanitizer::TypeSanitizer(Module &M)
void TypeSanitizer::initializeCallbacks(Module &M) {
IRBuilder<> IRB(M.getContext());
OrdTy = IRB.getInt32Ty();
+ U64Ty = IRB.getInt64Ty();
+ Type *BoolType = IRB.getInt1Ty();
AttributeList Attr;
Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
@@ -144,6 +162,30 @@ void TypeSanitizer::initializeCallbacks(Module &M) {
TysanCtorFunction =
M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy());
+
+ TysanIntrumentMemInst = M.getOrInsertFunction(
+ "__tysan_instrument_mem_inst", Attr, IRB.getVoidTy(),
+ IRB.getPtrTy(), // Pointer of data to be written to
+ IRB.getPtrTy(), // Pointer of data to write
+ U64Ty, // Size of the data in bytes
+ BoolType // Do we need to call memmove
+ );
+
+ TysanInstrumentWithShadowUpdate = M.getOrInsertFunction(
+ "__tysan_instrument_with_shadow_update", Attr, IRB.getVoidTy(),
+ IRB.getPtrTy(), // Pointer to data to be read
+ IRB.getPtrTy(), // Pointer to type descriptor
+ BoolType, // Do we need to type check this
+ U64Ty, // Size of data we access in bytes
+ OrdTy // Flags
+ );
+
+ TysanSetShadowType = M.getOrInsertFunction(
+ "__tysan_set_shadow_type", Attr, IRB.getVoidTy(),
+ IRB.getPtrTy(), // Pointer of data to be written to
+ IRB.getPtrTy(), // Pointer to the new type descriptor
+ U64Ty // Size of data we access in bytes
+ );
}
void TypeSanitizer::instrumentGlobals(Module &M) {
@@ -587,6 +629,29 @@ bool TypeSanitizer::instrumentWithShadowUpdate(
Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy());
+ if (ClOutlineInstrumentation) {
+ if (!ForceSetType && (!ClWritesAlwaysSetType || IsRead)) {
+ // We need to check the type here. If the type is unknown, then the read
+ // sets the type. If the type is known, then it is checked. If the type
+ // doesn't match, then we call the runtime type check (which may yet
+ // determine that the mismatch is okay).
+
+ Constant *Flags =
+ ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1));
+
+ IRB.CreateCall(TysanInstrumentWithShadowUpdate,
+ {Ptr, TD,
+ SanitizeFunction ? IRB.getTrue() : IRB.getFalse(),
+ IRB.getInt64(AccessSize), Flags});
+ } else if (ForceSetType || IsWrite) {
+ // In the mode where writes always set the type, for a write (which does
+ // not also read), we just set the type.
+ IRB.CreateCall(TysanSetShadowType, {Ptr, TD, IRB.getInt64(AccessSize)});
+ }
+
+ return true;
+ }
+
Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift,
ShadowBase, AppMemMask);
Type *Int8PtrPtrTy = PointerType::get(IRB.getContext(), 0);
@@ -838,37 +903,47 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
}
}
- if (!ShadowBase)
- ShadowBase = getShadowBase(*F);
- if (!AppMemMask)
- AppMemMask = getAppMemMask(*F);
+ if (ClOutlineInstrumentation) {
+ if (!Src)
+ Src = ConstantPointerNull::get(IRB.getPtrTy());
- Value *ShadowDataInt = IRB.CreateAdd(
- IRB.CreateShl(
- IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
- PtrShift),
- ShadowBase);
- Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
-
- if (!Src) {
- IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift),
- Align(1ull << PtrShift));
+ IRB.CreateCall(
+ TysanIntrumentMemInst,
+ {Dest, Src, Size, NeedsMemMove ? IRB.getTrue() : IRB.getFalse()});
return true;
- }
-
- Value *SrcShadowDataInt = IRB.CreateAdd(
- IRB.CreateShl(
- IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
- PtrShift),
- ShadowBase);
- Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
-
- if (NeedsMemMove) {
- IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData,
- Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
} else {
- IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData,
- Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+ if (!ShadowBase)
+ ShadowBase = getShadowBase(*F);
+ if (!AppMemMask)
+ AppMemMask = getAppMemMask(*F);
+
+ Value *ShadowDataInt = IRB.CreateAdd(
+ IRB.CreateShl(
+ IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
+ PtrShift),
+ ShadowBase);
+ Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
+
+ if (!Src) {
+ IRB.CreateMemSet(ShadowData, IRB.getInt8(0),
+ IRB.CreateShl(Size, PtrShift), Align(1ull << PtrShift));
+ return true;
+ }
+
+ Value *SrcShadowDataInt = IRB.CreateAdd(
+ IRB.CreateShl(
+ IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
+ PtrShift),
+ ShadowBase);
+ Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
+
+ if (NeedsMemMove) {
+ IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+ Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+ } else {
+ IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+ Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+ }
}
return true;
@@ -890,6 +965,16 @@ PreservedAnalyses TypeSanitizerPass::run(Module &M,
for (Function &F : M) {
const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
TySan.sanitizeFunction(F, TLI);
+ if (ClVerifyOutlinedInstrumentation && ClOutlineInstrumentation) {
+ // Outlined instrumentation is a new option, and so this exists to
+ // verify there is no difference in behaviour between the options.
+ // If the outlined instrumentation triggers a verification failure
+ // when the original inlined instrumentation does not, or vice versa,
+ // then there is a discrepency which should be investigated.
+ ClOutlineInstrumentation = false;
+ TySan.sanitizeFunction(F, TLI);
+ ClOutlineInstrumentation = true;
+ }
}
return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 4fe736a..94dfd3a 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -499,9 +499,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L);
const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
- unsigned EstimatedLoopInvocationWeight = 0;
std::optional<unsigned> OriginalTripCount =
- llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight);
+ llvm::getLoopEstimatedTripCount(L);
+ BranchProbability OriginalLoopProb = llvm::getLoopProbability(L);
// Effectively "DCE" unrolled iterations that are beyond the max tripcount
// and will never be executed.
@@ -592,11 +592,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
: isEpilogProfitable(L);
if (ULO.Runtime &&
- !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
- EpilogProfitability, ULO.UnrollRemainder,
- ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
- PreserveLCSSA, ULO.SCEVExpansionBudget,
- ULO.RuntimeUnrollMultiExit, RemainderLoop)) {
+ !UnrollRuntimeLoopRemainder(
+ L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
+ ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+ PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit,
+ RemainderLoop, OriginalTripCount, OriginalLoopProb)) {
if (ULO.Force)
ULO.Runtime = false;
else {
@@ -1130,11 +1130,46 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
LI->erase(L);
// We shouldn't try to use `L` anymore.
L = nullptr;
- } else if (OriginalTripCount) {
- // Update the trip count. Note that the remainder has already logic
- // computing it in `UnrollRuntimeLoopRemainder`.
- setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count,
- EstimatedLoopInvocationWeight);
+ } else {
+ // Update metadata for the loop's branch weights and estimated trip count:
+ // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch
+ // weights, latch branch weights, and estimated trip count of the
+ // remainder loop it creates. It also sets the branch weights for the
+ // unrolled loop guard it creates. The branch weights for the unrolled
+ // loop latch are adjusted below. FIXME: Handle prologue loops.
+ // - Otherwise, if unrolled loop iteration latches become unconditional,
+ // branch weights are adjusted above. FIXME: Actually handle such
+ // unconditional latches.
+ // - Otherwise, the original loop's branch weights are correct for the
+ // unrolled loop, so do not adjust them.
+ // - In all cases, the unrolled loop's estimated trip count is set below.
+ //
+ // As an example of the last case, consider what happens if the unroll count
+ // is 4 for a loop with an estimated trip count of 10 when we do not create
+ // a remainder loop and all iterations' latches remain conditional. Each
+ // unrolled iteration's latch still has the same probability of exiting the
+ // loop as it did when in the original loop, and thus it should still have
+ // the same branch weights. Each unrolled iteration's non-zero probability
+ // of exiting already appropriately reduces the probability of reaching the
+ // remaining iterations just as it did in the original loop. Trying to also
+ // adjust the branch weights of the final unrolled iteration's latch (i.e.,
+ // the backedge for the unrolled loop as a whole) to reflect its new trip
+ // count of 3 will erroneously further reduce its block frequencies.
+ // However, in case an analysis later needs to estimate the trip count of
+ // the unrolled loop as a whole without considering the branch weights for
+ // each unrolled iteration's latch within it, we store the new trip count as
+ // separate metadata.
+ if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) {
+ // Where p is always the probability of executing at least 1 more
+ // iteration, the probability for at least n more iterations is p^n.
+ setLoopProbability(L, OriginalLoopProb.pow(ULO.Count));
+ }
+ if (OriginalTripCount) {
+ unsigned NewTripCount = *OriginalTripCount / ULO.Count;
+ if (!ULO.Runtime && *OriginalTripCount % ULO.Count)
+ ++NewTripCount;
+ setLoopEstimatedTripCount(L, NewTripCount);
+ }
}
// LoopInfo should not be valid, confirm that.
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 7a2b8da..1e8f6cc 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -40,6 +40,7 @@
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <cmath>
using namespace llvm;
@@ -195,6 +196,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
}
}
+/// Assume, due to our position in the remainder loop or its guard, anywhere
+/// from 0 to \p N more iterations can possibly execute. Among such cases in
+/// the original loop (with loop probability \p OriginalLoopProb), what is the
+/// probability of executing at least one more iteration?
+static BranchProbability
+probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
+ // Each of these variables holds the original loop's probability that the
+ // number of iterations it will execute is some m in the specified range.
+ BranchProbability ProbOne = OriginalLoopProb; // 1 <= m
+ BranchProbability ProbTooMany = ProbOne.pow(N + 1); // N + 1 <= m
+ BranchProbability ProbNotTooMany = ProbTooMany.getCompl(); // 0 <= m <= N
+ BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N
+ return ProbOneNotTooMany / ProbNotTooMany;
+}
+
/// Connect the unrolling epilog code to the original loop.
/// The unrolling epilog code contains code to execute the
/// 'extra' iterations if the run-time trip count modulo the
@@ -221,7 +237,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
ValueToValueMapTy &VMap, DominatorTree *DT,
LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE,
- unsigned Count, AssumptionCache &AC) {
+ unsigned Count, AssumptionCache &AC,
+ BranchProbability OriginalLoopProb) {
BasicBlock *Latch = L->getLoopLatch();
assert(Latch && "Loop must have a latch");
BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
@@ -332,12 +349,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
PreserveLCSSA);
// Add the branch to the exit block (around the epilog loop)
MDNode *BranchWeights = nullptr;
- if (hasBranchWeightMD(*Latch->getTerminator())) {
+ if (OriginalLoopProb.isUnknown() &&
+ hasBranchWeightMD(*Latch->getTerminator())) {
// Assume equal distribution in interval [0, Count).
MDBuilder MDB(B.getContext());
BranchWeights = MDB.createBranchWeights(1, Count - 1);
}
- B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);
+ BranchInst *RemainderLoopGuard =
+ B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);
+ if (!OriginalLoopProb.isUnknown()) {
+ setBranchProbability(RemainderLoopGuard,
+ probOfNextInRemainder(OriginalLoopProb, Count - 1),
+ /*ForFirstTarget=*/true);
+ }
InsertPt->eraseFromParent();
if (DT) {
auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit);
@@ -357,14 +381,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
/// The cloned blocks should be inserted between InsertTop and InsertBot.
/// InsertTop should be new preheader, InsertBot new loop exit.
/// Returns the new cloned loop that is created.
-static Loop *
-CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
- const bool UnrollRemainder,
- BasicBlock *InsertTop,
- BasicBlock *InsertBot, BasicBlock *Preheader,
+static Loop *CloneLoopBlocks(Loop *L, Value *NewIter,
+ const bool UseEpilogRemainder,
+ const bool UnrollRemainder, BasicBlock *InsertTop,
+ BasicBlock *InsertBot, BasicBlock *Preheader,
std::vector<BasicBlock *> &NewBlocks,
LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
- DominatorTree *DT, LoopInfo *LI, unsigned Count) {
+ DominatorTree *DT, LoopInfo *LI, unsigned Count,
+ std::optional<unsigned> OriginalTripCount,
+ BranchProbability OriginalLoopProb) {
StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
@@ -419,7 +444,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");
Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp");
MDNode *BranchWeights = nullptr;
- if (hasBranchWeightMD(*LatchBR)) {
+ if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
+ hasBranchWeightMD(*LatchBR)) {
uint32_t ExitWeight;
uint32_t BackEdgeWeight;
if (Count >= 3) {
@@ -437,7 +463,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
MDBuilder MDB(Builder.getContext());
BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
}
- Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);
+ BranchInst *RemainderLoopLatch =
+ Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);
+ if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
+ // Compute the total frequency of the original loop body from the
+ // remainder iterations. Once we've reached them, the first of them
+ // always executes, so its frequency and probability are 1.
+ double FreqRemIters = 1;
+ if (Count > 2) {
+ BranchProbability ProbReaching = BranchProbability::getOne();
+ for (unsigned N = Count - 2; N >= 1; --N) {
+ ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N);
+ FreqRemIters += double(ProbReaching.getNumerator()) /
+ ProbReaching.getDenominator();
+ }
+ }
+ // Solve for the loop probability that would produce that frequency.
+ // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters.
+ double ProbDouble = 1 - 1 / FreqRemIters;
+ BranchProbability Prob = BranchProbability::getBranchProbability(
+ std::round(ProbDouble * BranchProbability::getDenominator()),
+ BranchProbability::getDenominator());
+ setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true);
+ }
NewIdx->addIncoming(Zero, InsertTop);
NewIdx->addIncoming(IdxNext, NewBB);
LatchBR->eraseFromParent();
@@ -461,6 +509,9 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
Loop *NewLoop = NewLoops[L];
assert(NewLoop && "L should have been cloned");
+ if (OriginalTripCount && UseEpilogRemainder)
+ setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count);
+
// Add unroll disable metadata to disable future unrolling for this loop.
if (!UnrollRemainder)
NewLoop->setLoopAlreadyUnrolled();
@@ -588,7 +639,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
const TargetTransformInfo *TTI, bool PreserveLCSSA,
unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
- Loop **ResultLoop) {
+ Loop **ResultLoop, std::optional<unsigned> OriginalTripCount,
+ BranchProbability OriginalLoopProb) {
LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
LLVM_DEBUG(L->dump());
LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -808,12 +860,23 @@ bool llvm::UnrollRuntimeLoopRemainder(
BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
// Branch to either remainder (extra iterations) loop or unrolling loop.
MDNode *BranchWeights = nullptr;
- if (hasBranchWeightMD(*Latch->getTerminator())) {
+ if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
+ hasBranchWeightMD(*Latch->getTerminator())) {
// Assume loop is nearly always entered.
MDBuilder MDB(B.getContext());
BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights);
}
- B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);
+ BranchInst *UnrollingLoopGuard =
+ B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);
+ if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
+ // The original loop's first iteration always happens. Compute the
+ // probability of the original loop executing Count-1 iterations after that
+ // to complete the first iteration of the unrolled loop.
+ BranchProbability ProbOne = OriginalLoopProb;
+ BranchProbability ProbRest = ProbOne.pow(Count - 1);
+ setBranchProbability(UnrollingLoopGuard, ProbRest,
+ /*ForFirstTarget=*/false);
+ }
PreHeaderBR->eraseFromParent();
if (DT) {
if (UseEpilogRemainder)
@@ -840,9 +903,10 @@ bool llvm::UnrollRuntimeLoopRemainder(
// iterations. This function adds the appropriate CFG connections.
BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
- Loop *remainderLoop = CloneLoopBlocks(
- L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot,
- NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count);
+ Loop *remainderLoop =
+ CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop,
+ InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT,
+ LI, Count, OriginalTripCount, OriginalLoopProb);
// Insert the cloned blocks into the function.
F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end());
@@ -941,7 +1005,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
// Connect the epilog code to the original loop and update the
// PHI functions.
ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader,
- NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC);
+ NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC,
+ OriginalLoopProb);
// Update counter in loop for unrolling.
// Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index b6ba822..8be471b 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -962,13 +962,51 @@ bool llvm::setLoopEstimatedTripCount(
if (LatchBranch->getSuccessor(0) != L->getHeader())
std::swap(BackedgeTakenWeight, LatchExitWeight);
- MDBuilder MDB(LatchBranch->getContext());
-
// Set/Update profile metadata.
- LatchBranch->setMetadata(
- LLVMContext::MD_prof,
- MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
+ setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight},
+ /*IsExpected=*/false);
+
+ return true;
+}
+
+BranchProbability llvm::getLoopProbability(Loop *L) {
+ BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+ if (!LatchBranch)
+ return BranchProbability::getUnknown();
+ bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
+ return getBranchProbability(LatchBranch, FirstTargetIsLoop);
+}
+bool llvm::setLoopProbability(Loop *L, BranchProbability P) {
+ BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+ if (!LatchBranch)
+ return false;
+ bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
+ return setBranchProbability(LatchBranch, P, FirstTargetIsLoop);
+}
+
+BranchProbability llvm::getBranchProbability(BranchInst *B,
+ bool ForFirstTarget) {
+ if (B->getNumSuccessors() != 2)
+ return BranchProbability::getUnknown();
+ uint64_t Weight0, Weight1;
+ if (!extractBranchWeights(*B, Weight0, Weight1))
+ return BranchProbability::getUnknown();
+ if (!ForFirstTarget)
+ std::swap(Weight0, Weight1);
+ return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1);
+}
+
+bool llvm::setBranchProbability(BranchInst *B, BranchProbability P,
+ bool ForFirstTarget) {
+ if (B->getNumSuccessors() != 2)
+ return false;
+ BranchProbability Prob0 = P;
+ BranchProbability Prob1 = P.getCompl();
+ if (!ForFirstTarget)
+ std::swap(Prob0, Prob1);
+ setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()},
+ /*IsExpected=*/false);
return true;
}
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index b03fb62..cbc604e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -80,6 +80,7 @@
#include <algorithm>
#include <cassert>
#include <climits>
+#include <cmath>
#include <cstddef>
#include <cstdint>
#include <iterator>
@@ -5955,7 +5956,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
}
// Update weight for the newly-created conditional branch.
- if (hasBranchWeightMD(*SI)) {
+ if (hasBranchWeightMD(*SI) && NewBI->isConditional()) {
SmallVector<uint64_t, 8> Weights;
getBranchWeights(SI, Weights);
if (Weights.size() == 1 + SI->getNumCases()) {
@@ -5977,14 +5978,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
}
// Prune obsolete incoming values off the successors' PHI nodes.
- for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) {
+ for (auto &PHI : make_early_inc_range(Dest->phis())) {
unsigned PreviousEdges = Cases->size();
if (Dest == SI->getDefaultDest())
++PreviousEdges;
for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
- cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+ PHI.removeIncomingValue(SI->getParent());
}
- for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
+ for (auto &PHI : make_early_inc_range(OtherDest->phis())) {
unsigned PreviousEdges = OtherCases->size();
if (OtherDest == SI->getDefaultDest())
++PreviousEdges;
@@ -5993,7 +5994,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
if (NewBI->isUnconditional())
++E;
for (unsigned I = 0; I != E; ++I)
- cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+ PHI.removeIncomingValue(SI->getParent());
}
// Clean up the default block - it may have phis or other instructions before
@@ -7632,7 +7633,33 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
auto *DefaultCaseBB = SI->getDefaultDest();
BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU);
auto It = OrigBB->getTerminator()->getIterator();
+ SmallVector<uint32_t> Weights;
+ auto HasWeights =
+ !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights);
auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+ if (HasWeights && any_of(Weights, [](const auto &V) { return V != 0; })) {
+ // IsPow2 covers a subset of the cases in which we'd go to the default
+ // label. The other is those powers of 2 that don't appear in the case
+ // statement. We don't know the distribution of the values coming in, so
+ // the safest is to split 50-50 the original probability to `default`.
+ uint64_t OrigDenominator = sum_of(map_range(
+ Weights, [](const auto &V) { return static_cast<uint64_t>(V); }));
+ SmallVector<uint64_t> NewWeights(2);
+ NewWeights[1] = Weights[0] / 2;
+ NewWeights[0] = OrigDenominator - NewWeights[1];
+ setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false);
+
+ // For the original switch, we reduce the weight of the default by the
+ // amount by which the previous branch contributes to getting to default,
+ // and then make sure the remaining weights have the same relative ratio
+ // wrt eachother.
+ uint64_t CasesDenominator = OrigDenominator - Weights[0];
+ Weights[0] /= 2;
+ for (auto &W : drop_begin(Weights))
+ W = NewWeights[0] * static_cast<double>(W) / CasesDenominator;
+
+ setBranchWeights(*SI, Weights, /*IsExpected=*/false);
+ }
// BI is handling the default case for SI, and so should share its DebugLoc.
BI->setDebugLoc(SI->getDebugLoc());
It->eraseFromParent();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 3fed003..5298728 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -280,8 +280,7 @@ public:
VPValue *createElementCount(Type *Ty, ElementCount EC) {
VPlan &Plan = *getInsertBlock()->getPlan();
- VPValue *RuntimeEC =
- Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue()));
+ VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue());
if (EC.isScalable()) {
VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty);
RuntimeEC = EC.getKnownMinValue() == 1
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 505fb43..e5c3f17 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
continue;
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
- *CM.PSE.getSE());
+ *CM.PSE.getSE(), OrigLoop);
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
- *CM.PSE.getSE());
+ *CM.PSE.getSE(), OrigLoop);
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6876,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(),
+ OrigLoop);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7110,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
- *CM.PSE.getSE());
+ *CM.PSE.getSE(), OrigLoop);
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
// legacy cost model doesn't properly model costs for such loops.
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
+ !Legal->getLAI()->getSymbolicStrides().empty() ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
CostCtx, OrigLoop,
BestFactor.Width) ||
@@ -7750,8 +7752,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands);
VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
- VPValue *One =
- Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
+ VPValue *One = Plan.getConstantInt(I->getType(), 1u);
auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
Ops[1] = SafeRHS;
return new VPWidenRecipe(*I, Ops);
@@ -7804,11 +7805,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
}
case Instruction::ExtractValue: {
SmallVector<VPValue *> NewOps(Operands);
- Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
auto *EVI = cast<ExtractValueInst>(I);
assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
unsigned Idx = EVI->getIndices()[0];
- NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
+ NewOps.push_back(Plan.getConstantInt(32, Idx));
return new VPWidenRecipe(*I, NewOps);
}
};
@@ -8177,8 +8177,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
"Expected an ADD or SUB operation for predicated partial "
"reductions (because the neutral element in the mask is zero)!");
Cond = getBlockInMask(Builder.getInsertBlock());
- VPValue *Zero =
- Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
+ VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0);
BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
@@ -8441,7 +8440,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
- *CM.PSE.getSE());
+ *CM.PSE.getSE(), OrigLoop);
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
@@ -8641,7 +8640,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
} else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
CurrentLinkI->getOpcode() == Instruction::Sub) {
Type *PhiTy = PhiR->getUnderlyingValue()->getType();
- auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
+ auto *Zero = Plan->getConstantInt(PhiTy, 0);
VPWidenRecipe *Sub = new VPWidenRecipe(
Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
VPIRMetadata(), CurrentLinkI->getDebugLoc());
@@ -8855,8 +8854,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
ToDelete.push_back(Select);
// Convert the reduction phi to operate on bools.
- PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
- OrigLoop->getHeader()->getContext())));
+ PhiR->setOperand(0, Plan->getFalse());
continue;
}
@@ -8878,9 +8876,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
unsigned ScaleFactor =
RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
.value_or(1);
- Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
- auto *ScaleFactorVPV =
- Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
+ auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor);
VPValue *StartV = PHBuilder.createNaryOp(
VPInstruction::ReductionStartVector,
{PhiR->getStartValue(), Iden, ScaleFactorVPV},
@@ -9911,7 +9907,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
- CM.CostKind, *CM.PSE.getSE());
+ CM.CostKind, *CM.PSE.getSE(), L);
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1b55a3b..34b405c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22134,6 +22134,27 @@ bool BoUpSLP::collectValuesToDemote(
{VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
+ if (E.isAltShuffle()) {
+ // Combining these opcodes may lead to incorrect analysis, skip for now.
+ auto IsDangerousOpcode = [](unsigned Opcode) {
+ switch (Opcode) {
+ case Instruction::Shl:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ return true;
+ default:
+ break;
+ }
+ return false;
+ };
+ if (IsDangerousOpcode(E.getAltOpcode()))
+ return FinalAnalysis();
+ }
+
switch (E.getOpcode()) {
// We can always demote truncations and extensions. Since truncations can
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f10058..08c9c15 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4109,6 +4109,12 @@ public:
const VPCanonicalIVPHIRecipe *getCanonicalIV() const {
return const_cast<VPRegionBlock *>(this)->getCanonicalIV();
}
+
+ /// Return the type of the canonical IV for loop regions.
+ Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); }
+ const Type *getCanonicalIVType() const {
+ return getCanonicalIV()->getScalarType();
+ }
};
inline VPRegionBlock *VPRecipeBase::getRegion() {
@@ -4387,15 +4393,25 @@ public:
}
/// Return a VPValue wrapping i1 true.
- VPValue *getTrue() {
- LLVMContext &Ctx = getContext();
- return getOrAddLiveIn(ConstantInt::getTrue(Ctx));
- }
+ VPValue *getTrue() { return getConstantInt(1, 1); }
/// Return a VPValue wrapping i1 false.
- VPValue *getFalse() {
- LLVMContext &Ctx = getContext();
- return getOrAddLiveIn(ConstantInt::getFalse(Ctx));
+ VPValue *getFalse() { return getConstantInt(1, 0); }
+
+ /// Return a VPValue wrapping a ConstantInt with the given type and value.
+ VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) {
+ return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned));
+ }
+
+ /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value.
+ VPValue *getConstantInt(unsigned BitWidth, uint64_t Val,
+ bool IsSigned = false) {
+ return getConstantInt(APInt(BitWidth, Val, IsSigned));
+ }
+
+ /// Return a VPValue wrapping a ConstantInt with the given APInt value.
+ VPValue *getConstantInt(const APInt &Val) {
+ return getOrAddLiveIn(ConstantInt::get(getContext(), Val));
}
/// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 65688a3..1a66d20 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -612,8 +612,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
if (!RequiresScalarEpilogueCheck)
Cmp = Plan.getFalse();
else if (TailFolded)
- Cmp = Plan.getOrAddLiveIn(
- ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
+ Cmp = Plan.getTrue();
else
Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
&Plan.getVectorTripCount(), LatchDL, "cmp.n");
@@ -712,8 +711,8 @@ void VPlanTransforms::addMinimumIterationCheck(
// additional overflow check is required before entering the vector loop.
// Get the maximum unsigned value for the type.
- VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
- TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
+ VPValue *MaxUIntTripCount =
+ Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask());
VPValue *DistanceToMax = Builder.createNaryOp(
Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
DebugLoc::getUnknown());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 2aaabd9..965426f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -350,13 +350,14 @@ struct VPCostContext {
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
ScalarEvolution &SE;
+ const Loop *L;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
const VPlan &Plan, LoopVectorizationCostModel &CM,
TargetTransformInfo::TargetCostKind CostKind,
- ScalarEvolution &SE)
+ ScalarEvolution &SE, const Loop *L)
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
- CostKind(CostKind), SE(SE) {}
+ CostKind(CostKind), SE(SE), L(L) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9a63c80..f9c15a3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2372,9 +2372,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = getRegion()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
- getScalarType() == CanIV->getScalarType();
+ getScalarType() == getRegion()->getCanonicalIVType();
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3167,26 +3166,30 @@ bool VPReplicateRecipe::shouldPack() const {
});
}
-/// Returns true if \p Ptr is a pointer computation for which the legacy cost
-/// model computes a SCEV expression when computing the address cost.
-static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
+/// which the legacy cost model computes a SCEV expression when computing the
+/// address cost. Computing SCEVs for VPValues is incomplete and returns
+/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
+/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
+static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
+ const Loop *L) {
auto *PtrR = Ptr->getDefiningRecipe();
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
Instruction::GetElementPtr) ||
isa<VPWidenGEPRecipe>(PtrR) ||
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
- return false;
+ return nullptr;
// We are looking for a GEP where all indices are either loop invariant or
// inductions.
for (VPValue *Opd : drop_begin(PtrR->operands())) {
if (!Opd->isDefinedOutsideLoopRegions() &&
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
- return false;
+ return nullptr;
}
- return true;
+ return vputils::getSCEVExprForVPValue(Ptr, SE, L);
}
/// Returns true if \p V is used as part of the address of another load or
@@ -3354,9 +3357,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
bool IsLoad = UI->getOpcode() == Instruction::Load;
const VPValue *PtrOp = getOperand(!IsLoad);
- // TODO: Handle cases where we need to pass a SCEV to
- // getAddressComputationCost.
- if (shouldUseAddressAccessSCEV(PtrOp))
+ const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
+ if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
break;
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
@@ -3374,7 +3376,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
InstructionCost ScalarCost =
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
- nullptr, Ctx.CostKind);
+ PtrSCEV, Ctx.CostKind);
if (isSingleScalar())
return ScalarCost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4d98014..6a8231b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -699,8 +699,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
continue;
const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
- VPValue *StartV =
- Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
+ VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
VPValue *StepV = PtrIV->getOperand(1);
VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
@@ -820,7 +819,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
// Calculate the final index.
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
auto *CanonicalIV = LoopRegion->getCanonicalIV();
- Type *CanonicalIVType = CanonicalIV->getScalarType();
+ Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
VPBuilder B(cast<VPBasicBlock>(PredVPBB));
DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
@@ -836,7 +835,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
// changed it means the exit is using the incremented value, so we need to
// add the step.
if (Incoming != WideIV) {
- VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
+ VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
}
@@ -882,7 +881,7 @@ static VPValue *optimizeLatchExitInductionUser(
return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
if (ScalarTy->isPointerTy()) {
Type *StepTy = TypeInfo.inferScalarType(Step);
- auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
+ auto *Zero = Plan.getConstantInt(StepTy, 0);
return B.createPtrAdd(EndValue,
B.createNaryOp(Instruction::Sub, {Zero, Step}),
DebugLoc::getUnknown(), "ind.escape");
@@ -1419,6 +1418,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
true /*IsSingleScalar*/);
Clone->insertBefore(RepOrWidenR);
RepOrWidenR->replaceAllUsesWith(Clone);
+ if (isDeadRecipe(*RepOrWidenR))
+ RepOrWidenR->eraseFromParent();
}
}
}
@@ -1572,9 +1573,9 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
continue;
// Update IV operands and comparison bound to use new narrower type.
- auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
+ auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
WideIV->setStartValue(NewStart);
- auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
+ auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
WideIV->setStepValue(NewStep);
auto *NewBTC = new VPWidenCastRecipe(
@@ -1693,8 +1694,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
// When using wide lane masks, the return type of the get.active.lane.mask
// intrinsic is VF x UF (last operand).
- VPValue *ALMMultiplier =
- Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+ VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
EntryALM->setOperand(2, ALMMultiplier);
LoopALM->setOperand(2, ALMMultiplier);
@@ -2400,8 +2400,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
"index.part.next");
// Create the active lane mask instruction in the VPlan preheader.
- VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
- ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1));
+ VPValue *ALMMultiplier =
+ Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{EntryIncrement, TC, ALMMultiplier}, DL,
"active.lane.mask.entry");
@@ -2501,7 +2501,7 @@ void VPlanTransforms::addActiveLaneMask(
} else {
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
- ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1));
+ ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
LaneMask =
B.createNaryOp(VPInstruction::ActiveLaneMask,
{WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
@@ -2773,7 +2773,7 @@ void VPlanTransforms::addExplicitVectorLength(
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
- auto *CanIVTy = CanonicalIVPHI->getScalarType();
+ auto *CanIVTy = LoopRegion->getCanonicalIVType();
VPValue *StartV = CanonicalIVPHI->getStartValue();
// Create the ExplicitVectorLengthPhi recipe in the main loop.
@@ -2788,8 +2788,7 @@ void VPlanTransforms::addExplicitVectorLength(
if (MaxSafeElements) {
// Support for MaxSafeDist for correct loop emission.
- VPValue *AVLSafe =
- Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
+ VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
"safe_avl");
@@ -2902,9 +2901,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
VPBuilder Builder(LatchExitingBr);
- VPValue *Cmp =
- Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
- Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));
+ VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
+ Plan.getConstantInt(AVLTy, 0));
Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
LatchExitingBr->eraseFromParent();
}
@@ -2928,8 +2926,7 @@ void VPlanTransforms::replaceSymbolicStrides(
// Only handle constant strides for now.
continue;
- auto *CI =
- Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
+ auto *CI = Plan.getConstantInt(*StrideConst);
if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
@@ -2944,7 +2941,7 @@ void VPlanTransforms::replaceSymbolicStrides(
unsigned BW = U->getType()->getScalarSizeInBits();
APInt C =
isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
- VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+ VPValue *CI = Plan.getConstantInt(C);
StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
}
RewriteMap[StrideV] = PSE.getSCEV(StrideV);
@@ -3123,8 +3120,7 @@ void VPlanTransforms::createInterleaveGroups(
DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
IG->getIndex(IRInsertPos),
/*IsSigned=*/true);
- VPValue *OffsetVPV =
- Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
+ VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
VPBuilder B(InsertPos);
Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
}
@@ -3865,8 +3861,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
VPBuilder Builder(VectorPH, VectorPH->begin());
auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
auto *TCMO = Builder.createNaryOp(
- Instruction::Sub,
- {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
+ Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
BTC->replaceAllUsesWith(TCMO);
}
@@ -3991,9 +3986,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
if (TailByMasking) {
TC = Builder.createNaryOp(
Instruction::Add,
- {TC, Builder.createNaryOp(
- Instruction::Sub,
- {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+ {TC, Builder.createNaryOp(Instruction::Sub,
+ {Step, Plan.getConstantInt(TCTy, 1)})},
DebugLoc::getCompilerGenerated(), "n.rnd.up");
}
@@ -4015,8 +4009,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
if (RequiresScalarEpilogue) {
assert(!TailByMasking &&
"requiring scalar epilogue is not supported with fail folding");
- VPValue *IsZero = Builder.createICmp(
- CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+ VPValue *IsZero =
+ Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
R = Builder.createSelect(IsZero, Step, R);
}
@@ -4054,7 +4048,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
}
VF.replaceAllUsesWith(RuntimeVF);
- VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+ VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
VFxUF.replaceAllUsesWith(MulByUF);
}
@@ -4334,17 +4328,17 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
VPBuilder PHBuilder(Plan.getVectorPreheader());
VPValue *UF = Plan.getOrAddLiveIn(
- ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+ ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
if (VF.isScalable()) {
VPValue *VScale = PHBuilder.createElementCount(
- CanIV->getScalarType(), ElementCount::getScalable(1));
+ VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1));
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
Inc->setOperand(1, VScaleUF);
Plan.getVF().replaceAllUsesWith(VScale);
} else {
Inc->setOperand(1, UF);
Plan.getVF().replaceAllUsesWith(
- Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+ Plan.getConstantInt(CanIV->getScalarType(), 1));
}
removeDeadRecipes(Plan);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index cfd1a74..d6a0028 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -68,10 +68,9 @@ class UnrollState {
void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
VPBasicBlock::iterator InsertPtForPhi);
- VPValue *getConstantVPV(unsigned Part) {
- Type *CanIVIntTy =
- Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType();
- return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+ VPValue *getConstantInt(unsigned Part) {
+ Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
+ return Plan.getConstantInt(CanIVIntTy, Part);
}
public:
@@ -138,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
remapOperands(&PartIR, Part);
if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
- ScalarIVSteps->addOperand(getConstantVPV(Part));
+ ScalarIVSteps->addOperand(getConstantInt(Part));
}
addRecipeForPart(&Part0R, &PartIR, Part);
@@ -250,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
for (unsigned Part = 1; Part != UF; ++Part)
VPV2Parts[VPI][Part - 1] = StartV;
}
- Copy->addOperand(getConstantVPV(Part));
+ Copy->addOperand(getConstantInt(Part));
} else {
assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
"unexpected header phi recipe not needing unrolled part");
@@ -319,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) ||
match(Copy,
m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
- Copy->addOperand(getConstantVPV(Part));
+ Copy->addOperand(getConstantInt(Part));
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
Copy->setOperand(0, R.getOperand(0));
@@ -475,8 +474,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
if (LaneDefs != Def2LaneDefs.end())
return LaneDefs->second[Lane.getKnownLane()];
- VPValue *Idx =
- Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+ VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
}
@@ -510,8 +508,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));
continue;
}
- VPValue *Idx =
- Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+ VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
NewOps.push_back(Ext);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4db92e7..8c23e78 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -75,7 +75,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
B == Plan.getBackedgeTakenCount();
}
-const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
+const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
+ ScalarEvolution &SE, const Loop *L) {
if (V->isLiveIn()) {
if (Value *LiveIn = V->getLiveInIRValue())
return SE.getSCEV(LiveIn);
@@ -86,6 +87,53 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe())
.Case<VPExpandSCEVRecipe>(
[](const VPExpandSCEVRecipe *R) { return R->getSCEV(); })
+ .Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) {
+ if (!L)
+ return SE.getCouldNotCompute();
+ const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+ return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L,
+ SCEV::FlagAnyWrap);
+ })
+ .Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) {
+ const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+ const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L);
+ const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L);
+ if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>))
+ return SE.getCouldNotCompute();
+
+ return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()),
+ SE.getMulExpr(IV, SE.getTruncateOrSignExtend(
+ Scale, IV->getType())));
+ })
+ .Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) {
+ const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+ const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L);
+ if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step) ||
+ !Step->isOne())
+ return SE.getCouldNotCompute();
+ return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()),
+ Step);
+ })
+ .Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) {
+ if (R->getOpcode() != Instruction::GetElementPtr)
+ return SE.getCouldNotCompute();
+
+ const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+ if (isa<SCEVCouldNotCompute>(Base))
+ return SE.getCouldNotCompute();
+
+ SmallVector<const SCEV *> IndexExprs;
+ for (VPValue *Index : drop_begin(R->operands())) {
+ const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L);
+ if (isa<SCEVCouldNotCompute>(IndexExpr))
+ return SE.getCouldNotCompute();
+ IndexExprs.push_back(IndexExpr);
+ }
+
+ Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr())
+ ->getSourceElementType();
+ return SE.getGEPExpr(Base, IndexExprs, SrcElementTy);
+ })
.Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 37cd413..c21a0e7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);
/// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no
/// SCEV expression could be constructed.
-const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
+const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE,
+ const Loop *L = nullptr);
/// Returns true if \p VPV is a single scalar, either because it produces the
/// same value for all lanes or only has its first lane used.