aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/IPO/ExpandVariadics.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp8
-rw-r--r--llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp20
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp1
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp38
-rw-r--r--llvm/lib/Transforms/Scalar/IndVarSimplify.cpp85
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp87
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp38
-rw-r--r--llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp20
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp60
-rw-r--r--llvm/lib/Transforms/Utils/BasicBlockUtils.cpp73
-rw-r--r--llvm/lib/Transforms/Utils/ControlFlowUtils.cpp5
-rw-r--r--llvm/lib/Transforms/Utils/FixIrreducible.cpp126
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp19
-rw-r--r--llvm/lib/Transforms/Utils/PredicateInfo.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp89
-rw-r--r--llvm/lib/Transforms/Utils/UnifyLoopExits.cpp77
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp39
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp26
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHelpers.h5
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp23
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp65
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp49
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h3
25 files changed, 685 insertions, 276 deletions
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index 042578d..6a11aec 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -380,7 +380,7 @@ bool ExpandVariadics::runOnModule(Module &M) {
if (CB->isIndirectCall()) {
FunctionType *FTy = CB->getFunctionType();
if (FTy->isVarArg())
- Changed |= expandCall(M, Builder, CB, FTy, 0);
+ Changed |= expandCall(M, Builder, CB, FTy, /*NF=*/nullptr);
}
}
}
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 5ba2167..cc53ec2 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1957,8 +1957,12 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr,
Value *DataFlowSanitizer::getShadowAddress(Value *Addr,
BasicBlock::iterator Pos) {
IRBuilder<> IRB(Pos->getParent(), Pos);
- Value *ShadowOffset = getShadowOffset(Addr, IRB);
- return getShadowAddress(Addr, Pos, ShadowOffset);
+ Value *ShadowAddr = getShadowOffset(Addr, IRB);
+ uint64_t ShadowBase = MapParams->ShadowBase;
+ if (ShadowBase != 0)
+ ShadowAddr =
+ IRB.CreateAdd(ShadowAddr, ConstantInt::get(IntptrTy, ShadowBase));
+ return getShadowAddress(Addr, Pos, ShadowAddr);
}
Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 7795cce..b5548d4 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -69,14 +69,6 @@ namespace llvm {
// Command line option to enable vtable value profiling. Defined in
// ProfileData/InstrProf.cpp: -enable-vtable-value-profiling=
extern cl::opt<bool> EnableVTableValueProfiling;
-// TODO: Remove -debug-info-correlate in next LLVM release, in favor of
-// -profile-correlate=debug-info.
-cl::opt<bool> DebugInfoCorrelate(
- "debug-info-correlate",
- cl::desc("Use debug info to correlate profiles. (Deprecated, use "
- "-profile-correlate=debug-info)"),
- cl::init(false));
-
LLVM_ABI cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate(
"profile-correlate",
cl::desc("Use debug info or binary file to correlate profiles."),
@@ -1047,7 +1039,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
// in lightweight mode. We need to move the value profile pointer to the
// Counter struct to get this working.
assert(
- !DebugInfoCorrelate && ProfileCorrelate == InstrProfCorrelator::NONE &&
+ ProfileCorrelate == InstrProfCorrelator::NONE &&
"Value profiling is not yet supported with lightweight instrumentation");
GlobalVariable *Name = Ind->getName();
auto It = ProfileDataMap.find(Name);
@@ -1504,7 +1496,7 @@ static inline Constant *getVTableAddrForProfData(GlobalVariable *GV) {
}
void InstrLowerer::getOrCreateVTableProfData(GlobalVariable *GV) {
- assert(!DebugInfoCorrelate &&
+ assert(ProfileCorrelate != InstrProfCorrelator::DEBUG_INFO &&
"Value profiling is not supported with lightweight instrumentation");
if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
return;
@@ -1584,8 +1576,7 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,
// Use internal rather than private linkage so the counter variable shows up
// in the symbol table when using debug info for correlation.
- if ((DebugInfoCorrelate ||
- ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) &&
+ if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO &&
TT.isOSBinFormatMachO() && Linkage == GlobalValue::PrivateLinkage)
Linkage = GlobalValue::InternalLinkage;
@@ -1691,8 +1682,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);
PD.RegionCounters = CounterPtr;
- if (DebugInfoCorrelate ||
- ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {
+ if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {
LLVMContext &Ctx = M.getContext();
Function *Fn = Inc->getParent()->getParent();
if (auto *SP = Fn->getSubprogram()) {
@@ -1737,7 +1727,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// When debug information is correlated to profile data, a data variable
// is not needed.
- if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
+ if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
return;
GlobalVariable *NamePtr = Inc->getName();
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index a6ec6c1..2f256df 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -216,7 +216,6 @@ static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
}
LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
<< Reason << ".\n");
- return;
}
struct AllocMatchInfo {
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 71736cf..af53fa0 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -456,7 +456,7 @@ createIRLevelProfileFlagVar(Module &M,
ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;
if (PGOInstrumentLoopEntries)
ProfileVersion |= VARIANT_MASK_INSTR_LOOP_ENTRIES;
- if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
+ if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
ProfileVersion |= VARIANT_MASK_DBG_CORRELATE;
if (PGOFunctionEntryCoverage)
ProfileVersion |=
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 66e45ec..e84ca81 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -122,16 +122,22 @@ static cl::opt<unsigned>
cl::desc("Maximum cost accepted for the transformation"),
cl::Hidden, cl::init(50));
-extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
-} // namespace llvm
-
static cl::opt<double> MaxClonedRate(
"dfa-max-cloned-rate",
cl::desc(
"Maximum cloned instructions rate accepted for the transformation"),
cl::Hidden, cl::init(7.5));
+static cl::opt<unsigned>
+ MaxOuterUseBlocks("dfa-max-out-use-blocks",
+ cl::desc("Maximum unduplicated blocks with outer uses "
+ "accepted for the transformation"),
+ cl::Hidden, cl::init(40));
+
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class SelectInstToUnfold {
SelectInst *SI;
@@ -965,8 +971,16 @@ private:
// SLPVectorizer.
// TODO: Thread the switch partially before reaching the threshold.
uint64_t NumOrigInst = 0;
- for (auto *BB : DuplicateMap.keys())
+ uint64_t NumOuterUseBlock = 0;
+ for (auto *BB : DuplicateMap.keys()) {
NumOrigInst += BB->sizeWithoutDebug();
+ // Only unduplicated blocks with single predecessor require new phi
+ // nodes.
+ for (auto *Succ : successors(BB))
+ if (!DuplicateMap.count(Succ) && Succ->getSinglePredecessor())
+ NumOuterUseBlock++;
+ }
+
if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) {
LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much "
"instructions wll be cloned\n");
@@ -977,6 +991,20 @@ private:
return false;
}
+ // Too much unduplicated blocks with outer uses may cause too much
+ // insertions of phi nodes for duplicated definitions. TODO: Drop this
+ // threshold if we come up with another way to reduce the number of inserted
+ // phi nodes.
+ if (NumOuterUseBlock > MaxOuterUseBlocks) {
+ LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much "
+ "blocks with outer uses\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch)
+ << "Too much blocks with outer uses.";
+ });
+ return false;
+ }
+
InstructionCost DuplicationCost = 0;
unsigned JumpTableSize = 0;
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 7ebcc21..4ba4ba3 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -162,8 +162,6 @@ class IndVarSimplify {
const SCEV *ExitCount,
PHINode *IndVar, SCEVExpander &Rewriter);
- bool sinkUnusedInvariants(Loop *L);
-
public:
IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
const DataLayout &DL, TargetLibraryInfo *TLI,
@@ -1079,85 +1077,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
return true;
}
-//===----------------------------------------------------------------------===//
-// sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
-//===----------------------------------------------------------------------===//
-
-/// If there's a single exit block, sink any loop-invariant values that
-/// were defined in the preheader but not used inside the loop into the
-/// exit block to reduce register pressure in the loop.
-bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
- BasicBlock *ExitBlock = L->getExitBlock();
- if (!ExitBlock) return false;
-
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader) return false;
-
- bool MadeAnyChanges = false;
- for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
-
- // Skip BB Terminator.
- if (Preheader->getTerminator() == &I)
- continue;
-
- // New instructions were inserted at the end of the preheader.
- if (isa<PHINode>(I))
- break;
-
- // Don't move instructions which might have side effects, since the side
- // effects need to complete before instructions inside the loop. Also don't
- // move instructions which might read memory, since the loop may modify
- // memory. Note that it's okay if the instruction might have undefined
- // behavior: LoopSimplify guarantees that the preheader dominates the exit
- // block.
- if (I.mayHaveSideEffects() || I.mayReadFromMemory())
- continue;
-
- // Skip debug or pseudo instructions.
- if (I.isDebugOrPseudoInst())
- continue;
-
- // Skip eh pad instructions.
- if (I.isEHPad())
- continue;
-
- // Don't sink alloca: we never want to sink static alloca's out of the
- // entry block, and correctly sinking dynamic alloca's requires
- // checks for stacksave/stackrestore intrinsics.
- // FIXME: Refactor this check somehow?
- if (isa<AllocaInst>(&I))
- continue;
-
- // Determine if there is a use in or before the loop (direct or
- // otherwise).
- bool UsedInLoop = false;
- for (Use &U : I.uses()) {
- Instruction *User = cast<Instruction>(U.getUser());
- BasicBlock *UseBB = User->getParent();
- if (PHINode *P = dyn_cast<PHINode>(User)) {
- unsigned i =
- PHINode::getIncomingValueNumForOperand(U.getOperandNo());
- UseBB = P->getIncomingBlock(i);
- }
- if (UseBB == Preheader || L->contains(UseBB)) {
- UsedInLoop = true;
- break;
- }
- }
-
- // If there is, the def must remain in the preheader.
- if (UsedInLoop)
- continue;
-
- // Otherwise, sink it to the exit block.
- I.moveBefore(ExitBlock->getFirstInsertionPt());
- SE->forgetValue(&I);
- MadeAnyChanges = true;
- }
-
- return MadeAnyChanges;
-}
-
static void replaceExitCond(BranchInst *BI, Value *NewCond,
SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
auto *OldCond = BI->getCondition();
@@ -2065,10 +1984,6 @@ bool IndVarSimplify::run(Loop *L) {
// The Rewriter may not be used from this point on.
- // Loop-invariant instructions in the preheader that aren't used in the
- // loop may be sunk below the loop to reduce register pressure.
- Changed |= sinkUnusedInvariants(L);
-
// rewriteFirstIterationLoopExitValues does not rely on the computation of
// trip count and therefore can further simplify exit values in addition to
// rewriteLoopExitValues.
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index b2c526b..d13b990 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -211,9 +211,15 @@ static Instruction *cloneInstructionInExitBlock(
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
MemorySSAUpdater &MSSAU);
-static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
- ICFLoopSafetyInfo &SafetyInfo,
- MemorySSAUpdater &MSSAU, ScalarEvolution *SE);
+static void moveInstructionBefore(
+ Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo,
+ MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
+ MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator);
+
+static bool sinkUnusedInvariantsFromPreheaderToExit(
+ Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
+ MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
+ SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);
static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,
function_ref<void(Instruction *)> Fn);
@@ -471,6 +477,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,
TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)
: sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
MSSAU, &SafetyInfo, Flags, ORE);
+
+ // sink pre-header defs that are unused in-loop into the unique exit to reduce
+ // pressure.
+ Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU,
+ SE, DT, Flags, ORE);
+
Flags.setIsSink(false);
if (Preheader)
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
@@ -1456,19 +1468,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
ICFLoopSafetyInfo &SafetyInfo,
- MemorySSAUpdater &MSSAU,
- ScalarEvolution *SE) {
+ MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
+ MemorySSA::InsertionPlace Point) {
SafetyInfo.removeInstruction(&I);
SafetyInfo.insertInstructionTo(&I, Dest->getParent());
I.moveBefore(*Dest->getParent(), Dest);
if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
MSSAU.getMemorySSA()->getMemoryAccess(&I)))
- MSSAU.moveToPlace(OldMemAcc, Dest->getParent(),
- MemorySSA::BeforeTerminator);
+ MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point);
if (SE)
SE->forgetBlockAndLoopDispositions(&I);
}
+// If there's a single exit block, sink any loop-invariant values that were
+// defined in the preheader but not used inside the loop into the exit block
+// to reduce register pressure in the loop.
+static bool sinkUnusedInvariantsFromPreheaderToExit(
+ Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
+ MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
+ SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) {
+ BasicBlock *ExitBlock = L->getExitBlock();
+ if (!ExitBlock)
+ return false;
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ return false;
+
+ bool MadeAnyChanges = false;
+
+ for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
+
+ // Skip terminator.
+ if (Preheader->getTerminator() == &I)
+ continue;
+
+ // New instructions were inserted at the end of the preheader.
+ if (isa<PHINode>(I))
+ break;
+
+ // Don't move instructions which might have side effects, since the side
+ // effects need to complete before instructions inside the loop. Note that
+ // it's okay if the instruction might have undefined behavior: LoopSimplify
+ // guarantees that the preheader dominates the exit block.
+ if (I.mayHaveSideEffects())
+ continue;
+
+ if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr))
+ continue;
+
+ // Determine if there is a use in or before the loop (direct or
+ // otherwise).
+ bool UsedInLoopOrPreheader = false;
+ for (Use &U : I.uses()) {
+ auto *UserI = cast<Instruction>(U.getUser());
+ BasicBlock *UseBB = UserI->getParent();
+ if (auto *PN = dyn_cast<PHINode>(UserI)) {
+ UseBB = PN->getIncomingBlock(U);
+ }
+ if (UseBB == Preheader || L->contains(UseBB)) {
+ UsedInLoopOrPreheader = true;
+ break;
+ }
+ }
+ if (UsedInLoopOrPreheader)
+ continue;
+
+ moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo,
+ MSSAU, SE, MemorySSA::Beginning);
+ MadeAnyChanges = true;
+ }
+
+ return MadeAnyChanges;
+}
+
static Instruction *sinkThroughTriviallyReplaceablePHI(
PHINode *TPN, Instruction *I, LoopInfo *LI,
SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 1a279b6..001215a 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1318,6 +1318,11 @@ public:
/// the loop, in which case some special-case heuristics may be used.
bool AllFixupsOutsideLoop = true;
+ /// This records whether all of the fixups using this LSRUse are unconditional
+ /// within the loop, meaning they will be executed on every path to the loop
+ /// latch. This includes fixups before early exits.
+ bool AllFixupsUnconditional = true;
+
/// RigidFormula is set to true to guarantee that this use will be associated
/// with a single formula--the one that initially matched. Some SCEV
/// expressions cannot be expanded. This allows LSR to consider the registers
@@ -1421,16 +1426,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
const SCEV *Start;
- const SCEVConstant *Step;
- if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
+ const APInt *Step;
+ if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
// If the step size matches the base offset, we could use pre-indexed
// addressing.
- if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
- Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
- ((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
- SE->isLoopInvariant(Start, L)))
+ bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
+ F.BaseOffset.isFixed() &&
+ *Step == F.BaseOffset.getFixedValue();
+ bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
+ !isa<SCEVConstant>(Start) &&
+ SE->isLoopInvariant(Start, L);
+ // We can only pre or post index when the load/store is unconditional.
+ if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
LoopCost = 0;
+ }
}
+
// If the loop counts down to zero and we'll be using a hardware loop then
// the addrec will be combined into the hardware loop instruction.
if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
@@ -1783,6 +1794,9 @@ void LSRUse::print(raw_ostream &OS) const {
if (AllFixupsOutsideLoop)
OS << ", all-fixups-outside-loop";
+ if (AllFixupsUnconditional)
+ OS << ", all-fixups-unconditional";
+
if (WidestFixupType)
OS << ", widest fixup type: " << *WidestFixupType;
}
@@ -2213,6 +2227,7 @@ class LSRInstance {
void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
void CountRegisters(const Formula &F, size_t LUIdx);
bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+ bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
void CollectLoopInvariantFixupsAndFormulae();
@@ -3607,6 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
LF.PostIncLoops = TmpPostIncLoops;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+ LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
// Create SCEV as Formula for calculating baseline cost
if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
@@ -3680,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
return true;
}
+/// Test whether this fixup will be executed each time the corresponding IV
+/// increment instruction is executed.
+bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
+ // If the fixup block dominates the IV increment block then there is no path
+ // through the loop to the increment that doesn't pass through the fixup.
+ return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
+}
+
/// Check for other uses of loop-invariant values which we're tracking. These
/// other uses will pin these values in registers, making them less profitable
/// for elimination.
@@ -3803,6 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
LF.OperandValToReplace = U;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+ LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
if (!LU.WidestFixupType ||
SE.getTypeSizeInBits(LU.WidestFixupType) <
SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
@@ -4940,6 +4965,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+ LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
// Transfer the fixups of LU to LUThatHas.
for (LSRFixup &Fixup : LU.Fixups) {
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index e043d07..08be5df 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1534,8 +1534,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
bool SrcNotDom = false;
auto CaptureTrackingWithModRef =
- [&](Instruction *AI,
- function_ref<bool(Instruction *)> ModRefCallback) -> bool {
+ [&](Instruction *AI, function_ref<bool(Instruction *)> ModRefCallback,
+ bool &AddressCaptured) -> bool {
SmallVector<Instruction *, 8> Worklist;
Worklist.push_back(AI);
unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
@@ -1559,8 +1559,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
if (!Visited.insert(&U).second)
continue;
UseCaptureInfo CI = DetermineUseCaptureKind(U, AI);
- if (capturesAnything(CI.UseCC))
+ if (capturesAnyProvenance(CI.UseCC))
return false;
+ AddressCaptured |= capturesAddress(CI.UseCC);
if (UI->mayReadOrWriteMemory()) {
if (UI->isLifetimeStartOrEnd()) {
@@ -1627,7 +1628,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
return true;
};
- if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback))
+ bool DestAddressCaptured = false;
+ if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback,
+ DestAddressCaptured))
return false;
// Bailout if Dest may have any ModRef before Store.
if (!ReachabilityWorklist.empty() &&
@@ -1653,7 +1656,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
return true;
};
- if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
+ bool SrcAddressCaptured = false;
+ if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback,
+ SrcAddressCaptured))
+ return false;
+
+ // If both the source and destination address are captured, the fact that they
+ // are no longer two separate allocations may be observed.
+ if (DestAddressCaptured && SrcAddressCaptured)
return false;
// We can do the transformation. First, move the SrcAlloca to the start of the
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 5af6c96..bb6c879 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -81,6 +81,7 @@ STATISTIC(
STATISTIC(NumInvariantConditionsInjected,
"Number of invariant conditions injected and unswitched");
+namespace llvm {
static cl::opt<bool> EnableNonTrivialUnswitch(
"enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
cl::desc("Forcibly enables non-trivial loop unswitching rather than "
@@ -131,11 +132,17 @@ static cl::opt<bool> InjectInvariantConditions(
static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold(
"simple-loop-unswitch-inject-invariant-condition-hotness-threshold",
- cl::Hidden, cl::desc("Only try to inject loop invariant conditions and "
- "unswitch on them to eliminate branches that are "
- "not-taken 1/<this option> times or less."),
+ cl::Hidden,
+ cl::desc("Only try to inject loop invariant conditions and "
+ "unswitch on them to eliminate branches that are "
+ "not-taken 1/<this option> times or less."),
cl::init(16));
+static cl::opt<bool> EstimateProfile("simple-loop-unswitch-estimate-profile",
+ cl::Hidden, cl::init(true));
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // namespace llvm
+
AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key;
namespace {
struct CompareDesc {
@@ -268,13 +275,42 @@ static bool areLoopExitPHIsLoopInvariant(const Loop &L,
llvm_unreachable("Basic blocks should never be empty!");
}
-/// Copy a set of loop invariant values \p ToDuplicate and insert them at the
+/// Copy a set of loop invariant values \p Invariants and insert them at the
/// end of \p BB and conditionally branch on the copied condition. We only
/// branch on a single value.
+/// We attempt to estimate the profile of the resulting conditional branch from
+/// \p ComputeProfFrom, which is the original conditional branch we're
+/// unswitching.
+/// When \p Direction is true, the \p Invariants form a disjunction, and the
+/// branch conditioned on it exits the loop on the "true" case. When \p
+/// Direction is false, the \p Invariants form a conjunction and the branch
+/// exits on the "false" case.
static void buildPartialUnswitchConditionalBranch(
BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,
BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze,
- const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) {
+ const Instruction *I, AssumptionCache *AC, const DominatorTree &DT,
+ const BranchInst &ComputeProfFrom) {
+
+ SmallVector<uint32_t> BranchWeights;
+ bool HasBranchWeights = EstimateProfile && !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(ComputeProfFrom, BranchWeights);
+ // If Direction is true, that means we had a disjunction and that the "true"
+ // case exits. The probability of the disjunction of the subset of terms is at
+ // most as high as the original one. So, if the probability is higher than the
+ // one we'd assign in absence of a profile (i.e. 0.5), we will use 0.5,
+ // but if it's lower, we will use the original probability.
+ // Conversely, if Direction is false, that means we had a conjunction, and the
+ // probability of exiting is captured in the second branch weight. That
+ // probability is a disjunction (of the negation of the original terms). The
+ // same reasoning applies as above.
+ // Issue #165649: should we expect BFI to conserve, and use that to calculate
+ // the branch weights?
+ if (HasBranchWeights &&
+ static_cast<double>(BranchWeights[Direction ? 0 : 1]) /
+ static_cast<double>(sum_of(BranchWeights)) >
+ 0.5)
+ HasBranchWeights = false;
+
IRBuilder<> IRB(&BB);
IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
@@ -287,8 +323,14 @@ static void buildPartialUnswitchConditionalBranch(
Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants)
: IRB.CreateAnd(FrozenInvariants);
- IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
- Direction ? &NormalSucc : &UnswitchedSucc);
+ auto *BR = IRB.CreateCondBr(
+ Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+ Direction ? &NormalSucc : &UnswitchedSucc,
+ HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof)
+ : nullptr);
+ if (!HasBranchWeights)
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BR, *BR->getParent()->getParent(), DEBUG_TYPE);
}
/// Copy a set of loop invariant values, and conditionally branch on them.
@@ -658,7 +700,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
" condition!");
buildPartialUnswitchConditionalBranch(
*OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH,
- FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT);
+ FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT, BI);
}
// Update the dominator tree with the added edge.
@@ -2477,7 +2519,7 @@ static void unswitchNontrivialInvariants(
else {
buildPartialUnswitchConditionalBranch(
*SplitBB, Invariants, Direction, *ClonedPH, *LoopPH,
- FreezeLoopUnswitchCond, BI, &AC, DT);
+ FreezeLoopUnswitchCond, BI, &AC, DT, *BI);
}
DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 9829d4d..11db0ec 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -674,6 +674,79 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName);
}
+/// Helper function to update the cycle or loop information after inserting a
+/// new block between a callbr instruction and one of its target blocks. Adds
+/// the new block to the innermost cycle or loop that the callbr instruction and
+/// the original target block share.
+/// \p LCI cycle or loop information to update
+/// \p CallBrBlock block containing the callbr instruction
+/// \p CallBrTarget new target block of the callbr instruction
+/// \p Succ original target block of the callbr instruction
+template <typename TI, typename T>
+static bool updateCycleLoopInfo(TI *LCI, BasicBlock *CallBrBlock,
+ BasicBlock *CallBrTarget, BasicBlock *Succ) {
+ static_assert(std::is_same_v<TI, CycleInfo> || std::is_same_v<TI, LoopInfo>,
+ "type must be CycleInfo or LoopInfo");
+ if (!LCI)
+ return false;
+
+ T *LC;
+ if constexpr (std::is_same_v<TI, CycleInfo>)
+ LC = LCI->getSmallestCommonCycle(CallBrBlock, Succ);
+ else
+ LC = LCI->getSmallestCommonLoop(CallBrBlock, Succ);
+ if (!LC)
+ return false;
+
+ if constexpr (std::is_same_v<TI, CycleInfo>)
+ LCI->addBlockToCycle(CallBrTarget, LC);
+ else
+ LC->addBasicBlockToLoop(CallBrTarget, *LCI);
+
+ return true;
+}
+
+BasicBlock *llvm::SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ,
+ unsigned SuccIdx, DomTreeUpdater *DTU,
+ CycleInfo *CI, LoopInfo *LI,
+ bool *UpdatedLI) {
+ CallBrInst *CallBr = dyn_cast<CallBrInst>(CallBrBlock->getTerminator());
+ assert(CallBr && "expected callbr terminator");
+ assert(SuccIdx < CallBr->getNumSuccessors() &&
+ Succ == CallBr->getSuccessor(SuccIdx) && "invalid successor index");
+
+ // Create a new block between callbr and the specified successor.
+ // splitBlockBefore cannot be re-used here since it cannot split if the split
+ // point is a PHI node (because BasicBlock::splitBasicBlockBefore cannot
+ // handle that). But we don't need to rewire every part of a potential PHI
+ // node. We only care about the edge between CallBrBlock and the original
+ // successor.
+ BasicBlock *CallBrTarget =
+ BasicBlock::Create(CallBrBlock->getContext(),
+ CallBrBlock->getName() + ".target." + Succ->getName(),
+ CallBrBlock->getParent());
+ // Rewire control flow from the new target block to the original successor.
+ Succ->replacePhiUsesWith(CallBrBlock, CallBrTarget);
+ // Rewire control flow from callbr to the new target block.
+ CallBr->setSuccessor(SuccIdx, CallBrTarget);
+ // Jump from the new target block to the original successor.
+ BranchInst::Create(Succ, CallBrTarget);
+
+ bool Updated =
+ updateCycleLoopInfo<LoopInfo, Loop>(LI, CallBrBlock, CallBrTarget, Succ);
+ if (UpdatedLI)
+ *UpdatedLI = Updated;
+ updateCycleLoopInfo<CycleInfo, Cycle>(CI, CallBrBlock, CallBrTarget, Succ);
+ if (DTU) {
+ DTU->applyUpdates({{DominatorTree::Insert, CallBrBlock, CallBrTarget}});
+ if (DTU->getDomTree().dominates(CallBrBlock, Succ))
+ DTU->applyUpdates({{DominatorTree::Delete, CallBrBlock, Succ},
+ {DominatorTree::Insert, CallBrTarget, Succ}});
+ }
+
+ return CallBrTarget;
+}
+
void llvm::setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) {
if (auto *II = dyn_cast<InvokeInst>(TI))
II->setUnwindDest(Succ);
diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
index 0046a00..287a177 100644
--- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
@@ -13,6 +13,7 @@
#include "llvm/Transforms/Utils/ControlFlowUtils.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/ValueHandle.h"
@@ -281,7 +282,9 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize(
for (auto [BB, Succ0, Succ1] : Branches) {
#ifndef NDEBUG
- assert(Incoming.insert(BB).second && "Duplicate entry for incoming block.");
+ assert(
+ (Incoming.insert(BB).second || isa<CallBrInst>(BB->getTerminator())) &&
+ "Duplicate entry for incoming block.");
#endif
if (Succ0)
Outgoing.insert(Succ0);
diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
index 45e1d12..804af22 100644
--- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp
+++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
@@ -79,6 +79,53 @@
// Limitation: The pass cannot handle switch statements and indirect
// branches. Both must be lowered to plain branches first.
//
+// CallBr support: CallBr is handled as a more general branch instruction which
+// can have multiple successors. The pass redirects the edges to intermediate
+// target blocks that unconditionally branch to the original callbr target
+// blocks. This allows the control flow hub to know to which of the original
+// target blocks to jump to.
+// Example input CFG:
+// Entry (callbr)
+// / \
+// v v
+// H ----> B
+// ^ /|
+// `----' |
+// v
+// Exit
+//
+// becomes:
+// Entry (callbr)
+// / \
+// v v
+// target.H target.B
+// | |
+// v v
+// H ----> B
+// ^ /|
+// `----' |
+// v
+// Exit
+//
+// Note
+// OUTPUT CFG: Converted to a natural loop with a new header N.
+//
+// Entry (callbr)
+// / \
+// v v
+// target.H target.B
+// \ /
+// \ /
+// v v
+// N <---.
+// / \ \
+// / \ |
+// v v /
+// H --> B --'
+// |
+// v
+// Exit
+//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Utils/FixIrreducible.h"
@@ -231,6 +278,7 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
return false;
LLVM_DEBUG(dbgs() << "Processing cycle:\n" << CI.print(&C) << "\n";);
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
ControlFlowHub CHub;
SetVector<BasicBlock *> Predecessors;
@@ -242,18 +290,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
}
for (BasicBlock *P : Predecessors) {
- auto *Branch = cast<BranchInst>(P->getTerminator());
- // Exactly one of the two successors is the header.
- BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr;
- BasicBlock *Succ1 = Succ0 ? nullptr : Header;
- if (!Succ0)
- assert(Branch->getSuccessor(1) == Header);
- assert(Succ0 || Succ1);
- CHub.addBranch(P, Succ0, Succ1);
-
- LLVM_DEBUG(dbgs() << "Added internal branch: " << P->getName() << " -> "
- << (Succ0 ? Succ0->getName() : "") << " "
- << (Succ1 ? Succ1->getName() : "") << "\n");
+ if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator())) {
+ // Exactly one of the two successors is the header.
+ BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr;
+ BasicBlock *Succ1 = Succ0 ? nullptr : Header;
+ assert(Succ0 || Branch->getSuccessor(1) == Header);
+ assert(Succ0 || Succ1);
+ CHub.addBranch(P, Succ0, Succ1);
+
+ LLVM_DEBUG(dbgs() << "Added internal branch: " << printBasicBlock(P)
+ << " -> " << printBasicBlock(Succ0)
+ << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1)
+ << '\n');
+ } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) {
+ for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) {
+ BasicBlock *Succ = CallBr->getSuccessor(I);
+ if (Succ != Header)
+ continue;
+ BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI);
+ CHub.addBranch(NewSucc, Succ);
+ LLVM_DEBUG(dbgs() << "Added internal branch: "
+ << printBasicBlock(NewSucc) << " -> "
+ << printBasicBlock(Succ) << '\n');
+ }
+ } else {
+ llvm_unreachable("unsupported block terminator");
+ }
}
// Redirect external incoming edges. This includes the edges on the header.
@@ -266,17 +328,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
}
for (BasicBlock *P : Predecessors) {
- auto *Branch = cast<BranchInst>(P->getTerminator());
- BasicBlock *Succ0 = Branch->getSuccessor(0);
- Succ0 = C.contains(Succ0) ? Succ0 : nullptr;
- BasicBlock *Succ1 =
- Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
- Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr;
- CHub.addBranch(P, Succ0, Succ1);
-
- LLVM_DEBUG(dbgs() << "Added external branch: " << P->getName() << " -> "
- << (Succ0 ? Succ0->getName() : "") << " "
- << (Succ1 ? Succ1->getName() : "") << "\n");
+ if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator()); Branch) {
+ BasicBlock *Succ0 = Branch->getSuccessor(0);
+ Succ0 = C.contains(Succ0) ? Succ0 : nullptr;
+ BasicBlock *Succ1 =
+ Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
+ Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr;
+ CHub.addBranch(P, Succ0, Succ1);
+
+ LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P)
+ << " -> " << printBasicBlock(Succ0)
+ << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1)
+ << '\n');
+ } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) {
+ for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) {
+ BasicBlock *Succ = CallBr->getSuccessor(I);
+ if (!C.contains(Succ))
+ continue;
+ BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI);
+ CHub.addBranch(NewSucc, Succ);
+ LLVM_DEBUG(dbgs() << "Added external branch: "
+ << printBasicBlock(NewSucc) << " -> "
+ << printBasicBlock(Succ) << '\n');
+ }
+ } else {
+ llvm_unreachable("unsupported block terminator");
+ }
}
// Redirect all the backedges through a "hub" consisting of a series
@@ -292,7 +369,6 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
SetVector<BasicBlock *> Entries;
Entries.insert(C.entry_rbegin(), C.entry_rend());
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
CHub.finalize(&DTU, GuardBlocks, "irr");
#if defined(EXPENSIVE_CHECKS)
assert(DT.verify(DominatorTree::VerificationLevel::Full));
@@ -325,8 +401,6 @@ static bool FixIrreducibleImpl(Function &F, CycleInfo &CI, DominatorTree &DT,
LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: "
<< F.getName() << "\n");
- assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
bool Changed = false;
for (Cycle *TopCycle : CI.toplevel_cycles()) {
for (Cycle *C : depth_first(TopCycle)) {
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 6312831..7a2b8da 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -460,25 +460,10 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
Loop *NewLoop = NewLoops[L];
assert(NewLoop && "L should have been cloned");
- MDNode *LoopID = NewLoop->getLoopID();
-
- // Only add loop metadata if the loop is not going to be completely
- // unrolled.
- if (UnrollRemainder)
- return NewLoop;
-
- std::optional<MDNode *> NewLoopID = makeFollowupLoopID(
- LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
- if (NewLoopID) {
- NewLoop->setLoopID(*NewLoopID);
-
- // Do not setLoopAlreadyUnrolled if loop attributes have been defined
- // explicitly.
- return NewLoop;
- }
// Add unroll disable metadata to disable future unrolling for this loop.
- NewLoop->setLoopAlreadyUnrolled();
+ if (!UnrollRemainder)
+ NewLoop->setLoopAlreadyUnrolled();
return NewLoop;
}
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index a9ab3b3..27fed73 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -809,7 +809,6 @@ public:
void emitInstructionAnnot(const Instruction *I,
formatted_raw_ostream &OS) override {
if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
- OS << "; Has predicate info\n";
if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
<< " Comparison:" << *PB->Condition << " Edge: [";
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c537be5c..b03fb62 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1866,10 +1866,19 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
// If either of the blocks has it's address taken, then we can't do this fold,
// because the code we'd hoist would no longer run when we jump into the block
// by it's address.
- for (auto *Succ : successors(BB))
- if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+ for (auto *Succ : successors(BB)) {
+ if (Succ->hasAddressTaken())
return false;
-
+ if (Succ->getSinglePredecessor())
+ continue;
+ // If Succ has >1 predecessors, continue to check if the Succ contains only
+ // one `unreachable` inst. Since executing `unreachable` inst is an UB, we
+ // can relax the condition based on the assumptiom that the program would
+ // never enter Succ and trigger such an UB.
+ if (isa<UnreachableInst>(*Succ->begin()))
+ continue;
+ return false;
+ }
// The second of pair is a SkipFlags bitmask.
using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;
SmallVector<SuccIterPair, 8> SuccIterPairs;
@@ -5228,32 +5237,52 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr");
}
- // Create the new switch instruction now.
- SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
- if (HasProfile) {
- // We know the weight of the default case. We don't know the weight of the
- // other cases, but rather than completely lose profiling info, we split
- // the remaining probability equally over them.
- SmallVector<uint32_t> NewWeights(Values.size() + 1);
- NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if
- // TrueWhenEqual.
- for (auto &V : drop_begin(NewWeights))
- V = BranchWeights[0] / Values.size();
- setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
- }
-
- // Add all of the 'cases' to the switch instruction.
- for (ConstantInt *Val : Values)
- New->addCase(Val, EdgeBB);
+ // Check if we can represent the values as a contiguous range. If so, we use a
+ // range check + conditional branch instead of a switch.
+ if (Values.front()->getValue() - Values.back()->getValue() ==
+ Values.size() - 1) {
+ ConstantRange RangeToCheck = ConstantRange::getNonEmpty(
+ Values.back()->getValue(), Values.front()->getValue() + 1);
+ APInt Offset, RHS;
+ ICmpInst::Predicate Pred;
+ RangeToCheck.getEquivalentICmp(Pred, RHS, Offset);
+ Value *X = CompVal;
+ if (!Offset.isZero())
+ X = Builder.CreateAdd(X, ConstantInt::get(CompVal->getType(), Offset));
+ Value *Cond =
+ Builder.CreateICmp(Pred, X, ConstantInt::get(CompVal->getType(), RHS));
+ BranchInst *NewBI = Builder.CreateCondBr(Cond, EdgeBB, DefaultBB);
+ if (HasProfile)
+ setBranchWeights(*NewBI, BranchWeights, /*IsExpected=*/false);
+ // We don't need to update PHI nodes since we don't add any new edges.
+ } else {
+ // Create the new switch instruction now.
+ SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+ if (HasProfile) {
+ // We know the weight of the default case. We don't know the weight of the
+ // other cases, but rather than completely lose profiling info, we split
+ // the remaining probability equally over them.
+ SmallVector<uint32_t> NewWeights(Values.size() + 1);
+ NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped
+ // if TrueWhenEqual.
+ for (auto &V : drop_begin(NewWeights))
+ V = BranchWeights[0] / Values.size();
+ setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
+ }
- // We added edges from PI to the EdgeBB. As such, if there were any
- // PHI nodes in EdgeBB, they need entries to be added corresponding to
- // the number of edges added.
- for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
- PHINode *PN = cast<PHINode>(BBI);
- Value *InVal = PN->getIncomingValueForBlock(BB);
- for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
- PN->addIncoming(InVal, BB);
+ // Add all of the 'cases' to the switch instruction.
+ for (ConstantInt *Val : Values)
+ New->addCase(Val, EdgeBB);
+
+ // We added edges from PI to the EdgeBB. As such, if there were any
+ // PHI nodes in EdgeBB, they need entries to be added corresponding to
+ // the number of edges added.
+ for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
+ PHINode *PN = cast<PHINode>(BBI);
+ Value *InVal = PN->getIncomingValueForBlock(BB);
+ for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
+ PN->addIncoming(InVal, BB);
+ }
}
// Erase the old branch instruction.
@@ -7603,7 +7632,9 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
auto *DefaultCaseBB = SI->getDefaultDest();
BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU);
auto It = OrigBB->getTerminator()->getIterator();
- BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+ auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+ // BI is handling the default case for SI, and so should share its DebugLoc.
+ BI->setDebugLoc(SI->getDebugLoc());
It->eraseFromParent();
addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB);
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 9f338db..94c5c170 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -12,7 +12,11 @@
//
// Limitation: This assumes that all terminators in the CFG are direct branches
// (the "br" instruction). The presence of any other control flow
-// such as indirectbr, switch or callbr will cause an assert.
+// such as indirectbr or switch will cause an assert.
+// The callbr terminator is supported by creating intermediate
+// target blocks that unconditionally branch to the original target
+// blocks. These intermediate target blocks can then be redirected
+// through the ControlFlowHub as usual.
//
//===----------------------------------------------------------------------===//
@@ -150,25 +154,55 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
SmallVector<BasicBlock *, 8> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;
// Redirect exiting edges through a control flow hub.
ControlFlowHub CHub;
- for (auto *BB : ExitingBlocks) {
- auto *Branch = cast<BranchInst>(BB->getTerminator());
- BasicBlock *Succ0 = Branch->getSuccessor(0);
- Succ0 = L->contains(Succ0) ? nullptr : Succ0;
-
- BasicBlock *Succ1 =
- Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
- Succ1 = L->contains(Succ1) ? nullptr : Succ1;
- CHub.addBranch(BB, Succ0, Succ1);
-
- LLVM_DEBUG(dbgs() << "Added exiting branch: " << BB->getName() << " -> {"
- << (Succ0 ? Succ0->getName() : "<none>") << ", "
- << (Succ1 ? Succ1->getName() : "<none>") << "}\n");
+
+ for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {
+ BasicBlock *BB = ExitingBlocks[I];
+ if (BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator())) {
+ BasicBlock *Succ0 = Branch->getSuccessor(0);
+ Succ0 = L->contains(Succ0) ? nullptr : Succ0;
+
+ BasicBlock *Succ1 =
+ Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
+ Succ1 = L->contains(Succ1) ? nullptr : Succ1;
+ CHub.addBranch(BB, Succ0, Succ1);
+
+ LLVM_DEBUG(dbgs() << "Added extiting branch: " << printBasicBlock(BB)
+ << " -> " << printBasicBlock(Succ0)
+ << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1)
+ << '\n');
+ } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(BB->getTerminator())) {
+ for (unsigned J = 0; J < CallBr->getNumSuccessors(); ++J) {
+ BasicBlock *Succ = CallBr->getSuccessor(J);
+ if (L->contains(Succ))
+ continue;
+ bool UpdatedLI = false;
+ BasicBlock *NewSucc =
+ SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI);
+ // Even if CallBr and Succ do not have a common parent loop, we need to
+ // add the new target block to the parent loop of the current loop.
+ if (!UpdatedLI)
+ CallBrTargetBlocksToFix.push_back(NewSucc);
+ // ExitingBlocks is later used to restore SSA, so we need to make sure
+ // that the blocks used for phi nodes in the guard blocks match the
+ // predecessors of the guard blocks, which, in the case of callbr, are
+ // the new intermediate target blocks instead of the callbr blocks
+ // themselves.
+ ExitingBlocks[I] = NewSucc;
+ CHub.addBranch(NewSucc, Succ);
+ LLVM_DEBUG(dbgs() << "Added exiting branch: "
+ << printBasicBlock(NewSucc) << " -> "
+ << printBasicBlock(Succ) << '\n');
+ }
+ } else {
+ llvm_unreachable("unsupported block terminator");
+ }
}
SmallVector<BasicBlock *, 8> GuardBlocks;
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
BasicBlock *LoopExitBlock;
bool ChangedCFG;
std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
@@ -187,10 +221,19 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
// The guard blocks were created outside the loop, so they need to become
// members of the parent loop.
- if (auto ParentLoop = L->getParentLoop()) {
+ // Same goes for the callbr target blocks. Although we try to add them to the
+ // smallest common parent loop of the callbr block and the corresponding
+ // original target block, there might not have been such a loop, in which case
+ // the newly created callbr target blocks are not part of any loop. For nested
+ // loops, this might result in them leading to a loop with multiple entry
+ // points.
+ if (auto *ParentLoop = L->getParentLoop()) {
for (auto *G : GuardBlocks) {
ParentLoop->addBasicBlockToLoop(G, LI);
}
+ for (auto *C : CallBrTargetBlocksToFix) {
+ ParentLoop->addBasicBlockToLoop(C, LI);
+ }
ParentLoop->verifyLoop();
}
@@ -218,8 +261,6 @@ bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) {
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
return runImpl(LI, DT);
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f7968ab..25bf49d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
continue;
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
- *CM.PSE.getSE());
+ *CM.PSE.getSE(), OrigLoop);
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
- *CM.PSE.getSE());
+ *CM.PSE.getSE(), OrigLoop);
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -5750,13 +5750,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
getMemoryInstructionCost(I, ElementCount::getFixed(1))));
UpdateMemOpUserCost(cast<LoadInst>(I));
} else if (const auto *Group = getInterleavedAccessGroup(I)) {
- // Scalarize an interleave group of address loads.
- for (unsigned I = 0; I < Group->getFactor(); ++I) {
- if (Instruction *Member = Group->getMember(I)) {
- setWideningDecision(
- Member, VF, CM_Scalarize,
- (VF.getKnownMinValue() *
- getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
+ // Scalarize all members of this interleaved group when any member
+ // is used as an address. The address-used load skips scalarization
+ // overhead, other members include it.
+ for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
+ if (Instruction *Member = Group->getMember(Idx)) {
+ InstructionCost Cost =
+ AddrDefs.contains(Member)
+ ? (VF.getKnownMinValue() *
+ getMemoryInstructionCost(Member,
+ ElementCount::getFixed(1)))
+ : getMemInstScalarizationCost(Member, VF);
+ setWideningDecision(Member, VF, CM_Scalarize, Cost);
UpdateMemOpUserCost(cast<LoadInst>(Member));
}
}
@@ -6871,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(),
+ OrigLoop);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7105,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
- *CM.PSE.getSE());
+ *CM.PSE.getSE(), OrigLoop);
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
// legacy cost model doesn't properly model costs for such loops.
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
+ !Legal->getLAI()->getSymbolicStrides().empty() ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
CostCtx, OrigLoop,
BestFactor.Width) ||
@@ -8335,11 +8342,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
&R) ||
(isa<VPInstruction>(&R) && !UnderlyingValue))
continue;
-
- // FIXME: VPlan0, which models a copy of the original scalar loop, should
- // not use VPWidenPHIRecipe to model the phis.
- assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
- UnderlyingValue && "unsupported recipe");
+ assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe");
// TODO: Gradually replace uses of underlying instruction by analyses on
// VPlan.
@@ -8440,7 +8443,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
- *CM.PSE.getSE());
+ *CM.PSE.getSE(), OrigLoop);
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
@@ -9910,7 +9913,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
- CM.CostKind, *CM.PSE.getSE());
+ CM.CostKind, *CM.PSE.getSE(), L);
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4fcaf6d..1b55a3b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5608,6 +5608,7 @@ private:
for (ScheduleBundle *Bundle : Bundles) {
if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
break;
+ SmallPtrSet<Value *, 4> ParentsUniqueUsers;
// Need to search for the lane since the tree entry can be
// reordered.
auto *It = find(Bundle->getTreeEntry()->Scalars, In);
@@ -5636,6 +5637,22 @@ private:
Bundle->getTreeEntry()->isCopyableElement(In)) &&
"Missed TreeEntry operands?");
+ bool IsNonSchedulableWithParentPhiNode =
+ Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
+ Bundle->getTreeEntry()->UserTreeIndex &&
+ Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
+ Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
+ Instruction::PHI;
+ // Count the number of unique phi nodes, which are the parent for
+ // parent entry, and exit, if all the unique phis are processed.
+ if (IsNonSchedulableWithParentPhiNode) {
+ const TreeEntry *ParentTE =
+ Bundle->getTreeEntry()->UserTreeIndex.UserTE;
+ Value *User = ParentTE->Scalars[Lane];
+ if (!ParentsUniqueUsers.insert(User).second)
+ break;
+ }
+
for (unsigned OpIdx :
seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
if (auto *I = dyn_cast<Instruction>(
@@ -5644,8 +5661,8 @@ private:
<< *I << "\n");
DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
}
- // If parent node is schedulable, it will be handle correctly.
- if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
+ // If parent node is schedulable, it will be handled correctly.
+ if (!IsNonSchedulableWithParentPhiNode)
break;
It = std::find(std::next(It),
Bundle->getTreeEntry()->Scalars.end(), In);
@@ -16903,7 +16920,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// otherwise TEPtr depends on TE.
if ((TEInsertBlock != InsertPt->getParent() ||
TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
- !CheckOrdering(InsertPt))
+ (!CheckOrdering(InsertPt) ||
+ (UseEI.UserTE->hasCopyableElements() &&
+ isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
+ is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
continue;
// The node is reused - exit.
if (CheckAndUseSameNode(TEPtr))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 2aaabd9..965426f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -350,13 +350,14 @@ struct VPCostContext {
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
ScalarEvolution &SE;
+ const Loop *L;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
const VPlan &Plan, LoopVectorizationCostModel &CM,
TargetTransformInfo::TargetCostKind CostKind,
- ScalarEvolution &SE)
+ ScalarEvolution &SE, const Loop *L)
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
- CostKind(CostKind), SE(SE) {}
+ CostKind(CostKind), SE(SE), L(L) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9a63c80..bde62dd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3167,26 +3167,30 @@ bool VPReplicateRecipe::shouldPack() const {
});
}
-/// Returns true if \p Ptr is a pointer computation for which the legacy cost
-/// model computes a SCEV expression when computing the address cost.
-static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
+/// which the legacy cost model computes a SCEV expression when computing the
+/// address cost. Computing SCEVs for VPValues is incomplete and returns
+/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
+/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
+static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
+ const Loop *L) {
auto *PtrR = Ptr->getDefiningRecipe();
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
Instruction::GetElementPtr) ||
isa<VPWidenGEPRecipe>(PtrR) ||
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
- return false;
+ return nullptr;
// We are looking for a GEP where all indices are either loop invariant or
// inductions.
for (VPValue *Opd : drop_begin(PtrR->operands())) {
if (!Opd->isDefinedOutsideLoopRegions() &&
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
- return false;
+ return nullptr;
}
- return true;
+ return vputils::getSCEVExprForVPValue(Ptr, SE, L);
}
/// Returns true if \p V is used as part of the address of another load or
@@ -3354,9 +3358,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
bool IsLoad = UI->getOpcode() == Instruction::Load;
const VPValue *PtrOp = getOperand(!IsLoad);
- // TODO: Handle cases where we need to pass a SCEV to
- // getAddressComputationCost.
- if (shouldUseAddressAccessSCEV(PtrOp))
+ const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
+ if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
break;
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
@@ -3374,7 +3377,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
InstructionCost ScalarCost =
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
- nullptr, Ctx.CostKind);
+ PtrSCEV, Ctx.CostKind);
if (isSingleScalar())
return ScalarCost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index acad795..4d98014 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3648,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
Sub = VecOp->getDefiningRecipe();
VecOp = Tmp;
}
+
+ // If ValB is a constant and can be safely extended, truncate it to the same
+ // type as ExtA's operand, then extend it to the same type as ExtA. This
+ // creates two uniform extends that can more easily be matched by the rest of
+ // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
+ // replaced with the new extend of the constant.
+ auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
+ VPWidenCastRecipe *&ExtB,
+ VPValue *&ValB, VPWidenRecipe *Mul) {
+ if (!ExtA || ExtB || !ValB->isLiveIn())
+ return;
+ Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
+ Instruction::CastOps ExtOpc = ExtA->getOpcode();
+ const APInt *Const;
+ if (!match(ValB, m_APInt(Const)) ||
+ !llvm::canConstantBeExtended(
+ Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
+ return;
+ // The truncate ensures that the type of each extended operand is the
+ // same, and it's been proven that the constant can be extended from
+ // NarrowTy safely. Necessary since ExtA's extended operand would be
+ // e.g. an i8, while the const will likely be an i32. This will be
+ // elided by later optimisations.
+ VPBuilder Builder(Mul);
+ auto *Trunc =
+ Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
+ Type *WideTy = Ctx.Types.inferScalarType(ExtA);
+ ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
+ Mul->setOperand(1, ExtB);
+ };
+
// Try to match reduce.add(mul(...)).
if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
auto *RecipeA =
@@ -3656,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
+ // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
+ ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
+
// Match reduce.add/sub(mul(ext, ext)).
if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
@@ -3665,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
cast<VPWidenRecipe>(Sub), Red);
return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
}
- // Match reduce.add(mul).
// TODO: Add an expression type for this variant with a negated mul
if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
return new VPExpressionRecipe(Mul, Red);
@@ -3674,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
// variants.
if (Sub)
return nullptr;
- // Match reduce.add(ext(mul(ext(A), ext(B)))).
- // All extend recipes must have same opcode or A == B
- // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
- if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
- m_ZExtOrSExt(m_VPValue()))))) {
+
+ // Match reduce.add(ext(mul(A, B))).
+ if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
- auto *Ext0 =
- cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
- auto *Ext1 =
- cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
- if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
+ auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
+ auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
+
+ // reduce.add(ext(mul(ext, const)))
+ // -> reduce.add(ext(mul(ext, ext(const))))
+ ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
+
+ // reduce.add(ext(mul(ext(A), ext(B))))
+ // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
+ // The inner extends must either have the same opcode as the outer extend or
+ // be the same, in which case the multiply can never result in a negative
+ // value and the outer extend can be folded away by doing wider
+ // extends for the operands of the mul.
+ if (Ext0 && Ext1 &&
+ (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
Ext0->getOpcode() == Ext1->getOpcode() &&
IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
auto *NewExt0 = new VPWidenCastRecipe(
@@ -4021,7 +4062,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
DenseMap<const SCEV *, Value *>
VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
const DataLayout &DL = SE.getDataLayout();
- SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);
+ SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
BasicBlock *EntryBB = Entry->getIRBasicBlock();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4db92e7..54348c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -75,7 +75,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
B == Plan.getBackedgeTakenCount();
}
-const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
+const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
+ ScalarEvolution &SE, const Loop *L) {
if (V->isLiveIn()) {
if (Value *LiveIn = V->getLiveInIRValue())
return SE.getSCEV(LiveIn);
@@ -86,6 +87,52 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe())
.Case<VPExpandSCEVRecipe>(
[](const VPExpandSCEVRecipe *R) { return R->getSCEV(); })
+ .Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) {
+ if (!L)
+ return SE.getCouldNotCompute();
+ const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+ return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L,
+ SCEV::FlagAnyWrap);
+ })
+ .Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) {
+ const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+ const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L);
+ const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L);
+ if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>))
+ return SE.getCouldNotCompute();
+
+ return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()),
+ SE.getMulExpr(IV, SE.getTruncateOrSignExtend(
+ Scale, IV->getType())));
+ })
+ .Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) {
+ const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+ const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L);
+ if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step))
+ return SE.getCouldNotCompute();
+ return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()),
+ Step);
+ })
+ .Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) {
+ if (R->getOpcode() != Instruction::GetElementPtr)
+ return SE.getCouldNotCompute();
+
+ const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+ if (isa<SCEVCouldNotCompute>(Base))
+ return SE.getCouldNotCompute();
+
+ SmallVector<const SCEV *> IndexExprs;
+ for (VPValue *Index : drop_begin(R->operands())) {
+ const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L);
+ if (isa<SCEVCouldNotCompute>(IndexExpr))
+ return SE.getCouldNotCompute();
+ IndexExprs.push_back(IndexExpr);
+ }
+
+ Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr())
+ ->getSourceElementType();
+ return SE.getGEPExpr(Base, IndexExprs, SrcElementTy);
+ })
.Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 37cd413..c21a0e7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);
/// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no
/// SCEV expression could be constructed.
-const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
+const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE,
+ const Loop *L = nullptr);
/// Returns true if \p VPV is a single scalar, either because it produces the
/// same value for all lanes or only has its first lane used.