aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/IPO/AttributorAttributes.cpp61
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFuse.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp19
-rw-r--r--llvm/lib/Transforms/Scalar/StructurizeCFG.cpp19
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp48
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp21
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp149
-rw-r--r--llvm/lib/Transforms/Utils/UnifyLoopExits.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp43
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp27
16 files changed, 361 insertions, 86 deletions
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 5ed47ae..a6ac761 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final
// ------------------------ Align Argument Attribute ------------------------
namespace {
+
static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
Value &AssociatedValue, const Use *U,
const Instruction *I, bool &TrackUse) {
@@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
TrackUse = true;
return 0;
}
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::ptrmask: {
+ // Is it appropriate to pull attribute in initialization?
+ const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+ QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE);
+ const auto *AlignAA = A.getAAFor<AAAlign>(
+ QueryingAA, IRPosition::value(*II), DepClassTy::NONE);
+ if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) {
+ unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(),
+ Value::MaxAlignmentExponent);
+ Align ConstAlign(UINT64_C(1) << ShiftValue);
+ if (ConstAlign >= AlignAA->getKnownAlign())
+ return Align(1).value();
+ }
+ if (AlignAA)
+ return AlignAA->getKnownAlign().value();
+ break;
+ }
+ default:
+ break;
+ }
MaybeAlign MA;
if (const auto *CB = dyn_cast<CallBase>(I)) {
@@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final
AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
: Base(IRP, A) {}
+ ChangeStatus updateImpl(Attributor &A) override {
+ Instruction *I = getIRPosition().getCtxI();
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::ptrmask: {
+ Align Alignment;
+ bool Valid = false;
+
+ const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+ *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED);
+ if (ConstVals && ConstVals->isValidState()) {
+ unsigned ShiftValue =
+ std::min(ConstVals->getAssumedMinTrailingZeros(),
+ Value::MaxAlignmentExponent);
+ Alignment = Align(UINT64_C(1) << ShiftValue);
+ Valid = true;
+ }
+
+ const auto *AlignAA =
+ A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))),
+ DepClassTy::REQUIRED);
+ if (AlignAA && AlignAA->isValidState()) {
+ Alignment = std::max(AlignAA->getAssumedAlign(), Alignment);
+ Valid = true;
+ }
+
+ if (Valid)
+ return clampStateAndIndicateChange<StateType>(
+ this->getState(),
+ std::min(this->getAssumedAlign(), Alignment).value());
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ return Base::updateImpl(A);
+ };
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
};
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 19eccb9..9ffa602 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1796,14 +1796,16 @@ private:
// mergeLatch may remove the only block in FC1.
SE.forgetLoop(FC1.L);
SE.forgetLoop(FC0.L);
- // Forget block dispositions as well, so that there are no dangling
- // pointers to erased/free'ed blocks.
- SE.forgetBlockAndLoopDispositions();
// Move instructions from FC0.Latch to FC1.Latch.
// Note: mergeLatch requires an updated DT.
mergeLatch(FC0, FC1);
+ // Forget block dispositions as well, so that there are no dangling
+ // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+ // since merging the latches may affect the dispositions.
+ SE.forgetBlockAndLoopDispositions();
+
// Merge the loops.
SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
for (BasicBlock *BB : Blocks) {
@@ -2092,14 +2094,16 @@ private:
// mergeLatch may remove the only block in FC1.
SE.forgetLoop(FC1.L);
SE.forgetLoop(FC0.L);
- // Forget block dispositions as well, so that there are no dangling
- // pointers to erased/free'ed blocks.
- SE.forgetBlockAndLoopDispositions();
// Move instructions from FC0.Latch to FC1.Latch.
// Note: mergeLatch requires an updated DT.
mergeLatch(FC0, FC1);
+ // Forget block dispositions as well, so that there are no dangling
+ // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+ // since merging the latches may affect the dispositions.
+ SE.forgetBlockAndLoopDispositions();
+
// Merge the loops.
SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
for (BasicBlock *BB : Blocks) {
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index a883998..1b770be 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate {
/// Return true if the dependence from the store to the load has an
/// absolute distance of one.
/// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
- bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
- Loop *L) const {
+ bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L,
+ const DominatorTree &DT) const {
Value *LoadPtr = Load->getPointerOperand();
Value *StorePtr = Store->getPointerOperand();
Type *LoadType = getLoadStoreType(Load);
@@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate {
DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
"Should be a known dependence");
- int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
- int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
+ int64_t StrideLoad =
+ getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0);
+ int64_t StrideStore =
+ getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0);
if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
return false;
@@ -287,8 +289,8 @@ public:
// so deciding which one forwards is easy. The later one forwards as
// long as they both have a dependence distance of one to the load.
if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
- Cand.isDependenceDistanceOfOne(PSE, L) &&
- OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+ Cand.isDependenceDistanceOfOne(PSE, L, *DT) &&
+ OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) {
// They are in the same block, the later one will forward to the load.
if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
OtherCand = &Cand;
@@ -538,7 +540,7 @@ public:
// Check whether the SCEV difference is the same as the induction step,
// thus we load the value in the next iteration.
- if (!Cand.isDependenceDistanceOfOne(PSE, L))
+ if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT))
continue;
assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 2bda9d8..802ae4e 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
}
// Do not attempt partial/runtime unrolling in FullLoopUnrolling
- if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) {
+ if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) ||
+ UP.Count < TripCount || UP.Count < MaxTripCount)) {
LLVM_DEBUG(
dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index bb6c879..239526e 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -337,7 +337,7 @@ static void buildPartialUnswitchConditionalBranch(
static void buildPartialInvariantUnswitchConditionalBranch(
BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction,
BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L,
- MemorySSAUpdater *MSSAU) {
+ MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) {
ValueToValueMapTy VMap;
for (auto *Val : reverse(ToDuplicate)) {
Instruction *Inst = cast<Instruction>(Val);
@@ -377,8 +377,19 @@ static void buildPartialInvariantUnswitchConditionalBranch(
IRBuilder<> IRB(&BB);
IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
Value *Cond = VMap[ToDuplicate[0]];
- IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
- Direction ? &NormalSucc : &UnswitchedSucc);
+ // The expectation is that ToDuplicate[0] is the condition used by the
+ // OriginalBranch, case in which we can clone the profile metadata from there.
+ auto *ProfData =
+ !ProfcheckDisableMetadataFixes &&
+ ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition())
+ ? OriginalBranch.getMetadata(LLVMContext::MD_prof)
+ : nullptr;
+ auto *BR =
+ IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+ Direction ? &NormalSucc : &UnswitchedSucc, ProfData);
+ if (!ProfData)
+ setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(),
+ DEBUG_TYPE);
}
/// Rewrite the PHI nodes in an unswitched loop exit basic block.
@@ -2515,7 +2526,7 @@ static void unswitchNontrivialInvariants(
// the branch in the split block.
if (PartiallyInvariant)
buildPartialInvariantUnswitchConditionalBranch(
- *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
+ *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI);
else {
buildPartialUnswitchConditionalBranch(
*SplitBB, Invariants, Direction, *ClonedPH, *LoopPH,
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5f6f66a..0a8f5ea 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
} else {
// Test for successors as back edge
BasicBlock *BB = N->getNodeAs<BasicBlock>();
- BranchInst *Term = cast<BranchInst>(BB->getTerminator());
-
- for (BasicBlock *Succ : Term->successors())
- if (Visited.count(Succ))
- Loops[Succ] = BB;
+ if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
+ for (BasicBlock *Succ : Term->successors())
+ if (Visited.count(Succ))
+ Loops[Succ] = BB;
}
}
@@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
for (BasicBlock *P : predecessors(BB)) {
// Ignore it if it's a branch from outside into our region entry
- if (!ParentRegion->contains(P))
+ if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
continue;
Region *R = RI->getRegionFor(P);
@@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
/// Run the transformation for each region found
bool StructurizeCFG::run(Region *R, DominatorTree *DT,
const TargetTransformInfo *TTI) {
- if (R->isTopLevelRegion())
+ // CallBr and its corresponding direct target blocks are for now ignored by
+ // this pass. This is not a limitation for the currently intended uses cases
+ // of callbr in the AMDGPU backend.
+ // Parent and child regions are not affected by this (current) restriction.
+ // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
+ if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
return false;
this->DT = DT;
this->TTI = TTI;
Func = R->getEntry()->getParent();
- assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
ParentRegion = R;
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 5ba6f95f..6086615 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
case Attribute::CoroDestroyOnlyWhenComplete:
case Attribute::CoroElideSafe:
case Attribute::NoDivergenceSource:
+ case Attribute::NoCreateUndefOrPoison:
continue;
// Those attributes should be safe to propagate to the extracted function.
case Attribute::AlwaysInline:
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 0642d51..6d4436b 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -16,22 +16,62 @@
using namespace llvm;
+static void mergeAttributes(LLVMContext &Ctx, const Module &M,
+ const DataLayout &DL, const Triple &TT,
+ Function *Func, FunctionType *FuncTy,
+ AttributeList FuncAttrs) {
+ AttributeList OldAttrs = Func->getAttributes();
+ AttributeList NewAttrs = OldAttrs;
+
+ {
+ AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs());
+ AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs());
+ OldBuilder.merge(NewBuilder);
+ NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder);
+ }
+
+ {
+ AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs());
+ AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs());
+ OldBuilder.merge(NewBuilder);
+ NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder);
+ }
+
+ for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) {
+ AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I));
+ AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I));
+ OldBuilder.merge(NewBuilder);
+ NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder);
+ }
+
+ Func->setAttributes(NewAttrs);
+}
+
PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
ModuleAnalysisManager &MAM) {
RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple());
LLVMContext &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
+ const Triple &TT = M.getTargetTriple();
for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) {
if (Impl == RTLIB::Unsupported)
continue;
- // TODO: Declare with correct type, calling convention, and attributes.
+ auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl);
- FunctionType *FuncTy =
- FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
+ // TODO: Declare with correct type, calling convention, and attributes.
+ if (!FuncTy)
+ FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
StringRef FuncName = RTLCI.getLibcallImplName(Impl);
- M.getOrInsertFunction(FuncName, FuncTy);
+
+ Function *Func =
+ cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee());
+ if (Func->getFunctionType() == FuncTy) {
+ mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs);
+ Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl));
+ }
}
return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 46f2903..a03cf6e 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
// Create integer constant expression.
auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * {
const APInt &API = cast<ConstantInt>(&CV)->getValue();
- std::optional<int64_t> InitIntOpt = API.trySExtValue();
+ std::optional<int64_t> InitIntOpt;
+ if (API.getBitWidth() == 1)
+ InitIntOpt = API.tryZExtValue();
+ else
+ InitIntOpt = API.trySExtValue();
return InitIntOpt ? DIB.createConstantValueExpression(
static_cast<uint64_t>(*InitIntOpt))
: nullptr;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 1e8f6cc..6c9467b 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
/// probability of executing at least one more iteration?
static BranchProbability
probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
+ // OriginalLoopProb == 1 would produce a division by zero in the calculation
+ // below. The problem is that case indicates an always infinite loop, but a
+ // remainder loop cannot be calculated at run time if the original loop is
+ // infinite as infinity % UnrollCount is undefined. We then choose
+ // probabilities indicating that all remainder loop iterations will always
+ // execute.
+ //
+ // Currently, the remainder loop here is an epilogue, which cannot be reached
+ // if the original loop is infinite, so the aforementioned choice is
+ // arbitrary.
+ //
+ // FIXME: Branch weights still need to be fixed in the case of prologues
+ // (issue #135812). In that case, the aforementioned choice seems reasonable
+ // for the goal of maintaining the original loop's block frequencies. That
+ // is, an infinite loop's initial iterations are not skipped, and the prologue
+ // loop body might have unique blocks that execute a finite number of times
+ // if, for example, the original loop body contains conditionals like i <
+ // UnrollCount.
+ if (OriginalLoopProb == BranchProbability::getOne())
+ return BranchProbability::getOne();
+
// Each of these variables holds the original loop's probability that the
// number of iterations it will execute is some m in the specified range.
BranchProbability ProbOne = OriginalLoopProb; // 1 <= m
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index cbc604e..3a3e3ad 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -778,8 +778,10 @@ private:
return false;
// Add all values from the range to the set
- for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+ APInt Tmp = Span.getLower();
+ do
Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+ while (++Tmp != Span.getUpper());
UsedICmps++;
return true;
@@ -6020,6 +6022,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
const DataLayout &DL) {
Value *Cond = SI->getCondition();
KnownBits Known = computeKnownBits(Cond, DL, AC, SI);
+ SmallPtrSet<const Constant *, 4> KnownValues;
+ bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4);
// We can also eliminate cases by determining that their values are outside of
// the limited range of the condition based on how many significant (non-sign)
@@ -6039,15 +6043,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
UniqueSuccessors.push_back(Successor);
++It->second;
}
- const APInt &CaseVal = Case.getCaseValue()->getValue();
+ ConstantInt *CaseC = Case.getCaseValue();
+ const APInt &CaseVal = CaseC->getValue();
if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
- (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) {
- DeadCases.push_back(Case.getCaseValue());
+ (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) ||
+ (IsKnownValuesValid && !KnownValues.contains(CaseC))) {
+ DeadCases.push_back(CaseC);
if (DTU)
--NumPerSuccessorCases[Successor];
LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
<< " is dead.\n");
- }
+ } else if (IsKnownValuesValid)
+ KnownValues.erase(CaseC);
}
// If we can prove that the cases must cover all possible values, the
@@ -6058,33 +6065,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
const unsigned NumUnknownBits =
Known.getBitWidth() - (Known.Zero | Known.One).popcount();
assert(NumUnknownBits <= Known.getBitWidth());
- if (HasDefault && DeadCases.empty() &&
- NumUnknownBits < 64 /* avoid overflow */) {
- uint64_t AllNumCases = 1ULL << NumUnknownBits;
- if (SI->getNumCases() == AllNumCases) {
+ if (HasDefault && DeadCases.empty()) {
+ if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) {
createUnreachableSwitchDefault(SI, DTU);
return true;
}
- // When only one case value is missing, replace default with that case.
- // Eliminating the default branch will provide more opportunities for
- // optimization, such as lookup tables.
- if (SI->getNumCases() == AllNumCases - 1) {
- assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
- IntegerType *CondTy = cast<IntegerType>(Cond->getType());
- if (CondTy->getIntegerBitWidth() > 64 ||
- !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
- return false;
- uint64_t MissingCaseVal = 0;
- for (const auto &Case : SI->cases())
- MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
- auto *MissingCase =
- cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
- SwitchInstProfUpdateWrapper SIW(*SI);
- SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
- createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
- SIW.setSuccessorWeight(0, 0);
- return true;
+ if (NumUnknownBits < 64 /* avoid overflow */) {
+ uint64_t AllNumCases = 1ULL << NumUnknownBits;
+ if (SI->getNumCases() == AllNumCases) {
+ createUnreachableSwitchDefault(SI, DTU);
+ return true;
+ }
+ // When only one case value is missing, replace default with that case.
+ // Eliminating the default branch will provide more opportunities for
+ // optimization, such as lookup tables.
+ if (SI->getNumCases() == AllNumCases - 1) {
+ assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+ IntegerType *CondTy = cast<IntegerType>(Cond->getType());
+ if (CondTy->getIntegerBitWidth() > 64 ||
+ !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+ return false;
+
+ uint64_t MissingCaseVal = 0;
+ for (const auto &Case : SI->cases())
+ MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+ auto *MissingCase = cast<ConstantInt>(
+ ConstantInt::get(Cond->getType(), MissingCaseVal));
+ SwitchInstProfUpdateWrapper SIW(*SI);
+ SIW.addCase(MissingCase, SI->getDefaultDest(),
+ SIW.getSuccessorWeight(0));
+ createUnreachableSwitchDefault(SI, DTU,
+ /*RemoveOrigDefaultBlock*/ false);
+ SIW.setSuccessorWeight(0, 0);
+ return true;
+ }
}
}
@@ -7570,6 +7585,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
return true;
}
+/// Tries to transform the switch when the condition is umin with a constant.
+/// In that case, the default branch can be replaced by the constant's branch.
+/// This method also removes dead cases when the simplification cannot replace
+/// the default branch.
+///
+/// For example:
+/// switch(umin(a, 3)) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// case 3:
+/// case 4:
+/// // ...
+/// default:
+/// unreachable
+/// }
+///
+/// Transforms into:
+///
+/// switch(a) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// default:
+/// // This is case 3
+/// }
+static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) {
+ Value *A;
+ ConstantInt *Constant;
+
+ if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant))))
+ return false;
+
+ SmallVector<DominatorTree::UpdateType> Updates;
+ SwitchInstProfUpdateWrapper SIW(*SI);
+ BasicBlock *BB = SIW->getParent();
+
+ // Dead cases are removed even when the simplification fails.
+ // A case is dead when its value is higher than the Constant.
+ for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) {
+ if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) {
+ ++I;
+ continue;
+ }
+ BasicBlock *DeadCaseBB = I->getCaseSuccessor();
+ DeadCaseBB->removePredecessor(BB);
+ Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB});
+ I = SIW->removeCase(I);
+ E = SIW->case_end();
+ }
+
+ auto Case = SI->findCaseValue(Constant);
+ // If the case value is not found, `findCaseValue` returns the default case.
+ // In this scenario, since there is no explicit `case 3:`, the simplification
+ // fails. The simplification also fails when the switch’s default destination
+ // is reachable.
+ if (!SI->defaultDestUnreachable() || Case == SI->case_default()) {
+ if (DTU)
+ DTU->applyUpdates(Updates);
+ return !Updates.empty();
+ }
+
+ BasicBlock *Unreachable = SI->getDefaultDest();
+ SIW.replaceDefaultDest(Case);
+ SIW.removeCase(Case);
+ SIW->setCondition(A);
+
+ Updates.push_back({DominatorTree::Delete, BB, Unreachable});
+
+ if (DTU)
+ DTU->applyUpdates(Updates);
+
+ return true;
+}
+
/// Tries to transform switch of powers of two to reduce switch range.
/// For example, switch like:
/// switch (C) { case 1: case 2: case 64: case 128: }
@@ -8037,6 +8127,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
if (simplifyDuplicateSwitchArms(SI, DTU))
return requestResimplify();
+ if (simplifySwitchWhenUMin(SI, DTU))
+ return requestResimplify();
+
return false;
}
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 94c5c170..e86ab13 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;
// Redirect exiting edges through a control flow hub.
ControlFlowHub CHub;
+ bool Changed = false;
for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {
BasicBlock *BB = ExitingBlocks[I];
@@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
bool UpdatedLI = false;
BasicBlock *NewSucc =
SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI);
+ // SplitCallBrEdge modifies the CFG because it creates an intermediate
+ // block. So we need to set the changed flag no matter what the
+ // ControlFlowHub is going to do later.
+ Changed = true;
// Even if CallBr and Succ do not have a common parent loop, we need to
// add the new target block to the parent loop of the current loop.
if (!UpdatedLI)
@@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
bool ChangedCFG;
std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
&DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
+ ChangedCFG |= Changed;
if (!ChangedCFG)
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index fdfff16..03112c6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
bool CanAddPredicate = !llvm::shouldOptimizeForSize(
TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
- int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
- CanAddPredicate, false).value_or(0);
+ int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides,
+ CanAddPredicate, false)
+ .value_or(0);
if (Stride == 1 || Stride == -1)
return Stride;
return 0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d9bb14..2588c87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -154,27 +154,32 @@ static bool sinkScalarOperands(VPlan &Plan) {
bool ScalarVFOnly = Plan.hasScalarVFOnly();
bool Changed = false;
- auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo,
- VPSingleDefRecipe *Candidate) {
- // We only know how to duplicate VPReplicateRecipes and
- // VPScalarIVStepsRecipes for now.
+ SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
+ auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
+ VPBasicBlock *SinkTo, VPValue *Op) {
+ auto *Candidate =
+ dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
+ if (!Candidate)
+ return;
+
+ // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
+ // for now.
if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate))
- return false;
+ return;
if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() ||
Candidate->mayReadOrWriteMemory())
- return false;
+ return;
if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
if (!ScalarVFOnly && RepR->isSingleScalar())
- return false;
+ return;
- return true;
+ WorkList.insert({SinkTo, Candidate});
};
// First, collect the operands of all recipes in replicate blocks as seeds for
// sinking.
- SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
@@ -182,14 +187,9 @@ static bool sinkScalarOperands(VPlan &Plan) {
VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
continue;
- for (auto &Recipe : *VPBB) {
- for (VPValue *Op : Recipe.operands()) {
- if (auto *Def =
- dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- if (IsValidSinkCandidate(VPBB, Def))
- WorkList.insert({VPBB, Def});
- }
- }
+ for (auto &Recipe : *VPBB)
+ for (VPValue *Op : Recipe.operands())
+ InsertIfValidSinkCandidate(VPBB, Op);
}
// Try to sink each replicate or scalar IV steps recipe in the worklist.
@@ -198,8 +198,8 @@ static bool sinkScalarOperands(VPlan &Plan) {
VPSingleDefRecipe *SinkCandidate;
std::tie(SinkTo, SinkCandidate) = WorkList[I];
- // All recipe users of the sink candidate must be in the same block SinkTo
- // or all users outside of SinkTo must have only their first lane used. In
+ // All recipe users of SinkCandidate must be in the same block SinkTo or all
+ // users outside of SinkTo must only use the first lane of SinkCandidate. In
// the latter case, we need to duplicate SinkCandidate.
auto UsersOutsideSinkTo =
make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
@@ -234,10 +234,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
}
SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
for (VPValue *Op : SinkCandidate->operands())
- if (auto *Def =
- dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
- if (IsValidSinkCandidate(SinkTo, Def))
- WorkList.insert({SinkTo, Def});
+ InsertIfValidSinkCandidate(SinkTo, Op);
Changed = true;
}
return Changed;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 91734a1..34754a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
for (const VPUser *U : V->users()) {
auto *UI = cast<VPRecipeBase>(U);
+ if (isa<VPIRPhi>(UI) &&
+ UI->getNumOperands() != UI->getParent()->getNumPredecessors()) {
+ errs() << "Phi-like recipe with different number of operands and "
+ "predecessors.\n";
+ return false;
+ }
+
if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) {
for (const auto &[IncomingVPV, IncomingVPBB] :
Phi->incoming_values_and_blocks()) {
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d6eb00d..27a8bbd 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
Value *ScalarV = Ext->getOperand(0);
if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
- &DT))
- ScalarV = Builder.CreateFreeze(ScalarV);
+ &DT)) {
+ // Check wether all lanes are extracted, all extracts trigger UB
+ // on poison, and the last extract (and hence all previous ones)
+ // are guaranteed to execute if Ext executes. If so, we do not
+ // need to insert a freeze.
+ SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
+ bool AllExtractsTriggerUB = true;
+ ExtractElementInst *LastExtract = nullptr;
+ BasicBlock *ExtBB = Ext->getParent();
+ for (User *U : Ext->users()) {
+ auto *Extract = cast<ExtractElementInst>(U);
+ if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
+ AllExtractsTriggerUB = false;
+ break;
+ }
+ ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
+ if (!LastExtract || LastExtract->comesBefore(Extract))
+ LastExtract = Extract;
+ }
+ if (ExtractedLanes.size() != DstTy->getNumElements() ||
+ !AllExtractsTriggerUB ||
+ !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(),
+ LastExtract->getIterator()))
+ ScalarV = Builder.CreateFreeze(ScalarV);
+ }
ScalarV = Builder.CreateBitCast(
ScalarV,
IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));