aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/IPO/FunctionImport.cpp12
-rw-r--r--llvm/lib/Transforms/IPO/LowerTypeTests.cpp4
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp6
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp23
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp54
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp99
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp7
-rw-r--r--llvm/lib/Transforms/Instrumentation/AllocToken.cpp73
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp24
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/EarlyCSE.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp8
-rw-r--r--llvm/lib/Transforms/Scalar/GVNSink.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/InferAlignment.cpp17
-rw-r--r--llvm/lib/Transforms/Scalar/LoopInterchange.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp43
-rw-r--r--llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp186
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp14
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp233
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp35
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h67
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp43
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp206
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h29
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp20
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp37
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h8
34 files changed, 893 insertions, 433 deletions
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 28ee444..a29faab 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1368,13 +1368,13 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest(
FunctionImporter::ImportMapTy &ImportList) {
for (const auto &GlobalList : Index) {
// Ignore entries for undefined references.
- if (GlobalList.second.SummaryList.empty())
+ if (GlobalList.second.getSummaryList().empty())
continue;
auto GUID = GlobalList.first;
- assert(GlobalList.second.SummaryList.size() == 1 &&
+ assert(GlobalList.second.getSummaryList().size() == 1 &&
"Expected individual combined index to have one summary per GUID");
- auto &Summary = GlobalList.second.SummaryList[0];
+ auto &Summary = GlobalList.second.getSummaryList()[0];
// Skip the summaries for the importing module. These are included to
// e.g. record required linkage changes.
if (Summary->modulePath() == ModulePath)
@@ -1423,7 +1423,7 @@ void updateValueInfoForIndirectCalls(ModuleSummaryIndex &Index,
void llvm::updateIndirectCalls(ModuleSummaryIndex &Index) {
for (const auto &Entry : Index) {
- for (const auto &S : Entry.second.SummaryList) {
+ for (const auto &S : Entry.second.getSummaryList()) {
if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
updateValueInfoForIndirectCalls(Index, FS);
}
@@ -1456,7 +1456,7 @@ void llvm::computeDeadSymbolsAndUpdateIndirectCalls(
// Add values flagged in the index as live roots to the worklist.
for (const auto &Entry : Index) {
auto VI = Index.getValueInfo(Entry);
- for (const auto &S : Entry.second.SummaryList) {
+ for (const auto &S : Entry.second.getSummaryList()) {
if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
updateValueInfoForIndirectCalls(Index, FS);
if (S->isLive()) {
@@ -2094,7 +2094,7 @@ static bool doImportingForModuleForTest(
// is only enabled when testing importing via the 'opt' tool, which does
// not do the ThinLink that would normally determine what values to promote.
for (auto &I : *Index) {
- for (auto &S : I.second.SummaryList) {
+ for (auto &S : I.second.getSummaryList()) {
if (GlobalValue::isLocalLinkage(S->linkage()))
S->setLinkage(GlobalValue::ExternalLinkage);
}
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index be6cba3..46fb567 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -2130,7 +2130,7 @@ bool LowerTypeTestsModule::lower() {
// A set of all functions that are address taken by a live global object.
DenseSet<GlobalValue::GUID> AddressTaken;
for (auto &I : *ExportSummary)
- for (auto &GVS : I.second.SummaryList)
+ for (auto &GVS : I.second.getSummaryList())
if (GVS->isLive())
for (const auto &Ref : GVS->refs()) {
AddressTaken.insert(Ref.getGUID());
@@ -2409,7 +2409,7 @@ bool LowerTypeTestsModule::lower() {
}
for (auto &P : *ExportSummary) {
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
if (!ExportSummary->isGlobalValueLive(S.get()))
continue;
if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()))
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 2d5cb82..76e588b 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -928,7 +928,7 @@ void llvm::updateVCallVisibilityInIndex(
// linker, as we have no information on their eventual use.
if (DynamicExportSymbols.count(P.first))
continue;
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
if (!GVar ||
GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
@@ -2413,7 +2413,7 @@ bool DevirtModule::run() {
}
for (auto &P : *ExportSummary) {
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
auto *FS = dyn_cast<FunctionSummary>(S.get());
if (!FS)
continue;
@@ -2564,7 +2564,7 @@ void DevirtIndex::run() {
// Collect information from summary about which calls to try to devirtualize.
for (auto &P : ExportSummary) {
- for (auto &S : P.second.SummaryList) {
+ for (auto &S : P.second.getSummaryList()) {
auto *FS = dyn_cast<FunctionSummary>(S.get());
if (!FS)
continue;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e1e24a9..dab200d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -289,12 +289,11 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
// * Narrow width by halfs excluding zero/undef lanes
Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
Value *LoadPtr = II.getArgOperand(0);
- const Align Alignment =
- cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+ const Align Alignment = II.getParamAlign(0).valueOrOne();
// If the mask is all ones or undefs, this is a plain vector load of the 1st
// argument.
- if (maskIsAllOneOrUndef(II.getArgOperand(2))) {
+ if (maskIsAllOneOrUndef(II.getArgOperand(1))) {
LoadInst *L = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
"unmaskedload");
L->copyMetadata(II);
@@ -308,7 +307,7 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
LoadInst *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
"unmaskedload");
LI->copyMetadata(II);
- return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
+ return Builder.CreateSelect(II.getArgOperand(1), LI, II.getArgOperand(2));
}
return nullptr;
@@ -319,8 +318,8 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
// * Narrow width by halfs excluding zero/undef lanes
Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
Value *StorePtr = II.getArgOperand(1);
- Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
- auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+ Align Alignment = II.getParamAlign(1).valueOrOne();
+ auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
if (!ConstMask)
return nullptr;
@@ -356,7 +355,7 @@ Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
// * Narrow width by halfs excluding zero/undef lanes
// * Vector incrementing address -> vector masked load
Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
- auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
+ auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(1));
if (!ConstMask)
return nullptr;
@@ -366,8 +365,7 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
if (ConstMask->isAllOnesValue())
if (auto *SplatPtr = getSplatValue(II.getArgOperand(0))) {
auto *VecTy = cast<VectorType>(II.getType());
- const Align Alignment =
- cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+ const Align Alignment = II.getParamAlign(0).valueOrOne();
LoadInst *L = Builder.CreateAlignedLoad(VecTy->getElementType(), SplatPtr,
Alignment, "load.scalar");
Value *Shuf =
@@ -384,7 +382,7 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
// * Narrow store width by halfs excluding zero/undef lanes
// * Vector incrementing address -> vector masked store
Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
- auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+ auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
if (!ConstMask)
return nullptr;
@@ -397,8 +395,7 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
// scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr
if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) {
if (maskContainsAllOneOrUndef(ConstMask)) {
- Align Alignment =
- cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+ Align Alignment = II.getParamAlign(1).valueOrOne();
StoreInst *S = new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false,
Alignment);
S->copyMetadata(II);
@@ -408,7 +405,7 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
// scatter(vector, splat(ptr), splat(true)) -> store extract(vector,
// lastlane), ptr
if (ConstMask->isAllOnesValue()) {
- Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+ Align Alignment = II.getParamAlign(1).valueOrOne();
VectorType *WideLoadTy = cast<VectorType>(II.getArgOperand(1)->getType());
ElementCount VF = WideLoadTy->getElementCount();
Value *RunTimeVF = Builder.CreateElementCount(Builder.getInt32Ty(), VF);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index cdc559b..9b9fe26 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1643,33 +1643,46 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
/// Return a Constant* for the specified floating-point constant if it fits
/// in the specified FP type without changing its value.
-static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+static bool fitsInFPType(APFloat F, const fltSemantics &Sem) {
bool losesInfo;
- APFloat F = CFP->getValueAPF();
(void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
return !losesInfo;
}
-static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
- if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
- return nullptr; // No constant folding of this.
+static Type *shrinkFPConstant(LLVMContext &Ctx, const APFloat &F,
+ bool PreferBFloat) {
// See if the value can be truncated to bfloat and then reextended.
- if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat()))
- return Type::getBFloatTy(CFP->getContext());
+ if (PreferBFloat && fitsInFPType(F, APFloat::BFloat()))
+ return Type::getBFloatTy(Ctx);
// See if the value can be truncated to half and then reextended.
- if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf()))
- return Type::getHalfTy(CFP->getContext());
+ if (!PreferBFloat && fitsInFPType(F, APFloat::IEEEhalf()))
+ return Type::getHalfTy(Ctx);
// See if the value can be truncated to float and then reextended.
- if (fitsInFPType(CFP, APFloat::IEEEsingle()))
- return Type::getFloatTy(CFP->getContext());
- if (CFP->getType()->isDoubleTy())
- return nullptr; // Won't shrink.
- if (fitsInFPType(CFP, APFloat::IEEEdouble()))
- return Type::getDoubleTy(CFP->getContext());
+ if (fitsInFPType(F, APFloat::IEEEsingle()))
+ return Type::getFloatTy(Ctx);
+ if (&F.getSemantics() == &APFloat::IEEEdouble())
+ return nullptr; // Won't shrink.
+ // See if the value can be truncated to double and then reextended.
+ if (fitsInFPType(F, APFloat::IEEEdouble()))
+ return Type::getDoubleTy(Ctx);
// Don't try to shrink to various long double types.
return nullptr;
}
+static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
+ Type *Ty = CFP->getType();
+ if (Ty->getScalarType()->isPPC_FP128Ty())
+ return nullptr; // No constant folding of this.
+
+ Type *ShrinkTy =
+ shrinkFPConstant(CFP->getContext(), CFP->getValueAPF(), PreferBFloat);
+ if (ShrinkTy)
+ if (auto *VecTy = dyn_cast<VectorType>(Ty))
+ ShrinkTy = VectorType::get(ShrinkTy, VecTy);
+
+ return ShrinkTy;
+}
+
// Determine if this is a vector of ConstantFPs and if so, return the minimal
// type we can safely truncate all elements to.
static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) {
@@ -1720,10 +1733,10 @@ static Type *getMinimumFPType(Value *V, bool PreferBFloat) {
// Try to shrink scalable and fixed splat vectors.
if (auto *FPC = dyn_cast<Constant>(V))
- if (isa<VectorType>(V->getType()))
+ if (auto *VTy = dyn_cast<VectorType>(V->getType()))
if (auto *Splat = dyn_cast_or_null<ConstantFP>(FPC->getSplatValue()))
if (Type *T = shrinkFPConstant(Splat, PreferBFloat))
- return T;
+ return VectorType::get(T, VTy);
// Try to shrink a vector of FP constants. This returns nullptr on scalable
// vectors
@@ -1796,10 +1809,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
Type *Ty = FPT.getType();
auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
if (BO && BO->hasOneUse()) {
- Type *LHSMinType =
- getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy());
- Type *RHSMinType =
- getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy());
+ bool PreferBFloat = Ty->getScalarType()->isBFloatTy();
+ Type *LHSMinType = getMinimumFPType(BO->getOperand(0), PreferBFloat);
+ Type *RHSMinType = getMinimumFPType(BO->getOperand(1), PreferBFloat);
unsigned OpWidth = BO->getType()->getFPMantissaWidth();
unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 09cb225..5aa8de3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3455,27 +3455,45 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
// select a, false, b -> select !a, b, false
if (match(TrueVal, m_Specific(Zero))) {
Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
- return SelectInst::Create(NotCond, FalseVal, Zero);
+ Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+ SelectInst *NewSI =
+ SelectInst::Create(NotCond, FalseVal, Zero, "", nullptr, MDFrom);
+ NewSI->swapProfMetadata();
+ return NewSI;
}
// select a, b, true -> select !a, true, b
if (match(FalseVal, m_Specific(One))) {
Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
- return SelectInst::Create(NotCond, One, TrueVal);
+ Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+ SelectInst *NewSI =
+ SelectInst::Create(NotCond, One, TrueVal, "", nullptr, MDFrom);
+ NewSI->swapProfMetadata();
+ return NewSI;
}
// DeMorgan in select form: !a && !b --> !(a || b)
// select !a, !b, false --> not (select a, true, b)
if (match(&SI, m_LogicalAnd(m_Not(m_Value(A)), m_Not(m_Value(B)))) &&
(CondVal->hasOneUse() || TrueVal->hasOneUse()) &&
- !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr()))
- return BinaryOperator::CreateNot(Builder.CreateSelect(A, One, B));
+ !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) {
+ Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+ SelectInst *NewSI =
+ cast<SelectInst>(Builder.CreateSelect(A, One, B, "", MDFrom));
+ NewSI->swapProfMetadata();
+ return BinaryOperator::CreateNot(NewSI);
+ }
// DeMorgan in select form: !a || !b --> !(a && b)
// select !a, true, !b --> not (select a, b, false)
if (match(&SI, m_LogicalOr(m_Not(m_Value(A)), m_Not(m_Value(B)))) &&
(CondVal->hasOneUse() || FalseVal->hasOneUse()) &&
- !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr()))
- return BinaryOperator::CreateNot(Builder.CreateSelect(A, B, Zero));
+ !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) {
+ Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+ SelectInst *NewSI =
+ cast<SelectInst>(Builder.CreateSelect(A, B, Zero, "", MDFrom));
+ NewSI->swapProfMetadata();
+ return BinaryOperator::CreateNot(NewSI);
+ }
// select (select a, true, b), true, b -> select a, true, b
if (match(CondVal, m_Select(m_Value(A), m_One(), m_Value(B))) &&
@@ -3757,6 +3775,10 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder,
// (x < y) ? -1 : zext(x > y)
// (x > y) ? 1 : sext(x != y)
// (x > y) ? 1 : sext(x < y)
+// (x == y) ? 0 : (x > y ? 1 : -1)
+// (x == y) ? 0 : (x < y ? -1 : 1)
+// Special case: x == C ? 0 : (x > C - 1 ? 1 : -1)
+// Special case: x == C ? 0 : (x < C + 1 ? -1 : 1)
// Into ucmp/scmp(x, y), where signedness is determined by the signedness
// of the comparison in the original sequence.
Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
@@ -3849,6 +3871,44 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
}
}
+ // Special cases with constants: x == C ? 0 : (x > C-1 ? 1 : -1)
+ if (Pred == ICmpInst::ICMP_EQ && match(TV, m_Zero())) {
+ const APInt *C;
+ if (match(RHS, m_APInt(C))) {
+ CmpPredicate InnerPred;
+ Value *InnerRHS;
+ const APInt *InnerTV, *InnerFV;
+ if (match(FV,
+ m_Select(m_ICmp(InnerPred, m_Specific(LHS), m_Value(InnerRHS)),
+ m_APInt(InnerTV), m_APInt(InnerFV)))) {
+
+ // x == C ? 0 : (x > C-1 ? 1 : -1)
+ if (ICmpInst::isGT(InnerPred) && InnerTV->isOne() &&
+ InnerFV->isAllOnes()) {
+ IsSigned = ICmpInst::isSigned(InnerPred);
+ bool CanSubOne = IsSigned ? !C->isMinSignedValue() : !C->isMinValue();
+ if (CanSubOne) {
+ APInt Cminus1 = *C - 1;
+ if (match(InnerRHS, m_SpecificInt(Cminus1)))
+ Replace = true;
+ }
+ }
+
+ // x == C ? 0 : (x < C+1 ? -1 : 1)
+ if (ICmpInst::isLT(InnerPred) && InnerTV->isAllOnes() &&
+ InnerFV->isOne()) {
+ IsSigned = ICmpInst::isSigned(InnerPred);
+ bool CanAddOne = IsSigned ? !C->isMaxSignedValue() : !C->isMaxValue();
+ if (CanAddOne) {
+ APInt Cplus1 = *C + 1;
+ if (match(InnerRHS, m_SpecificInt(Cplus1)))
+ Replace = true;
+ }
+ }
+ }
+ }
+ }
+
Intrinsic::ID IID = IsSigned ? Intrinsic::scmp : Intrinsic::ucmp;
if (Replace)
return replaceInstUsesWith(
@@ -4459,24 +4519,24 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
if (Value *V = foldSelectIntoAddConstant(SI, Builder))
return replaceInstUsesWith(SI, V);
- // select(mask, mload(,,mask,0), 0) -> mload(,,mask,0)
+ // select(mask, mload(ptr,mask,0), 0) -> mload(ptr,mask,0)
// Load inst is intentionally not checked for hasOneUse()
if (match(FalseVal, m_Zero()) &&
- (match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal),
+ (match(TrueVal, m_MaskedLoad(m_Value(), m_Specific(CondVal),
m_CombineOr(m_Undef(), m_Zero()))) ||
- match(TrueVal, m_MaskedGather(m_Value(), m_Value(), m_Specific(CondVal),
+ match(TrueVal, m_MaskedGather(m_Value(), m_Specific(CondVal),
m_CombineOr(m_Undef(), m_Zero()))))) {
auto *MaskedInst = cast<IntrinsicInst>(TrueVal);
- if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
- MaskedInst->setArgOperand(3, FalseVal /* Zero */);
+ if (isa<UndefValue>(MaskedInst->getArgOperand(2)))
+ MaskedInst->setArgOperand(2, FalseVal /* Zero */);
return replaceInstUsesWith(SI, MaskedInst);
}
Value *Mask;
if (match(TrueVal, m_Zero()) &&
- (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
+ (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(Mask),
m_CombineOr(m_Undef(), m_Zero()))) ||
- match(FalseVal, m_MaskedGather(m_Value(), m_Value(), m_Value(Mask),
+ match(FalseVal, m_MaskedGather(m_Value(), m_Value(Mask),
m_CombineOr(m_Undef(), m_Zero())))) &&
(CondVal->getType() == Mask->getType())) {
// We can remove the select by ensuring the load zeros all lanes the
@@ -4489,8 +4549,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
if (CanMergeSelectIntoLoad) {
auto *MaskedInst = cast<IntrinsicInst>(FalseVal);
- if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
- MaskedInst->setArgOperand(3, TrueVal /* Zero */);
+ if (isa<UndefValue>(MaskedInst->getArgOperand(2)))
+ MaskedInst->setArgOperand(2, TrueVal /* Zero */);
return replaceInstUsesWith(SI, MaskedInst);
}
}
@@ -4629,14 +4689,13 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
}
Value *MaskedLoadPtr;
- const APInt *MaskedLoadAlignment;
if (match(TrueVal, m_OneUse(m_MaskedLoad(m_Value(MaskedLoadPtr),
- m_APInt(MaskedLoadAlignment),
m_Specific(CondVal), m_Value()))))
return replaceInstUsesWith(
- SI, Builder.CreateMaskedLoad(TrueVal->getType(), MaskedLoadPtr,
- Align(MaskedLoadAlignment->getZExtValue()),
- CondVal, FalseVal));
+ SI, Builder.CreateMaskedLoad(
+ TrueVal->getType(), MaskedLoadPtr,
+ cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(),
+ CondVal, FalseVal));
return nullptr;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a330bb7..651e305 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1892,7 +1892,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
// segfaults which didn't exist in the original program.
APInt DemandedPtrs(APInt::getAllOnes(VWidth)),
DemandedPassThrough(DemandedElts);
- if (auto *CMask = dyn_cast<Constant>(II->getOperand(2))) {
+ if (auto *CMask = dyn_cast<Constant>(II->getOperand(1))) {
for (unsigned i = 0; i < VWidth; i++) {
if (Constant *CElt = CMask->getAggregateElement(i)) {
if (CElt->isNullValue())
@@ -1905,7 +1905,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
if (II->getIntrinsicID() == Intrinsic::masked_gather)
simplifyAndSetOp(II, 0, DemandedPtrs, PoisonElts2);
- simplifyAndSetOp(II, 3, DemandedPassThrough, PoisonElts3);
+ simplifyAndSetOp(II, 2, DemandedPassThrough, PoisonElts3);
// Output elements are undefined if the element from both sources are.
// TODO: can strengthen via mask as well.
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 2646334..cb6ca72 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1494,11 +1494,8 @@ void AddressSanitizer::getInterestingMemoryOperands(
if (ignoreAccess(I, BasePtr))
return;
Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
- MaybeAlign Alignment = Align(1);
- // Otherwise no alignment guarantees. We probably got Undef.
- if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
- Alignment = Op->getMaybeAlignValue();
- Value *Mask = CI->getOperand(2 + OpOffset);
+ MaybeAlign Alignment = CI->getParamAlign(0);
+ Value *Mask = CI->getOperand(1 + OpOffset);
Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
break;
}
diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
index 40720ae..29968b8 100644
--- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
@@ -31,6 +31,7 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
@@ -131,7 +132,7 @@ cl::opt<uint64_t> ClFallbackToken(
//===--- Statistics -------------------------------------------------------===//
-STATISTIC(NumFunctionsInstrumented, "Functions instrumented");
+STATISTIC(NumFunctionsModified, "Functions modified");
STATISTIC(NumAllocationsInstrumented, "Allocations instrumented");
//===----------------------------------------------------------------------===//
@@ -140,9 +141,19 @@ STATISTIC(NumAllocationsInstrumented, "Allocations instrumented");
///
/// Expected format is: !{<type-name>, <contains-pointer>}
MDNode *getAllocTokenMetadata(const CallBase &CB) {
- MDNode *Ret = CB.getMetadata(LLVMContext::MD_alloc_token);
- if (!Ret)
- return nullptr;
+ MDNode *Ret = nullptr;
+ if (auto *II = dyn_cast<IntrinsicInst>(&CB);
+ II && II->getIntrinsicID() == Intrinsic::alloc_token_id) {
+ auto *MDV = cast<MetadataAsValue>(II->getArgOperand(0));
+ Ret = cast<MDNode>(MDV->getMetadata());
+ // If the intrinsic has an empty MDNode, type inference failed.
+ if (Ret->getNumOperands() == 0)
+ return nullptr;
+ } else {
+ Ret = CB.getMetadata(LLVMContext::MD_alloc_token);
+ if (!Ret)
+ return nullptr;
+ }
assert(Ret->getNumOperands() == 2 && "bad !alloc_token");
assert(isa<MDString>(Ret->getOperand(0)));
assert(isa<ConstantAsMetadata>(Ret->getOperand(1)));
@@ -315,6 +326,9 @@ private:
FunctionCallee getTokenAllocFunction(const CallBase &CB, uint64_t TokenID,
LibFunc OriginalFunc);
+ /// Lower alloc_token_* intrinsics.
+ void replaceIntrinsicInst(IntrinsicInst *II, OptimizationRemarkEmitter &ORE);
+
/// Return the token ID from metadata in the call.
uint64_t getToken(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
return std::visit([&](auto &&Mode) { return Mode(CB, ORE); }, Mode);
@@ -336,21 +350,32 @@ bool AllocToken::instrumentFunction(Function &F) {
// Do not apply any instrumentation for naked functions.
if (F.hasFnAttribute(Attribute::Naked))
return false;
- if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
- return false;
// Don't touch available_externally functions, their actual body is elsewhere.
if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
return false;
- // Only instrument functions that have the sanitize_alloc_token attribute.
- if (!F.hasFnAttribute(Attribute::SanitizeAllocToken))
- return false;
auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
SmallVector<std::pair<CallBase *, LibFunc>, 4> AllocCalls;
+ SmallVector<IntrinsicInst *, 4> IntrinsicInsts;
+
+ // Only instrument functions that have the sanitize_alloc_token attribute.
+ const bool InstrumentFunction =
+ F.hasFnAttribute(Attribute::SanitizeAllocToken) &&
+ !F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation);
// Collect all allocation calls to avoid iterator invalidation.
for (Instruction &I : instructions(F)) {
+ // Collect all alloc_token_* intrinsics.
+ if (auto *II = dyn_cast<IntrinsicInst>(&I);
+ II && II->getIntrinsicID() == Intrinsic::alloc_token_id) {
+ IntrinsicInsts.emplace_back(II);
+ continue;
+ }
+
+ if (!InstrumentFunction)
+ continue;
+
auto *CB = dyn_cast<CallBase>(&I);
if (!CB)
continue;
@@ -359,11 +384,21 @@ bool AllocToken::instrumentFunction(Function &F) {
}
bool Modified = false;
- for (auto &[CB, Func] : AllocCalls)
- Modified |= replaceAllocationCall(CB, Func, ORE, TLI);
- if (Modified)
- NumFunctionsInstrumented++;
+ if (!AllocCalls.empty()) {
+ for (auto &[CB, Func] : AllocCalls)
+ Modified |= replaceAllocationCall(CB, Func, ORE, TLI);
+ if (Modified)
+ NumFunctionsModified++;
+ }
+
+ if (!IntrinsicInsts.empty()) {
+ for (auto *II : IntrinsicInsts)
+ replaceIntrinsicInst(II, ORE);
+ Modified = true;
+ NumFunctionsModified++;
+ }
+
return Modified;
}
@@ -381,7 +416,7 @@ AllocToken::shouldInstrumentCall(const CallBase &CB,
if (TLI.getLibFunc(*Callee, Func)) {
if (isInstrumentableLibFunc(Func, CB, TLI))
return Func;
- } else if (Options.Extended && getAllocTokenMetadata(CB)) {
+ } else if (Options.Extended && CB.getMetadata(LLVMContext::MD_alloc_token)) {
return NotLibFunc;
}
@@ -528,6 +563,16 @@ FunctionCallee AllocToken::getTokenAllocFunction(const CallBase &CB,
return TokenAlloc;
}
+void AllocToken::replaceIntrinsicInst(IntrinsicInst *II,
+ OptimizationRemarkEmitter &ORE) {
+ assert(II->getIntrinsicID() == Intrinsic::alloc_token_id);
+
+ uint64_t TokenID = getToken(*II, ORE);
+ Value *V = ConstantInt::get(IntPtrTy, TokenID);
+ II->replaceAllUsesWith(V);
+ II->eraseFromParent();
+}
+
} // namespace
AllocTokenPass::AllocTokenPass(AllocTokenOptions Opts)
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp
index 3ae771a..3c0f185 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp
@@ -338,7 +338,7 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
}
auto *BasePtr = CI->getOperand(0 + OpOffset);
- Access.MaybeMask = CI->getOperand(2 + OpOffset);
+ Access.MaybeMask = CI->getOperand(1 + OpOffset);
Access.Addr = BasePtr;
}
}
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index eff6f0c..b6cbecb 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4191,10 +4191,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void handleMaskedGather(IntrinsicInst &I) {
IRBuilder<> IRB(&I);
Value *Ptrs = I.getArgOperand(0);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
- Value *Mask = I.getArgOperand(2);
- Value *PassThru = I.getArgOperand(3);
+ const Align Alignment = I.getParamAlign(0).valueOrOne();
+ Value *Mask = I.getArgOperand(1);
+ Value *PassThru = I.getArgOperand(2);
Type *PtrsShadowTy = getShadowTy(Ptrs);
if (ClCheckAccessAddress) {
@@ -4230,9 +4229,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRBuilder<> IRB(&I);
Value *Values = I.getArgOperand(0);
Value *Ptrs = I.getArgOperand(1);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
- Value *Mask = I.getArgOperand(3);
+ const Align Alignment = I.getParamAlign(1).valueOrOne();
+ Value *Mask = I.getArgOperand(2);
Type *PtrsShadowTy = getShadowTy(Ptrs);
if (ClCheckAccessAddress) {
@@ -4262,9 +4260,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRBuilder<> IRB(&I);
Value *V = I.getArgOperand(0);
Value *Ptr = I.getArgOperand(1);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
- Value *Mask = I.getArgOperand(3);
+ const Align Alignment = I.getParamAlign(1).valueOrOne();
+ Value *Mask = I.getArgOperand(2);
Value *Shadow = getShadow(V);
if (ClCheckAccessAddress) {
@@ -4295,10 +4292,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void handleMaskedLoad(IntrinsicInst &I) {
IRBuilder<> IRB(&I);
Value *Ptr = I.getArgOperand(0);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
- Value *Mask = I.getArgOperand(2);
- Value *PassThru = I.getArgOperand(3);
+ const Align Alignment = I.getParamAlign(0).valueOrOne();
+ Value *Mask = I.getArgOperand(1);
+ Value *PassThru = I.getArgOperand(2);
if (ClCheckAccessAddress) {
insertCheckShadowOf(Ptr, &I);
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 6141b6d..4ac1321 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -272,7 +272,7 @@ static OverwriteResult isMaskedStoreOverwrite(const Instruction *KillingI,
if (KillingII->getIntrinsicID() == Intrinsic::masked_store) {
// Masks.
// TODO: check that KillingII's mask is a superset of the DeadII's mask.
- if (KillingII->getArgOperand(3) != DeadII->getArgOperand(3))
+ if (KillingII->getArgOperand(2) != DeadII->getArgOperand(2))
return OW_Unknown;
} else if (KillingII->getIntrinsicID() == Intrinsic::vp_store) {
// Masks.
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 2afa7b7..e30f306 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -1017,14 +1017,14 @@ private:
};
auto MaskOp = [](const IntrinsicInst *II) {
if (II->getIntrinsicID() == Intrinsic::masked_load)
- return II->getOperand(2);
+ return II->getOperand(1);
if (II->getIntrinsicID() == Intrinsic::masked_store)
- return II->getOperand(3);
+ return II->getOperand(2);
llvm_unreachable("Unexpected IntrinsicInst");
};
auto ThruOp = [](const IntrinsicInst *II) {
if (II->getIntrinsicID() == Intrinsic::masked_load)
- return II->getOperand(3);
+ return II->getOperand(2);
llvm_unreachable("Unexpected IntrinsicInst");
};
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 42db424..72e1131 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2212,11 +2212,11 @@ bool GVNPass::processMaskedLoad(IntrinsicInst *I) {
if (!DepInst || !Dep.isLocal() || !Dep.isDef())
return false;
- Value *Mask = I->getOperand(2);
- Value *Passthrough = I->getOperand(3);
+ Value *Mask = I->getOperand(1);
+ Value *Passthrough = I->getOperand(2);
Value *StoreVal;
- if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(),
- m_Specific(Mask))) ||
+ if (!match(DepInst,
+ m_MaskedStore(m_Value(StoreVal), m_Value(), m_Specific(Mask))) ||
StoreVal->getType() != I->getType())
return false;
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index b9534def..a06f832 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -430,6 +430,7 @@ public:
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::PtrToInt:
+ case Instruction::PtrToAddr:
case Instruction::IntToPtr:
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 995b803..39751c0 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -45,25 +45,20 @@ static bool tryToImproveAlign(
switch (II->getIntrinsicID()) {
case Intrinsic::masked_load:
case Intrinsic::masked_store: {
- int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2;
- Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load
- ? II->getArgOperand(0)
- : II->getArgOperand(1);
+ unsigned PtrOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
+ Value *PtrOp = II->getArgOperand(PtrOpIdx);
Type *Type = II->getIntrinsicID() == Intrinsic::masked_load
? II->getType()
: II->getArgOperand(0)->getType();
- Align OldAlign =
- cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue();
+ Align OldAlign = II->getParamAlign(PtrOpIdx).valueOrOne();
Align PrefAlign = DL.getPrefTypeAlign(Type);
Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign);
- if (NewAlign <= OldAlign ||
- NewAlign.value() > std::numeric_limits<uint32_t>().max())
+ if (NewAlign <= OldAlign)
return false;
- Value *V =
- ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value());
- II->setOperand(AlignOpIdx, V);
+ II->addParamAttr(PtrOpIdx,
+ Attribute::getWithAlignment(II->getContext(), NewAlign));
return true;
}
default:
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 28ae4f0..9aaf6a5 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -43,6 +43,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <cassert>
#include <utility>
@@ -1872,6 +1873,51 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
}
+/// This deals with a corner case when a LCSSA phi node appears in a non-exit
+/// block: the outer loop latch block does not need to be exit block of the
+/// inner loop. Consider a loop that was in LCSSA form, but then some
+/// transformation like loop-unswitch comes along and creates an empty block,
+/// where BB5 in this example is the outer loop latch block:
+///
+/// BB4:
+/// br label %BB5
+/// BB5:
+/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
+/// br outer.header
+///
+/// Interchange then brings it in LCSSA form again resulting in this chain of
+/// single-input phi nodes:
+///
+/// BB4:
+/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
+/// br label %BB5
+/// BB5:
+/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
+///
+/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
+/// use before the def if we don't check this. The solution is to simplify
+/// lcssa phi nodes (remove) if they appear in non-exit blocks.
+///
+static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+ // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
+ if (OuterLoopLatch == InnerLoopExit)
+ return;
+
+ // Collect and remove phis in non-exit blocks if they have 1 input.
+ SmallVector<PHINode *, 8> Phis(
+ llvm::make_pointer_range(OuterLoopLatch->phis()));
+ for (PHINode *Phi : Phis) {
+ assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
+ LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
+ << "\n");
+ Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
+ Phi->eraseFromParent();
+ }
+}
+
bool LoopInterchangeTransform::adjustLoopBranches() {
LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
std::vector<DominatorTree::UpdateType> DTUpdates;
@@ -1882,6 +1928,9 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
InnerLoopPreHeader && "Guaranteed by loop-simplify form");
+
+ simplifyLCSSAPhis(OuterLoop, InnerLoop);
+
// Ensure that both preheaders do not contain PHI nodes and have single
// predecessors. This allows us to move them easily. We use
// InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 42d6680..146e7d1 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -111,7 +111,7 @@ static unsigned adjustForEndian(const DataLayout &DL, unsigned VectorWidth,
}
// Translate a masked load intrinsic like
-// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr,
// <16 x i1> %mask, <16 x i32> %passthru)
// to a chain of basic blocks, with loading element one-by-one if
// the appropriate mask bit is set
@@ -146,11 +146,10 @@ static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence,
CallInst *CI, DomTreeUpdater *DTU,
bool &ModifiedDT) {
Value *Ptr = CI->getArgOperand(0);
- Value *Alignment = CI->getArgOperand(1);
- Value *Mask = CI->getArgOperand(2);
- Value *Src0 = CI->getArgOperand(3);
+ Value *Mask = CI->getArgOperand(1);
+ Value *Src0 = CI->getArgOperand(2);
- const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+ const Align AlignVal = CI->getParamAlign(0).valueOrOne();
VectorType *VecType = cast<FixedVectorType>(CI->getType());
Type *EltTy = VecType->getElementType();
@@ -290,7 +289,7 @@ static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence,
}
// Translate a masked store intrinsic, like
-// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr,
// <16 x i1> %mask)
// to a chain of basic blocks, that stores element one-by-one if
// the appropriate mask bit is set
@@ -320,10 +319,9 @@ static void scalarizeMaskedStore(const DataLayout &DL, bool HasBranchDivergence,
bool &ModifiedDT) {
Value *Src = CI->getArgOperand(0);
Value *Ptr = CI->getArgOperand(1);
- Value *Alignment = CI->getArgOperand(2);
- Value *Mask = CI->getArgOperand(3);
+ Value *Mask = CI->getArgOperand(2);
- const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+ const Align AlignVal = CI->getParamAlign(1).valueOrOne();
auto *VecType = cast<VectorType>(Src->getType());
Type *EltTy = VecType->getElementType();
@@ -472,9 +470,8 @@ static void scalarizeMaskedGather(const DataLayout &DL,
bool HasBranchDivergence, CallInst *CI,
DomTreeUpdater *DTU, bool &ModifiedDT) {
Value *Ptrs = CI->getArgOperand(0);
- Value *Alignment = CI->getArgOperand(1);
- Value *Mask = CI->getArgOperand(2);
- Value *Src0 = CI->getArgOperand(3);
+ Value *Mask = CI->getArgOperand(1);
+ Value *Src0 = CI->getArgOperand(2);
auto *VecType = cast<FixedVectorType>(CI->getType());
Type *EltTy = VecType->getElementType();
@@ -483,7 +480,7 @@ static void scalarizeMaskedGather(const DataLayout &DL,
Instruction *InsertPt = CI;
BasicBlock *IfBlock = CI->getParent();
Builder.SetInsertPoint(InsertPt);
- MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+ Align AlignVal = CI->getParamAlign(0).valueOrOne();
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
@@ -608,8 +605,7 @@ static void scalarizeMaskedScatter(const DataLayout &DL,
DomTreeUpdater *DTU, bool &ModifiedDT) {
Value *Src = CI->getArgOperand(0);
Value *Ptrs = CI->getArgOperand(1);
- Value *Alignment = CI->getArgOperand(2);
- Value *Mask = CI->getArgOperand(3);
+ Value *Mask = CI->getArgOperand(2);
auto *SrcFVTy = cast<FixedVectorType>(Src->getType());
@@ -623,7 +619,7 @@ static void scalarizeMaskedScatter(const DataLayout &DL,
Builder.SetInsertPoint(InsertPt);
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
- MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+ Align AlignVal = CI->getParamAlign(1).valueOrOne();
unsigned VectorWidth = SrcFVTy->getNumElements();
// Shorten the way if the mask is a vector of constants.
@@ -1125,8 +1121,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
case Intrinsic::masked_load:
// Scalarize unsupported vector masked load
if (TTI.isLegalMaskedLoad(
- CI->getType(),
- cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(),
+ CI->getType(), CI->getParamAlign(0).valueOrOne(),
cast<PointerType>(CI->getArgOperand(0)->getType())
->getAddressSpace()))
return false;
@@ -1135,18 +1130,15 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
case Intrinsic::masked_store:
if (TTI.isLegalMaskedStore(
CI->getArgOperand(0)->getType(),
- cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue(),
+ CI->getParamAlign(1).valueOrOne(),
cast<PointerType>(CI->getArgOperand(1)->getType())
->getAddressSpace()))
return false;
scalarizeMaskedStore(DL, HasBranchDivergence, CI, DTU, ModifiedDT);
return true;
case Intrinsic::masked_gather: {
- MaybeAlign MA =
- cast<ConstantInt>(CI->getArgOperand(1))->getMaybeAlignValue();
+ Align Alignment = CI->getParamAlign(0).valueOrOne();
Type *LoadTy = CI->getType();
- Align Alignment = DL.getValueOrABITypeAlignment(MA,
- LoadTy->getScalarType());
if (TTI.isLegalMaskedGather(LoadTy, Alignment) &&
!TTI.forceScalarizeMaskedGather(cast<VectorType>(LoadTy), Alignment))
return false;
@@ -1154,11 +1146,8 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
return true;
}
case Intrinsic::masked_scatter: {
- MaybeAlign MA =
- cast<ConstantInt>(CI->getArgOperand(2))->getMaybeAlignValue();
+ Align Alignment = CI->getParamAlign(1).valueOrOne();
Type *StoreTy = CI->getArgOperand(0)->getType();
- Align Alignment = DL.getValueOrABITypeAlignment(MA,
- StoreTy->getScalarType());
if (TTI.isLegalMaskedScatter(StoreTy, Alignment) &&
!TTI.forceScalarizeMaskedScatter(cast<VectorType>(StoreTy),
Alignment))
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index fa66a03..23e1243 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -227,6 +227,7 @@ static InstructionCost ComputeSpeculationCost(const Instruction *I,
case Instruction::Call:
case Instruction::BitCast:
case Instruction::PtrToInt:
+ case Instruction::PtrToAddr:
case Instruction::IntToPtr:
case Instruction::AddrSpaceCast:
case Instruction::FPToUI:
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 9693ae6..4947d03 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Instructions.h"
@@ -634,18 +635,10 @@ private:
/// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV
/// changes.
bool mergeInValue(ValueLatticeElement &IV, Value *V,
- ValueLatticeElement MergeWithV,
+ const ValueLatticeElement &MergeWithV,
ValueLatticeElement::MergeOptions Opts = {
/*MayIncludeUndef=*/false, /*CheckWiden=*/false});
- bool mergeInValue(Value *V, ValueLatticeElement MergeWithV,
- ValueLatticeElement::MergeOptions Opts = {
- /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
- assert(!V->getType()->isStructTy() &&
- "non-structs should use markConstant");
- return mergeInValue(ValueState[V], V, MergeWithV, Opts);
- }
-
/// getValueState - Return the ValueLatticeElement object that corresponds to
/// the value. This function handles the case when the value hasn't been seen
/// yet by properly seeding constants etc.
@@ -768,6 +761,7 @@ private:
void handleCallArguments(CallBase &CB);
void handleExtractOfWithOverflow(ExtractValueInst &EVI,
const WithOverflowInst *WO, unsigned Idx);
+ bool isInstFullyOverDefined(Instruction &Inst);
private:
friend class InstVisitor<SCCPInstVisitor>;
@@ -987,7 +981,7 @@ public:
void trackValueOfArgument(Argument *A) {
if (A->getType()->isStructTy())
return (void)markOverdefined(A);
- mergeInValue(A, getArgAttributeVL(A));
+ mergeInValue(ValueState[A], A, getArgAttributeVL(A));
}
bool isStructLatticeConstant(Function *F, StructType *STy);
@@ -1128,8 +1122,7 @@ bool SCCPInstVisitor::isStructLatticeConstant(Function *F, StructType *STy) {
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
assert(It != TrackedMultipleRetVals.end());
- ValueLatticeElement LV = It->second;
- if (!SCCPSolver::isConstant(LV))
+ if (!SCCPSolver::isConstant(It->second))
return false;
}
return true;
@@ -1160,7 +1153,7 @@ Constant *SCCPInstVisitor::getConstantOrNull(Value *V) const {
std::vector<Constant *> ConstVals;
auto *ST = cast<StructType>(V->getType());
for (unsigned I = 0, E = ST->getNumElements(); I != E; ++I) {
- ValueLatticeElement LV = LVs[I];
+ const ValueLatticeElement &LV = LVs[I];
ConstVals.push_back(SCCPSolver::isConstant(LV)
? getConstant(LV, ST->getElementType(I))
: UndefValue::get(ST->getElementType(I)));
@@ -1225,7 +1218,7 @@ void SCCPInstVisitor::visitInstruction(Instruction &I) {
}
bool SCCPInstVisitor::mergeInValue(ValueLatticeElement &IV, Value *V,
- ValueLatticeElement MergeWithV,
+ const ValueLatticeElement &MergeWithV,
ValueLatticeElement::MergeOptions Opts) {
if (IV.mergeIn(MergeWithV, Opts)) {
pushUsersToWorkList(V);
@@ -1264,7 +1257,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI,
return;
}
- ValueLatticeElement BCValue = getValueState(BI->getCondition());
+ const ValueLatticeElement &BCValue = getValueState(BI->getCondition());
ConstantInt *CI = getConstantInt(BCValue, BI->getCondition()->getType());
if (!CI) {
// Overdefined condition variables, and branches on unfoldable constant
@@ -1326,7 +1319,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI,
// the target as executable.
if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
// Casts are folded by visitCastInst.
- ValueLatticeElement IBRValue = getValueState(IBR->getAddress());
+ const ValueLatticeElement &IBRValue = getValueState(IBR->getAddress());
BlockAddress *Addr = dyn_cast_or_null<BlockAddress>(
getConstant(IBRValue, IBR->getAddress()->getType()));
if (!Addr) { // Overdefined or unknown condition?
@@ -1383,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
// 7. If a conditional branch has a value that is overdefined, make all
// successors executable.
void SCCPInstVisitor::visitPHINode(PHINode &PN) {
- // If this PN returns a struct, just mark the result overdefined.
- // TODO: We could do a lot better than this if code actually uses this.
- if (PN.getType()->isStructTy())
- return (void)markOverdefined(&PN);
-
- if (getValueState(&PN).isOverdefined())
- return; // Quick exit
-
// Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
// and slow us down a lot. Just mark them overdefined.
if (PN.getNumIncomingValues() > 64)
return (void)markOverdefined(&PN);
- unsigned NumActiveIncoming = 0;
+ if (isInstFullyOverDefined(PN))
+ return;
+ SmallVector<unsigned> FeasibleIncomingIndices;
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+ continue;
+ FeasibleIncomingIndices.push_back(i);
+ }
// Look at all of the executable operands of the PHI node. If any of them
// are overdefined, the PHI becomes overdefined as well. If they are all
// constant, and they agree with each other, the PHI becomes the identical
// constant. If they are constant and don't agree, the PHI is a constant
// range. If there are no executable operands, the PHI remains unknown.
- ValueLatticeElement PhiState = getValueState(&PN);
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
- if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
- continue;
-
- ValueLatticeElement IV = getValueState(PN.getIncomingValue(i));
- PhiState.mergeIn(IV);
- NumActiveIncoming++;
- if (PhiState.isOverdefined())
- break;
+ if (StructType *STy = dyn_cast<StructType>(PN.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ ValueLatticeElement PhiState = getStructValueState(&PN, i);
+ if (PhiState.isOverdefined())
+ continue;
+ for (unsigned j : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV =
+ getStructValueState(PN.getIncomingValue(j), i);
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i);
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
+ }
+ } else {
+ ValueLatticeElement PhiState = getValueState(&PN);
+ for (unsigned i : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ // We allow up to 1 range extension per active incoming value and one
+ // additional extension. Note that we manually adjust the number of range
+ // extensions to match the number of active incoming values. This helps to
+ // limit multiple extensions caused by the same incoming value, if other
+ // incoming values are equal.
+ ValueLatticeElement &PhiStateRef = ValueState[&PN];
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
}
-
- // We allow up to 1 range extension per active incoming value and one
- // additional extension. Note that we manually adjust the number of range
- // extensions to match the number of active incoming values. This helps to
- // limit multiple extensions caused by the same incoming value, if other
- // incoming values are equal.
- mergeInValue(&PN, PhiState,
- ValueLatticeElement::MergeOptions().setMaxWidenSteps(
- NumActiveIncoming + 1));
- ValueLatticeElement &PhiStateRef = getValueState(&PN);
- PhiStateRef.setNumRangeExtensions(
- std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
}
void SCCPInstVisitor::visitReturnInst(ReturnInst &I) {
@@ -1481,7 +1491,7 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
}
}
- ValueLatticeElement OpSt = getValueState(I.getOperand(0));
+ const ValueLatticeElement &OpSt = getValueState(I.getOperand(0));
if (OpSt.isUnknownOrUndef())
return;
@@ -1496,9 +1506,9 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
if (I.getDestTy()->isIntOrIntVectorTy() &&
I.getSrcTy()->isIntOrIntVectorTy() &&
I.getOpcode() != Instruction::BitCast) {
- auto &LV = getValueState(&I);
ConstantRange OpRange =
OpSt.asConstantRange(I.getSrcTy(), /*UndefAllowed=*/false);
+ auto &LV = getValueState(&I);
Type *DestTy = I.getDestTy();
ConstantRange Res = ConstantRange::getEmpty(DestTy->getScalarSizeInBits());
@@ -1516,19 +1526,24 @@ void SCCPInstVisitor::handleExtractOfWithOverflow(ExtractValueInst &EVI,
const WithOverflowInst *WO,
unsigned Idx) {
Value *LHS = WO->getLHS(), *RHS = WO->getRHS();
- ValueLatticeElement L = getValueState(LHS);
- ValueLatticeElement R = getValueState(RHS);
+ Type *Ty = LHS->getType();
+
addAdditionalUser(LHS, &EVI);
addAdditionalUser(RHS, &EVI);
- if (L.isUnknownOrUndef() || R.isUnknownOrUndef())
- return; // Wait to resolve.
- Type *Ty = LHS->getType();
+ const ValueLatticeElement &L = getValueState(LHS);
+ if (L.isUnknownOrUndef())
+ return; // Wait to resolve.
ConstantRange LR = L.asConstantRange(Ty, /*UndefAllowed=*/false);
+
+ const ValueLatticeElement &R = getValueState(RHS);
+ if (R.isUnknownOrUndef())
+ return; // Wait to resolve.
+
ConstantRange RR = R.asConstantRange(Ty, /*UndefAllowed=*/false);
if (Idx == 0) {
ConstantRange Res = LR.binaryOp(WO->getBinaryOp(), RR);
- mergeInValue(&EVI, ValueLatticeElement::getRange(Res));
+ mergeInValue(ValueState[&EVI], &EVI, ValueLatticeElement::getRange(Res));
} else {
assert(Idx == 1 && "Index can only be 0 or 1");
ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
@@ -1560,7 +1575,7 @@ void SCCPInstVisitor::visitExtractValueInst(ExtractValueInst &EVI) {
if (auto *WO = dyn_cast<WithOverflowInst>(AggVal))
return handleExtractOfWithOverflow(EVI, WO, i);
ValueLatticeElement EltVal = getStructValueState(AggVal, i);
- mergeInValue(getValueState(&EVI), &EVI, EltVal);
+ mergeInValue(ValueState[&EVI], &EVI, EltVal);
} else {
// Otherwise, must be extracting from an array.
return (void)markOverdefined(&EVI);
@@ -1616,14 +1631,18 @@ void SCCPInstVisitor::visitSelectInst(SelectInst &I) {
if (ValueState[&I].isOverdefined())
return (void)markOverdefined(&I);
- ValueLatticeElement CondValue = getValueState(I.getCondition());
+ const ValueLatticeElement &CondValue = getValueState(I.getCondition());
if (CondValue.isUnknownOrUndef())
return;
if (ConstantInt *CondCB =
getConstantInt(CondValue, I.getCondition()->getType())) {
Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
- mergeInValue(&I, getValueState(OpVal));
+ const ValueLatticeElement &OpValState = getValueState(OpVal);
+ // Safety: ValueState[&I] doesn't invalidate OpValState since it is already
+ // in the map.
+ assert(ValueState.contains(&I) && "&I is not in ValueState map.");
+ mergeInValue(ValueState[&I], &I, OpValState);
return;
}
@@ -1721,7 +1740,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
// being a special floating value.
ValueLatticeElement NewV;
NewV.markConstant(C, /*MayIncludeUndef=*/true);
- return (void)mergeInValue(&I, NewV);
+ return (void)mergeInValue(ValueState[&I], &I, NewV);
}
}
@@ -1741,7 +1760,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
R = A.overflowingBinaryOp(BO->getOpcode(), B, OBO->getNoWrapKind());
else
R = A.binaryOp(BO->getOpcode(), B);
- mergeInValue(&I, ValueLatticeElement::getRange(R));
+ mergeInValue(ValueState[&I], &I, ValueLatticeElement::getRange(R));
// TODO: Currently we do not exploit special values that produce something
// better than overdefined with an overdefined operand for vector or floating
@@ -1767,7 +1786,7 @@ void SCCPInstVisitor::visitCmpInst(CmpInst &I) {
if (C) {
ValueLatticeElement CV;
CV.markConstant(C);
- mergeInValue(&I, CV);
+ mergeInValue(ValueState[&I], &I, CV);
return;
}
@@ -1802,7 +1821,7 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
Operands.reserve(I.getNumOperands());
for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
- ValueLatticeElement State = getValueState(I.getOperand(i));
+ const ValueLatticeElement &State = getValueState(I.getOperand(i));
if (State.isUnknownOrUndef())
return; // Operands are not resolved yet.
@@ -1881,14 +1900,13 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) {
if (ValueState[&I].isOverdefined())
return (void)markOverdefined(&I);
- ValueLatticeElement PtrVal = getValueState(I.getOperand(0));
+ const ValueLatticeElement &PtrVal = getValueState(I.getOperand(0));
if (PtrVal.isUnknownOrUndef())
return; // The pointer is not resolved yet!
- ValueLatticeElement &IV = ValueState[&I];
-
if (SCCPSolver::isConstant(PtrVal)) {
Constant *Ptr = getConstant(PtrVal, I.getOperand(0)->getType());
+ ValueLatticeElement &IV = ValueState[&I];
// load null is undefined.
if (isa<ConstantPointerNull>(Ptr)) {
@@ -1916,7 +1934,7 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) {
}
// Fall back to metadata.
- mergeInValue(&I, getValueFromMetadata(&I));
+ mergeInValue(ValueState[&I], &I, getValueFromMetadata(&I));
}
void SCCPInstVisitor::visitCallBase(CallBase &CB) {
@@ -1944,7 +1962,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) {
return markOverdefined(&CB); // Can't handle struct args.
if (A.get()->getType()->isMetadataTy())
continue; // Carried in CB, not allowed in Operands.
- ValueLatticeElement State = getValueState(A);
+ const ValueLatticeElement &State = getValueState(A);
if (State.isUnknownOrUndef())
return; // Operands are not resolved yet.
@@ -1964,7 +1982,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) {
}
// Fall back to metadata.
- mergeInValue(&CB, getValueFromMetadata(&CB));
+ mergeInValue(ValueState[&CB], &CB, getValueFromMetadata(&CB));
}
void SCCPInstVisitor::handleCallArguments(CallBase &CB) {
@@ -1992,10 +2010,11 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) {
mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg,
getMaxWidenStepsOpts());
}
- } else
- mergeInValue(&*AI,
- getValueState(*CAI).intersect(getArgAttributeVL(&*AI)),
- getMaxWidenStepsOpts());
+ } else {
+ ValueLatticeElement CallArg =
+ getValueState(*CAI).intersect(getArgAttributeVL(&*AI));
+ mergeInValue(ValueState[&*AI], &*AI, CallArg, getMaxWidenStepsOpts());
+ }
}
}
}
@@ -2076,7 +2095,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
if (II->getIntrinsicID() == Intrinsic::vscale) {
unsigned BitWidth = CB.getType()->getScalarSizeInBits();
const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth);
- return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
+ return (void)mergeInValue(ValueState[II], II,
+ ValueLatticeElement::getRange(Result));
}
if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
@@ -2094,7 +2114,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
ConstantRange Result =
ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges);
- return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
+ return (void)mergeInValue(ValueState[II], II,
+ ValueLatticeElement::getRange(Result));
}
}
@@ -2121,10 +2142,25 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
return handleCallOverdefined(CB); // Not tracking this callee.
// If so, propagate the return value of the callee into this call result.
- mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts());
+ mergeInValue(ValueState[&CB], &CB, TFRVI->second, getMaxWidenStepsOpts());
}
}
+bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) {
+ // For structure Type, we handle each member separately.
+ // A structure object won't be considered as overdefined when
+ // there is at least one member that is not overdefined.
+ if (StructType *STy = dyn_cast<StructType>(Inst.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) {
+ if (!getStructValueState(&Inst, i).isOverdefined())
+ return false;
+ }
+ return true;
+ }
+
+ return getValueState(&Inst).isOverdefined();
+}
+
void SCCPInstVisitor::solve() {
// Process the work lists until they are empty!
while (!BBWorkList.empty() || !InstWorkList.empty()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 7651ba1..3fed003 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -325,6 +325,8 @@ public:
VPIRFlags Flags;
if (Opcode == Instruction::Trunc)
Flags = VPIRFlags::TruncFlagsTy(false, false);
+ else if (Opcode == Instruction::ZExt)
+ Flags = VPIRFlags::NonNegFlagsTy(false);
return tryInsertInstruction(
new VPWidenCastRecipe(Opcode, Op, ResultTy, Flags));
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 280eb20..adf27be 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1011,6 +1011,10 @@ public:
/// \returns True if instruction \p I can be truncated to a smaller bitwidth
/// for vectorization factor \p VF.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
+ // Truncs must truncate at most to their destination type.
+ if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) &&
+ I->getType()->getScalarSizeInBits() < MinBWs.lookup(I))
+ return false;
return VF.isVector() && MinBWs.contains(I) &&
!isProfitableToScalarize(I, VF) &&
!isScalarAfterVectorization(I, VF);
@@ -7192,7 +7196,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
// cost model is complete for better cost estimates.
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
- VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
+ VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks,
+ BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
bool HasBranchWeights =
@@ -7226,9 +7231,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
return DenseMap<const SCEV *, Value *>();
}
- VPlanTransforms::narrowInterleaveGroups(
- BestVPlan, BestVF,
- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -8197,6 +8199,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
if (CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
*Plan, CM.getMaxSafeElements());
+
+ if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
+ VPlans.push_back(std::move(P));
+
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b62c8f1..cdb9e7e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2242,8 +2242,49 @@ public:
/// may not be necessary.
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
- Align Alignment, const int64_t Diff, Value *Ptr0,
- Value *PtrN, StridedPtrInfo &SPtrInfo) const;
+ Align Alignment, const int64_t Diff,
+ const size_t Sz) const;
+
+ /// Return true if an array of scalar loads can be replaced with a strided
+ /// load (with constant stride).
+ ///
+ /// TODO:
+ /// It is possible that the load gets "widened". Suppose that originally each
+ /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
+ /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
+ /// ...
+ /// %b + 0 * %s + (w - 1)
+ ///
+ /// %b + 1 * %s + 0
+ /// %b + 1 * %s + 1
+ /// %b + 1 * %s + 2
+ /// ...
+ /// %b + 1 * %s + (w - 1)
+ /// ...
+ ///
+ /// %b + (n - 1) * %s + 0
+ /// %b + (n - 1) * %s + 1
+ /// %b + (n - 1) * %s + 2
+ /// ...
+ /// %b + (n - 1) * %s + (w - 1)
+ ///
+ /// In this case we will generate a strided load of type `<n x (k * w)>`.
+ ///
+ /// \param PointerOps list of pointer arguments of loads.
+ /// \param ElemTy original scalar type of loads.
+ /// \param Alignment alignment of the first load.
+ /// \param SortedIndices is the order of PointerOps as returned by
+ /// `sortPtrAccesses`
+ /// \param Diff Pointer difference between the lowest and the highes pointer
+ /// in `PointerOps` as returned by `getPointersDiff`.
+ /// \param Ptr0 first pointer in `PointersOps`.
+ /// \param PtrN last pointer in `PointersOps`.
+ /// \param SPtrInfo If the function return `true`, it also sets all the fields
+ /// of `SPtrInfo` necessary to generate the strided load later.
+ bool analyzeConstantStrideCandidate(
+ const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
+ const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
+ Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
/// Return true if an array of scalar loads can be replaced with a strided
/// load (with run-time stride).
@@ -5302,7 +5343,7 @@ private:
unsigned &OpCnt =
OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
EdgeInfo EI(TE, U.getOperandNo());
- if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
+ if (!getScheduleCopyableData(EI, Op))
continue;
// Found copyable operand - continue.
++OpCnt;
@@ -5536,62 +5577,79 @@ private:
}
// Decrement the unscheduled counter and insert to ready list if
// ready.
- auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
- unsigned OpIdx) {
- if (!ScheduleCopyableDataMap.empty()) {
- const EdgeInfo EI = {UserTE, OpIdx};
- if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
- DecrUnsched(CD, /*IsControl=*/false);
- return;
- }
- }
- auto It = OperandsUses.find(I);
- assert(It != OperandsUses.end() && "Operand not found");
- if (It->second > 0) {
- --It->getSecond();
- assert(TotalOpCount > 0 && "No more operands to decrement");
- --TotalOpCount;
- if (ScheduleData *OpSD = getScheduleData(I))
- DecrUnsched(OpSD, /*IsControl=*/false);
- }
- };
+ auto DecrUnschedForInst =
+ [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
+ SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
+ &Checked) {
+ if (!ScheduleCopyableDataMap.empty()) {
+ const EdgeInfo EI = {UserTE, OpIdx};
+ if (ScheduleCopyableData *CD =
+ getScheduleCopyableData(EI, I)) {
+ if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
+ return;
+ DecrUnsched(CD, /*IsControl=*/false);
+ return;
+ }
+ }
+ auto It = OperandsUses.find(I);
+ assert(It != OperandsUses.end() && "Operand not found");
+ if (It->second > 0) {
+ --It->getSecond();
+ assert(TotalOpCount > 0 && "No more operands to decrement");
+ --TotalOpCount;
+ if (ScheduleData *OpSD = getScheduleData(I)) {
+ if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
+ return;
+ DecrUnsched(OpSD, /*IsControl=*/false);
+ }
+ }
+ };
for (ScheduleBundle *Bundle : Bundles) {
if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
break;
// Need to search for the lane since the tree entry can be
// reordered.
- int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
- find(Bundle->getTreeEntry()->Scalars, In));
- assert(Lane >= 0 && "Lane not set");
- if (isa<StoreInst>(In) &&
- !Bundle->getTreeEntry()->ReorderIndices.empty())
- Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
- assert(Lane < static_cast<int>(
- Bundle->getTreeEntry()->Scalars.size()) &&
- "Couldn't find extract lane");
-
- // Since vectorization tree is being built recursively this
- // assertion ensures that the tree entry has all operands set before
- // reaching this code. Couple of exceptions known at the moment are
- // extracts where their second (immediate) operand is not added.
- // Since immediates do not affect scheduler behavior this is
- // considered okay.
- assert(In &&
- (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
- In->getNumOperands() ==
- Bundle->getTreeEntry()->getNumOperands() ||
- Bundle->getTreeEntry()->isCopyableElement(In)) &&
- "Missed TreeEntry operands?");
-
- for (unsigned OpIdx :
- seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
- if (auto *I = dyn_cast<Instruction>(
- Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
- LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
- << "\n");
- DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
- }
+ auto *It = find(Bundle->getTreeEntry()->Scalars, In);
+ SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
+ do {
+ int Lane =
+ std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(In) &&
+ !Bundle->getTreeEntry()->ReorderIndices.empty())
+ Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(
+ Bundle->getTreeEntry()->Scalars.size()) &&
+ "Couldn't find extract lane");
+
+ // Since vectorization tree is being built recursively this
+ // assertion ensures that the tree entry has all operands set
+ // before reaching this code. Couple of exceptions known at the
+ // moment are extracts where their second (immediate) operand is
+ // not added. Since immediates do not affect scheduler behavior
+ // this is considered okay.
+ assert(In &&
+ (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
+ In->getNumOperands() ==
+ Bundle->getTreeEntry()->getNumOperands() ||
+ Bundle->getTreeEntry()->isCopyableElement(In)) &&
+ "Missed TreeEntry operands?");
+
+ for (unsigned OpIdx :
+ seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
+ if (auto *I = dyn_cast<Instruction>(
+ Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
+ LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
+ << *I << "\n");
+ DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
+ }
+ // If parent node is schedulable, it will be handle correctly.
+ if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
+ break;
+ It = std::find(std::next(It),
+ Bundle->getTreeEntry()->Scalars.end(), In);
+ } while (It != Bundle->getTreeEntry()->Scalars.end());
}
} else {
// If BundleMember is a stand-alone instruction, no operand reordering
@@ -6849,9 +6907,8 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
/// current graph (for masked gathers extra extractelement instructions
/// might be required).
bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
- Align Alignment, const int64_t Diff, Value *Ptr0,
- Value *PtrN, StridedPtrInfo &SPtrInfo) const {
- const size_t Sz = PointerOps.size();
+ Align Alignment, const int64_t Diff,
+ const size_t Sz) const {
if (Diff % (Sz - 1) != 0)
return false;
@@ -6875,27 +6932,40 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
return false;
if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
return false;
+ return true;
+ }
+ return false;
+}
- // Iterate through all pointers and check if all distances are
- // unique multiple of Dist.
- SmallSet<int64_t, 4> Dists;
- for (Value *Ptr : PointerOps) {
- int64_t Dist = 0;
- if (Ptr == PtrN)
- Dist = Diff;
- else if (Ptr != Ptr0)
- Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
- // If the strides are not the same or repeated, we can't
- // vectorize.
- if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
- break;
- }
- if (Dists.size() == Sz) {
- Type *StrideTy = DL->getIndexType(Ptr0->getType());
- SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
- SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
- return true;
- }
+bool BoUpSLP::analyzeConstantStrideCandidate(
+ const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
+ const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
+ Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+ const size_t Sz = PointerOps.size();
+ if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
+ return false;
+
+ int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
+
+ // Iterate through all pointers and check if all distances are
+ // unique multiple of Dist.
+ SmallSet<int64_t, 4> Dists;
+ for (Value *Ptr : PointerOps) {
+ int64_t Dist = 0;
+ if (Ptr == PtrN)
+ Dist = Diff;
+ else if (Ptr != Ptr0)
+ Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
+ // If the strides are not the same or repeated, we can't
+ // vectorize.
+ if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+ break;
+ }
+ if (Dists.size() == Sz) {
+ Type *StrideTy = DL->getIndexType(Ptr0->getType());
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
+ SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
+ return true;
}
return false;
}
@@ -6995,8 +7065,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
Align Alignment =
cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
->getAlign();
- if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
- SPtrInfo))
+ if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
+ *Diff, Ptr0, PtrN, SPtrInfo))
return LoadsState::StridedVectorize;
}
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -10493,8 +10563,11 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
PoisonValue::get(UniqueValues.front()->getType()));
// Check that extended with poisons/copyable operations are still valid
// for vectorization (div/rem are not allowed).
- if (!S.areInstructionsWithCopyableElements() &&
- !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
+ if ((!S.areInstructionsWithCopyableElements() &&
+ !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
+ (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
+ (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
+ isa<CallInst>(S.getMainOp())))) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
ReuseShuffleIndices.clear();
return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index d167009..c95c887 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -217,32 +217,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
return Parent->getEnclosingBlockWithPredecessors();
}
-bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
- const VPDominatorTree &VPDT) {
- auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
- if (!VPBB)
- return false;
-
- // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
- // VPBB as its entry, i.e., free of predecessors.
- if (auto *R = VPBB->getParent())
- return !R->isReplicator() && !VPBB->hasPredecessors();
-
- // A header dominates its second predecessor (the latch), with the other
- // predecessor being the preheader
- return VPB->getPredecessors().size() == 2 &&
- VPDT.dominates(VPB, VPB->getPredecessors()[1]);
-}
-
-bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
- const VPDominatorTree &VPDT) {
- // A latch has a header as its second successor, with its other successor
- // leaving the loop. A preheader OTOH has a header as its first (and only)
- // successor.
- return VPB->getNumSuccessors() == 2 &&
- VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
-}
-
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
iterator It = begin();
while (It != end() && It->isPhi())
@@ -768,8 +742,12 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
VPRegionBlock *VPRegionBlock::clone() {
const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
- auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting,
- getName(), isReplicator());
+ VPlan &Plan = *getPlan();
+ VPRegionBlock *NewRegion =
+ isReplicator()
+ ? Plan.createReplicateRegion(NewEntry, NewExiting, getName())
+ : Plan.createLoopRegion(getName(), NewEntry, NewExiting);
+
for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
Block->setParent(NewRegion);
return NewRegion;
@@ -1213,6 +1191,7 @@ VPlan *VPlan::duplicate() {
}
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
Old2NewVPValues[&VF] = &NewPlan->VF;
+ Old2NewVPValues[&UF] = &NewPlan->UF;
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
if (BackedgeTakenCount) {
NewPlan->BackedgeTakenCount = new VPValue();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0e0b042..167ba55 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -407,6 +407,10 @@ public:
VPBasicBlock *getParent() { return Parent; }
const VPBasicBlock *getParent() const { return Parent; }
+ /// \return the VPRegionBlock which the recipe belongs to.
+ VPRegionBlock *getRegion();
+ const VPRegionBlock *getRegion() const;
+
/// The method which generates the output IR instructions that correspond to
/// this VPRecipe, thereby "executing" the VPlan.
virtual void execute(VPTransformState &State) = 0;
@@ -1003,6 +1007,11 @@ public:
/// Creates a fixed-width vector containing all operands. The number of
/// operands matches the vector element count.
BuildVector,
+ /// Extracts all lanes from its (non-scalable) vector operand. This is an
+ /// abstract VPInstruction whose single defined VPValue represents VF
+ /// scalars extracted from a vector, to be replaced by VF ExtractElement
+ /// VPInstructions.
+ Unpack,
/// Compute the final result of a AnyOf reduction with select(cmp(),x,y),
/// where one of (x,y) is loop invariant, and both x and y are integer type.
ComputeAnyOfResult,
@@ -2711,6 +2720,15 @@ public:
return R && classof(R);
}
+ static inline bool classof(const VPValue *VPV) {
+ const VPRecipeBase *R = VPV->getDefiningRecipe();
+ return R && classof(R);
+ }
+
+ static inline bool classof(const VPSingleDefRecipe *R) {
+ return classof(static_cast<const VPRecipeBase *>(R));
+ }
+
/// Generate the reduction in the loop.
void execute(VPTransformState &State) override;
@@ -3096,6 +3114,9 @@ public:
/// Returns true if this expression contains recipes that may have side
/// effects.
bool mayHaveSideEffects() const;
+
+ /// Returns true if the result of this VPExpressionRecipe is a single-scalar.
+ bool isSingleScalar() const;
};
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
@@ -4075,6 +4096,14 @@ public:
}
};
+inline VPRegionBlock *VPRecipeBase::getRegion() {
+ return getParent()->getParent();
+}
+
+inline const VPRegionBlock *VPRecipeBase::getRegion() const {
+ return getParent()->getParent();
+}
+
/// VPlan models a candidate for vectorization, encoding various decisions take
/// to produce efficient output IR, including which branches, basic-blocks and
/// output IR instructions to generate, and their cost. VPlan holds a
@@ -4123,6 +4152,9 @@ class VPlan {
/// Represents the vectorization factor of the loop.
VPValue VF;
+ /// Represents the symbolic unroll factor of the loop.
+ VPValue UF;
+
/// Represents the loop-invariant VF * UF of the vector loop region.
VPValue VFxUF;
@@ -4276,6 +4308,9 @@ public:
VPValue &getVF() { return VF; };
const VPValue &getVF() const { return VF; };
+ /// Returns the symbolic UF of the vector loop region.
+ VPValue &getSymbolicUF() { return UF; };
+
/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }
@@ -4285,6 +4320,12 @@ public:
void addVF(ElementCount VF) { VFs.insert(VF); }
+ /// Remove \p VF from the plan.
+ void removeVF(ElementCount VF) {
+ assert(hasVF(VF) && "tried to remove VF not present in plan");
+ VFs.remove(VF);
+ }
+
void setVF(ElementCount VF) {
assert(hasVF(VF) && "Cannot set VF not already in plan");
VFs.clear();
@@ -4409,22 +4450,24 @@ public:
return VPB;
}
- /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
- /// IsReplicator is true, the region is a replicate region. The returned block
- /// is owned by the VPlan and deleted once the VPlan is destroyed.
- VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
- const std::string &Name = "",
- bool IsReplicator = false) {
- auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
+ /// Create a new loop region with \p Name and entry and exiting blocks set
+ /// to \p Entry and \p Exiting respectively, if set. The returned block is
+ /// owned by the VPlan and deleted once the VPlan is destroyed.
+ VPRegionBlock *createLoopRegion(const std::string &Name = "",
+ VPBlockBase *Entry = nullptr,
+ VPBlockBase *Exiting = nullptr) {
+ auto *VPB = Entry ? new VPRegionBlock(Entry, Exiting, Name)
+ : new VPRegionBlock(Name);
CreatedBlocks.push_back(VPB);
return VPB;
}
- /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set
- /// to nullptr. The returned block is owned by the VPlan and deleted once the
- /// VPlan is destroyed.
- VPRegionBlock *createVPRegionBlock(const std::string &Name = "") {
- auto *VPB = new VPRegionBlock(Name);
+ /// Create a new replicate region with \p Entry, \p Exiting and \p Name. The
+ /// returned block is owned by the VPlan and deleted once the VPlan is
+ /// destroyed.
+ VPRegionBlock *createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting,
+ const std::string &Name = "") {
+ auto *VPB = new VPRegionBlock(Entry, Exiting, Name, true);
CreatedBlocks.push_back(VPB);
return VPB;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f413c63..80a2e4b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -110,6 +110,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::AnyOf:
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
+ case VPInstruction::Unpack:
return SetResultTyFromOp();
case VPInstruction::ExtractLane:
return inferScalarType(R->getOperand(1));
@@ -377,7 +378,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
#ifndef NDEBUG
auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
- auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
+ VPRegionBlock *Region = R->getRegion();
if (Region && Region->isReplicator()) {
assert(Region->getNumSuccessors() == 1 &&
Region->getNumPredecessors() == 1 && "Expected SESE region!");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 332791a..65688a3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -406,7 +406,7 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
// LatchExitVPB, taking care to preserve the original predecessor & successor
// order of blocks. Set region entry and exiting after both HeaderVPB and
// LatchVPBB have been disconnected from their predecessors/successors.
- auto *R = Plan.createVPRegionBlock();
+ auto *R = Plan.createLoopRegion();
VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R);
VPBlockUtils::disconnectBlocks(LatchVPBB, R);
VPBlockUtils::connectBlocks(PreheaderVPBB, R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index d8203e2..b5b98c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -388,6 +388,12 @@ m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
}
+template <typename Op0_t, typename Op1_t>
+inline VPInstruction_match<Instruction::ExtractElement, Op0_t, Op1_t>
+m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1);
+}
+
template <typename Op0_t>
inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
m_ExtractLastLanePerPart(const Op0_t &Op0) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a98c75..1f1b42b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -515,6 +515,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
+ case VPInstruction::Unpack:
return 1;
case Instruction::ICmp:
case Instruction::FCmp:
@@ -1246,6 +1247,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
case VPInstruction::VScale:
+ case VPInstruction::Unpack:
return false;
default:
return true;
@@ -1290,7 +1292,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::PtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
case VPInstruction::WidePtrAdd:
- return Op == getOperand(0);
+ // WidePtrAdd supports scalar and vector base addresses.
+ return false;
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ComputeFindIVResult:
return Op == getOperand(1);
@@ -1417,6 +1420,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
+ case VPInstruction::Unpack:
+ O << "unpack";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -2352,7 +2358,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = getParent()->getParent()->getCanonicalIV();
+ auto *CanIV = getRegion()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
getScalarType() == CanIV->getScalarType();
}
@@ -2888,6 +2894,13 @@ bool VPExpressionRecipe::mayHaveSideEffects() const {
return false;
}
+bool VPExpressionRecipe::isSingleScalar() const {
+ // Cannot use vputils::isSingleScalar(), because all external operands
+ // of the expression will be live-ins while bundled.
+ return isa<VPReductionRecipe>(ExpressionRecipes.back()) &&
+ !isa<VPPartialReductionRecipe>(ExpressionRecipes.back());
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
@@ -3076,7 +3089,7 @@ static void scalarizeInstruction(const Instruction *Instr,
State.AC->registerAssumption(II);
assert(
- (RepRecipe->getParent()->getParent() ||
+ (RepRecipe->getRegion() ||
!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
all_of(RepRecipe->operands(),
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
@@ -3149,7 +3162,17 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
while (!WorkList.empty()) {
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
- if (!Cur || !Seen.insert(Cur).second || isa<VPBlendRecipe>(Cur))
+ if (!Cur || !Seen.insert(Cur).second)
+ continue;
+
+ auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
+ // Skip blends that use V only through a compare by checking if any incoming
+ // value was already visited.
+ if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
+ [&](unsigned I) {
+ return Seen.contains(
+ Blend->getIncomingValue(I)->getDefiningRecipe());
+ }))
continue;
for (VPUser *U : Cur->users()) {
@@ -3170,7 +3193,13 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
}
}
- append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+ // The legacy cost model only supports scalarization loads/stores with phi
+ // addresses, if the phi is directly used as load/store address. Don't
+ // traverse further for Blends.
+ if (Blend)
+ continue;
+
+ append_range(WorkList, Cur->users());
}
return false;
}
@@ -3268,7 +3297,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
to_vector(operands()), VF);
// If the recipe is not predicated (i.e. not in a replicate region), return
// the scalar cost. Otherwise handle predicated cost.
- if (!getParent()->getParent()->isReplicator())
+ if (!getRegion()->isReplicator())
return ScalarCost;
// Account for the phi nodes that we will create.
@@ -3284,7 +3313,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
case Instruction::Store: {
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- const VPRegionBlock *ParentRegion = getParent()->getParent();
+ const VPRegionBlock *ParentRegion = getRegion();
if (ParentRegion && ParentRegion->isReplicator())
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cae9aee8..ff25ef5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -106,7 +106,7 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
return false;
NewRecipe = new VPWidenIntrinsicRecipe(
*CI, getVectorIntrinsicIDForCall(CI, &TLI),
- {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
+ drop_end(Ingredient.operands()), CI->getType(),
CI->getDebugLoc());
} else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
@@ -356,8 +356,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
// Replace predicated replicate recipe with a replicate recipe without a
// mask but in the replicate region.
auto *RecipeWithoutMask = new VPReplicateRecipe(
- PredRecipe->getUnderlyingInstr(),
- make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
+ PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
auto *Pred =
Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
@@ -373,7 +372,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
auto *Exiting =
Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
VPRegionBlock *Region =
- Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
+ Plan.createReplicateRegion(Entry, Exiting, RegionName);
// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
@@ -939,7 +938,7 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
continue;
if (!isDeadRecipe(*R))
continue;
- WorkList.append(R->op_begin(), R->op_end());
+ append_range(WorkList, R->operands());
R->eraseFromParent();
}
}
@@ -1224,6 +1223,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
+ uint64_t Idx;
+ if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {
+ auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+ Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
+ return;
+ }
+
if (auto *Phi = dyn_cast<VPPhi>(Def)) {
if (Phi->getNumOperands() == 1)
Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -1472,11 +1478,8 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
if (!Plan.getVectorLoopRegion())
return false;
- if (!Plan.getTripCount()->isLiveIn())
- return false;
- auto *TC = dyn_cast_if_present<ConstantInt>(
- Plan.getTripCount()->getUnderlyingValue());
- if (!TC || !BestVF.isFixed())
+ const APInt *TC;
+ if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
return false;
// Calculate the minimum power-of-2 bit width that can fit the known TC, VF
@@ -1489,7 +1492,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
};
unsigned NewBitWidth =
- ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
+ ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
LLVMContext &Ctx = Plan.getContext();
auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
@@ -1858,8 +1861,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
return nullptr;
VPRegionBlock *EnclosingLoopRegion =
HoistCandidate->getParent()->getEnclosingLoopRegion();
- assert((!HoistCandidate->getParent()->getParent() ||
- HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
+ assert((!HoistCandidate->getRegion() ||
+ HoistCandidate->getRegion() == EnclosingLoopRegion) &&
"CFG in VPlan should still be flat, without replicate regions");
// Hoist candidate was already visited, no need to hoist.
if (!Visited.insert(HoistCandidate).second)
@@ -2006,7 +2009,7 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
.Case<VPWidenIntrinsicRecipe>([](auto *I) {
return std::make_pair(true, I->getVectorIntrinsicID());
})
- .Case<VPVectorPointerRecipe>([](auto *I) {
+ .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
// For recipes that do not directly map to LLVM IR instructions,
// assign opcodes after the last VPInstruction opcode (which is also
// after the last IR Instruction opcode), based on the VPDefID.
@@ -2083,6 +2086,15 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
LFlags->getPredicate() !=
cast<VPRecipeWithIRFlags>(R)->getPredicate())
return false;
+ // Recipes in replicate regions implicitly depend on predicate. If either
+ // recipe is in a replicate region, only consider them equal if both have
+ // the same parent.
+ const VPRegionBlock *RegionL = L->getRegion();
+ const VPRegionBlock *RegionR = R->getRegion();
+ if (((RegionL && RegionL->isReplicator()) ||
+ (RegionR && RegionR->isReplicator())) &&
+ L->getParent() != R->getParent())
+ return false;
const VPlan *Plan = L->getParent()->getPlan();
VPTypeAnalysis TypeInfo(*Plan);
return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
@@ -2898,7 +2910,7 @@ void VPlanTransforms::replaceSymbolicStrides(
// evolution.
auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
auto *R = cast<VPRecipeBase>(&U);
- return R->getParent()->getParent() ||
+ return R->getRegion() ||
R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
};
ValueToSCEVMapTy RewriteMap;
@@ -3780,7 +3792,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
BTC->replaceAllUsesWith(TCMO);
}
-void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
+void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
if (Plan.hasScalarVFOnly())
return;
@@ -3803,8 +3815,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
- VPRegionBlock *ParentRegion =
- cast<VPRecipeBase>(U)->getParent()->getParent();
+ VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
};
if ((isa<VPReplicateRecipe>(DefR) &&
@@ -3829,6 +3840,49 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
});
}
}
+
+ // Create explicit VPInstructions to convert vectors to scalars. The current
+ // implementation is conservative - it may miss some cases that may or may not
+ // be vector values. TODO: introduce Unpacks speculatively - remove them later
+ // if they are known to operate on scalar values.
+ for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
+ VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(&R))
+ continue;
+ for (VPValue *Def : R.definedValues()) {
+ // Skip recipes that are single-scalar or only have their first lane
+ // used.
+ // TODO: The Defs skipped here may or may not be vector values.
+ // Introduce Unpacks, and remove them later, if they are guaranteed to
+ // produce scalar values.
+ if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def))
+ continue;
+
+ // At the moment, we create unpacks only for scalar users outside
+ // replicate regions. Recipes inside replicate regions still extract the
+ // required lanes implicitly.
+ // TODO: Remove once replicate regions are unrolled completely.
+ auto IsCandidateUnpackUser = [Def](VPUser *U) {
+ VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
+ return U->usesScalars(Def) &&
+ (!ParentRegion || !ParentRegion->isReplicator());
+ };
+ if (none_of(Def->users(), IsCandidateUnpackUser))
+ continue;
+
+ auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
+ if (R.isPhi())
+ Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
+ else
+ Unpack->insertAfter(&R);
+ Def->replaceUsesWithIf(Unpack,
+ [&IsCandidateUnpackUser](VPUser &U, unsigned) {
+ return IsCandidateUnpackUser(&U);
+ });
+ }
+ }
+ }
}
void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
@@ -3902,6 +3956,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
// used.
// TODO: Assert that they aren't used.
+ VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+ Plan.getSymbolicUF().replaceAllUsesWith(UF);
+
// If there are no users of the runtime VF, compute VFxUF by constant folding
// the multiplication of VF and UF.
if (VF.getNumUsers() == 0) {
@@ -3921,7 +3978,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
}
VF.replaceAllUsesWith(RuntimeVF);
- VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
VFxUF.replaceAllUsesWith(MulByUF);
}
@@ -3989,14 +4045,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
return false;
}
-/// Returns true if \p IR is a full interleave group with factor and number of
-/// members both equal to \p VF. The interleave group must also access the full
-/// vector width \p VectorRegWidth.
-static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
- unsigned VF, VPTypeAnalysis &TypeInfo,
- unsigned VectorRegWidth) {
+/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
+/// number of members both equal to VF. The interleave group must also access
+/// the full vector width.
+static std::optional<ElementCount> isConsecutiveInterleaveGroup(
+ VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
+ VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
if (!InterleaveR)
- return false;
+ return std::nullopt;
Type *GroupElementTy = nullptr;
if (InterleaveR->getStoredValues().empty()) {
@@ -4005,7 +4061,7 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
- return false;
+ return std::nullopt;
} else {
GroupElementTy =
TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
@@ -4013,13 +4069,27 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
[&TypeInfo, GroupElementTy](VPValue *Op) {
return TypeInfo.inferScalarType(Op) == GroupElementTy;
}))
- return false;
+ return std::nullopt;
}
- unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
- auto IG = InterleaveR->getInterleaveGroup();
- return IG->getFactor() == VF && IG->getNumMembers() == VF &&
- GroupSize == VectorRegWidth;
+ auto GetVectorWidthForVF = [&TTI](ElementCount VF) {
+ TypeSize Size = TTI.getRegisterBitWidth(
+ VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
+ : TargetTransformInfo::RGK_ScalableVector);
+ assert(Size.isScalable() == VF.isScalable() &&
+ "if Size is scalable, VF must to and vice versa");
+ return Size.getKnownMinValue();
+ };
+
+ for (ElementCount VF : VFs) {
+ unsigned MinVal = VF.getKnownMinValue();
+ unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
+ auto IG = InterleaveR->getInterleaveGroup();
+ if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal &&
+ GroupSize == GetVectorWidthForVF(VF))
+ return {VF};
+ }
+ return std::nullopt;
}
/// Returns true if \p VPValue is a narrow VPValue.
@@ -4030,16 +4100,18 @@ static bool isAlreadyNarrow(VPValue *VPV) {
return RepR && RepR->isSingleScalar();
}
-void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth) {
+std::unique_ptr<VPlan>
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
+ const TargetTransformInfo &TTI) {
+ using namespace llvm::VPlanPatternMatch;
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
+
if (!VectorLoop)
- return;
+ return nullptr;
VPTypeAnalysis TypeInfo(Plan);
-
- unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
+ std::optional<ElementCount> VFToOptimize;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
continue;
@@ -4053,30 +4125,33 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// * recipes writing to memory except interleave groups
// Only support plans with a canonical induction phi.
if (R.isPhi())
- return;
+ return nullptr;
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
if (R.mayWriteToMemory() && !InterleaveR)
- return;
-
- // Do not narrow interleave groups if there are VectorPointer recipes and
- // the plan was unrolled. The recipe implicitly uses VF from
- // VPTransformState.
- // TODO: Remove restriction once the VF for the VectorPointer offset is
- // modeled explicitly as operand.
- if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
- return;
+ return nullptr;
// All other ops are allowed, but we reject uses that cannot be converted
// when checking all allowed consumers (store interleave groups) below.
if (!InterleaveR)
continue;
- // Bail out on non-consecutive interleave groups.
- if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
- VectorRegWidth))
- return;
-
+ // Try to find a single VF, where all interleave groups are consecutive and
+ // saturate the full vector width. If we already have a candidate VF, check
+ // if it is applicable for the current InterleaveR, otherwise look for a
+ // suitable VF across the Plans VFs.
+ //
+ if (VFToOptimize) {
+ if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo,
+ TTI))
+ return nullptr;
+ } else {
+ if (auto VF = isConsecutiveInterleaveGroup(
+ InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI))
+ VFToOptimize = *VF;
+ else
+ return nullptr;
+ }
// Skip read interleave groups.
if (InterleaveR->getStoredValues().empty())
continue;
@@ -4110,24 +4185,34 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
if (!WideMember0)
- return;
+ return nullptr;
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
R->getNumOperands() > 2)
- return;
+ return nullptr;
if (any_of(enumerate(R->operands()),
[WideMember0, Idx = I](const auto &P) {
const auto &[OpIdx, OpV] = P;
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
}))
- return;
+ return nullptr;
}
StoreGroups.push_back(InterleaveR);
}
if (StoreGroups.empty())
- return;
+ return nullptr;
+
+ // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
+ // original Plan into 2: a) a new clone which contains all VFs of Plan, except
+ // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
+ std::unique_ptr<VPlan> NewPlan;
+ if (size(Plan.vectorFactors()) != 1) {
+ NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
+ Plan.setVF(*VFToOptimize);
+ NewPlan->removeVF(*VFToOptimize);
+ }
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
SmallPtrSet<VPValue *, 4> NarrowedOps;
@@ -4198,9 +4283,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
VPBuilder PHBuilder(Plan.getVectorPreheader());
- VPValue *UF = Plan.getOrAddLiveIn(
- ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
- if (VF.isScalable()) {
+ VPValue *UF = &Plan.getSymbolicUF();
+ if (VFToOptimize->isScalable()) {
VPValue *VScale = PHBuilder.createElementCount(
CanIV->getScalarType(), ElementCount::getScalable(1));
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
@@ -4212,6 +4296,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
}
removeDeadRecipes(Plan);
+ assert(none_of(*VectorLoop->getEntryBasicBlock(),
+ IsaPred<VPVectorPointerRecipe>) &&
+ "All VPVectorPointerRecipes should have been removed");
+ return NewPlan;
}
/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5a8a2bb..ca8d956 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -325,9 +325,10 @@ struct VPlanTransforms {
static void materializeBackedgeTakenCount(VPlan &Plan,
VPBasicBlock *VectorPH);
- /// Add explicit Build[Struct]Vector recipes that combine multiple scalar
- /// values into single vectors.
- static void materializeBuildVectors(VPlan &Plan);
+ /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
+ /// into vectors and Unpack recipes to extract scalars from vectors as
+ /// needed.
+ static void materializePacksAndUnpacks(VPlan &Plan);
/// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
@@ -340,14 +341,20 @@ struct VPlanTransforms {
static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
ScalarEvolution &SE);
- /// Try to convert a plan with interleave groups with VF elements to a plan
- /// with the interleave groups replaced by wide loads and stores processing VF
- /// elements, if all transformed interleave groups access the full vector
- /// width (checked via \o VectorRegWidth). This effectively is a very simple
- /// form of loop-aware SLP, where we use interleave groups to identify
- /// candidates.
- static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
- unsigned VectorRegWidth);
+ /// Try to find a single VF among \p Plan's VFs for which all interleave
+ /// groups (with known minimum VF elements) can be replaced by wide loads and
+ /// stores processing VF elements, if all transformed interleave groups access
+ /// the full vector width (checked via the maximum vector register width). If
+ /// the transformation can be applied, the original \p Plan will be split in
+ /// 2:
+ /// 1. The original Plan with the single VF containing the optimized recipes
+ /// using wide loads instead of interleave groups.
+ /// 2. A new clone which contains all VFs of Plan except the optimized VF.
+ ///
+ /// This effectively is a very simple form of loop-aware SLP, where we use
+ /// interleave groups to identify candidates.
+ static std::unique_ptr<VPlan>
+ narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
/// Predicate and linearize the control-flow in the only loop region of
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 5aeda3e..cfd1a74 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -465,10 +465,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
/// definitions for operands of \DefR.
-static VPRecipeWithIRFlags *
+static VPValue *
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
VPRecipeWithIRFlags *DefR, VPLane Lane,
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
+ VPValue *Op;
+ if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) {
+ auto LaneDefs = Def2LaneDefs.find(Op);
+ if (LaneDefs != Def2LaneDefs.end())
+ return LaneDefs->second[Lane.getKnownLane()];
+
+ VPValue *Idx =
+ Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+ return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
+ }
+
// Collect the operands at Lane, creating extracts as needed.
SmallVector<VPValue *> NewOps;
for (VPValue *Op : DefR->operands()) {
@@ -480,6 +491,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
continue;
}
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
+ // Look through mandatory Unpack.
+ [[maybe_unused]] bool Matched =
+ match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)));
+ assert(Matched && "original op must have been Unpack");
NewOps.push_back(
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
continue;
@@ -547,7 +562,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
(isa<VPReplicateRecipe>(&R) &&
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
(isa<VPInstruction>(&R) &&
- !cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
+ !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
+ cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 8b1b0e5..32e4b88 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -8,6 +8,7 @@
#include "VPlanUtils.h"
#include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
#include "VPlanPatternMatch.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -113,12 +114,12 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
return TypeSwitch<const VPRecipeBase *, bool>(R)
.Case<VPDerivedIVRecipe>([](const auto *R) { return true; })
.Case<VPReplicateRecipe>([](const auto *R) {
- // Loads and stores that are uniform across VF lanes are handled by
- // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if
- // all their operands are invariant.
- // TODO: Further relax the restrictions.
+ // Be conservative about side-effects, except for the
+ // known-side-effecting assumes and stores, which we know will be
+ // uniform.
return R->isSingleScalar() &&
- (isa<LoadInst, StoreInst>(R->getUnderlyingValue())) &&
+ (!R->mayHaveSideEffects() ||
+ isa<AssumeInst, StoreInst>(R->getUnderlyingInstr())) &&
all_of(R->operands(), isUniformAcrossVFsAndUFs);
})
.Case<VPInstruction>([](const auto *VPI) {
@@ -253,3 +254,29 @@ vputils::getRecipesForUncountableExit(VPlan &Plan,
return UncountableCondition;
}
+
+bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
+ const VPDominatorTree &VPDT) {
+ auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
+ if (!VPBB)
+ return false;
+
+ // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
+ // VPBB as its entry, i.e., free of predecessors.
+ if (auto *R = VPBB->getParent())
+ return !R->isReplicator() && !VPBB->hasPredecessors();
+
+ // A header dominates its second predecessor (the latch), with the other
+ // predecessor being the preheader
+ return VPB->getPredecessors().size() == 2 &&
+ VPDT.dominates(VPB, VPB->getPredecessors()[1]);
+}
+
+bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
+ const VPDominatorTree &VPDT) {
+ // A latch has a header as its second successor, with its other successor
+ // leaving the loop. A preheader OTOH has a header as its first (and only)
+ // successor.
+ return VPB->getNumSuccessors() == 2 &&
+ VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index cf95ac0..840a5b9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
return true;
if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
- const VPRegionBlock *RegionOfR = Rep->getParent()->getParent();
+ const VPRegionBlock *RegionOfR = Rep->getRegion();
// Don't consider recipes in replicate regions as uniform yet; their first
// lane cannot be accessed when executing the replicate region for other
// lanes.
@@ -84,6 +84,12 @@ inline bool isSingleScalar(const VPValue *VPV) {
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
(PreservesUniformity(VPI->getOpcode()) &&
all_of(VPI->operands(), isSingleScalar));
+ if (isa<VPPartialReductionRecipe>(VPV))
+ return false;
+ if (isa<VPReductionRecipe>(VPV))
+ return true;
+ if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
+ return Expr->isSingleScalar();
// VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
return isa<VPExpandSCEVRecipe>(VPV);