aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp23
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp27
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp7
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp24
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/EarlyCSE.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp8
-rw-r--r--llvm/lib/Transforms/Scalar/GVNSink.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/InferAlignment.cpp17
-rw-r--r--llvm/lib/Transforms/Scalar/LoopInterchange.cpp49
-rw-r--r--llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp43
-rw-r--r--llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp96
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h29
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp43
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp80
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp20
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp10
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h8
27 files changed, 370 insertions, 160 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e1e24a9..dab200d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -289,12 +289,11 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
// * Narrow width by halfs excluding zero/undef lanes
Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
Value *LoadPtr = II.getArgOperand(0);
- const Align Alignment =
- cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+ const Align Alignment = II.getParamAlign(0).valueOrOne();
// If the mask is all ones or undefs, this is a plain vector load of the 1st
// argument.
- if (maskIsAllOneOrUndef(II.getArgOperand(2))) {
+ if (maskIsAllOneOrUndef(II.getArgOperand(1))) {
LoadInst *L = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
"unmaskedload");
L->copyMetadata(II);
@@ -308,7 +307,7 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
LoadInst *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
"unmaskedload");
LI->copyMetadata(II);
- return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
+ return Builder.CreateSelect(II.getArgOperand(1), LI, II.getArgOperand(2));
}
return nullptr;
@@ -319,8 +318,8 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
// * Narrow width by halfs excluding zero/undef lanes
Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
Value *StorePtr = II.getArgOperand(1);
- Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
- auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+ Align Alignment = II.getParamAlign(1).valueOrOne();
+ auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
if (!ConstMask)
return nullptr;
@@ -356,7 +355,7 @@ Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
// * Narrow width by halfs excluding zero/undef lanes
// * Vector incrementing address -> vector masked load
Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
- auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
+ auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(1));
if (!ConstMask)
return nullptr;
@@ -366,8 +365,7 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
if (ConstMask->isAllOnesValue())
if (auto *SplatPtr = getSplatValue(II.getArgOperand(0))) {
auto *VecTy = cast<VectorType>(II.getType());
- const Align Alignment =
- cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+ const Align Alignment = II.getParamAlign(0).valueOrOne();
LoadInst *L = Builder.CreateAlignedLoad(VecTy->getElementType(), SplatPtr,
Alignment, "load.scalar");
Value *Shuf =
@@ -384,7 +382,7 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
// * Narrow store width by halfs excluding zero/undef lanes
// * Vector incrementing address -> vector masked store
Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
- auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+ auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
if (!ConstMask)
return nullptr;
@@ -397,8 +395,7 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
// scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr
if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) {
if (maskContainsAllOneOrUndef(ConstMask)) {
- Align Alignment =
- cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+ Align Alignment = II.getParamAlign(1).valueOrOne();
StoreInst *S = new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false,
Alignment);
S->copyMetadata(II);
@@ -408,7 +405,7 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
// scatter(vector, splat(ptr), splat(true)) -> store extract(vector,
// lastlane), ptr
if (ConstMask->isAllOnesValue()) {
- Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+ Align Alignment = II.getParamAlign(1).valueOrOne();
VectorType *WideLoadTy = cast<VectorType>(II.getArgOperand(1)->getType());
ElementCount VF = WideLoadTy->getElementCount();
Value *RunTimeVF = Builder.CreateElementCount(Builder.getInt32Ty(), VF);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index a8eb9b9..975498f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -4501,24 +4501,24 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
if (Value *V = foldSelectIntoAddConstant(SI, Builder))
return replaceInstUsesWith(SI, V);
- // select(mask, mload(,,mask,0), 0) -> mload(,,mask,0)
+ // select(mask, mload(ptr,mask,0), 0) -> mload(ptr,mask,0)
// Load inst is intentionally not checked for hasOneUse()
if (match(FalseVal, m_Zero()) &&
- (match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal),
+ (match(TrueVal, m_MaskedLoad(m_Value(), m_Specific(CondVal),
m_CombineOr(m_Undef(), m_Zero()))) ||
- match(TrueVal, m_MaskedGather(m_Value(), m_Value(), m_Specific(CondVal),
+ match(TrueVal, m_MaskedGather(m_Value(), m_Specific(CondVal),
m_CombineOr(m_Undef(), m_Zero()))))) {
auto *MaskedInst = cast<IntrinsicInst>(TrueVal);
- if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
- MaskedInst->setArgOperand(3, FalseVal /* Zero */);
+ if (isa<UndefValue>(MaskedInst->getArgOperand(2)))
+ MaskedInst->setArgOperand(2, FalseVal /* Zero */);
return replaceInstUsesWith(SI, MaskedInst);
}
Value *Mask;
if (match(TrueVal, m_Zero()) &&
- (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
+ (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(Mask),
m_CombineOr(m_Undef(), m_Zero()))) ||
- match(FalseVal, m_MaskedGather(m_Value(), m_Value(), m_Value(Mask),
+ match(FalseVal, m_MaskedGather(m_Value(), m_Value(Mask),
m_CombineOr(m_Undef(), m_Zero())))) &&
(CondVal->getType() == Mask->getType())) {
// We can remove the select by ensuring the load zeros all lanes the
@@ -4531,8 +4531,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
if (CanMergeSelectIntoLoad) {
auto *MaskedInst = cast<IntrinsicInst>(FalseVal);
- if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
- MaskedInst->setArgOperand(3, TrueVal /* Zero */);
+ if (isa<UndefValue>(MaskedInst->getArgOperand(2)))
+ MaskedInst->setArgOperand(2, TrueVal /* Zero */);
return replaceInstUsesWith(SI, MaskedInst);
}
}
@@ -4671,14 +4671,13 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
}
Value *MaskedLoadPtr;
- const APInt *MaskedLoadAlignment;
if (match(TrueVal, m_OneUse(m_MaskedLoad(m_Value(MaskedLoadPtr),
- m_APInt(MaskedLoadAlignment),
m_Specific(CondVal), m_Value()))))
return replaceInstUsesWith(
- SI, Builder.CreateMaskedLoad(TrueVal->getType(), MaskedLoadPtr,
- Align(MaskedLoadAlignment->getZExtValue()),
- CondVal, FalseVal));
+ SI, Builder.CreateMaskedLoad(
+ TrueVal->getType(), MaskedLoadPtr,
+ cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(),
+ CondVal, FalseVal));
return nullptr;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a330bb7..651e305 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1892,7 +1892,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
// segfaults which didn't exist in the original program.
APInt DemandedPtrs(APInt::getAllOnes(VWidth)),
DemandedPassThrough(DemandedElts);
- if (auto *CMask = dyn_cast<Constant>(II->getOperand(2))) {
+ if (auto *CMask = dyn_cast<Constant>(II->getOperand(1))) {
for (unsigned i = 0; i < VWidth; i++) {
if (Constant *CElt = CMask->getAggregateElement(i)) {
if (CElt->isNullValue())
@@ -1905,7 +1905,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
if (II->getIntrinsicID() == Intrinsic::masked_gather)
simplifyAndSetOp(II, 0, DemandedPtrs, PoisonElts2);
- simplifyAndSetOp(II, 3, DemandedPassThrough, PoisonElts3);
+ simplifyAndSetOp(II, 2, DemandedPassThrough, PoisonElts3);
// Output elements are undefined if the element from both sources are.
// TODO: can strengthen via mask as well.
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 2646334..cb6ca72 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1494,11 +1494,8 @@ void AddressSanitizer::getInterestingMemoryOperands(
if (ignoreAccess(I, BasePtr))
return;
Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
- MaybeAlign Alignment = Align(1);
- // Otherwise no alignment guarantees. We probably got Undef.
- if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
- Alignment = Op->getMaybeAlignValue();
- Value *Mask = CI->getOperand(2 + OpOffset);
+ MaybeAlign Alignment = CI->getParamAlign(0);
+ Value *Mask = CI->getOperand(1 + OpOffset);
Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
break;
}
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp
index 3ae771a..3c0f185 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp
@@ -338,7 +338,7 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
}
auto *BasePtr = CI->getOperand(0 + OpOffset);
- Access.MaybeMask = CI->getOperand(2 + OpOffset);
+ Access.MaybeMask = CI->getOperand(1 + OpOffset);
Access.Addr = BasePtr;
}
}
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index eff6f0c..b6cbecb 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4191,10 +4191,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void handleMaskedGather(IntrinsicInst &I) {
IRBuilder<> IRB(&I);
Value *Ptrs = I.getArgOperand(0);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
- Value *Mask = I.getArgOperand(2);
- Value *PassThru = I.getArgOperand(3);
+ const Align Alignment = I.getParamAlign(0).valueOrOne();
+ Value *Mask = I.getArgOperand(1);
+ Value *PassThru = I.getArgOperand(2);
Type *PtrsShadowTy = getShadowTy(Ptrs);
if (ClCheckAccessAddress) {
@@ -4230,9 +4229,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRBuilder<> IRB(&I);
Value *Values = I.getArgOperand(0);
Value *Ptrs = I.getArgOperand(1);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
- Value *Mask = I.getArgOperand(3);
+ const Align Alignment = I.getParamAlign(1).valueOrOne();
+ Value *Mask = I.getArgOperand(2);
Type *PtrsShadowTy = getShadowTy(Ptrs);
if (ClCheckAccessAddress) {
@@ -4262,9 +4260,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRBuilder<> IRB(&I);
Value *V = I.getArgOperand(0);
Value *Ptr = I.getArgOperand(1);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
- Value *Mask = I.getArgOperand(3);
+ const Align Alignment = I.getParamAlign(1).valueOrOne();
+ Value *Mask = I.getArgOperand(2);
Value *Shadow = getShadow(V);
if (ClCheckAccessAddress) {
@@ -4295,10 +4292,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
void handleMaskedLoad(IntrinsicInst &I) {
IRBuilder<> IRB(&I);
Value *Ptr = I.getArgOperand(0);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
- Value *Mask = I.getArgOperand(2);
- Value *PassThru = I.getArgOperand(3);
+ const Align Alignment = I.getParamAlign(0).valueOrOne();
+ Value *Mask = I.getArgOperand(1);
+ Value *PassThru = I.getArgOperand(2);
if (ClCheckAccessAddress) {
insertCheckShadowOf(Ptr, &I);
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 6141b6d..4ac1321 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -272,7 +272,7 @@ static OverwriteResult isMaskedStoreOverwrite(const Instruction *KillingI,
if (KillingII->getIntrinsicID() == Intrinsic::masked_store) {
// Masks.
// TODO: check that KillingII's mask is a superset of the DeadII's mask.
- if (KillingII->getArgOperand(3) != DeadII->getArgOperand(3))
+ if (KillingII->getArgOperand(2) != DeadII->getArgOperand(2))
return OW_Unknown;
} else if (KillingII->getIntrinsicID() == Intrinsic::vp_store) {
// Masks.
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 2afa7b7..e30f306 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -1017,14 +1017,14 @@ private:
};
auto MaskOp = [](const IntrinsicInst *II) {
if (II->getIntrinsicID() == Intrinsic::masked_load)
- return II->getOperand(2);
+ return II->getOperand(1);
if (II->getIntrinsicID() == Intrinsic::masked_store)
- return II->getOperand(3);
+ return II->getOperand(2);
llvm_unreachable("Unexpected IntrinsicInst");
};
auto ThruOp = [](const IntrinsicInst *II) {
if (II->getIntrinsicID() == Intrinsic::masked_load)
- return II->getOperand(3);
+ return II->getOperand(2);
llvm_unreachable("Unexpected IntrinsicInst");
};
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 42db424..72e1131 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2212,11 +2212,11 @@ bool GVNPass::processMaskedLoad(IntrinsicInst *I) {
if (!DepInst || !Dep.isLocal() || !Dep.isDef())
return false;
- Value *Mask = I->getOperand(2);
- Value *Passthrough = I->getOperand(3);
+ Value *Mask = I->getOperand(1);
+ Value *Passthrough = I->getOperand(2);
Value *StoreVal;
- if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(),
- m_Specific(Mask))) ||
+ if (!match(DepInst,
+ m_MaskedStore(m_Value(StoreVal), m_Value(), m_Specific(Mask))) ||
StoreVal->getType() != I->getType())
return false;
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index b9534def..a06f832 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -430,6 +430,7 @@ public:
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::PtrToInt:
+ case Instruction::PtrToAddr:
case Instruction::IntToPtr:
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 995b803..39751c0 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -45,25 +45,20 @@ static bool tryToImproveAlign(
switch (II->getIntrinsicID()) {
case Intrinsic::masked_load:
case Intrinsic::masked_store: {
- int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2;
- Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load
- ? II->getArgOperand(0)
- : II->getArgOperand(1);
+ unsigned PtrOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
+ Value *PtrOp = II->getArgOperand(PtrOpIdx);
Type *Type = II->getIntrinsicID() == Intrinsic::masked_load
? II->getType()
: II->getArgOperand(0)->getType();
- Align OldAlign =
- cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue();
+ Align OldAlign = II->getParamAlign(PtrOpIdx).valueOrOne();
Align PrefAlign = DL.getPrefTypeAlign(Type);
Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign);
- if (NewAlign <= OldAlign ||
- NewAlign.value() > std::numeric_limits<uint32_t>().max())
+ if (NewAlign <= OldAlign)
return false;
- Value *V =
- ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value());
- II->setOperand(AlignOpIdx, V);
+ II->addParamAttr(PtrOpIdx,
+ Attribute::getWithAlignment(II->getContext(), NewAlign));
return true;
}
default:
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 28ae4f0..9aaf6a5 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -43,6 +43,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <cassert>
#include <utility>
@@ -1872,6 +1873,51 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
}
+/// This deals with a corner case when a LCSSA phi node appears in a non-exit
+/// block: the outer loop latch block does not need to be exit block of the
+/// inner loop. Consider a loop that was in LCSSA form, but then some
+/// transformation like loop-unswitch comes along and creates an empty block,
+/// where BB5 in this example is the outer loop latch block:
+///
+/// BB4:
+/// br label %BB5
+/// BB5:
+/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
+/// br outer.header
+///
+/// Interchange then brings it in LCSSA form again resulting in this chain of
+/// single-input phi nodes:
+///
+/// BB4:
+/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
+/// br label %BB5
+/// BB5:
+/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
+///
+/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
+/// use before the def if we don't check this. The solution is to simplify
+/// lcssa phi nodes (remove) if they appear in non-exit blocks.
+///
+static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+ // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
+ if (OuterLoopLatch == InnerLoopExit)
+ return;
+
+ // Collect and remove phis in non-exit blocks if they have 1 input.
+ SmallVector<PHINode *, 8> Phis(
+ llvm::make_pointer_range(OuterLoopLatch->phis()));
+ for (PHINode *Phi : Phis) {
+ assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
+ LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
+ << "\n");
+ Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
+ Phi->eraseFromParent();
+ }
+}
+
bool LoopInterchangeTransform::adjustLoopBranches() {
LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
std::vector<DominatorTree::UpdateType> DTUpdates;
@@ -1882,6 +1928,9 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
InnerLoopPreHeader && "Guaranteed by loop-simplify form");
+
+ simplifyLCSSAPhis(OuterLoop, InnerLoop);
+
// Ensure that both preheaders do not contain PHI nodes and have single
// predecessors. This allows us to move them easily. We use
// InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 42d6680..146e7d1 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -111,7 +111,7 @@ static unsigned adjustForEndian(const DataLayout &DL, unsigned VectorWidth,
}
// Translate a masked load intrinsic like
-// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr,
// <16 x i1> %mask, <16 x i32> %passthru)
// to a chain of basic blocks, with loading element one-by-one if
// the appropriate mask bit is set
@@ -146,11 +146,10 @@ static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence,
CallInst *CI, DomTreeUpdater *DTU,
bool &ModifiedDT) {
Value *Ptr = CI->getArgOperand(0);
- Value *Alignment = CI->getArgOperand(1);
- Value *Mask = CI->getArgOperand(2);
- Value *Src0 = CI->getArgOperand(3);
+ Value *Mask = CI->getArgOperand(1);
+ Value *Src0 = CI->getArgOperand(2);
- const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+ const Align AlignVal = CI->getParamAlign(0).valueOrOne();
VectorType *VecType = cast<FixedVectorType>(CI->getType());
Type *EltTy = VecType->getElementType();
@@ -290,7 +289,7 @@ static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence,
}
// Translate a masked store intrinsic, like
-// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr,
// <16 x i1> %mask)
// to a chain of basic blocks, that stores element one-by-one if
// the appropriate mask bit is set
@@ -320,10 +319,9 @@ static void scalarizeMaskedStore(const DataLayout &DL, bool HasBranchDivergence,
bool &ModifiedDT) {
Value *Src = CI->getArgOperand(0);
Value *Ptr = CI->getArgOperand(1);
- Value *Alignment = CI->getArgOperand(2);
- Value *Mask = CI->getArgOperand(3);
+ Value *Mask = CI->getArgOperand(2);
- const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+ const Align AlignVal = CI->getParamAlign(1).valueOrOne();
auto *VecType = cast<VectorType>(Src->getType());
Type *EltTy = VecType->getElementType();
@@ -472,9 +470,8 @@ static void scalarizeMaskedGather(const DataLayout &DL,
bool HasBranchDivergence, CallInst *CI,
DomTreeUpdater *DTU, bool &ModifiedDT) {
Value *Ptrs = CI->getArgOperand(0);
- Value *Alignment = CI->getArgOperand(1);
- Value *Mask = CI->getArgOperand(2);
- Value *Src0 = CI->getArgOperand(3);
+ Value *Mask = CI->getArgOperand(1);
+ Value *Src0 = CI->getArgOperand(2);
auto *VecType = cast<FixedVectorType>(CI->getType());
Type *EltTy = VecType->getElementType();
@@ -483,7 +480,7 @@ static void scalarizeMaskedGather(const DataLayout &DL,
Instruction *InsertPt = CI;
BasicBlock *IfBlock = CI->getParent();
Builder.SetInsertPoint(InsertPt);
- MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+ Align AlignVal = CI->getParamAlign(0).valueOrOne();
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
@@ -608,8 +605,7 @@ static void scalarizeMaskedScatter(const DataLayout &DL,
DomTreeUpdater *DTU, bool &ModifiedDT) {
Value *Src = CI->getArgOperand(0);
Value *Ptrs = CI->getArgOperand(1);
- Value *Alignment = CI->getArgOperand(2);
- Value *Mask = CI->getArgOperand(3);
+ Value *Mask = CI->getArgOperand(2);
auto *SrcFVTy = cast<FixedVectorType>(Src->getType());
@@ -623,7 +619,7 @@ static void scalarizeMaskedScatter(const DataLayout &DL,
Builder.SetInsertPoint(InsertPt);
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
- MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+ Align AlignVal = CI->getParamAlign(1).valueOrOne();
unsigned VectorWidth = SrcFVTy->getNumElements();
// Shorten the way if the mask is a vector of constants.
@@ -1125,8 +1121,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
case Intrinsic::masked_load:
// Scalarize unsupported vector masked load
if (TTI.isLegalMaskedLoad(
- CI->getType(),
- cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(),
+ CI->getType(), CI->getParamAlign(0).valueOrOne(),
cast<PointerType>(CI->getArgOperand(0)->getType())
->getAddressSpace()))
return false;
@@ -1135,18 +1130,15 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
case Intrinsic::masked_store:
if (TTI.isLegalMaskedStore(
CI->getArgOperand(0)->getType(),
- cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue(),
+ CI->getParamAlign(1).valueOrOne(),
cast<PointerType>(CI->getArgOperand(1)->getType())
->getAddressSpace()))
return false;
scalarizeMaskedStore(DL, HasBranchDivergence, CI, DTU, ModifiedDT);
return true;
case Intrinsic::masked_gather: {
- MaybeAlign MA =
- cast<ConstantInt>(CI->getArgOperand(1))->getMaybeAlignValue();
+ Align Alignment = CI->getParamAlign(0).valueOrOne();
Type *LoadTy = CI->getType();
- Align Alignment = DL.getValueOrABITypeAlignment(MA,
- LoadTy->getScalarType());
if (TTI.isLegalMaskedGather(LoadTy, Alignment) &&
!TTI.forceScalarizeMaskedGather(cast<VectorType>(LoadTy), Alignment))
return false;
@@ -1154,11 +1146,8 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
return true;
}
case Intrinsic::masked_scatter: {
- MaybeAlign MA =
- cast<ConstantInt>(CI->getArgOperand(2))->getMaybeAlignValue();
+ Align Alignment = CI->getParamAlign(1).valueOrOne();
Type *StoreTy = CI->getArgOperand(0)->getType();
- Align Alignment = DL.getValueOrABITypeAlignment(MA,
- StoreTy->getScalarType());
if (TTI.isLegalMaskedScatter(StoreTy, Alignment) &&
!TTI.forceScalarizeMaskedScatter(cast<VectorType>(StoreTy),
Alignment))
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index fa66a03..23e1243 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -227,6 +227,7 @@ static InstructionCost ComputeSpeculationCost(const Instruction *I,
case Instruction::Call:
case Instruction::BitCast:
case Instruction::PtrToInt:
+ case Instruction::PtrToAddr:
case Instruction::IntToPtr:
case Instruction::AddrSpaceCast:
case Instruction::FPToUI:
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index b80c3c9..4947d03 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Instructions.h"
@@ -760,6 +761,7 @@ private:
void handleCallArguments(CallBase &CB);
void handleExtractOfWithOverflow(ExtractValueInst &EVI,
const WithOverflowInst *WO, unsigned Idx);
+ bool isInstFullyOverDefined(Instruction &Inst);
private:
friend class InstVisitor<SCCPInstVisitor>;
@@ -1374,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
// 7. If a conditional branch has a value that is overdefined, make all
// successors executable.
void SCCPInstVisitor::visitPHINode(PHINode &PN) {
- // If this PN returns a struct, just mark the result overdefined.
- // TODO: We could do a lot better than this if code actually uses this.
- if (PN.getType()->isStructTy())
- return (void)markOverdefined(&PN);
-
- if (getValueState(&PN).isOverdefined())
- return; // Quick exit
-
// Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
// and slow us down a lot. Just mark them overdefined.
if (PN.getNumIncomingValues() > 64)
return (void)markOverdefined(&PN);
- unsigned NumActiveIncoming = 0;
+ if (isInstFullyOverDefined(PN))
+ return;
+ SmallVector<unsigned> FeasibleIncomingIndices;
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+ continue;
+ FeasibleIncomingIndices.push_back(i);
+ }
// Look at all of the executable operands of the PHI node. If any of them
// are overdefined, the PHI becomes overdefined as well. If they are all
// constant, and they agree with each other, the PHI becomes the identical
// constant. If they are constant and don't agree, the PHI is a constant
// range. If there are no executable operands, the PHI remains unknown.
- ValueLatticeElement PhiState = getValueState(&PN);
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
- if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
- continue;
-
- const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
- PhiState.mergeIn(IV);
- NumActiveIncoming++;
- if (PhiState.isOverdefined())
- break;
+ if (StructType *STy = dyn_cast<StructType>(PN.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ ValueLatticeElement PhiState = getStructValueState(&PN, i);
+ if (PhiState.isOverdefined())
+ continue;
+ for (unsigned j : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV =
+ getStructValueState(PN.getIncomingValue(j), i);
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i);
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
+ }
+ } else {
+ ValueLatticeElement PhiState = getValueState(&PN);
+ for (unsigned i : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ // We allow up to 1 range extension per active incoming value and one
+ // additional extension. Note that we manually adjust the number of range
+ // extensions to match the number of active incoming values. This helps to
+ // limit multiple extensions caused by the same incoming value, if other
+ // incoming values are equal.
+ ValueLatticeElement &PhiStateRef = ValueState[&PN];
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
}
-
- // We allow up to 1 range extension per active incoming value and one
- // additional extension. Note that we manually adjust the number of range
- // extensions to match the number of active incoming values. This helps to
- // limit multiple extensions caused by the same incoming value, if other
- // incoming values are equal.
- ValueLatticeElement &PhiStateRef = ValueState[&PN];
- mergeInValue(PhiStateRef, &PN, PhiState,
- ValueLatticeElement::MergeOptions().setMaxWidenSteps(
- NumActiveIncoming + 1));
- PhiStateRef.setNumRangeExtensions(
- std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
}
void SCCPInstVisitor::visitReturnInst(ReturnInst &I) {
@@ -2127,6 +2146,21 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
}
}
+bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) {
+ // For structure Type, we handle each member separately.
+ // A structure object won't be considered as overdefined when
+ // there is at least one member that is not overdefined.
+ if (StructType *STy = dyn_cast<StructType>(Inst.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) {
+ if (!getStructValueState(&Inst, i).isOverdefined())
+ return false;
+ }
+ return true;
+ }
+
+ return getValueState(&Inst).isOverdefined();
+}
+
void SCCPInstVisitor::solve() {
// Process the work lists until they are empty!
while (!BBWorkList.empty() || !InstWorkList.empty()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 7651ba1..3fed003 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -325,6 +325,8 @@ public:
VPIRFlags Flags;
if (Opcode == Instruction::Trunc)
Flags = VPIRFlags::TruncFlagsTy(false, false);
+ else if (Opcode == Instruction::ZExt)
+ Flags = VPIRFlags::NonNegFlagsTy(false);
return tryInsertInstruction(
new VPWidenCastRecipe(Opcode, Op, ResultTy, Flags));
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 280eb20..febdc54 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7192,7 +7192,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
// cost model is complete for better cost estimates.
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
- VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
+ VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks,
+ BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
bool HasBranchWeights =
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9cd52da..3f18bd7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5343,7 +5343,7 @@ private:
unsigned &OpCnt =
OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
EdgeInfo EI(TE, U.getOperandNo());
- if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
+ if (!getScheduleCopyableData(EI, Op))
continue;
// Found copyable operand - continue.
++OpCnt;
@@ -10546,8 +10546,11 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
PoisonValue::get(UniqueValues.front()->getType()));
// Check that extended with poisons/copyable operations are still valid
// for vectorization (div/rem are not allowed).
- if (!S.areInstructionsWithCopyableElements() &&
- !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
+ if ((!S.areInstructionsWithCopyableElements() &&
+ !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
+ (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
+ (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
+ isa<CallInst>(S.getMainOp())))) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
ReuseShuffleIndices.clear();
return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0e0b042..fed04eb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -407,6 +407,10 @@ public:
VPBasicBlock *getParent() { return Parent; }
const VPBasicBlock *getParent() const { return Parent; }
+ /// \return the VPRegionBlock which the recipe belongs to.
+ VPRegionBlock *getRegion();
+ const VPRegionBlock *getRegion() const;
+
/// The method which generates the output IR instructions that correspond to
/// this VPRecipe, thereby "executing" the VPlan.
virtual void execute(VPTransformState &State) = 0;
@@ -1003,6 +1007,11 @@ public:
/// Creates a fixed-width vector containing all operands. The number of
/// operands matches the vector element count.
BuildVector,
+ /// Extracts all lanes from its (non-scalable) vector operand. This is an
+ /// abstract VPInstruction whose single defined VPValue represents VF
+ /// scalars extracted from a vector, to be replaced by VF ExtractElement
+ /// VPInstructions.
+ Unpack,
/// Compute the final result of a AnyOf reduction with select(cmp(),x,y),
/// where one of (x,y) is loop invariant, and both x and y are integer type.
ComputeAnyOfResult,
@@ -2711,6 +2720,15 @@ public:
return R && classof(R);
}
+ static inline bool classof(const VPValue *VPV) {
+ const VPRecipeBase *R = VPV->getDefiningRecipe();
+ return R && classof(R);
+ }
+
+ static inline bool classof(const VPSingleDefRecipe *R) {
+ return classof(static_cast<const VPRecipeBase *>(R));
+ }
+
/// Generate the reduction in the loop.
void execute(VPTransformState &State) override;
@@ -3096,6 +3114,9 @@ public:
/// Returns true if this expression contains recipes that may have side
/// effects.
bool mayHaveSideEffects() const;
+
+ /// Returns true if the result of this VPExpressionRecipe is a single-scalar.
+ bool isSingleScalar() const;
};
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
@@ -4075,6 +4096,14 @@ public:
}
};
+inline VPRegionBlock *VPRecipeBase::getRegion() {
+ return getParent()->getParent();
+}
+
+inline const VPRegionBlock *VPRecipeBase::getRegion() const {
+ return getParent()->getParent();
+}
+
/// VPlan models a candidate for vectorization, encoding various decisions take
/// to produce efficient output IR, including which branches, basic-blocks and
/// output IR instructions to generate, and their cost. VPlan holds a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f413c63..80a2e4b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -110,6 +110,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::AnyOf:
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
+ case VPInstruction::Unpack:
return SetResultTyFromOp();
case VPInstruction::ExtractLane:
return inferScalarType(R->getOperand(1));
@@ -377,7 +378,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
#ifndef NDEBUG
auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
- auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
+ VPRegionBlock *Region = R->getRegion();
if (Region && Region->isReplicator()) {
assert(Region->getNumSuccessors() == 1 &&
Region->getNumPredecessors() == 1 && "Expected SESE region!");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index d8203e2..b5b98c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -388,6 +388,12 @@ m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
}
+template <typename Op0_t, typename Op1_t>
+inline VPInstruction_match<Instruction::ExtractElement, Op0_t, Op1_t>
+m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1);
+}
+
template <typename Op0_t>
inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
m_ExtractLastLanePerPart(const Op0_t &Op0) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a98c75..1f1b42b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -515,6 +515,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
+ case VPInstruction::Unpack:
return 1;
case Instruction::ICmp:
case Instruction::FCmp:
@@ -1246,6 +1247,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
case VPInstruction::VScale:
+ case VPInstruction::Unpack:
return false;
default:
return true;
@@ -1290,7 +1292,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::PtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
case VPInstruction::WidePtrAdd:
- return Op == getOperand(0);
+ // WidePtrAdd supports scalar and vector base addresses.
+ return false;
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ComputeFindIVResult:
return Op == getOperand(1);
@@ -1417,6 +1420,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
+ case VPInstruction::Unpack:
+ O << "unpack";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -2352,7 +2358,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = getParent()->getParent()->getCanonicalIV();
+ auto *CanIV = getRegion()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
getScalarType() == CanIV->getScalarType();
}
@@ -2888,6 +2894,13 @@ bool VPExpressionRecipe::mayHaveSideEffects() const {
return false;
}
+bool VPExpressionRecipe::isSingleScalar() const {
+ // Cannot use vputils::isSingleScalar(), because all external operands
+ // of the expression will be live-ins while bundled.
+ return isa<VPReductionRecipe>(ExpressionRecipes.back()) &&
+ !isa<VPPartialReductionRecipe>(ExpressionRecipes.back());
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
@@ -3076,7 +3089,7 @@ static void scalarizeInstruction(const Instruction *Instr,
State.AC->registerAssumption(II);
assert(
- (RepRecipe->getParent()->getParent() ||
+ (RepRecipe->getRegion() ||
!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
all_of(RepRecipe->operands(),
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
@@ -3149,7 +3162,17 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
while (!WorkList.empty()) {
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
- if (!Cur || !Seen.insert(Cur).second || isa<VPBlendRecipe>(Cur))
+ if (!Cur || !Seen.insert(Cur).second)
+ continue;
+
+ auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
+ // Skip blends that use V only through a compare by checking if any incoming
+ // value was already visited.
+ if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
+ [&](unsigned I) {
+ return Seen.contains(
+ Blend->getIncomingValue(I)->getDefiningRecipe());
+ }))
continue;
for (VPUser *U : Cur->users()) {
@@ -3170,7 +3193,13 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
}
}
- append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+ // The legacy cost model only supports scalarization loads/stores with phi
+ // addresses, if the phi is directly used as load/store address. Don't
+ // traverse further for Blends.
+ if (Blend)
+ continue;
+
+ append_range(WorkList, Cur->users());
}
return false;
}
@@ -3268,7 +3297,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
to_vector(operands()), VF);
// If the recipe is not predicated (i.e. not in a replicate region), return
// the scalar cost. Otherwise handle predicated cost.
- if (!getParent()->getParent()->isReplicator())
+ if (!getRegion()->isReplicator())
return ScalarCost;
// Account for the phi nodes that we will create.
@@ -3284,7 +3313,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
case Instruction::Store: {
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- const VPRegionBlock *ParentRegion = getParent()->getParent();
+ const VPRegionBlock *ParentRegion = getRegion();
if (ParentRegion && ParentRegion->isReplicator())
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cae9aee8..e060e70 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -106,7 +106,7 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
return false;
NewRecipe = new VPWidenIntrinsicRecipe(
*CI, getVectorIntrinsicIDForCall(CI, &TLI),
- {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
+ drop_end(Ingredient.operands()), CI->getType(),
CI->getDebugLoc());
} else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
@@ -356,8 +356,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
// Replace predicated replicate recipe with a replicate recipe without a
// mask but in the replicate region.
auto *RecipeWithoutMask = new VPReplicateRecipe(
- PredRecipe->getUnderlyingInstr(),
- make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
+ PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
auto *Pred =
Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
@@ -939,7 +938,7 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
continue;
if (!isDeadRecipe(*R))
continue;
- WorkList.append(R->op_begin(), R->op_end());
+ append_range(WorkList, R->operands());
R->eraseFromParent();
}
}
@@ -1224,6 +1223,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
+ uint64_t Idx;
+ if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {
+ auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+ Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
+ return;
+ }
+
if (auto *Phi = dyn_cast<VPPhi>(Def)) {
if (Phi->getNumOperands() == 1)
Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -1858,8 +1864,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
return nullptr;
VPRegionBlock *EnclosingLoopRegion =
HoistCandidate->getParent()->getEnclosingLoopRegion();
- assert((!HoistCandidate->getParent()->getParent() ||
- HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
+ assert((!HoistCandidate->getRegion() ||
+ HoistCandidate->getRegion() == EnclosingLoopRegion) &&
"CFG in VPlan should still be flat, without replicate regions");
// Hoist candidate was already visited, no need to hoist.
if (!Visited.insert(HoistCandidate).second)
@@ -2006,7 +2012,7 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
.Case<VPWidenIntrinsicRecipe>([](auto *I) {
return std::make_pair(true, I->getVectorIntrinsicID());
})
- .Case<VPVectorPointerRecipe>([](auto *I) {
+ .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
// For recipes that do not directly map to LLVM IR instructions,
// assign opcodes after the last VPInstruction opcode (which is also
// after the last IR Instruction opcode), based on the VPDefID.
@@ -2083,6 +2089,15 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
LFlags->getPredicate() !=
cast<VPRecipeWithIRFlags>(R)->getPredicate())
return false;
+ // Recipes in replicate regions implicitly depend on predicate. If either
+ // recipe is in a replicate region, only consider them equal if both have
+ // the same parent.
+ const VPRegionBlock *RegionL = L->getParent()->getParent();
+ const VPRegionBlock *RegionR = R->getParent()->getParent();
+ if (((RegionL && RegionL->isReplicator()) ||
+ (RegionR && RegionR->isReplicator())) &&
+ L->getParent() != R->getParent())
+ return false;
const VPlan *Plan = L->getParent()->getPlan();
VPTypeAnalysis TypeInfo(*Plan);
return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
@@ -2898,7 +2913,7 @@ void VPlanTransforms::replaceSymbolicStrides(
// evolution.
auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
auto *R = cast<VPRecipeBase>(&U);
- return R->getParent()->getParent() ||
+ return R->getRegion() ||
R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
};
ValueToSCEVMapTy RewriteMap;
@@ -3780,7 +3795,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
BTC->replaceAllUsesWith(TCMO);
}
-void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
+void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
if (Plan.hasScalarVFOnly())
return;
@@ -3803,8 +3818,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
- VPRegionBlock *ParentRegion =
- cast<VPRecipeBase>(U)->getParent()->getParent();
+ VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
};
if ((isa<VPReplicateRecipe>(DefR) &&
@@ -3829,6 +3843,50 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
});
}
}
+
+ // Create explicit VPInstructions to convert vectors to scalars. The current
+ // implementation is conservative - it may miss some cases that may or may not
+ // be vector values. TODO: introduce Unpacks speculatively - remove them later
+ // if they are known to operate on scalar values.
+ for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
+ VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(&R))
+ continue;
+ for (VPValue *Def : R.definedValues()) {
+ // Skip recipes that are single-scalar or only have their first lane
+ // used.
+ // TODO: The Defs skipped here may or may not be vector values.
+ // Introduce Unpacks, and remove them later, if they are guaranteed to
+ // produce scalar values.
+ if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def))
+ continue;
+
+ // At the moment, we create unpacks only for scalar users outside
+ // replicate regions. Recipes inside replicate regions still extract the
+ // required lanes implicitly.
+ // TODO: Remove once replicate regions are unrolled completely.
+ auto IsCandidateUnpackUser = [Def](VPUser *U) {
+ VPRegionBlock *ParentRegion =
+ cast<VPRecipeBase>(U)->getParent()->getParent();
+ return U->usesScalars(Def) &&
+ (!ParentRegion || !ParentRegion->isReplicator());
+ };
+ if (none_of(Def->users(), IsCandidateUnpackUser))
+ continue;
+
+ auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
+ if (R.isPhi())
+ Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
+ else
+ Unpack->insertAfter(&R);
+ Def->replaceUsesWithIf(Unpack,
+ [&IsCandidateUnpackUser](VPUser &U, unsigned) {
+ return IsCandidateUnpackUser(&U);
+ });
+ }
+ }
+ }
}
void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5a8a2bb..b28559b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -325,9 +325,10 @@ struct VPlanTransforms {
static void materializeBackedgeTakenCount(VPlan &Plan,
VPBasicBlock *VectorPH);
- /// Add explicit Build[Struct]Vector recipes that combine multiple scalar
- /// values into single vectors.
- static void materializeBuildVectors(VPlan &Plan);
+ /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
+ /// into vectors and Unpack recipes to extract scalars from vectors as
+ /// needed.
+ static void materializePacksAndUnpacks(VPlan &Plan);
/// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 5aeda3e..cfd1a74 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -465,10 +465,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
/// definitions for operands of \DefR.
-static VPRecipeWithIRFlags *
+static VPValue *
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
VPRecipeWithIRFlags *DefR, VPLane Lane,
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
+ VPValue *Op;
+ if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) {
+ auto LaneDefs = Def2LaneDefs.find(Op);
+ if (LaneDefs != Def2LaneDefs.end())
+ return LaneDefs->second[Lane.getKnownLane()];
+
+ VPValue *Idx =
+ Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+ return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
+ }
+
// Collect the operands at Lane, creating extracts as needed.
SmallVector<VPValue *> NewOps;
for (VPValue *Op : DefR->operands()) {
@@ -480,6 +491,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
continue;
}
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
+ // Look through mandatory Unpack.
+ [[maybe_unused]] bool Matched =
+ match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)));
+ assert(Matched && "original op must have been Unpack");
NewOps.push_back(
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
continue;
@@ -547,7 +562,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
(isa<VPReplicateRecipe>(&R) &&
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
(isa<VPInstruction>(&R) &&
- !cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
+ !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
+ cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 8b1b0e5..10801c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -113,12 +113,12 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
return TypeSwitch<const VPRecipeBase *, bool>(R)
.Case<VPDerivedIVRecipe>([](const auto *R) { return true; })
.Case<VPReplicateRecipe>([](const auto *R) {
- // Loads and stores that are uniform across VF lanes are handled by
- // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if
- // all their operands are invariant.
- // TODO: Further relax the restrictions.
+ // Be conservative about side-effects, except for the
+ // known-side-effecting assumes and stores, which we know will be
+ // uniform.
return R->isSingleScalar() &&
- (isa<LoadInst, StoreInst>(R->getUnderlyingValue())) &&
+ (!R->mayHaveSideEffects() ||
+ isa<AssumeInst, StoreInst>(R->getUnderlyingInstr())) &&
all_of(R->operands(), isUniformAcrossVFsAndUFs);
})
.Case<VPInstruction>([](const auto *VPI) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index cf95ac0..840a5b9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
return true;
if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
- const VPRegionBlock *RegionOfR = Rep->getParent()->getParent();
+ const VPRegionBlock *RegionOfR = Rep->getRegion();
// Don't consider recipes in replicate regions as uniform yet; their first
// lane cannot be accessed when executing the replicate region for other
// lanes.
@@ -84,6 +84,12 @@ inline bool isSingleScalar(const VPValue *VPV) {
return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
(PreservesUniformity(VPI->getOpcode()) &&
all_of(VPI->operands(), isSingleScalar));
+ if (isa<VPPartialReductionRecipe>(VPV))
+ return false;
+ if (isa<VPReductionRecipe>(VPV))
+ return true;
+ if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
+ return Expr->isSingleScalar();
// VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
return isa<VPExpandSCEVRecipe>(VPV);