aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp42
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h30
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp22
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp133
8 files changed, 158 insertions, 82 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8d9933b..92fca90 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3496,7 +3496,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
if (isPowerOf2_64(AlignMask + 1)) {
uint64_t Offset = 0;
match(A, m_Add(m_Value(A), m_ConstantInt(Offset)));
- if (match(A, m_PtrToInt(m_Value(A)))) {
+ if (match(A, m_PtrToIntOrAddr(m_Value(A)))) {
/// Note: this doesn't preserve the offset information but merges
/// offset and alignment.
/// TODO: we can generate a GEP instead of merging the alignment with
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 9b9fe26..614c6eb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1525,7 +1525,15 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
}
// Try to extend the entire expression tree to the wide destination type.
- if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) {
+ bool ShouldExtendExpression = true;
+ Value *TruncSrc = nullptr;
+ // It is not desirable to extend expression in the trunc + sext pattern when
+ // destination type is narrower than original (pre-trunc) type.
+ if (match(Src, m_Trunc(m_Value(TruncSrc))))
+ if (TruncSrc->getType()->getScalarSizeInBits() > DestBitSize)
+ ShouldExtendExpression = false;
+ if (ShouldExtendExpression && shouldChangeType(SrcTy, DestTy) &&
+ canEvaluateSExtd(Src, DestTy)) {
// Okay, we can transform this! Insert the new expression now.
LLVM_DEBUG(
dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -1545,13 +1553,18 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
ShAmt);
}
- Value *X;
- if (match(Src, m_Trunc(m_Value(X)))) {
+ Value *X = TruncSrc;
+ if (X) {
// If the input has more sign bits than bits truncated, then convert
// directly to final type.
unsigned XBitSize = X->getType()->getScalarSizeInBits();
- if (ComputeNumSignBits(X, &Sext) > XBitSize - SrcBitSize)
- return CastInst::CreateIntegerCast(X, DestTy, /* isSigned */ true);
+ bool HasNSW = cast<TruncInst>(Src)->hasNoSignedWrap();
+ if (HasNSW || (ComputeNumSignBits(X, &Sext) > XBitSize - SrcBitSize)) {
+ auto *Res = CastInst::CreateIntegerCast(X, DestTy, /* isSigned */ true);
+ if (auto *ResTrunc = dyn_cast<TruncInst>(Res); ResTrunc && HasNSW)
+ ResTrunc->setHasNoSignedWrap(true);
+ return Res;
+ }
// If input is a trunc from the destination type, then convert into shifts.
if (Src->hasOneUse() && X->getType() == DestTy) {
@@ -2135,7 +2148,7 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
return nullptr;
}
-Value *InstCombinerImpl::foldPtrToIntOfGEP(Type *IntTy, Value *Ptr) {
+Value *InstCombinerImpl::foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr) {
// Look through chain of one-use GEPs.
Type *PtrTy = Ptr->getType();
SmallVector<GEPOperator *> GEPs;
@@ -2197,7 +2210,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
Mask->getType() == Ty)
return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask);
- if (Value *V = foldPtrToIntOfGEP(Ty, SrcOp))
+ if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
return replaceInstUsesWith(CI, V);
Value *Vec, *Scalar, *Index;
@@ -2215,6 +2228,21 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
}
Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) {
+ Value *SrcOp = CI.getPointerOperand();
+ Type *Ty = CI.getType();
+
+ // (ptrtoaddr (ptrmask P, M))
+ // -> (and (ptrtoaddr P), M)
+ // This is generally beneficial as `and` is better supported than `ptrmask`.
+ Value *Ptr, *Mask;
+ if (match(SrcOp, m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(Ptr),
+ m_Value(Mask)))) &&
+ Mask->getType() == Ty)
+ return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask);
+
+ if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
+ return replaceInstUsesWith(CI, V);
+
// FIXME: Implement variants of ptrtoint folds.
return commonCastTransforms(CI);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9c75d9a..d85e4f7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -700,7 +700,7 @@ public:
/// folded operation.
void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN);
- Value *foldPtrToIntOfGEP(Type *IntTy, Value *Ptr);
+ Value *foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr);
Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond,
Instruction &I);
Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 651e305..550dfc5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -105,6 +105,8 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I,
if (~KnownShrBits.Zero != ShlAmt)
return nullptr;
+ IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+ IC.Builder.SetInsertPoint(I);
Value *ShrAmtZ =
IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()),
ShrAmt->getName() + ".z");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index facb0fa..f7968ab 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7543,12 +7543,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- VPIRMetadata(*Load, LVer), I->getDebugLoc());
+ Load->getAlign(), VPIRMetadata(*Load, LVer),
+ I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, VPIRMetadata(*Store, LVer),
- I->getDebugLoc());
+ Reverse, Store->getAlign(),
+ VPIRMetadata(*Store, LVer), I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5b9f005..1f10058 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3179,6 +3179,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
protected:
Instruction &Ingredient;
+ /// Alignment information for this memory access.
+ Align Alignment;
+
/// Whether the accessed addresses are consecutive.
bool Consecutive;
@@ -3198,10 +3201,10 @@ protected:
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse,
+ bool Consecutive, bool Reverse, Align Alignment,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
- Consecutive(Consecutive), Reverse(Reverse) {
+ Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
}
@@ -3242,6 +3245,9 @@ public:
return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
}
+ /// Returns the alignment of the memory access.
+ Align getAlign() const { return Alignment; }
+
/// Generate the wide load/store.
void execute(VPTransformState &State) override {
llvm_unreachable("VPWidenMemoryRecipe should not be instantiated.");
@@ -3259,18 +3265,18 @@ public:
struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse,
+ bool Consecutive, bool Reverse, Align Alignment,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, Metadata, DL),
+ Reverse, Alignment, Metadata, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse, *this,
- getDebugLoc());
+ getMask(), Consecutive, Reverse, getAlign(),
+ *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3301,8 +3307,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
- {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
- L.getDebugLoc()),
+ {Addr, &EVL}, L.isConsecutive(), L.isReverse(),
+ L.getAlign(), L, L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -3340,16 +3346,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
VPValue *Mask, bool Consecutive, bool Reverse,
- const VPIRMetadata &Metadata, DebugLoc DL)
+ Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, Metadata, DL) {
+ Consecutive, Reverse, Alignment, Metadata, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, *this, getDebugLoc());
+ Reverse, getAlign(), *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3384,7 +3390,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
- S.isReverse(), S, S.getDebugLoc()) {
+ S.isReverse(), S.getAlign(), S, S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 931a5b7..9a63c80 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -70,6 +70,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
case VPCanonicalIVPHISC:
case VPBranchOnMaskSC:
+ case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPReductionPHISC:
case VPScalarIVStepsSC:
@@ -86,6 +87,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenPHISC:
+ case VPWidenPointerInductionSC:
case VPWidenSC:
case VPWidenSelectSC: {
const Instruction *I =
@@ -119,6 +121,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
case VPBranchOnMaskSC:
+ case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
case VPScalarIVStepsSC:
@@ -134,6 +137,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
+ case VPWidenPointerInductionSC:
case VPWidenSC:
case VPWidenSelectSC: {
const Instruction *I =
@@ -3358,7 +3362,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
const Align Alignment = getLoadStoreAlignment(UI);
- unsigned AS = getLoadStoreAddressSpace(UI);
+ unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
@@ -3525,7 +3529,6 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
@@ -3575,7 +3578,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
void VPWidenLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
bool CreateGather = !isConsecutive();
auto &Builder = State.Builder;
@@ -3630,7 +3632,6 @@ static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
bool CreateGather = !isConsecutive();
auto &Builder = State.Builder;
@@ -3674,8 +3675,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
// don't need to compare to the legacy cost model.
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
- unsigned AS = getLoadStoreAddressSpace(&Ingredient);
+ unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
+ ->getAddressSpace();
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
@@ -3699,7 +3700,6 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
void VPWidenStoreRecipe::execute(VPTransformState &State) {
VPValue *StoredVPValue = getStoredValue();
bool CreateScatter = !isConsecutive();
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
@@ -3742,7 +3742,6 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = getStoredValue();
bool CreateScatter = !isConsecutive();
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
@@ -3785,8 +3784,8 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
// don't need to compare to the legacy cost model.
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
- unsigned AS = getLoadStoreAddressSpace(&Ingredient);
+ unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
+ ->getAddressSpace();
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
@@ -4252,7 +4251,8 @@ InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
: getStoredValues()[InsertPosIdx]);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
- unsigned AS = getLoadStoreAddressSpace(InsertPos);
+ unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
+ ->getAddressSpace();
unsigned InterleaveFactor = IG->getFactor();
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 84817d7..d9ac26bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -91,13 +91,14 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
- Ingredient.getDebugLoc());
+ false /*Consecutive*/, false /*Reverse*/, Load->getAlign(),
+ VPIRMetadata(*Load), Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- VPIRMetadata(*Store), Ingredient.getDebugLoc());
+ Store->getAlign(), VPIRMetadata(*Store),
+ Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -130,6 +131,24 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
return true;
}
+/// Return true if we do not know how to (mechanically) hoist or sink \p R out
+/// of a loop region.
+static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) {
+ // Assumes don't alias anything or throw; as long as they're guaranteed to
+ // execute, they're safe to hoist.
+ if (match(&R, m_Intrinsic<Intrinsic::assume>()))
+ return false;
+
+ // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
+ // memory location is not modified in the vector loop.
+ if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
+ return true;
+
+ // Allocas cannot be hoisted.
+ auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+ return RepR && RepR->getOpcode() == Instruction::Alloca;
+}
+
static bool sinkScalarOperands(VPlan &Plan) {
auto Iter = vp_depth_first_deep(Plan.getEntry());
bool Changed = false;
@@ -1825,7 +1844,7 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
VPDT.properlyDominates(Previous, SinkCandidate))
return true;
- if (SinkCandidate->mayHaveSideEffects())
+ if (cannotHoistOrSinkRecipe(*SinkCandidate))
return false;
WorkList.push_back(SinkCandidate);
@@ -1865,7 +1884,7 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
VPRecipeBase *Previous,
VPDominatorTree &VPDT) {
- if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
+ if (cannotHoistOrSinkRecipe(*Previous))
return false;
// Collect recipes that need hoisting.
@@ -1912,11 +1931,6 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
return nullptr;
return HoistCandidate;
};
- auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
- // Avoid hoisting candidates with side-effects, as we do not yet analyze
- // associated dependencies.
- return !HoistCandidate->mayHaveSideEffects();
- };
if (!NeedsHoisting(Previous->getVPSingleValue()))
return true;
@@ -1928,7 +1942,7 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
VPRecipeBase *Current = HoistCandidates[I];
assert(Current->getNumDefinedValues() == 1 &&
"only recipes with a single defined value expected");
- if (!CanHoist(Current))
+ if (cannotHoistOrSinkRecipe(*Current))
return false;
for (VPValue *Op : Current->operands()) {
@@ -2143,24 +2157,6 @@ void VPlanTransforms::cse(VPlan &Plan) {
static void licm(VPlan &Plan) {
VPBasicBlock *Preheader = Plan.getVectorPreheader();
- // Return true if we do not know how to (mechanically) hoist a given recipe
- // out of a loop region.
- auto CannotHoistRecipe = [](VPRecipeBase &R) {
- // Assumes don't alias anything or throw; as long as they're guaranteed to
- // execute, they're safe to hoist.
- if (match(&R, m_Intrinsic<Intrinsic::assume>()))
- return false;
-
- // TODO: Relax checks in the future, e.g. we could also hoist reads, if
- // their memory location is not modified in the vector loop.
- if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
- return true;
-
- // Allocas cannot be hoisted.
- auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
- return RepR && RepR->getOpcode() == Instruction::Alloca;
- };
-
// Hoist any loop invariant recipes from the vector loop region to the
// preheader. Preform a shallow traversal of the vector loop region, to
// exclude recipes in replicate regions. Since the top-level blocks in the
@@ -2172,7 +2168,7 @@ static void licm(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(LoopRegion->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (CannotHoistRecipe(R))
+ if (cannotHoistOrSinkRecipe(R))
continue;
if (any_of(R.operands(), [](VPValue *Op) {
return !Op->isDefinedOutsideLoopRegions();
@@ -3652,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
Sub = VecOp->getDefiningRecipe();
VecOp = Tmp;
}
+
+ // If ValB is a constant and can be safely extended, truncate it to the same
+ // type as ExtA's operand, then extend it to the same type as ExtA. This
+ // creates two uniform extends that can more easily be matched by the rest of
+ // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
+ // replaced with the new extend of the constant.
+ auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
+ VPWidenCastRecipe *&ExtB,
+ VPValue *&ValB, VPWidenRecipe *Mul) {
+ if (!ExtA || ExtB || !ValB->isLiveIn())
+ return;
+ Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
+ Instruction::CastOps ExtOpc = ExtA->getOpcode();
+ const APInt *Const;
+ if (!match(ValB, m_APInt(Const)) ||
+ !llvm::canConstantBeExtended(
+ Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
+ return;
+ // The truncate ensures that the type of each extended operand is the
+ // same, and it's been proven that the constant can be extended from
+ // NarrowTy safely. Necessary since ExtA's extended operand would be
+ // e.g. an i8, while the const will likely be an i32. This will be
+ // elided by later optimisations.
+ VPBuilder Builder(Mul);
+ auto *Trunc =
+ Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
+ Type *WideTy = Ctx.Types.inferScalarType(ExtA);
+ ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
+ Mul->setOperand(1, ExtB);
+ };
+
// Try to match reduce.add(mul(...)).
if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
auto *RecipeA =
@@ -3660,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
+ // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
+ ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
+
// Match reduce.add/sub(mul(ext, ext)).
if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
@@ -3669,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
cast<VPWidenRecipe>(Sub), Red);
return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
}
- // Match reduce.add(mul).
// TODO: Add an expression type for this variant with a negated mul
if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
return new VPExpressionRecipe(Mul, Red);
@@ -3678,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
// variants.
if (Sub)
return nullptr;
- // Match reduce.add(ext(mul(ext(A), ext(B)))).
- // All extend recipes must have same opcode or A == B
- // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
- if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
- m_ZExtOrSExt(m_VPValue()))))) {
+
+ // Match reduce.add(ext(mul(A, B))).
+ if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
- auto *Ext0 =
- cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
- auto *Ext1 =
- cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
- if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
+ auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
+ auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
+
+ // reduce.add(ext(mul(ext, const)))
+ // -> reduce.add(ext(mul(ext, ext(const))))
+ ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
+
+ // reduce.add(ext(mul(ext(A), ext(B))))
+ // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
+ // The inner extends must either have the same opcode as the outer extend or
+ // be the same, in which case the multiply can never result in a negative
+ // value and the outer extend can be folded away by doing wider
+ // extends for the operands of the mul.
+ if (Ext0 && Ext1 &&
+ (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
Ext0->getOpcode() == Ext1->getOpcode() &&
IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
auto *NewExt0 = new VPWidenCastRecipe(
@@ -4234,10 +4271,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
// Narrow interleave group to wide load, as transformed VPlan will only
// process one original iteration.
+ auto *LI =
+ cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
auto *L = new VPWidenLoadRecipe(
- *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
- LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
+ *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
+ /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
NarrowedOps.insert(L);
return L;
@@ -4280,10 +4318,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
Res = NarrowOp(Member0);
}
+ auto *SI =
+ cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
auto *S = new VPWidenStoreRecipe(
- *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
- StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
- /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
+ *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
+ /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
}