diff options
Diffstat (limited to 'llvm/lib/Transforms')
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp | 42 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineInternal.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 30 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 22 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 133 |
8 files changed, 158 insertions, 82 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8d9933b..92fca90 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3496,7 +3496,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (isPowerOf2_64(AlignMask + 1)) { uint64_t Offset = 0; match(A, m_Add(m_Value(A), m_ConstantInt(Offset))); - if (match(A, m_PtrToInt(m_Value(A)))) { + if (match(A, m_PtrToIntOrAddr(m_Value(A)))) { /// Note: this doesn't preserve the offset information but merges /// offset and alignment. /// TODO: we can generate a GEP instead of merging the alignment with diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 9b9fe26..614c6eb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1525,7 +1525,15 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) { } // Try to extend the entire expression tree to the wide destination type. - if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) { + bool ShouldExtendExpression = true; + Value *TruncSrc = nullptr; + // It is not desirable to extend expression in the trunc + sext pattern when + // destination type is narrower than original (pre-trunc) type. + if (match(Src, m_Trunc(m_Value(TruncSrc)))) + if (TruncSrc->getType()->getScalarSizeInBits() > DestBitSize) + ShouldExtendExpression = false; + if (ShouldExtendExpression && shouldChangeType(SrcTy, DestTy) && + canEvaluateSExtd(Src, DestTy)) { // Okay, we can transform this! Insert the new expression now. LLVM_DEBUG( dbgs() << "ICE: EvaluateInDifferentType converting expression type" @@ -1545,13 +1553,18 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) { ShAmt); } - Value *X; - if (match(Src, m_Trunc(m_Value(X)))) { + Value *X = TruncSrc; + if (X) { // If the input has more sign bits than bits truncated, then convert // directly to final type. unsigned XBitSize = X->getType()->getScalarSizeInBits(); - if (ComputeNumSignBits(X, &Sext) > XBitSize - SrcBitSize) - return CastInst::CreateIntegerCast(X, DestTy, /* isSigned */ true); + bool HasNSW = cast<TruncInst>(Src)->hasNoSignedWrap(); + if (HasNSW || (ComputeNumSignBits(X, &Sext) > XBitSize - SrcBitSize)) { + auto *Res = CastInst::CreateIntegerCast(X, DestTy, /* isSigned */ true); + if (auto *ResTrunc = dyn_cast<TruncInst>(Res); ResTrunc && HasNSW) + ResTrunc->setHasNoSignedWrap(true); + return Res; + } // If input is a trunc from the destination type, then convert into shifts. if (Src->hasOneUse() && X->getType() == DestTy) { @@ -2135,7 +2148,7 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) { return nullptr; } -Value *InstCombinerImpl::foldPtrToIntOfGEP(Type *IntTy, Value *Ptr) { +Value *InstCombinerImpl::foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr) { // Look through chain of one-use GEPs. Type *PtrTy = Ptr->getType(); SmallVector<GEPOperator *> GEPs; @@ -2197,7 +2210,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { Mask->getType() == Ty) return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask); - if (Value *V = foldPtrToIntOfGEP(Ty, SrcOp)) + if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp)) return replaceInstUsesWith(CI, V); Value *Vec, *Scalar, *Index; @@ -2215,6 +2228,21 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { } Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) { + Value *SrcOp = CI.getPointerOperand(); + Type *Ty = CI.getType(); + + // (ptrtoaddr (ptrmask P, M)) + // -> (and (ptrtoaddr P), M) + // This is generally beneficial as `and` is better supported than `ptrmask`. + Value *Ptr, *Mask; + if (match(SrcOp, m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(Ptr), + m_Value(Mask)))) && + Mask->getType() == Ty) + return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask); + + if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp)) + return replaceInstUsesWith(CI, V); + // FIXME: Implement variants of ptrtoint folds. return commonCastTransforms(CI); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 9c75d9a..d85e4f7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -700,7 +700,7 @@ public: /// folded operation. void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN); - Value *foldPtrToIntOfGEP(Type *IntTy, Value *Ptr); + Value *foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr); Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond, Instruction &I); Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS, diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 651e305..550dfc5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -105,6 +105,8 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I, if (~KnownShrBits.Zero != ShlAmt) return nullptr; + IRBuilderBase::InsertPointGuard Guard(IC.Builder); + IC.Builder.SetInsertPoint(I); Value *ShrAmtZ = IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()), ShrAmt->getName() + ".z"); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index facb0fa..f7968ab 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7543,12 +7543,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, } if (LoadInst *Load = dyn_cast<LoadInst>(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, - VPIRMetadata(*Load, LVer), I->getDebugLoc()); + Load->getAlign(), VPIRMetadata(*Load, LVer), + I->getDebugLoc()); StoreInst *Store = cast<StoreInst>(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, VPIRMetadata(*Store, LVer), - I->getDebugLoc()); + Reverse, Store->getAlign(), + VPIRMetadata(*Store, LVer), I->getDebugLoc()); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5b9f005..1f10058 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3179,6 +3179,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, protected: Instruction &Ingredient; + /// Alignment information for this memory access. + Align Alignment; + /// Whether the accessed addresses are consecutive. bool Consecutive; @@ -3198,10 +3201,10 @@ protected: VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, std::initializer_list<VPValue *> Operands, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL) : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), - Consecutive(Consecutive), Reverse(Reverse) { + Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); } @@ -3242,6 +3245,9 @@ public: return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; } + /// Returns the alignment of the memory access. + Align getAlign() const { return Alignment; } + /// Generate the wide load/store. void execute(VPTransformState &State) override { llvm_unreachable("VPWidenMemoryRecipe should not be instantiated."); @@ -3259,18 +3265,18 @@ public: struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, - Reverse, Metadata, DL), + Reverse, Alignment, Metadata, DL), VPValue(this, &Load) { setMask(Mask); } VPWidenLoadRecipe *clone() override { return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(), - getMask(), Consecutive, Reverse, *this, - getDebugLoc()); + getMask(), Consecutive, Reverse, getAlign(), + *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); @@ -3301,8 +3307,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), - {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L, - L.getDebugLoc()), + {Addr, &EVL}, L.isConsecutive(), L.isReverse(), + L.getAlign(), L, L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -3340,16 +3346,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, VPValue *Mask, bool Consecutive, bool Reverse, - const VPIRMetadata &Metadata, DebugLoc DL) + Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, - Consecutive, Reverse, Metadata, DL) { + Consecutive, Reverse, Alignment, Metadata, DL) { setMask(Mask); } VPWidenStoreRecipe *clone() override { return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(), getStoredValue(), getMask(), Consecutive, - Reverse, *this, getDebugLoc()); + Reverse, getAlign(), *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); @@ -3384,7 +3390,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(), - S.isReverse(), S, S.getDebugLoc()) { + S.isReverse(), S.getAlign(), S, S.getDebugLoc()) { setMask(Mask); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 931a5b7..9a63c80 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -70,6 +70,7 @@ bool VPRecipeBase::mayWriteToMemory() const { return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory(); case VPCanonicalIVPHISC: case VPBranchOnMaskSC: + case VPDerivedIVSC: case VPFirstOrderRecurrencePHISC: case VPReductionPHISC: case VPScalarIVStepsSC: @@ -86,6 +87,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: + case VPWidenPointerInductionSC: case VPWidenSC: case VPWidenSelectSC: { const Instruction *I = @@ -119,6 +121,7 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPWidenIntrinsicSC: return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory(); case VPBranchOnMaskSC: + case VPDerivedIVSC: case VPFirstOrderRecurrencePHISC: case VPPredInstPHISC: case VPScalarIVStepsSC: @@ -134,6 +137,7 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenPHISC: + case VPWidenPointerInductionSC: case VPWidenSC: case VPWidenSelectSC: { const Instruction *I = @@ -3358,7 +3362,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); + unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace(); TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); @@ -3525,7 +3529,6 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) @@ -3575,7 +3578,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, void VPWidenLoadRecipe::execute(VPTransformState &State) { Type *ScalarDataTy = getLoadStoreType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; @@ -3630,7 +3632,6 @@ static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Type *ScalarDataTy = getLoadStoreType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; @@ -3674,8 +3675,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); - unsigned AS = getLoadStoreAddressSpace(&Ingredient); + unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) + ->getAddressSpace(); InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( Instruction::Load, Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) @@ -3699,7 +3700,6 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); - const Align Alignment = getLoadStoreAlignment(&Ingredient); auto &Builder = State.Builder; @@ -3742,7 +3742,6 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { VPValue *StoredValue = getStoredValue(); bool CreateScatter = !isConsecutive(); - const Align Alignment = getLoadStoreAlignment(&Ingredient); auto &Builder = State.Builder; @@ -3785,8 +3784,8 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); - unsigned AS = getLoadStoreAddressSpace(&Ingredient); + unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) + ->getAddressSpace(); InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( Instruction::Store, Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) @@ -4252,7 +4251,8 @@ InstructionCost VPInterleaveBase::computeCost(ElementCount VF, getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) : getStoredValues()[InsertPosIdx]); auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); - unsigned AS = getLoadStoreAddressSpace(InsertPos); + unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) + ->getAddressSpace(); unsigned InterleaveFactor = IG->getFactor(); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 84817d7..d9ac26bb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -91,13 +91,14 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load), - Ingredient.getDebugLoc()); + false /*Consecutive*/, false /*Reverse*/, Load->getAlign(), + VPIRMetadata(*Load), Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, - VPIRMetadata(*Store), Ingredient.getDebugLoc()); + Store->getAlign(), VPIRMetadata(*Store), + Ingredient.getDebugLoc()); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { @@ -130,6 +131,24 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( return true; } +/// Return true if we do not know how to (mechanically) hoist or sink \p R out +/// of a loop region. +static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) { + // Assumes don't alias anything or throw; as long as they're guaranteed to + // execute, they're safe to hoist. + if (match(&R, m_Intrinsic<Intrinsic::assume>())) + return false; + + // TODO: Relax checks in the future, e.g. we could also hoist reads, if their + // memory location is not modified in the vector loop. + if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi()) + return true; + + // Allocas cannot be hoisted. + auto *RepR = dyn_cast<VPReplicateRecipe>(&R); + return RepR && RepR->getOpcode() == Instruction::Alloca; +} + static bool sinkScalarOperands(VPlan &Plan) { auto Iter = vp_depth_first_deep(Plan.getEntry()); bool Changed = false; @@ -1825,7 +1844,7 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPDT.properlyDominates(Previous, SinkCandidate)) return true; - if (SinkCandidate->mayHaveSideEffects()) + if (cannotHoistOrSinkRecipe(*SinkCandidate)) return false; WorkList.push_back(SinkCandidate); @@ -1865,7 +1884,7 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT) { - if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory()) + if (cannotHoistOrSinkRecipe(*Previous)) return false; // Collect recipes that need hoisting. @@ -1912,11 +1931,6 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, return nullptr; return HoistCandidate; }; - auto CanHoist = [&](VPRecipeBase *HoistCandidate) { - // Avoid hoisting candidates with side-effects, as we do not yet analyze - // associated dependencies. - return !HoistCandidate->mayHaveSideEffects(); - }; if (!NeedsHoisting(Previous->getVPSingleValue())) return true; @@ -1928,7 +1942,7 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Current = HoistCandidates[I]; assert(Current->getNumDefinedValues() == 1 && "only recipes with a single defined value expected"); - if (!CanHoist(Current)) + if (cannotHoistOrSinkRecipe(*Current)) return false; for (VPValue *Op : Current->operands()) { @@ -2143,24 +2157,6 @@ void VPlanTransforms::cse(VPlan &Plan) { static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); - // Return true if we do not know how to (mechanically) hoist a given recipe - // out of a loop region. - auto CannotHoistRecipe = [](VPRecipeBase &R) { - // Assumes don't alias anything or throw; as long as they're guaranteed to - // execute, they're safe to hoist. - if (match(&R, m_Intrinsic<Intrinsic::assume>())) - return false; - - // TODO: Relax checks in the future, e.g. we could also hoist reads, if - // their memory location is not modified in the vector loop. - if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi()) - return true; - - // Allocas cannot be hoisted. - auto *RepR = dyn_cast<VPReplicateRecipe>(&R); - return RepR && RepR->getOpcode() == Instruction::Alloca; - }; - // Hoist any loop invariant recipes from the vector loop region to the // preheader. Preform a shallow traversal of the vector loop region, to // exclude recipes in replicate regions. Since the top-level blocks in the @@ -2172,7 +2168,7 @@ static void licm(VPlan &Plan) { for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_shallow(LoopRegion->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (CannotHoistRecipe(R)) + if (cannotHoistOrSinkRecipe(R)) continue; if (any_of(R.operands(), [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); @@ -3652,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Sub = VecOp->getDefiningRecipe(); VecOp = Tmp; } + + // If ValB is a constant and can be safely extended, truncate it to the same + // type as ExtA's operand, then extend it to the same type as ExtA. This + // creates two uniform extends that can more easily be matched by the rest of + // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all + // replaced with the new extend of the constant. + auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA, + VPWidenCastRecipe *&ExtB, + VPValue *&ValB, VPWidenRecipe *Mul) { + if (!ExtA || ExtB || !ValB->isLiveIn()) + return; + Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0)); + Instruction::CastOps ExtOpc = ExtA->getOpcode(); + const APInt *Const; + if (!match(ValB, m_APInt(Const)) || + !llvm::canConstantBeExtended( + Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc))) + return; + // The truncate ensures that the type of each extended operand is the + // same, and it's been proven that the constant can be extended from + // NarrowTy safely. Necessary since ExtA's extended operand would be + // e.g. an i8, while the const will likely be an i32. This will be + // elided by later optimisations. + VPBuilder Builder(Mul); + auto *Trunc = + Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy); + Type *WideTy = Ctx.Types.inferScalarType(ExtA); + ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy); + Mul->setOperand(1, ExtB); + }; + // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { auto *RecipeA = @@ -3660,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe()); + // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const))) + ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul); + // Match reduce.add/sub(mul(ext, ext)). if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) && match(RecipeB, m_ZExtOrSExt(m_VPValue())) && @@ -3669,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, cast<VPWidenRecipe>(Sub), Red); return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red); } - // Match reduce.add(mul). // TODO: Add an expression type for this variant with a negated mul if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr)) return new VPExpressionRecipe(Mul, Red); @@ -3678,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // variants. if (Sub) return nullptr; - // Match reduce.add(ext(mul(ext(A), ext(B)))). - // All extend recipes must have same opcode or A == B - // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). - if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), - m_ZExtOrSExt(m_VPValue()))))) { + + // Match reduce.add(ext(mul(A, B))). + if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) { auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe()); auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe()); - auto *Ext0 = - cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe()); - auto *Ext1 = - cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe()); - if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && + auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe()); + auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe()); + + // reduce.add(ext(mul(ext, const))) + // -> reduce.add(ext(mul(ext, ext(const)))) + ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul); + + // reduce.add(ext(mul(ext(A), ext(B)))) + // -> reduce.add(mul(wider_ext(A), wider_ext(B))) + // The inner extends must either have the same opcode as the outer extend or + // be the same, in which case the multiply can never result in a negative + // value and the outer extend can be folded away by doing wider + // extends for the operands of the mul. + if (Ext0 && Ext1 && + (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && Ext0->getOpcode() == Ext1->getOpcode() && IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) { auto *NewExt0 = new VPWidenCastRecipe( @@ -4234,10 +4271,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { // Narrow interleave group to wide load, as transformed VPlan will only // process one original iteration. + auto *LI = + cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); auto *L = new VPWidenLoadRecipe( - *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()), - LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); + *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, + /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); NarrowedOps.insert(L); return L; @@ -4280,10 +4318,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, Res = NarrowOp(Member0); } + auto *SI = + cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()); auto *S = new VPWidenStoreRecipe( - *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()), - StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, - /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); + *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, + /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc()); S->insertBefore(StoreGroup); StoreGroup->eraseFromParent(); } |
