diff options
Diffstat (limited to 'llvm/lib')
39 files changed, 898 insertions, 318 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 805b682..853bd66 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -122,11 +122,61 @@ static cl::opt<unsigned> MIVMaxLevelThreshold( cl::desc("Maximum depth allowed for the recursive algorithm used to " "explore MIV direction vectors.")); -static cl::opt<bool> RunSIVRoutinesOnly( - "da-run-siv-routines-only", cl::init(false), cl::ReallyHidden, - cl::desc("Run only SIV routines and disable others (ZIV, RDIV, and MIV). " - "The purpose is mainly to exclude the influence of those routines " - "in regression tests for SIV routines.")); +namespace { + +/// Types of dependence test routines. +enum class DependenceTestType { + All, + StrongSIV, + WeakCrossingSIV, + ExactSIV, + WeakZeroSIV, + ExactRDIV, + SymbolicRDIV, + GCDMIV, + BanerjeeMIV, +}; + +} // anonymous namespace + +static cl::opt<DependenceTestType> EnableDependenceTest( + "da-enable-dependence-test", cl::init(DependenceTestType::All), + cl::ReallyHidden, + cl::desc("Run only specified dependence test routine and disable others. " + "The purpose is mainly to exclude the influence of other " + "dependence test routines in regression tests. If set to All, all " + "dependence test routines are enabled."), + cl::values(clEnumValN(DependenceTestType::All, "all", + "Enable all dependence test routines."), + clEnumValN(DependenceTestType::StrongSIV, "strong-siv", + "Enable only Strong SIV test."), + clEnumValN(DependenceTestType::WeakCrossingSIV, + "weak-crossing-siv", + "Enable only Weak-Crossing SIV test."), + clEnumValN(DependenceTestType::ExactSIV, "exact-siv", + "Enable only Exact SIV test."), + clEnumValN(DependenceTestType::WeakZeroSIV, "weak-zero-siv", + "Enable only Weak-Zero SIV test."), + clEnumValN(DependenceTestType::ExactRDIV, "exact-rdiv", + "Enable only Exact RDIV test."), + clEnumValN(DependenceTestType::SymbolicRDIV, "symbolic-rdiv", + "Enable only Symbolic RDIV test."), + clEnumValN(DependenceTestType::GCDMIV, "gcd-miv", + "Enable only GCD MIV test."), + clEnumValN(DependenceTestType::BanerjeeMIV, "banerjee-miv", + "Enable only Banerjee MIV test."))); + +// TODO: This flag is disabled by default because it is still under development. +// Enable it or delete this flag when the feature is ready. +static cl::opt<bool> EnableMonotonicityCheck( + "da-enable-monotonicity-check", cl::init(false), cl::Hidden, + cl::desc("Check if the subscripts are monotonic. If it's not, dependence " + "is reported as unknown.")); + +static cl::opt<bool> DumpMonotonicityReport( + "da-dump-monotonicity-report", cl::init(false), cl::Hidden, + cl::desc( + "When printing analysis, dump the results of monotonicity checks.")); //===----------------------------------------------------------------------===// // basics @@ -177,13 +227,196 @@ void DependenceAnalysisWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredTransitive<LoopInfoWrapperPass>(); } +namespace { + +/// The property of monotonicity of a SCEV. To define the monotonicity, assume +/// a SCEV defined within N-nested loops. Let i_k denote the iteration number +/// of the k-th loop. Then we can regard the SCEV as an N-ary function: +/// +/// F(i_1, i_2, ..., i_N) +/// +/// The domain of i_k is the closed range [0, BTC_k], where BTC_k is the +/// backedge-taken count of the k-th loop. +/// +/// A function F is said to be "monotonically increasing with respect to the +/// k-th loop" if x <= y implies the following condition: +/// +/// F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) <= +/// F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N) +/// +/// where i_1, ..., i_{k-1}, i_{k+1}, ..., i_N, x, and y are elements of their +/// respective domains. +/// +/// Likewise F is "monotonically decreasing with respect to the k-th loop" +/// if x <= y implies +/// +/// F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) >= +/// F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N) +/// +/// A function F that is monotonically increasing or decreasing with respect to +/// the k-th loop is simply called "monotonic with respect to k-th loop". +/// +/// A function F is said to be "multivariate monotonic" when it is monotonic +/// with respect to all of the N loops. +/// +/// Since integer comparison can be either signed or unsigned, we need to +/// distinguish monotonicity in the signed sense from that in the unsigned +/// sense. Note that the inequality "x <= y" merely indicates loop progression +/// and is not affected by the difference between signed and unsigned order. +/// +/// Currently we only consider monotonicity in a signed sense. +enum class SCEVMonotonicityType { + /// We don't know anything about the monotonicity of the SCEV. + Unknown, + + /// The SCEV is loop-invariant with respect to the outermost loop. In other + /// words, the function F corresponding to the SCEV is a constant function. + Invariant, + + /// The function F corresponding to the SCEV is multivariate monotonic in a + /// signed sense. Note that the multivariate monotonic function may also be a + /// constant function. The order employed in the definition of monotonicity + /// is not strict order. + MultivariateSignedMonotonic, +}; + +struct SCEVMonotonicity { + SCEVMonotonicity(SCEVMonotonicityType Type, + const SCEV *FailurePoint = nullptr); + + SCEVMonotonicityType getType() const { return Type; } + + const SCEV *getFailurePoint() const { return FailurePoint; } + + bool isUnknown() const { return Type == SCEVMonotonicityType::Unknown; } + + void print(raw_ostream &OS, unsigned Depth) const; + +private: + SCEVMonotonicityType Type; + + /// The subexpression that caused Unknown. Mainly for debugging purpose. + const SCEV *FailurePoint; +}; + +/// Check the monotonicity of a SCEV. Since dependence tests (SIV, MIV, etc.) +/// assume that subscript expressions are (multivariate) monotonic, we need to +/// verify this property before applying those tests. Violating this assumption +/// may cause them to produce incorrect results. +struct SCEVMonotonicityChecker + : public SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity> { + + SCEVMonotonicityChecker(ScalarEvolution *SE) : SE(SE) {} + + /// Check the monotonicity of \p Expr. \p Expr must be integer type. If \p + /// OutermostLoop is not null, \p Expr must be defined in \p OutermostLoop or + /// one of its nested loops. + SCEVMonotonicity checkMonotonicity(const SCEV *Expr, + const Loop *OutermostLoop); + +private: + ScalarEvolution *SE; + + /// The outermost loop that DA is analyzing. + const Loop *OutermostLoop; + + /// A helper to classify \p Expr as either Invariant or Unknown. + SCEVMonotonicity invariantOrUnknown(const SCEV *Expr); + + /// Return true if \p Expr is loop-invariant with respect to the outermost + /// loop. + bool isLoopInvariant(const SCEV *Expr) const; + + /// A helper to create an Unknown SCEVMonotonicity. + SCEVMonotonicity createUnknown(const SCEV *FailurePoint) { + return SCEVMonotonicity(SCEVMonotonicityType::Unknown, FailurePoint); + } + + SCEVMonotonicity visitAddRecExpr(const SCEVAddRecExpr *Expr); + + SCEVMonotonicity visitConstant(const SCEVConstant *) { + return SCEVMonotonicity(SCEVMonotonicityType::Invariant); + } + SCEVMonotonicity visitVScale(const SCEVVScale *) { + return SCEVMonotonicity(SCEVMonotonicityType::Invariant); + } + + // TODO: Handle more cases. + SCEVMonotonicity visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitSignExtendExpr(const SCEVSignExtendExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitAddExpr(const SCEVAddExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitMulExpr(const SCEVMulExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitTruncateExpr(const SCEVTruncateExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitUDivExpr(const SCEVUDivExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitSMaxExpr(const SCEVSMaxExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitUMaxExpr(const SCEVUMaxExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitSMinExpr(const SCEVSMinExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitUMinExpr(const SCEVUMinExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitSequentialUMinExpr(const SCEVSequentialUMinExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitUnknown(const SCEVUnknown *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitCouldNotCompute(const SCEVCouldNotCompute *Expr) { + return invariantOrUnknown(Expr); + } + + friend struct SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity>; +}; + +} // anonymous namespace + // Used to test the dependence analyzer. // Looks through the function, noting instructions that may access memory. // Calls depends() on every possible pair and prints out the result. // Ignores all other instructions. static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, - ScalarEvolution &SE, bool NormalizeResults) { + ScalarEvolution &SE, LoopInfo &LI, + bool NormalizeResults) { auto *F = DA->getFunction(); + + if (DumpMonotonicityReport) { + SCEVMonotonicityChecker Checker(&SE); + OS << "Monotonicity check:\n"; + for (Instruction &Inst : instructions(F)) { + if (!isa<LoadInst>(Inst) && !isa<StoreInst>(Inst)) + continue; + Value *Ptr = getLoadStorePointerOperand(&Inst); + const Loop *L = LI.getLoopFor(Inst.getParent()); + const SCEV *PtrSCEV = SE.getSCEVAtScope(Ptr, L); + const SCEV *AccessFn = SE.removePointerBase(PtrSCEV); + SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, L); + OS.indent(2) << "Inst: " << Inst << "\n"; + OS.indent(4) << "Expr: " << *AccessFn << "\n"; + Mon.print(OS, 4); + } + OS << "\n"; + } + for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE; ++SrcI) { if (SrcI->mayReadOrWriteMemory()) { @@ -235,7 +468,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, void DependenceAnalysisWrapperPass::print(raw_ostream &OS, const Module *) const { dumpExampleDependence( - OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false); + OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), + getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), false); } PreservedAnalyses @@ -244,7 +478,7 @@ DependenceAnalysisPrinterPass::run(Function &F, FunctionAnalysisManager &FAM) { << "':\n"; dumpExampleDependence(OS, &FAM.getResult<DependenceAnalysis>(F), FAM.getResult<ScalarEvolutionAnalysis>(F), - NormalizeResults); + FAM.getResult<LoopAnalysis>(F), NormalizeResults); return PreservedAnalyses::all(); } @@ -671,6 +905,81 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { } //===----------------------------------------------------------------------===// +// SCEVMonotonicity + +SCEVMonotonicity::SCEVMonotonicity(SCEVMonotonicityType Type, + const SCEV *FailurePoint) + : Type(Type), FailurePoint(FailurePoint) { + assert( + ((Type == SCEVMonotonicityType::Unknown) == (FailurePoint != nullptr)) && + "FailurePoint must be provided iff Type is Unknown"); +} + +void SCEVMonotonicity::print(raw_ostream &OS, unsigned Depth) const { + OS.indent(Depth) << "Monotonicity: "; + switch (Type) { + case SCEVMonotonicityType::Unknown: + assert(FailurePoint && "FailurePoint must be provided for Unknown"); + OS << "Unknown\n"; + OS.indent(Depth) << "Reason: " << *FailurePoint << "\n"; + break; + case SCEVMonotonicityType::Invariant: + OS << "Invariant\n"; + break; + case SCEVMonotonicityType::MultivariateSignedMonotonic: + OS << "MultivariateSignedMonotonic\n"; + break; + } +} + +bool SCEVMonotonicityChecker::isLoopInvariant(const SCEV *Expr) const { + return !OutermostLoop || SE->isLoopInvariant(Expr, OutermostLoop); +} + +SCEVMonotonicity SCEVMonotonicityChecker::invariantOrUnknown(const SCEV *Expr) { + if (isLoopInvariant(Expr)) + return SCEVMonotonicity(SCEVMonotonicityType::Invariant); + return createUnknown(Expr); +} + +SCEVMonotonicity +SCEVMonotonicityChecker::checkMonotonicity(const SCEV *Expr, + const Loop *OutermostLoop) { + assert(Expr->getType()->isIntegerTy() && "Expr must be integer type"); + this->OutermostLoop = OutermostLoop; + return visit(Expr); +} + +/// We only care about an affine AddRec at the moment. For an affine AddRec, +/// the monotonicity can be inferred from its nowrap property. For example, let +/// X and Y be loop-invariant, and assume Y is non-negative. An AddRec +/// {X,+.Y}<nsw> implies: +/// +/// X <=s (X + Y) <=s ((X + Y) + Y) <=s ... +/// +/// Thus, we can conclude that the AddRec is monotonically increasing with +/// respect to the associated loop in a signed sense. The similar reasoning +/// applies when Y is non-positive, leading to a monotonically decreasing +/// AddRec. +SCEVMonotonicity +SCEVMonotonicityChecker::visitAddRecExpr(const SCEVAddRecExpr *Expr) { + if (!Expr->isAffine() || !Expr->hasNoSignedWrap()) + return createUnknown(Expr); + + const SCEV *Start = Expr->getStart(); + const SCEV *Step = Expr->getStepRecurrence(*SE); + + SCEVMonotonicity StartMon = visit(Start); + if (StartMon.isUnknown()) + return StartMon; + + if (!isLoopInvariant(Step)) + return createUnknown(Expr); + + return SCEVMonotonicity(SCEVMonotonicityType::MultivariateSignedMonotonic); +} + +//===----------------------------------------------------------------------===// // DependenceInfo methods // For debugging purposes. Dumps a dependence to OS. @@ -1273,6 +1582,13 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B, return nullptr; } +/// Returns true iff \p Test is enabled. +static bool isDependenceTestEnabled(DependenceTestType Test) { + if (EnableDependenceTest == DependenceTestType::All) + return true; + return EnableDependenceTest == Test; +} + // testZIV - // When we have a pair of subscripts of the form [c1] and [c2], // where c1 and c2 are both loop invariant, we attack it using @@ -1334,6 +1650,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint) const { + if (!isDependenceTestEnabled(DependenceTestType::StrongSIV)) + return false; + LLVM_DEBUG(dbgs() << "\tStrong SIV test\n"); LLVM_DEBUG(dbgs() << "\t Coeff = " << *Coeff); LLVM_DEBUG(dbgs() << ", " << *Coeff->getType() << "\n"); @@ -1468,6 +1787,9 @@ bool DependenceInfo::weakCrossingSIVtest( const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint, const SCEV *&SplitIter) const { + if (!isDependenceTestEnabled(DependenceTestType::WeakCrossingSIV)) + return false; + LLVM_DEBUG(dbgs() << "\tWeak-Crossing SIV test\n"); LLVM_DEBUG(dbgs() << "\t Coeff = " << *Coeff << "\n"); LLVM_DEBUG(dbgs() << "\t SrcConst = " << *SrcConst << "\n"); @@ -1726,6 +2048,9 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint) const { + if (!isDependenceTestEnabled(DependenceTestType::ExactSIV)) + return false; + LLVM_DEBUG(dbgs() << "\tExact SIV test\n"); LLVM_DEBUG(dbgs() << "\t SrcCoeff = " << *SrcCoeff << " = AM\n"); LLVM_DEBUG(dbgs() << "\t DstCoeff = " << *DstCoeff << " = BM\n"); @@ -1905,6 +2230,9 @@ bool DependenceInfo::weakZeroSrcSIVtest( const SCEV *DstCoeff, const SCEV *SrcConst, const SCEV *DstConst, const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint) const { + if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV)) + return false; + // For the WeakSIV test, it's possible the loop isn't common to // the Src and Dst loops. If it isn't, then there's no need to // record a direction. @@ -2013,6 +2341,9 @@ bool DependenceInfo::weakZeroDstSIVtest( const SCEV *SrcCoeff, const SCEV *SrcConst, const SCEV *DstConst, const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint) const { + if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV)) + return false; + // For the WeakSIV test, it's possible the loop isn't common to the // Src and Dst loops. If it isn't, then there's no need to record a direction. LLVM_DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n"); @@ -2096,8 +2427,9 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, const SCEV *SrcConst, const SCEV *DstConst, const Loop *SrcLoop, const Loop *DstLoop, FullDependence &Result) const { - if (RunSIVRoutinesOnly) + if (!isDependenceTestEnabled(DependenceTestType::ExactRDIV)) return false; + LLVM_DEBUG(dbgs() << "\tExact RDIV test\n"); LLVM_DEBUG(dbgs() << "\t SrcCoeff = " << *SrcCoeff << " = AM\n"); LLVM_DEBUG(dbgs() << "\t DstCoeff = " << *DstCoeff << " = BM\n"); @@ -2242,8 +2574,9 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, const SCEV *C1, const SCEV *C2, const Loop *Loop1, const Loop *Loop2) const { - if (RunSIVRoutinesOnly) + if (!isDependenceTestEnabled(DependenceTestType::SymbolicRDIV)) return false; + ++SymbolicRDIVapplications; LLVM_DEBUG(dbgs() << "\ttry symbolic RDIV test\n"); LLVM_DEBUG(dbgs() << "\t A1 = " << *A1); @@ -2557,8 +2890,9 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr, // to "a common divisor". bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, FullDependence &Result) const { - if (RunSIVRoutinesOnly) + if (!isDependenceTestEnabled(DependenceTestType::GCDMIV)) return false; + LLVM_DEBUG(dbgs() << "starting gcd\n"); ++GCDapplications; unsigned BitWidth = SE->getTypeSizeInBits(Src->getType()); @@ -2725,8 +3059,9 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst, const SmallBitVector &Loops, FullDependence &Result) const { - if (RunSIVRoutinesOnly) + if (!isDependenceTestEnabled(DependenceTestType::BanerjeeMIV)) return false; + LLVM_DEBUG(dbgs() << "starting Banerjee\n"); ++BanerjeeApplications; LLVM_DEBUG(dbgs() << " Src = " << *Src << '\n'); @@ -3488,10 +3823,19 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst, // resize Pair to contain as many pairs of subscripts as the delinearization // has found, and then initialize the pairs following the delinearization. Pair.resize(Size); + SCEVMonotonicityChecker MonChecker(SE); + const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr; for (int I = 0; I < Size; ++I) { Pair[I].Src = SrcSubscripts[I]; Pair[I].Dst = DstSubscripts[I]; unifySubscriptType(&Pair[I]); + + if (EnableMonotonicityCheck) { + if (MonChecker.checkMonotonicity(Pair[I].Src, OutermostLoop).isUnknown()) + return false; + if (MonChecker.checkMonotonicity(Pair[I].Dst, OutermostLoop).isUnknown()) + return false; + } } return true; @@ -3824,6 +4168,14 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, Pair[0].Src = SrcEv; Pair[0].Dst = DstEv; + SCEVMonotonicityChecker MonChecker(SE); + const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr; + if (EnableMonotonicityCheck) + if (MonChecker.checkMonotonicity(Pair[0].Src, OutermostLoop).isUnknown() || + MonChecker.checkMonotonicity(Pair[0].Dst, OutermostLoop).isUnknown()) + return std::make_unique<Dependence>(Src, Dst, + SCEVUnionPredicate(Assume, *SE)); + if (Delinearize) { if (tryDelinearize(Src, Dst, Pair)) { LLVM_DEBUG(dbgs() << " delinearized\n"); diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index dc813f6..8da51d0 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -5106,32 +5106,33 @@ static Value *simplifyGEPInst(Type *SrcTy, Value *Ptr, return Ptr; // The following transforms are only safe if the ptrtoint cast - // doesn't truncate the pointers. - if (Indices[0]->getType()->getScalarSizeInBits() == - Q.DL.getPointerSizeInBits(AS)) { + // doesn't truncate the address of the pointers. The non-address bits + // must be the same, as the underlying objects are the same. + if (Indices[0]->getType()->getScalarSizeInBits() >= + Q.DL.getAddressSizeInBits(AS)) { auto CanSimplify = [GEPTy, &P, Ptr]() -> bool { return P->getType() == GEPTy && getUnderlyingObject(P) == getUnderlyingObject(Ptr); }; // getelementptr V, (sub P, V) -> P if P points to a type of size 1. if (TyAllocSize == 1 && - match(Indices[0], - m_Sub(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Specific(Ptr)))) && + match(Indices[0], m_Sub(m_PtrToIntOrAddr(m_Value(P)), + m_PtrToIntOrAddr(m_Specific(Ptr)))) && CanSimplify()) return P; // getelementptr V, (ashr (sub P, V), C) -> P if P points to a type of // size 1 << C. - if (match(Indices[0], m_AShr(m_Sub(m_PtrToInt(m_Value(P)), - m_PtrToInt(m_Specific(Ptr))), + if (match(Indices[0], m_AShr(m_Sub(m_PtrToIntOrAddr(m_Value(P)), + m_PtrToIntOrAddr(m_Specific(Ptr))), m_ConstantInt(C))) && TyAllocSize == 1ULL << C && CanSimplify()) return P; // getelementptr V, (sdiv (sub P, V), C) -> P if P points to a type of // size C. - if (match(Indices[0], m_SDiv(m_Sub(m_PtrToInt(m_Value(P)), - m_PtrToInt(m_Specific(Ptr))), + if (match(Indices[0], m_SDiv(m_Sub(m_PtrToIntOrAddr(m_Value(P)), + m_PtrToIntOrAddr(m_Specific(Ptr))), m_SpecificInt(TyAllocSize))) && CanSimplify()) return P; diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index a60a4bb..6cdf51a 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -1100,12 +1100,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( for (auto &GlobalList : Index) { // Ignore entries for references that are undefined in the current module. - if (GlobalList.second.SummaryList.empty()) + if (GlobalList.second.getSummaryList().empty()) continue; - assert(GlobalList.second.SummaryList.size() == 1 && + assert(GlobalList.second.getSummaryList().size() == 1 && "Expected module's index to have one summary per GUID"); - auto &Summary = GlobalList.second.SummaryList[0]; + auto &Summary = GlobalList.second.getSummaryList()[0]; if (!IsThinLTO) { Summary->setNotEligibleToImport(); continue; diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp index 5e94e0b..5e92ca1 100644 --- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp +++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp @@ -1136,7 +1136,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) { if (!AreStatisticsEnabled()) return; for (auto &GVS : Index) - for (auto &GV : GVS.second.SummaryList) + for (auto &GV : GVS.second.getSummaryList()) if (FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get())) Stat += FS->paramAccesses().size(); }; @@ -1147,7 +1147,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) { // Convert the ModuleSummaryIndex to a FunctionMap for (auto &GVS : Index) { - for (auto &GV : GVS.second.SummaryList) { + for (auto &GV : GVS.second.getSummaryList()) { FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get()); if (!FS || FS->paramAccesses().empty()) continue; diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 8ff3aa9..61aa7c2f5 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -235,7 +235,7 @@ public: return; for (const auto &GUIDSummaryLists : *Index) // Examine all summaries for this GUID. - for (auto &Summary : GUIDSummaryLists.second.SummaryList) + for (auto &Summary : GUIDSummaryLists.second.getSummaryList()) if (auto FS = dyn_cast<FunctionSummary>(Summary.get())) { // For each call in the function summary, see if the call // is to a GUID (which means it is for an indirect call, @@ -587,7 +587,7 @@ public: } } else { for (auto &Summaries : Index) - for (auto &Summary : Summaries.second.SummaryList) + for (auto &Summary : Summaries.second.getSummaryList()) Callback({Summaries.first, Summary.get()}, false); } } diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp index 72bb98c..64cbe9d 100644 --- a/llvm/lib/CAS/OnDiskGraphDB.cpp +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -836,6 +836,7 @@ uint64_t DataRecordHandle::getDataSize() const { case DataSizeFlags::Uses8B: return support::endian::read64le(DataSizePtr); } + llvm_unreachable("Unknown DataSizeFlags enum"); } void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const { @@ -863,6 +864,7 @@ uint32_t DataRecordHandle::getNumRefs() const { case NumRefsFlags::Uses8B: return support::endian::read64le(NumRefsPtr); } + llvm_unreachable("Unknown NumRefsFlags enum"); } void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const { @@ -1270,6 +1272,7 @@ Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) { return FaultInResult.takeError(); return true; } + llvm_unreachable("Unknown ObjectPresence enum"); } Expected<OnDiskGraphDB::ObjectPresence> diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index f0f0861..c7d45897 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -566,32 +566,54 @@ bool DwarfExpression::addExpression( case dwarf::DW_OP_LLVM_extract_bits_zext: { unsigned SizeInBits = Op->getArg(1); unsigned BitOffset = Op->getArg(0); + unsigned DerefSize = 0; + // Operations are done in the DWARF "generic type" whose size + // is the size of a pointer. + unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize(); // If we have a memory location then dereference to get the value, though // we have to make sure we don't dereference any bytes past the end of the // object. if (isMemoryLocation()) { - emitOp(dwarf::DW_OP_deref_size); - emitUnsigned(alignTo(BitOffset + SizeInBits, 8) / 8); + DerefSize = alignTo(BitOffset + SizeInBits, 8) / 8; + if (DerefSize == PtrSizeInBytes) { + emitOp(dwarf::DW_OP_deref); + } else { + emitOp(dwarf::DW_OP_deref_size); + emitUnsigned(DerefSize); + } } - // Extract the bits by a shift left (to shift out the bits after what we - // want to extract) followed by shift right (to shift the bits to position - // 0 and also sign/zero extend). These operations are done in the DWARF - // "generic type" whose size is the size of a pointer. - unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize(); - unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset); - unsigned RightShift = LeftShift + BitOffset; - if (LeftShift) { - emitOp(dwarf::DW_OP_constu); - emitUnsigned(LeftShift); - emitOp(dwarf::DW_OP_shl); - } - if (RightShift) { - emitOp(dwarf::DW_OP_constu); - emitUnsigned(RightShift); - emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra - : dwarf::DW_OP_shr); + // If a dereference was emitted for an unsigned value, and + // there's no bit offset, then a bit of optimization is + // possible. + if (OpNum == dwarf::DW_OP_LLVM_extract_bits_zext && BitOffset == 0) { + if (8 * DerefSize == SizeInBits) { + // The correct value is already on the stack. + } else { + // No need to shift, we can just mask off the desired bits. + emitOp(dwarf::DW_OP_constu); + emitUnsigned((1u << SizeInBits) - 1); + emitOp(dwarf::DW_OP_and); + } + } else { + // Extract the bits by a shift left (to shift out the bits after what we + // want to extract) followed by shift right (to shift the bits to + // position 0 and also sign/zero extend). + unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset); + unsigned RightShift = LeftShift + BitOffset; + if (LeftShift) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(LeftShift); + emitOp(dwarf::DW_OP_shl); + } + if (RightShift) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(RightShift); + emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext + ? dwarf::DW_OP_shra + : dwarf::DW_OP_shr); + } } // The value is now at the top of the stack, so set the location to diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp index f80e1e8..3ac6d2a 100644 --- a/llvm/lib/CodeGen/IfConversion.cpp +++ b/llvm/lib/CodeGen/IfConversion.cpp @@ -1498,7 +1498,7 @@ static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) { // Before stepping forward past MI, remember which regs were live // before MI. This is needed to set the Undef flag only when reg is // dead. - SparseSet<MCPhysReg, identity<MCPhysReg>> LiveBeforeMI; + SparseSet<MCPhysReg, MCPhysReg> LiveBeforeMI; LiveBeforeMI.setUniverse(TRI->getNumRegs()); for (unsigned Reg : Redefs) LiveBeforeMI.insert(Reg); diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 804480c..72b364c 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -211,7 +211,7 @@ private: unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); } }; - using LiveRegMap = SparseSet<LiveReg, identity<unsigned>, uint16_t>; + using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>; /// This map contains entries for each virtual register that is currently /// available in a physical register. LiveRegMap LiveVirtRegs; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6bf9008..310d35d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16433,7 +16433,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { case ISD::OR: case ISD::XOR: if (!LegalOperations && N0.hasOneUse() && - (isConstantOrConstantVector(N0.getOperand(0), true) || + (N0.getOperand(0) == N0.getOperand(1) || + isConstantOrConstantVector(N0.getOperand(0), true) || isConstantOrConstantVector(N0.getOperand(1), true))) { // TODO: We already restricted this to pre-legalization, but for vectors // we are extra cautious to not create an unsupported operation. diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 212a0c0..db5cc37 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -107,6 +107,28 @@ static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) { return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference(); } +static llvm::StringRef +prettyLanguageVersionString(const DWARFAttribute &AttrValue, + const DWARFDie &Die) { + if (AttrValue.Attr != DW_AT_language_version) + return {}; + + auto NameForm = Die.find(DW_AT_language_name); + if (!NameForm) + return {}; + + auto LName = NameForm->getAsUnsignedConstant(); + if (!LName) + return {}; + + auto LVersion = AttrValue.Value.getAsUnsignedConstant(); + if (!LVersion) + return {}; + + return llvm::dwarf::LanguageDescription( + static_cast<SourceLanguageName>(*LName), *LVersion); +} + static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, const DWARFAttribute &AttrValue, unsigned Indent, DIDumpOptions DumpOpts) { @@ -146,15 +168,28 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, } else if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant()) Name = AttributeValueString(Attr, *Val); - if (!Name.empty()) - WithColor(OS, Color) << Name; - else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column || - Attr == DW_AT_call_line || Attr == DW_AT_call_column || - Attr == DW_AT_language_version) { + auto DumpUnsignedConstant = [&OS, + &DumpOpts](const DWARFFormValue &FormValue) { if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant()) OS << *Val; else FormValue.dump(OS, DumpOpts); + }; + + llvm::StringRef PrettyVersionName = + prettyLanguageVersionString(AttrValue, Die); + bool ShouldDumpRawLanguageVersion = + Attr == DW_AT_language_version && + (DumpOpts.Verbose || PrettyVersionName.empty()); + + if (!Name.empty()) + WithColor(OS, Color) << Name; + else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column || + Attr == DW_AT_call_line || Attr == DW_AT_call_column) { + DumpUnsignedConstant(FormValue); + } else if (Attr == DW_AT_language_version) { + if (ShouldDumpRawLanguageVersion) + DumpUnsignedConstant(FormValue); } else if (Attr == DW_AT_low_pc && (FormValue.getAsAddress() == dwarf::computeTombstoneAddress(U->getAddressByteSize()))) { @@ -226,6 +261,10 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, DumpOpts.RecoverableErrorHandler(createStringError( errc::invalid_argument, "decoding address ranges: %s", toString(RangesOrError.takeError()).c_str())); + } else if (Attr == DW_AT_language_version) { + if (!PrettyVersionName.empty()) + WithColor(OS, Color) << (ShouldDumpRawLanguageVersion ? " " : "") + << PrettyVersionName; } OS << ")\n"; diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 3908a78..488b078 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -3196,7 +3196,7 @@ void AssemblyWriter::printModuleSummaryIndex() { // for aliasee (then update BitcodeWriter.cpp and remove get/setAliaseeGUID). for (auto &GlobalList : *TheIndex) { auto GUID = GlobalList.first; - for (auto &Summary : GlobalList.second.SummaryList) + for (auto &Summary : GlobalList.second.getSummaryList()) SummaryToGUIDMap[Summary.get()] = GUID; } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 9060a89..3b8fde8 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -2878,7 +2878,7 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp, { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc | { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt | { 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt | - { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr | + { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr | { 99,99,99,99,99,99,99,99,99,11,11,99,15, 0}, // IntToPtr | { 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp index dc55b63..a6353664 100644 --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -162,7 +162,7 @@ void ModuleSummaryIndex::collectDefinedFunctionsForModule( StringRef ModulePath, GVSummaryMapTy &GVSummaryMap) const { for (auto &GlobalList : *this) { auto GUID = GlobalList.first; - for (auto &GlobSummary : GlobalList.second.SummaryList) { + for (auto &GlobSummary : GlobalList.second.getSummaryList()) { auto *Summary = dyn_cast_or_null<FunctionSummary>(GlobSummary.get()); if (!Summary) // Ignore global variable, focus on functions @@ -263,7 +263,7 @@ void ModuleSummaryIndex::propagateAttributes( DenseSet<ValueInfo> MarkedNonReadWriteOnly; for (auto &P : *this) { bool IsDSOLocal = true; - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { if (!isGlobalValueLive(S.get())) { // computeDeadSymbolsAndUpdateIndirectCalls should have marked all // copies live. Note that it is possible that there is a GUID collision @@ -273,7 +273,7 @@ void ModuleSummaryIndex::propagateAttributes( // all copies live we can assert here that all are dead if any copy is // dead. assert(llvm::none_of( - P.second.SummaryList, + P.second.getSummaryList(), [&](const std::unique_ptr<GlobalValueSummary> &Summary) { return isGlobalValueLive(Summary.get()); })); @@ -308,16 +308,16 @@ void ModuleSummaryIndex::propagateAttributes( // Mark the flag in all summaries false so that we can do quick check // without going through the whole list. for (const std::unique_ptr<GlobalValueSummary> &Summary : - P.second.SummaryList) + P.second.getSummaryList()) Summary->setDSOLocal(false); } setWithAttributePropagation(); setWithDSOLocalPropagation(); if (llvm::AreStatisticsEnabled()) for (auto &P : *this) - if (P.second.SummaryList.size()) + if (P.second.getSummaryList().size()) if (auto *GVS = dyn_cast<GlobalVarSummary>( - P.second.SummaryList[0]->getBaseObject())) + P.second.getSummaryList()[0]->getBaseObject())) if (isGlobalValueLive(GVS)) { if (GVS->maybeReadOnly()) ReadOnlyLiveGVars++; diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index aec8891..cbc0b1d 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -457,7 +457,7 @@ void llvm::thinLTOResolvePrevailingInIndex( // when needed. DenseSet<GlobalValueSummary *> GlobalInvolvedWithAlias; for (auto &I : Index) - for (auto &S : I.second.SummaryList) + for (auto &S : I.second.getSummaryList()) if (auto AS = dyn_cast<AliasSummary>(S.get())) GlobalInvolvedWithAlias.insert(&AS->getAliasee()); @@ -1182,7 +1182,7 @@ Error LTO::checkPartiallySplit() { // Otherwise check if there are any recorded in the combined summary from the // ThinLTO modules. for (auto &P : ThinLTO.CombinedIndex) { - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { auto *FS = dyn_cast<FunctionSummary>(S.get()); if (!FS) continue; diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 280c3d1..93118be 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -770,11 +770,11 @@ bool lto::initImportList(const Module &M, // via a WriteIndexesThinBackend. for (const auto &GlobalList : CombinedIndex) { // Ignore entries for undefined references. - if (GlobalList.second.SummaryList.empty()) + if (GlobalList.second.getSummaryList().empty()) continue; auto GUID = GlobalList.first; - for (const auto &Summary : GlobalList.second.SummaryList) { + for (const auto &Summary : GlobalList.second.getSummaryList()) { // Skip the summaries for the importing module. These are included to // e.g. record required linkage changes. if (Summary->modulePath() == M.getModuleIdentifier()) diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index 5b333cd..ff94c54 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -101,8 +101,8 @@ static void saveTempBitcode(const Module &TheModule, StringRef TempDir, WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); } -static const GlobalValueSummary * -getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) { +static const GlobalValueSummary *getFirstDefinitionForLinker( + ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) { // If there is any strong definition anywhere, get it. auto StrongDefForLinker = llvm::find_if( GVSummaryList, [](const std::unique_ptr<GlobalValueSummary> &Summary) { @@ -131,14 +131,15 @@ getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) { static void computePrevailingCopies( const ModuleSummaryIndex &Index, DenseMap<GlobalValue::GUID, const GlobalValueSummary *> &PrevailingCopy) { - auto HasMultipleCopies = [&](const GlobalValueSummaryList &GVSummaryList) { - return GVSummaryList.size() > 1; - }; + auto HasMultipleCopies = + [&](ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) { + return GVSummaryList.size() > 1; + }; for (auto &I : Index) { - if (HasMultipleCopies(I.second.SummaryList)) + if (HasMultipleCopies(I.second.getSummaryList())) PrevailingCopy[I.first] = - getFirstDefinitionForLinker(I.second.SummaryList); + getFirstDefinitionForLinker(I.second.getSummaryList()); } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 662d84b..a81de5c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -27602,6 +27602,15 @@ static SDValue performPTestFirstCombine(SDNode *N, static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + SDLoc DL(N); + + // If a DUP(Op0) already exists, reuse it for the scalar_to_vector. + if (DCI.isAfterLegalizeDAG()) { + if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(), + N->getOperand(0))) + return SDValue(LN, 0); + } + // Let's do below transform. // // t34: v4i32 = AArch64ISD::UADDLV t2 @@ -27638,7 +27647,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); // Let's generate new sequence with AArch64ISD::NVCAST. - SDLoc DL(N); SDValue EXTRACT_SUBVEC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV, DAG.getConstant(0, DL, MVT::i64)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp new file mode 100644 index 0000000..30a1f05 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -0,0 +1,73 @@ +//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to add latency to +/// barrier edges between ATOMIC_FENCE instructions and preceding +/// memory accesses potentially affected by the fence. +/// This encourages the scheduling of more instructions before +/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may +/// introduce wait counting or indicate an impending S_BARRIER +/// wait. Having more instructions in-flight across these +/// constructs improves latency hiding. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUBarrierLatency.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" + +using namespace llvm; + +namespace { + +class BarrierLatency : public ScheduleDAGMutation { +public: + BarrierLatency() = default; + void apply(ScheduleDAGInstrs *DAG) override; +}; + +void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { + constexpr unsigned SyntheticLatency = 2000; + for (SUnit &SU : DAG->SUnits) { + const MachineInstr *MI = SU.getInstr(); + if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) + continue; + + // Update latency on barrier edges of ATOMIC_FENCE. + // We don't consider the scope of the fence or type of instruction + // involved in the barrier edge. + for (SDep &PredDep : SU.Preds) { + if (!PredDep.isBarrier()) + continue; + SUnit *PredSU = PredDep.getSUnit(); + MachineInstr *MI = PredSU->getInstr(); + // Only consider memory loads + if (!MI->mayLoad() || MI->mayStore()) + continue; + SDep ForwardD = PredDep; + ForwardD.setSUnit(&SU); + for (SDep &SuccDep : PredSU->Succs) { + if (SuccDep == ForwardD) { + SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency); + break; + } + } + PredDep.setLatency(PredDep.getLatency() + SyntheticLatency); + PredSU->setDepthDirty(); + SU.setDepthDirty(); + } + } +} + +} // end namespace + +std::unique_ptr<ScheduleDAGMutation> +llvm::createAMDGPUBarrierLatencyDAGMutation() { + return std::make_unique<BarrierLatency>(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h new file mode 100644 index 0000000..c23f0b9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h @@ -0,0 +1,21 @@ +//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include <memory> + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4958a20..996b55f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" +#include "AMDGPUBarrierLatency.h" #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" @@ -639,6 +640,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } @@ -659,6 +661,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } @@ -1197,6 +1200,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 13f727b68..a1e0e52 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -52,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUAsmPrinter.cpp AMDGPUAtomicOptimizer.cpp AMDGPUAttributor.cpp + AMDGPUBarrierLatency.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 74d4153..6f1feb1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2223,8 +2223,8 @@ def : GCNPat < def : GCNPat < (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)), - (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, - 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0), + (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src, + !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0), 0, 0, 0, 0, 0) > { let SubtargetPredicate = HasPackedFP32Ops; diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index bdbc000..07264d9 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -397,12 +397,6 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos) const = 0; - /// Inserts any necessary instructions before the barrier start instruction - /// \p MI in order to support pairing of barriers and fences. - virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const { - return false; - }; - /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; }; @@ -583,12 +577,8 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos, AtomicOrdering Order, bool AtomicsOnly) const override; - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - - bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, Position Pos) const override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -2069,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, // the WGP. Therefore need to wait for operations to complete to ensure // they are visible to waves in the other CU as the L0 is per CU. // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. + if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) VMCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2225,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx10CacheControl::insertBarrierStart( - MachineBasicBlock::iterator &MI) const { - // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU - // mode. This is because a CU mode release fence does not emit any wait, which - // is fine when only dealing with vmem, but isn't sufficient in the presence - // of barriers which do not go through vmem. - // GFX12.5 does not require this additional wait. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) - return false; - - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); - return true; -} - bool SIGfx11CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { @@ -2419,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // In WGP mode the waves of a work-group can be executing on either CU // of the WGP. Therefore need to wait for operations to complete to // ensure they are visible to waves in the other CU as the L0 is per CU. + // // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. // // GFX12.5: // CU$ has two ports. To ensure operations are visible at the workgroup // level, we need to ensure all operations in this port have completed // so the other SIMDs in the WG can see them. There is no ordering // guarantee between the ports. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() || + isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -3017,11 +2999,6 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { MI = II->getIterator(); } - if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) { - Changed |= CC->insertBarrierStart(MI); - continue; - } - if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 96ee69c..406f4c1 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -882,7 +882,7 @@ static bool producesFalseLanesZero(MachineInstr &MI, continue; // Skip the lr predicate reg int PIdx = llvm::findFirstVPTPredOperandIdx(MI); - if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2) + if (PIdx != -1 && MO.getOperandNo() == PIdx + ARM::SUBOP_vpred_n_tp_reg) continue; // Check that this instruction will produce zeros in its false lanes: diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index 5eeb4fe..413e844 100644 --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -534,7 +534,7 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, Register LR = LoopPhi->getOperand(0).getReg(); for (MachineInstr *MI : MVEInstrs) { int Idx = findFirstVPTPredOperandIdx(*MI); - MI->getOperand(Idx + 2).setReg(LR); + MI->getOperand(Idx + ARM::SUBOP_vpred_n_tp_reg).setReg(LR); } } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8bf0d11..d477522 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -442,14 +442,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); // If we're enabling GP optimizations, use hardware square root - if (!Subtarget.hasFSQRT() && - !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && - Subtarget.hasFRE())) + if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE())) setOperationAction(ISD::FSQRT, MVT::f64, Expand); if (!Subtarget.hasFSQRT() && - !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && - Subtarget.hasFRES())) + !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); if (Subtarget.hasFCPSGN()) { @@ -565,16 +562,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i32, Legal); setOperationAction(ISD::BITCAST, MVT::i64, Legal); setOperationAction(ISD::BITCAST, MVT::f64, Legal); - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::LRINT, MVT::f64, Legal); - setOperationAction(ISD::LRINT, MVT::f32, Legal); - setOperationAction(ISD::LLRINT, MVT::f64, Legal); - setOperationAction(ISD::LLRINT, MVT::f32, Legal); - setOperationAction(ISD::LROUND, MVT::f64, Legal); - setOperationAction(ISD::LROUND, MVT::f32, Legal); - setOperationAction(ISD::LLROUND, MVT::f64, Legal); - setOperationAction(ISD::LLROUND, MVT::f32, Legal); - } + + setOperationAction(ISD::STRICT_LRINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LRINT, MVT::f32, Custom); + setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Custom); + setOperationAction(ISD::STRICT_LROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Custom); } else { setOperationAction(ISD::BITCAST, MVT::f32, Expand); setOperationAction(ISD::BITCAST, MVT::i32, Expand); @@ -1034,11 +1030,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); // The nearbyint variants are not allowed to raise the inexact exception - // so we can only code-gen them with unsafe math. - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); - } + // so we can only code-gen them with fpexcept.ignore. + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); @@ -8911,11 +8905,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // be lost at this stage, but is below the single-precision rounding // position. // - // However, if -enable-unsafe-fp-math is in effect, accept double + // However, if afn is in effect, accept double // rounding to avoid the extra overhead. - if (Op.getValueType() == MVT::f32 && - !Subtarget.hasFPCVT() && - !DAG.getTarget().Options.UnsafeFPMath) { + // FIXME: Currently INT_TO_FP can't support fast math flags because + // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always + // false. + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() && + !Op->getFlags().hasApproximateFuncs()) { // Twiddle input to make sure the low 11 bits are zero. (If this // is the case, we are guaranteed the value will fit into the 53 bit @@ -12759,6 +12755,14 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerADDSUBO_CARRY(Op, DAG); case ISD::UCMP: return LowerUCMP(Op, DAG); + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_FNEARBYINT: + if (Op->getFlags().hasNoFPExcept()) + return Op; + return SDValue(); } } @@ -13088,7 +13092,9 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(TmpReg).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); + .addImm(PPC::PRED_NE_MINUS) + .addReg(PPC::CR0) + .addMBB(loopMBB); BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); @@ -13346,7 +13352,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( .addReg(ZeroReg) .addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) + .addImm(PPC::PRED_NE_MINUS) .addReg(PPC::CR0) .addMBB(loopMBB); BB->addSuccessor(loopMBB); @@ -14177,7 +14183,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(dest) .addReg(oldval); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) + .addImm(PPC::PRED_NE_MINUS) .addReg(CrReg) .addMBB(exitMBB); BB->addSuccessor(loop2MBB); @@ -14189,7 +14195,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(ptrA) .addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) + .addImm(PPC::PRED_NE_MINUS) .addReg(PPC::CR0) .addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); @@ -18707,11 +18713,12 @@ bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const { const Function *F = I->getFunction(); const DataLayout &DL = F->getDataLayout(); Type *Ty = User->getOperand(0)->getType(); + bool AllowContract = I->getFastMathFlags().allowContract() && + User->getFastMathFlags().allowContract(); - return !( - isFMAFasterThanFMulAndFAdd(*F, Ty) && - isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); + return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && + isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && + (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast)); } case Instruction::Load: { // Don't break "store (load float*)" pattern, this pattern will be combined diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 979ba31..885bed6 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -2786,14 +2786,16 @@ def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>; // Rounding without exceptions (nearbyint). Due to strange tblgen behaviour, // these need to be defined after the any_frint versions so ISEL will correctly // add the chain to the strict versions. -def : Pat<(f32 (fnearbyint f32:$S)), +// TODO: Match strict fp rounding intrinsics with instructions like xsrdpiz when +// rounding mode is propagated to CodeGen part. +def : Pat<(f32 (strict_fnearbyint f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPIC (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; -def : Pat<(f64 (fnearbyint f64:$S)), +def : Pat<(f64 (strict_fnearbyint f64:$S)), (f64 (XSRDPIC $S))>; -def : Pat<(v2f64 (fnearbyint v2f64:$S)), +def : Pat<(v2f64 (strict_fnearbyint v2f64:$S)), (v2f64 (XVRDPIC $S))>; -def : Pat<(v4f32 (fnearbyint v4f32:$S)), +def : Pat<(v4f32 (strict_fnearbyint v4f32:$S)), (v4f32 (XVRSPIC $S))>; // Materialize a zero-vector of long long @@ -3578,25 +3580,25 @@ def : Pat<(f64 (bitconvert i64:$S)), (f64 (MTVSRD $S))>; // Rounding to integer. -def : Pat<(i64 (lrint f64:$S)), +def : Pat<(i64 (strict_lrint f64:$S)), (i64 (MFVSRD (FCTID $S)))>; -def : Pat<(i64 (lrint f32:$S)), +def : Pat<(i64 (strict_lrint f32:$S)), (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>; -def : Pat<(i64 (llrint f64:$S)), +def : Pat<(i64 (strict_llrint f64:$S)), (i64 (MFVSRD (FCTID $S)))>; -def : Pat<(i64 (llrint f32:$S)), +def : Pat<(i64 (strict_llrint f32:$S)), (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>; -def : Pat<(i64 (lround f64:$S)), +def : Pat<(i64 (strict_lround f64:$S)), (i64 (MFVSRD (FCTID (XSRDPI $S))))>; -def : Pat<(i64 (lround f32:$S)), +def : Pat<(i64 (strict_lround f32:$S)), (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; -def : Pat<(i32 (lround f64:$S)), +def : Pat<(i32 (strict_lround f64:$S)), (i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>; -def : Pat<(i32 (lround f32:$S)), +def : Pat<(i32 (strict_lround f32:$S)), (i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; -def : Pat<(i64 (llround f64:$S)), +def : Pat<(i64 (strict_llround f64:$S)), (i64 (MFVSRD (FCTID (XSRDPI $S))))>; -def : Pat<(i64 (llround f32:$S)), +def : Pat<(i64 (strict_llround f32:$S)), (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0a53ba9..a77d765 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -24044,18 +24044,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } } - std::pair<Register, const TargetRegisterClass *> Res = - TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); - - // If we picked one of the Zfinx register classes, remap it to the GPR class. - // FIXME: When Zfinx is supported in CodeGen this will need to take the - // Subtarget into account. - if (Res.second == &RISCV::GPRF16RegClass || - Res.second == &RISCV::GPRF32RegClass || - Res.second == &RISCV::GPRPairRegClass) - return std::make_pair(Res.first, &RISCV::GPRRegClass); - - return Res; + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } InlineAsm::ConstraintCode diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 28ee444..a29faab 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -1368,13 +1368,13 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest( FunctionImporter::ImportMapTy &ImportList) { for (const auto &GlobalList : Index) { // Ignore entries for undefined references. - if (GlobalList.second.SummaryList.empty()) + if (GlobalList.second.getSummaryList().empty()) continue; auto GUID = GlobalList.first; - assert(GlobalList.second.SummaryList.size() == 1 && + assert(GlobalList.second.getSummaryList().size() == 1 && "Expected individual combined index to have one summary per GUID"); - auto &Summary = GlobalList.second.SummaryList[0]; + auto &Summary = GlobalList.second.getSummaryList()[0]; // Skip the summaries for the importing module. These are included to // e.g. record required linkage changes. if (Summary->modulePath() == ModulePath) @@ -1423,7 +1423,7 @@ void updateValueInfoForIndirectCalls(ModuleSummaryIndex &Index, void llvm::updateIndirectCalls(ModuleSummaryIndex &Index) { for (const auto &Entry : Index) { - for (const auto &S : Entry.second.SummaryList) { + for (const auto &S : Entry.second.getSummaryList()) { if (auto *FS = dyn_cast<FunctionSummary>(S.get())) updateValueInfoForIndirectCalls(Index, FS); } @@ -1456,7 +1456,7 @@ void llvm::computeDeadSymbolsAndUpdateIndirectCalls( // Add values flagged in the index as live roots to the worklist. for (const auto &Entry : Index) { auto VI = Index.getValueInfo(Entry); - for (const auto &S : Entry.second.SummaryList) { + for (const auto &S : Entry.second.getSummaryList()) { if (auto *FS = dyn_cast<FunctionSummary>(S.get())) updateValueInfoForIndirectCalls(Index, FS); if (S->isLive()) { @@ -2094,7 +2094,7 @@ static bool doImportingForModuleForTest( // is only enabled when testing importing via the 'opt' tool, which does // not do the ThinLink that would normally determine what values to promote. for (auto &I : *Index) { - for (auto &S : I.second.SummaryList) { + for (auto &S : I.second.getSummaryList()) { if (GlobalValue::isLocalLinkage(S->linkage())) S->setLinkage(GlobalValue::ExternalLinkage); } diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index be6cba3..46fb567 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -2130,7 +2130,7 @@ bool LowerTypeTestsModule::lower() { // A set of all functions that are address taken by a live global object. DenseSet<GlobalValue::GUID> AddressTaken; for (auto &I : *ExportSummary) - for (auto &GVS : I.second.SummaryList) + for (auto &GVS : I.second.getSummaryList()) if (GVS->isLive()) for (const auto &Ref : GVS->refs()) { AddressTaken.insert(Ref.getGUID()); @@ -2409,7 +2409,7 @@ bool LowerTypeTestsModule::lower() { } for (auto &P : *ExportSummary) { - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { if (!ExportSummary->isGlobalValueLive(S.get())) continue; if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject())) diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 2d5cb82..76e588b 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -928,7 +928,7 @@ void llvm::updateVCallVisibilityInIndex( // linker, as we have no information on their eventual use. if (DynamicExportSymbols.count(P.first)) continue; - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { auto *GVar = dyn_cast<GlobalVarSummary>(S.get()); if (!GVar || GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic) @@ -2413,7 +2413,7 @@ bool DevirtModule::run() { } for (auto &P : *ExportSummary) { - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { auto *FS = dyn_cast<FunctionSummary>(S.get()); if (!FS) continue; @@ -2564,7 +2564,7 @@ void DevirtIndex::run() { // Collect information from summary about which calls to try to devirtualize. for (auto &P : ExportSummary) { - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { auto *FS = dyn_cast<FunctionSummary>(S.get()); if (!FS) continue; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index cdc559b..9b9fe26 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1643,33 +1643,46 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) { /// Return a Constant* for the specified floating-point constant if it fits /// in the specified FP type without changing its value. -static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { +static bool fitsInFPType(APFloat F, const fltSemantics &Sem) { bool losesInfo; - APFloat F = CFP->getValueAPF(); (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); return !losesInfo; } -static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) { - if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext())) - return nullptr; // No constant folding of this. +static Type *shrinkFPConstant(LLVMContext &Ctx, const APFloat &F, + bool PreferBFloat) { // See if the value can be truncated to bfloat and then reextended. - if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat())) - return Type::getBFloatTy(CFP->getContext()); + if (PreferBFloat && fitsInFPType(F, APFloat::BFloat())) + return Type::getBFloatTy(Ctx); // See if the value can be truncated to half and then reextended. - if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf())) - return Type::getHalfTy(CFP->getContext()); + if (!PreferBFloat && fitsInFPType(F, APFloat::IEEEhalf())) + return Type::getHalfTy(Ctx); // See if the value can be truncated to float and then reextended. - if (fitsInFPType(CFP, APFloat::IEEEsingle())) - return Type::getFloatTy(CFP->getContext()); - if (CFP->getType()->isDoubleTy()) - return nullptr; // Won't shrink. - if (fitsInFPType(CFP, APFloat::IEEEdouble())) - return Type::getDoubleTy(CFP->getContext()); + if (fitsInFPType(F, APFloat::IEEEsingle())) + return Type::getFloatTy(Ctx); + if (&F.getSemantics() == &APFloat::IEEEdouble()) + return nullptr; // Won't shrink. + // See if the value can be truncated to double and then reextended. + if (fitsInFPType(F, APFloat::IEEEdouble())) + return Type::getDoubleTy(Ctx); // Don't try to shrink to various long double types. return nullptr; } +static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) { + Type *Ty = CFP->getType(); + if (Ty->getScalarType()->isPPC_FP128Ty()) + return nullptr; // No constant folding of this. + + Type *ShrinkTy = + shrinkFPConstant(CFP->getContext(), CFP->getValueAPF(), PreferBFloat); + if (ShrinkTy) + if (auto *VecTy = dyn_cast<VectorType>(Ty)) + ShrinkTy = VectorType::get(ShrinkTy, VecTy); + + return ShrinkTy; +} + // Determine if this is a vector of ConstantFPs and if so, return the minimal // type we can safely truncate all elements to. static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) { @@ -1720,10 +1733,10 @@ static Type *getMinimumFPType(Value *V, bool PreferBFloat) { // Try to shrink scalable and fixed splat vectors. if (auto *FPC = dyn_cast<Constant>(V)) - if (isa<VectorType>(V->getType())) + if (auto *VTy = dyn_cast<VectorType>(V->getType())) if (auto *Splat = dyn_cast_or_null<ConstantFP>(FPC->getSplatValue())) if (Type *T = shrinkFPConstant(Splat, PreferBFloat)) - return T; + return VectorType::get(T, VTy); // Try to shrink a vector of FP constants. This returns nullptr on scalable // vectors @@ -1796,10 +1809,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { Type *Ty = FPT.getType(); auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0)); if (BO && BO->hasOneUse()) { - Type *LHSMinType = - getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy()); - Type *RHSMinType = - getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy()); + bool PreferBFloat = Ty->getScalarType()->isBFloatTy(); + Type *LHSMinType = getMinimumFPType(BO->getOperand(0), PreferBFloat); + Type *RHSMinType = getMinimumFPType(BO->getOperand(1), PreferBFloat); unsigned OpWidth = BO->getType()->getFPMantissaWidth(); unsigned LHSWidth = LHSMinType->getFPMantissaWidth(); unsigned RHSWidth = RHSMinType->getFPMantissaWidth(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1cc9173..adf27be 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7231,9 +7231,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( return DenseMap<const SCEV *, Value *>(); } - VPlanTransforms::narrowInterleaveGroups( - BestVPlan, BestVF, - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); @@ -8202,6 +8199,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, if (CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, *Plan, CM.getMaxSafeElements()); + + if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI)) + VPlans.push_back(std::move(P)); + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 3f18bd7..cdb9e7e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5577,62 +5577,79 @@ private: } // Decrement the unscheduled counter and insert to ready list if // ready. - auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE, - unsigned OpIdx) { - if (!ScheduleCopyableDataMap.empty()) { - const EdgeInfo EI = {UserTE, OpIdx}; - if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) { - DecrUnsched(CD, /*IsControl=*/false); - return; - } - } - auto It = OperandsUses.find(I); - assert(It != OperandsUses.end() && "Operand not found"); - if (It->second > 0) { - --It->getSecond(); - assert(TotalOpCount > 0 && "No more operands to decrement"); - --TotalOpCount; - if (ScheduleData *OpSD = getScheduleData(I)) - DecrUnsched(OpSD, /*IsControl=*/false); - } - }; + auto DecrUnschedForInst = + [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx, + SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> + &Checked) { + if (!ScheduleCopyableDataMap.empty()) { + const EdgeInfo EI = {UserTE, OpIdx}; + if (ScheduleCopyableData *CD = + getScheduleCopyableData(EI, I)) { + if (!Checked.insert(std::make_pair(CD, OpIdx)).second) + return; + DecrUnsched(CD, /*IsControl=*/false); + return; + } + } + auto It = OperandsUses.find(I); + assert(It != OperandsUses.end() && "Operand not found"); + if (It->second > 0) { + --It->getSecond(); + assert(TotalOpCount > 0 && "No more operands to decrement"); + --TotalOpCount; + if (ScheduleData *OpSD = getScheduleData(I)) { + if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second) + return; + DecrUnsched(OpSD, /*IsControl=*/false); + } + } + }; for (ScheduleBundle *Bundle : Bundles) { if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0) break; // Need to search for the lane since the tree entry can be // reordered. - int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(), - find(Bundle->getTreeEntry()->Scalars, In)); - assert(Lane >= 0 && "Lane not set"); - if (isa<StoreInst>(In) && - !Bundle->getTreeEntry()->ReorderIndices.empty()) - Lane = Bundle->getTreeEntry()->ReorderIndices[Lane]; - assert(Lane < static_cast<int>( - Bundle->getTreeEntry()->Scalars.size()) && - "Couldn't find extract lane"); - - // Since vectorization tree is being built recursively this - // assertion ensures that the tree entry has all operands set before - // reaching this code. Couple of exceptions known at the moment are - // extracts where their second (immediate) operand is not added. - // Since immediates do not affect scheduler behavior this is - // considered okay. - assert(In && - (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) || - In->getNumOperands() == - Bundle->getTreeEntry()->getNumOperands() || - Bundle->getTreeEntry()->isCopyableElement(In)) && - "Missed TreeEntry operands?"); - - for (unsigned OpIdx : - seq<unsigned>(Bundle->getTreeEntry()->getNumOperands())) - if (auto *I = dyn_cast<Instruction>( - Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { - LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I - << "\n"); - DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx); - } + auto *It = find(Bundle->getTreeEntry()->Scalars, In); + SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked; + do { + int Lane = + std::distance(Bundle->getTreeEntry()->Scalars.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa<StoreInst>(In) && + !Bundle->getTreeEntry()->ReorderIndices.empty()) + Lane = Bundle->getTreeEntry()->ReorderIndices[Lane]; + assert(Lane < static_cast<int>( + Bundle->getTreeEntry()->Scalars.size()) && + "Couldn't find extract lane"); + + // Since vectorization tree is being built recursively this + // assertion ensures that the tree entry has all operands set + // before reaching this code. Couple of exceptions known at the + // moment are extracts where their second (immediate) operand is + // not added. Since immediates do not affect scheduler behavior + // this is considered okay. + assert(In && + (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) || + In->getNumOperands() == + Bundle->getTreeEntry()->getNumOperands() || + Bundle->getTreeEntry()->isCopyableElement(In)) && + "Missed TreeEntry operands?"); + + for (unsigned OpIdx : + seq<unsigned>(Bundle->getTreeEntry()->getNumOperands())) + if (auto *I = dyn_cast<Instruction>( + Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { + LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " + << *I << "\n"); + DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked); + } + // If parent node is schedulable, it will be handle correctly. + if (!Bundle->getTreeEntry()->doesNotNeedToSchedule()) + break; + It = std::find(std::next(It), + Bundle->getTreeEntry()->Scalars.end(), In); + } while (It != Bundle->getTreeEntry()->Scalars.end()); } } else { // If BundleMember is a stand-alone instruction, no operand reordering diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index d167009..b4e4dc2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1213,6 +1213,7 @@ VPlan *VPlan::duplicate() { } Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount; Old2NewVPValues[&VF] = &NewPlan->VF; + Old2NewVPValues[&UF] = &NewPlan->UF; Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF; if (BackedgeTakenCount) { NewPlan->BackedgeTakenCount = new VPValue(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index fed04eb..8274431 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4152,6 +4152,9 @@ class VPlan { /// Represents the vectorization factor of the loop. VPValue VF; + /// Represents the symbolic unroll factor of the loop. + VPValue UF; + /// Represents the loop-invariant VF * UF of the vector loop region. VPValue VFxUF; @@ -4305,6 +4308,9 @@ public: VPValue &getVF() { return VF; }; const VPValue &getVF() const { return VF; }; + /// Returns the symbolic UF of the vector loop region. + VPValue &getSymbolicUF() { return UF; }; + /// Returns VF * UF of the vector loop region. VPValue &getVFxUF() { return VFxUF; } @@ -4314,6 +4320,12 @@ public: void addVF(ElementCount VF) { VFs.insert(VF); } + /// Remove \p VF from the plan. + void removeVF(ElementCount VF) { + assert(hasVF(VF) && "tried to remove VF not present in plan"); + VFs.remove(VF); + } + void setVF(ElementCount VF) { assert(hasVF(VF) && "Cannot set VF not already in plan"); VFs.clear(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e060e70..7bf8d83 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1478,11 +1478,8 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, if (!Plan.getVectorLoopRegion()) return false; - if (!Plan.getTripCount()->isLiveIn()) - return false; - auto *TC = dyn_cast_if_present<ConstantInt>( - Plan.getTripCount()->getUnderlyingValue()); - if (!TC || !BestVF.isFixed()) + const APInt *TC; + if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC))) return false; // Calculate the minimum power-of-2 bit width that can fit the known TC, VF @@ -1495,7 +1492,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8); }; unsigned NewBitWidth = - ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF); + ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF); LLVMContext &Ctx = Plan.getContext(); auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth); @@ -2092,8 +2089,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> { // Recipes in replicate regions implicitly depend on predicate. If either // recipe is in a replicate region, only consider them equal if both have // the same parent. - const VPRegionBlock *RegionL = L->getParent()->getParent(); - const VPRegionBlock *RegionR = R->getParent()->getParent(); + const VPRegionBlock *RegionL = L->getRegion(); + const VPRegionBlock *RegionR = R->getRegion(); if (((RegionL && RegionL->isReplicator()) || (RegionR && RegionR->isReplicator())) && L->getParent() != R->getParent()) @@ -3867,8 +3864,7 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) { // required lanes implicitly. // TODO: Remove once replicate regions are unrolled completely. auto IsCandidateUnpackUser = [Def](VPUser *U) { - VPRegionBlock *ParentRegion = - cast<VPRecipeBase>(U)->getParent()->getParent(); + VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion(); return U->usesScalars(Def) && (!ParentRegion || !ParentRegion->isReplicator()); }; @@ -3960,6 +3956,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, // used. // TODO: Assert that they aren't used. + VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); + Plan.getSymbolicUF().replaceAllUsesWith(UF); + // If there are no users of the runtime VF, compute VFxUF by constant folding // the multiplication of VF and UF. if (VF.getNumUsers() == 0) { @@ -3979,7 +3978,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, } VF.replaceAllUsesWith(RuntimeVF); - VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF}); VFxUF.replaceAllUsesWith(MulByUF); } @@ -4047,14 +4045,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, return false; } -/// Returns true if \p IR is a full interleave group with factor and number of -/// members both equal to \p VF. The interleave group must also access the full -/// vector width \p VectorRegWidth. -static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, - unsigned VF, VPTypeAnalysis &TypeInfo, - unsigned VectorRegWidth) { +/// Returns VF from \p VFs if \p IR is a full interleave group with factor and +/// number of members both equal to VF. The interleave group must also access +/// the full vector width. +static std::optional<ElementCount> isConsecutiveInterleaveGroup( + VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs, + VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) { if (!InterleaveR) - return false; + return std::nullopt; Type *GroupElementTy = nullptr; if (InterleaveR->getStoredValues().empty()) { @@ -4063,7 +4061,7 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, [&TypeInfo, GroupElementTy](VPValue *Op) { return TypeInfo.inferScalarType(Op) == GroupElementTy; })) - return false; + return std::nullopt; } else { GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]); @@ -4071,13 +4069,27 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, [&TypeInfo, GroupElementTy](VPValue *Op) { return TypeInfo.inferScalarType(Op) == GroupElementTy; })) - return false; + return std::nullopt; } - unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF; - auto IG = InterleaveR->getInterleaveGroup(); - return IG->getFactor() == VF && IG->getNumMembers() == VF && - GroupSize == VectorRegWidth; + auto GetVectorWidthForVF = [&TTI](ElementCount VF) { + TypeSize Size = TTI.getRegisterBitWidth( + VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector + : TargetTransformInfo::RGK_ScalableVector); + assert(Size.isScalable() == VF.isScalable() && + "if Size is scalable, VF must to and vice versa"); + return Size.getKnownMinValue(); + }; + + for (ElementCount VF : VFs) { + unsigned MinVal = VF.getKnownMinValue(); + unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal; + auto IG = InterleaveR->getInterleaveGroup(); + if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal && + GroupSize == GetVectorWidthForVF(VF)) + return {VF}; + } + return std::nullopt; } /// Returns true if \p VPValue is a narrow VPValue. @@ -4088,16 +4100,18 @@ static bool isAlreadyNarrow(VPValue *VPV) { return RepR && RepR->isSingleScalar(); } -void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, - unsigned VectorRegWidth) { +std::unique_ptr<VPlan> +VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, + const TargetTransformInfo &TTI) { + using namespace llvm::VPlanPatternMatch; VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); + if (!VectorLoop) - return; + return nullptr; VPTypeAnalysis TypeInfo(Plan); - - unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; + std::optional<ElementCount> VFToOptimize; for (auto &R : *VectorLoop->getEntryBasicBlock()) { if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount())) continue; @@ -4111,30 +4125,33 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // * recipes writing to memory except interleave groups // Only support plans with a canonical induction phi. if (R.isPhi()) - return; + return nullptr; auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R); if (R.mayWriteToMemory() && !InterleaveR) - return; - - // Do not narrow interleave groups if there are VectorPointer recipes and - // the plan was unrolled. The recipe implicitly uses VF from - // VPTransformState. - // TODO: Remove restriction once the VF for the VectorPointer offset is - // modeled explicitly as operand. - if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1) - return; + return nullptr; // All other ops are allowed, but we reject uses that cannot be converted // when checking all allowed consumers (store interleave groups) below. if (!InterleaveR) continue; - // Bail out on non-consecutive interleave groups. - if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo, - VectorRegWidth)) - return; - + // Try to find a single VF, where all interleave groups are consecutive and + // saturate the full vector width. If we already have a candidate VF, check + // if it is applicable for the current InterleaveR, otherwise look for a + // suitable VF across the Plans VFs. + // + if (VFToOptimize) { + if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo, + TTI)) + return nullptr; + } else { + if (auto VF = isConsecutiveInterleaveGroup( + InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI)) + VFToOptimize = *VF; + else + return nullptr; + } // Skip read interleave groups. if (InterleaveR->getStoredValues().empty()) continue; @@ -4168,24 +4185,34 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>( InterleaveR->getStoredValues()[0]->getDefiningRecipe()); if (!WideMember0) - return; + return nullptr; for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) { auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe()); if (!R || R->getOpcode() != WideMember0->getOpcode() || R->getNumOperands() > 2) - return; + return nullptr; if (any_of(enumerate(R->operands()), [WideMember0, Idx = I](const auto &P) { const auto &[OpIdx, OpV] = P; return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx); })) - return; + return nullptr; } StoreGroups.push_back(InterleaveR); } if (StoreGroups.empty()) - return; + return nullptr; + + // All interleave groups in Plan can be narrowed for VFToOptimize. Split the + // original Plan into 2: a) a new clone which contains all VFs of Plan, except + // VFToOptimize, and b) the original Plan with VFToOptimize as single VF. + std::unique_ptr<VPlan> NewPlan; + if (size(Plan.vectorFactors()) != 1) { + NewPlan = std::unique_ptr<VPlan>(Plan.duplicate()); + Plan.setVF(*VFToOptimize); + NewPlan->removeVF(*VFToOptimize); + } // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. SmallPtrSet<VPValue *, 4> NarrowedOps; @@ -4256,9 +4283,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); VPBuilder PHBuilder(Plan.getVectorPreheader()); - VPValue *UF = Plan.getOrAddLiveIn( - ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); - if (VF.isScalable()) { + VPValue *UF = &Plan.getSymbolicUF(); + if (VFToOptimize->isScalable()) { VPValue *VScale = PHBuilder.createElementCount( CanIV->getScalarType(), ElementCount::getScalable(1)); VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); @@ -4270,6 +4296,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); } removeDeadRecipes(Plan); + assert(none_of(*VectorLoop->getEntryBasicBlock(), + IsaPred<VPVectorPointerRecipe>) && + "All VPVectorPointerRecipes should have been removed"); + return NewPlan; } /// Add branch weight metadata, if the \p Plan's middle block is terminated by a diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b28559b..ca8d956 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -341,14 +341,20 @@ struct VPlanTransforms { static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan, ScalarEvolution &SE); - /// Try to convert a plan with interleave groups with VF elements to a plan - /// with the interleave groups replaced by wide loads and stores processing VF - /// elements, if all transformed interleave groups access the full vector - /// width (checked via \o VectorRegWidth). This effectively is a very simple - /// form of loop-aware SLP, where we use interleave groups to identify - /// candidates. - static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, - unsigned VectorRegWidth); + /// Try to find a single VF among \p Plan's VFs for which all interleave + /// groups (with known minimum VF elements) can be replaced by wide loads and + /// stores processing VF elements, if all transformed interleave groups access + /// the full vector width (checked via the maximum vector register width). If + /// the transformation can be applied, the original \p Plan will be split in + /// 2: + /// 1. The original Plan with the single VF containing the optimized recipes + /// using wide loads instead of interleave groups. + /// 2. A new clone which contains all VFs of Plan except the optimized VF. + /// + /// This effectively is a very simple form of loop-aware SLP, where we use + /// interleave groups to identify candidates. + static std::unique_ptr<VPlan> + narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI); /// Predicate and linearize the control-flow in the only loop region of /// \p Plan. If \p FoldTail is true, create a mask guarding the loop |