diff options
Diffstat (limited to 'llvm/lib')
95 files changed, 2667 insertions, 1441 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 805b682..853bd66 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -122,11 +122,61 @@ static cl::opt<unsigned> MIVMaxLevelThreshold( cl::desc("Maximum depth allowed for the recursive algorithm used to " "explore MIV direction vectors.")); -static cl::opt<bool> RunSIVRoutinesOnly( - "da-run-siv-routines-only", cl::init(false), cl::ReallyHidden, - cl::desc("Run only SIV routines and disable others (ZIV, RDIV, and MIV). " - "The purpose is mainly to exclude the influence of those routines " - "in regression tests for SIV routines.")); +namespace { + +/// Types of dependence test routines. +enum class DependenceTestType { + All, + StrongSIV, + WeakCrossingSIV, + ExactSIV, + WeakZeroSIV, + ExactRDIV, + SymbolicRDIV, + GCDMIV, + BanerjeeMIV, +}; + +} // anonymous namespace + +static cl::opt<DependenceTestType> EnableDependenceTest( + "da-enable-dependence-test", cl::init(DependenceTestType::All), + cl::ReallyHidden, + cl::desc("Run only specified dependence test routine and disable others. " + "The purpose is mainly to exclude the influence of other " + "dependence test routines in regression tests. If set to All, all " + "dependence test routines are enabled."), + cl::values(clEnumValN(DependenceTestType::All, "all", + "Enable all dependence test routines."), + clEnumValN(DependenceTestType::StrongSIV, "strong-siv", + "Enable only Strong SIV test."), + clEnumValN(DependenceTestType::WeakCrossingSIV, + "weak-crossing-siv", + "Enable only Weak-Crossing SIV test."), + clEnumValN(DependenceTestType::ExactSIV, "exact-siv", + "Enable only Exact SIV test."), + clEnumValN(DependenceTestType::WeakZeroSIV, "weak-zero-siv", + "Enable only Weak-Zero SIV test."), + clEnumValN(DependenceTestType::ExactRDIV, "exact-rdiv", + "Enable only Exact RDIV test."), + clEnumValN(DependenceTestType::SymbolicRDIV, "symbolic-rdiv", + "Enable only Symbolic RDIV test."), + clEnumValN(DependenceTestType::GCDMIV, "gcd-miv", + "Enable only GCD MIV test."), + clEnumValN(DependenceTestType::BanerjeeMIV, "banerjee-miv", + "Enable only Banerjee MIV test."))); + +// TODO: This flag is disabled by default because it is still under development. +// Enable it or delete this flag when the feature is ready. +static cl::opt<bool> EnableMonotonicityCheck( + "da-enable-monotonicity-check", cl::init(false), cl::Hidden, + cl::desc("Check if the subscripts are monotonic. If it's not, dependence " + "is reported as unknown.")); + +static cl::opt<bool> DumpMonotonicityReport( + "da-dump-monotonicity-report", cl::init(false), cl::Hidden, + cl::desc( + "When printing analysis, dump the results of monotonicity checks.")); //===----------------------------------------------------------------------===// // basics @@ -177,13 +227,196 @@ void DependenceAnalysisWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredTransitive<LoopInfoWrapperPass>(); } +namespace { + +/// The property of monotonicity of a SCEV. To define the monotonicity, assume +/// a SCEV defined within N-nested loops. Let i_k denote the iteration number +/// of the k-th loop. Then we can regard the SCEV as an N-ary function: +/// +/// F(i_1, i_2, ..., i_N) +/// +/// The domain of i_k is the closed range [0, BTC_k], where BTC_k is the +/// backedge-taken count of the k-th loop. +/// +/// A function F is said to be "monotonically increasing with respect to the +/// k-th loop" if x <= y implies the following condition: +/// +/// F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) <= +/// F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N) +/// +/// where i_1, ..., i_{k-1}, i_{k+1}, ..., i_N, x, and y are elements of their +/// respective domains. +/// +/// Likewise F is "monotonically decreasing with respect to the k-th loop" +/// if x <= y implies +/// +/// F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) >= +/// F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N) +/// +/// A function F that is monotonically increasing or decreasing with respect to +/// the k-th loop is simply called "monotonic with respect to k-th loop". +/// +/// A function F is said to be "multivariate monotonic" when it is monotonic +/// with respect to all of the N loops. +/// +/// Since integer comparison can be either signed or unsigned, we need to +/// distinguish monotonicity in the signed sense from that in the unsigned +/// sense. Note that the inequality "x <= y" merely indicates loop progression +/// and is not affected by the difference between signed and unsigned order. +/// +/// Currently we only consider monotonicity in a signed sense. +enum class SCEVMonotonicityType { + /// We don't know anything about the monotonicity of the SCEV. + Unknown, + + /// The SCEV is loop-invariant with respect to the outermost loop. In other + /// words, the function F corresponding to the SCEV is a constant function. + Invariant, + + /// The function F corresponding to the SCEV is multivariate monotonic in a + /// signed sense. Note that the multivariate monotonic function may also be a + /// constant function. The order employed in the definition of monotonicity + /// is not strict order. + MultivariateSignedMonotonic, +}; + +struct SCEVMonotonicity { + SCEVMonotonicity(SCEVMonotonicityType Type, + const SCEV *FailurePoint = nullptr); + + SCEVMonotonicityType getType() const { return Type; } + + const SCEV *getFailurePoint() const { return FailurePoint; } + + bool isUnknown() const { return Type == SCEVMonotonicityType::Unknown; } + + void print(raw_ostream &OS, unsigned Depth) const; + +private: + SCEVMonotonicityType Type; + + /// The subexpression that caused Unknown. Mainly for debugging purpose. + const SCEV *FailurePoint; +}; + +/// Check the monotonicity of a SCEV. Since dependence tests (SIV, MIV, etc.) +/// assume that subscript expressions are (multivariate) monotonic, we need to +/// verify this property before applying those tests. Violating this assumption +/// may cause them to produce incorrect results. +struct SCEVMonotonicityChecker + : public SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity> { + + SCEVMonotonicityChecker(ScalarEvolution *SE) : SE(SE) {} + + /// Check the monotonicity of \p Expr. \p Expr must be integer type. If \p + /// OutermostLoop is not null, \p Expr must be defined in \p OutermostLoop or + /// one of its nested loops. + SCEVMonotonicity checkMonotonicity(const SCEV *Expr, + const Loop *OutermostLoop); + +private: + ScalarEvolution *SE; + + /// The outermost loop that DA is analyzing. + const Loop *OutermostLoop; + + /// A helper to classify \p Expr as either Invariant or Unknown. + SCEVMonotonicity invariantOrUnknown(const SCEV *Expr); + + /// Return true if \p Expr is loop-invariant with respect to the outermost + /// loop. + bool isLoopInvariant(const SCEV *Expr) const; + + /// A helper to create an Unknown SCEVMonotonicity. + SCEVMonotonicity createUnknown(const SCEV *FailurePoint) { + return SCEVMonotonicity(SCEVMonotonicityType::Unknown, FailurePoint); + } + + SCEVMonotonicity visitAddRecExpr(const SCEVAddRecExpr *Expr); + + SCEVMonotonicity visitConstant(const SCEVConstant *) { + return SCEVMonotonicity(SCEVMonotonicityType::Invariant); + } + SCEVMonotonicity visitVScale(const SCEVVScale *) { + return SCEVMonotonicity(SCEVMonotonicityType::Invariant); + } + + // TODO: Handle more cases. + SCEVMonotonicity visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitSignExtendExpr(const SCEVSignExtendExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitAddExpr(const SCEVAddExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitMulExpr(const SCEVMulExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitTruncateExpr(const SCEVTruncateExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitUDivExpr(const SCEVUDivExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitSMaxExpr(const SCEVSMaxExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitUMaxExpr(const SCEVUMaxExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitSMinExpr(const SCEVSMinExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitUMinExpr(const SCEVUMinExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitSequentialUMinExpr(const SCEVSequentialUMinExpr *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitUnknown(const SCEVUnknown *Expr) { + return invariantOrUnknown(Expr); + } + SCEVMonotonicity visitCouldNotCompute(const SCEVCouldNotCompute *Expr) { + return invariantOrUnknown(Expr); + } + + friend struct SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity>; +}; + +} // anonymous namespace + // Used to test the dependence analyzer. // Looks through the function, noting instructions that may access memory. // Calls depends() on every possible pair and prints out the result. // Ignores all other instructions. static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, - ScalarEvolution &SE, bool NormalizeResults) { + ScalarEvolution &SE, LoopInfo &LI, + bool NormalizeResults) { auto *F = DA->getFunction(); + + if (DumpMonotonicityReport) { + SCEVMonotonicityChecker Checker(&SE); + OS << "Monotonicity check:\n"; + for (Instruction &Inst : instructions(F)) { + if (!isa<LoadInst>(Inst) && !isa<StoreInst>(Inst)) + continue; + Value *Ptr = getLoadStorePointerOperand(&Inst); + const Loop *L = LI.getLoopFor(Inst.getParent()); + const SCEV *PtrSCEV = SE.getSCEVAtScope(Ptr, L); + const SCEV *AccessFn = SE.removePointerBase(PtrSCEV); + SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, L); + OS.indent(2) << "Inst: " << Inst << "\n"; + OS.indent(4) << "Expr: " << *AccessFn << "\n"; + Mon.print(OS, 4); + } + OS << "\n"; + } + for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE; ++SrcI) { if (SrcI->mayReadOrWriteMemory()) { @@ -235,7 +468,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, void DependenceAnalysisWrapperPass::print(raw_ostream &OS, const Module *) const { dumpExampleDependence( - OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false); + OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), + getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), false); } PreservedAnalyses @@ -244,7 +478,7 @@ DependenceAnalysisPrinterPass::run(Function &F, FunctionAnalysisManager &FAM) { << "':\n"; dumpExampleDependence(OS, &FAM.getResult<DependenceAnalysis>(F), FAM.getResult<ScalarEvolutionAnalysis>(F), - NormalizeResults); + FAM.getResult<LoopAnalysis>(F), NormalizeResults); return PreservedAnalyses::all(); } @@ -671,6 +905,81 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { } //===----------------------------------------------------------------------===// +// SCEVMonotonicity + +SCEVMonotonicity::SCEVMonotonicity(SCEVMonotonicityType Type, + const SCEV *FailurePoint) + : Type(Type), FailurePoint(FailurePoint) { + assert( + ((Type == SCEVMonotonicityType::Unknown) == (FailurePoint != nullptr)) && + "FailurePoint must be provided iff Type is Unknown"); +} + +void SCEVMonotonicity::print(raw_ostream &OS, unsigned Depth) const { + OS.indent(Depth) << "Monotonicity: "; + switch (Type) { + case SCEVMonotonicityType::Unknown: + assert(FailurePoint && "FailurePoint must be provided for Unknown"); + OS << "Unknown\n"; + OS.indent(Depth) << "Reason: " << *FailurePoint << "\n"; + break; + case SCEVMonotonicityType::Invariant: + OS << "Invariant\n"; + break; + case SCEVMonotonicityType::MultivariateSignedMonotonic: + OS << "MultivariateSignedMonotonic\n"; + break; + } +} + +bool SCEVMonotonicityChecker::isLoopInvariant(const SCEV *Expr) const { + return !OutermostLoop || SE->isLoopInvariant(Expr, OutermostLoop); +} + +SCEVMonotonicity SCEVMonotonicityChecker::invariantOrUnknown(const SCEV *Expr) { + if (isLoopInvariant(Expr)) + return SCEVMonotonicity(SCEVMonotonicityType::Invariant); + return createUnknown(Expr); +} + +SCEVMonotonicity +SCEVMonotonicityChecker::checkMonotonicity(const SCEV *Expr, + const Loop *OutermostLoop) { + assert(Expr->getType()->isIntegerTy() && "Expr must be integer type"); + this->OutermostLoop = OutermostLoop; + return visit(Expr); +} + +/// We only care about an affine AddRec at the moment. For an affine AddRec, +/// the monotonicity can be inferred from its nowrap property. For example, let +/// X and Y be loop-invariant, and assume Y is non-negative. An AddRec +/// {X,+.Y}<nsw> implies: +/// +/// X <=s (X + Y) <=s ((X + Y) + Y) <=s ... +/// +/// Thus, we can conclude that the AddRec is monotonically increasing with +/// respect to the associated loop in a signed sense. The similar reasoning +/// applies when Y is non-positive, leading to a monotonically decreasing +/// AddRec. +SCEVMonotonicity +SCEVMonotonicityChecker::visitAddRecExpr(const SCEVAddRecExpr *Expr) { + if (!Expr->isAffine() || !Expr->hasNoSignedWrap()) + return createUnknown(Expr); + + const SCEV *Start = Expr->getStart(); + const SCEV *Step = Expr->getStepRecurrence(*SE); + + SCEVMonotonicity StartMon = visit(Start); + if (StartMon.isUnknown()) + return StartMon; + + if (!isLoopInvariant(Step)) + return createUnknown(Expr); + + return SCEVMonotonicity(SCEVMonotonicityType::MultivariateSignedMonotonic); +} + +//===----------------------------------------------------------------------===// // DependenceInfo methods // For debugging purposes. Dumps a dependence to OS. @@ -1273,6 +1582,13 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B, return nullptr; } +/// Returns true iff \p Test is enabled. +static bool isDependenceTestEnabled(DependenceTestType Test) { + if (EnableDependenceTest == DependenceTestType::All) + return true; + return EnableDependenceTest == Test; +} + // testZIV - // When we have a pair of subscripts of the form [c1] and [c2], // where c1 and c2 are both loop invariant, we attack it using @@ -1334,6 +1650,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint) const { + if (!isDependenceTestEnabled(DependenceTestType::StrongSIV)) + return false; + LLVM_DEBUG(dbgs() << "\tStrong SIV test\n"); LLVM_DEBUG(dbgs() << "\t Coeff = " << *Coeff); LLVM_DEBUG(dbgs() << ", " << *Coeff->getType() << "\n"); @@ -1468,6 +1787,9 @@ bool DependenceInfo::weakCrossingSIVtest( const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint, const SCEV *&SplitIter) const { + if (!isDependenceTestEnabled(DependenceTestType::WeakCrossingSIV)) + return false; + LLVM_DEBUG(dbgs() << "\tWeak-Crossing SIV test\n"); LLVM_DEBUG(dbgs() << "\t Coeff = " << *Coeff << "\n"); LLVM_DEBUG(dbgs() << "\t SrcConst = " << *SrcConst << "\n"); @@ -1726,6 +2048,9 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint) const { + if (!isDependenceTestEnabled(DependenceTestType::ExactSIV)) + return false; + LLVM_DEBUG(dbgs() << "\tExact SIV test\n"); LLVM_DEBUG(dbgs() << "\t SrcCoeff = " << *SrcCoeff << " = AM\n"); LLVM_DEBUG(dbgs() << "\t DstCoeff = " << *DstCoeff << " = BM\n"); @@ -1905,6 +2230,9 @@ bool DependenceInfo::weakZeroSrcSIVtest( const SCEV *DstCoeff, const SCEV *SrcConst, const SCEV *DstConst, const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint) const { + if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV)) + return false; + // For the WeakSIV test, it's possible the loop isn't common to // the Src and Dst loops. If it isn't, then there's no need to // record a direction. @@ -2013,6 +2341,9 @@ bool DependenceInfo::weakZeroDstSIVtest( const SCEV *SrcCoeff, const SCEV *SrcConst, const SCEV *DstConst, const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level, FullDependence &Result, Constraint &NewConstraint) const { + if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV)) + return false; + // For the WeakSIV test, it's possible the loop isn't common to the // Src and Dst loops. If it isn't, then there's no need to record a direction. LLVM_DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n"); @@ -2096,8 +2427,9 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, const SCEV *SrcConst, const SCEV *DstConst, const Loop *SrcLoop, const Loop *DstLoop, FullDependence &Result) const { - if (RunSIVRoutinesOnly) + if (!isDependenceTestEnabled(DependenceTestType::ExactRDIV)) return false; + LLVM_DEBUG(dbgs() << "\tExact RDIV test\n"); LLVM_DEBUG(dbgs() << "\t SrcCoeff = " << *SrcCoeff << " = AM\n"); LLVM_DEBUG(dbgs() << "\t DstCoeff = " << *DstCoeff << " = BM\n"); @@ -2242,8 +2574,9 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, const SCEV *C1, const SCEV *C2, const Loop *Loop1, const Loop *Loop2) const { - if (RunSIVRoutinesOnly) + if (!isDependenceTestEnabled(DependenceTestType::SymbolicRDIV)) return false; + ++SymbolicRDIVapplications; LLVM_DEBUG(dbgs() << "\ttry symbolic RDIV test\n"); LLVM_DEBUG(dbgs() << "\t A1 = " << *A1); @@ -2557,8 +2890,9 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr, // to "a common divisor". bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, FullDependence &Result) const { - if (RunSIVRoutinesOnly) + if (!isDependenceTestEnabled(DependenceTestType::GCDMIV)) return false; + LLVM_DEBUG(dbgs() << "starting gcd\n"); ++GCDapplications; unsigned BitWidth = SE->getTypeSizeInBits(Src->getType()); @@ -2725,8 +3059,9 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst, const SmallBitVector &Loops, FullDependence &Result) const { - if (RunSIVRoutinesOnly) + if (!isDependenceTestEnabled(DependenceTestType::BanerjeeMIV)) return false; + LLVM_DEBUG(dbgs() << "starting Banerjee\n"); ++BanerjeeApplications; LLVM_DEBUG(dbgs() << " Src = " << *Src << '\n'); @@ -3488,10 +3823,19 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst, // resize Pair to contain as many pairs of subscripts as the delinearization // has found, and then initialize the pairs following the delinearization. Pair.resize(Size); + SCEVMonotonicityChecker MonChecker(SE); + const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr; for (int I = 0; I < Size; ++I) { Pair[I].Src = SrcSubscripts[I]; Pair[I].Dst = DstSubscripts[I]; unifySubscriptType(&Pair[I]); + + if (EnableMonotonicityCheck) { + if (MonChecker.checkMonotonicity(Pair[I].Src, OutermostLoop).isUnknown()) + return false; + if (MonChecker.checkMonotonicity(Pair[I].Dst, OutermostLoop).isUnknown()) + return false; + } } return true; @@ -3824,6 +4168,14 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, Pair[0].Src = SrcEv; Pair[0].Dst = DstEv; + SCEVMonotonicityChecker MonChecker(SE); + const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr; + if (EnableMonotonicityCheck) + if (MonChecker.checkMonotonicity(Pair[0].Src, OutermostLoop).isUnknown() || + MonChecker.checkMonotonicity(Pair[0].Dst, OutermostLoop).isUnknown()) + return std::make_unique<Dependence>(Src, Dst, + SCEVUnionPredicate(Assume, *SE)); + if (Delinearize) { if (tryDelinearize(Src, Dst, Pair)) { LLVM_DEBUG(dbgs() << " delinearized\n"); diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index dc813f6..b573023 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4866,6 +4866,89 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F, return nullptr; } +/// Look for the following pattern and simplify %to_fold to %identicalPhi. +/// Here %phi, %to_fold and %phi.next perform the same functionality as +/// %identicalPhi and hence the select instruction %to_fold can be folded +/// into %identicalPhi. +/// +/// BB1: +/// %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ] +/// %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ] +/// ... +/// %identicalPhi.next = select %cmp, %val, %identicalPhi +/// (or select %cmp, %identicalPhi, %val) +/// %to_fold = select %cmp2, %identicalPhi, %phi +/// %phi.next = select %cmp, %val, %to_fold +/// (or select %cmp, %to_fold, %val) +/// +/// Prove that %phi and %identicalPhi are the same by induction: +/// +/// Base case: Both %phi and %identicalPhi are equal on entry to the loop. +/// Inductive case: +/// Suppose %phi and %identicalPhi are equal at iteration i. +/// We look at their values at iteration i+1 which are %phi.next and +/// %identicalPhi.next. They would have become different only when %cmp is +/// false and the corresponding values %to_fold and %identicalPhi differ +/// (similar reason for the other "or" case in the bracket). +/// +/// The only condition when %to_fold and %identicalPh could differ is when %cmp2 +/// is false and %to_fold is %phi, which contradicts our inductive hypothesis +/// that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are +/// always equal at iteration i+1. +bool isSimplifierIdenticalPHI(PHINode &PN, PHINode &IdenticalPN) { + if (PN.getParent() != IdenticalPN.getParent()) + return false; + if (PN.getNumIncomingValues() != 2) + return false; + + // Check that only the backedge incoming value is different. + unsigned DiffVals = 0; + BasicBlock *DiffValBB = nullptr; + for (unsigned i = 0; i < 2; i++) { + BasicBlock *PredBB = PN.getIncomingBlock(i); + if (PN.getIncomingValueForBlock(PredBB) != + IdenticalPN.getIncomingValueForBlock(PredBB)) { + DiffVals++; + DiffValBB = PredBB; + } + } + if (DiffVals != 1) + return false; + // Now check that the backedge incoming values are two select + // instructions with the same condition. Either their true + // values are the same, or their false values are the same. + auto *SI = dyn_cast<SelectInst>(PN.getIncomingValueForBlock(DiffValBB)); + auto *IdenticalSI = + dyn_cast<SelectInst>(IdenticalPN.getIncomingValueForBlock(DiffValBB)); + if (!SI || !IdenticalSI) + return false; + if (SI->getCondition() != IdenticalSI->getCondition()) + return false; + + SelectInst *SIOtherVal = nullptr; + Value *IdenticalSIOtherVal = nullptr; + if (SI->getTrueValue() == IdenticalSI->getTrueValue()) { + SIOtherVal = dyn_cast<SelectInst>(SI->getFalseValue()); + IdenticalSIOtherVal = IdenticalSI->getFalseValue(); + } else if (SI->getFalseValue() == IdenticalSI->getFalseValue()) { + SIOtherVal = dyn_cast<SelectInst>(SI->getTrueValue()); + IdenticalSIOtherVal = IdenticalSI->getTrueValue(); + } else { + return false; + } + + // Now check that the other values in select, i.e., %to_fold and + // %identicalPhi, are essentially the same value. + if (!SIOtherVal || IdenticalSIOtherVal != &IdenticalPN) + return false; + if (!(SIOtherVal->getTrueValue() == &IdenticalPN && + SIOtherVal->getFalseValue() == &PN) && + !(SIOtherVal->getTrueValue() == &PN && + SIOtherVal->getFalseValue() == &IdenticalPN)) + return false; + return true; +} + /// Given operands for a SelectInst, see if we can fold the result. /// If not, this returns null. static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, @@ -5041,7 +5124,14 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, std::optional<bool> Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL); if (Imp) return *Imp ? TrueVal : FalseVal; - + // Look for same PHIs in the true and false values. + if (auto *TruePHI = dyn_cast<PHINode>(TrueVal)) + if (auto *FalsePHI = dyn_cast<PHINode>(FalseVal)) { + if (isSimplifierIdenticalPHI(*TruePHI, *FalsePHI)) + return FalseVal; + if (isSimplifierIdenticalPHI(*FalsePHI, *TruePHI)) + return TrueVal; + } return nullptr; } @@ -5106,32 +5196,33 @@ static Value *simplifyGEPInst(Type *SrcTy, Value *Ptr, return Ptr; // The following transforms are only safe if the ptrtoint cast - // doesn't truncate the pointers. - if (Indices[0]->getType()->getScalarSizeInBits() == - Q.DL.getPointerSizeInBits(AS)) { + // doesn't truncate the address of the pointers. The non-address bits + // must be the same, as the underlying objects are the same. + if (Indices[0]->getType()->getScalarSizeInBits() >= + Q.DL.getAddressSizeInBits(AS)) { auto CanSimplify = [GEPTy, &P, Ptr]() -> bool { return P->getType() == GEPTy && getUnderlyingObject(P) == getUnderlyingObject(Ptr); }; // getelementptr V, (sub P, V) -> P if P points to a type of size 1. if (TyAllocSize == 1 && - match(Indices[0], - m_Sub(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Specific(Ptr)))) && + match(Indices[0], m_Sub(m_PtrToIntOrAddr(m_Value(P)), + m_PtrToIntOrAddr(m_Specific(Ptr)))) && CanSimplify()) return P; // getelementptr V, (ashr (sub P, V), C) -> P if P points to a type of // size 1 << C. - if (match(Indices[0], m_AShr(m_Sub(m_PtrToInt(m_Value(P)), - m_PtrToInt(m_Specific(Ptr))), + if (match(Indices[0], m_AShr(m_Sub(m_PtrToIntOrAddr(m_Value(P)), + m_PtrToIntOrAddr(m_Specific(Ptr))), m_ConstantInt(C))) && TyAllocSize == 1ULL << C && CanSimplify()) return P; // getelementptr V, (sdiv (sub P, V), C) -> P if P points to a type of // size C. - if (match(Indices[0], m_SDiv(m_Sub(m_PtrToInt(m_Value(P)), - m_PtrToInt(m_Specific(Ptr))), + if (match(Indices[0], m_SDiv(m_Sub(m_PtrToIntOrAddr(m_Value(P)), + m_PtrToIntOrAddr(m_Specific(Ptr))), m_SpecificInt(TyAllocSize))) && CanSimplify()) return P; diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index a60a4bb..6cdf51a 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -1100,12 +1100,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( for (auto &GlobalList : Index) { // Ignore entries for references that are undefined in the current module. - if (GlobalList.second.SummaryList.empty()) + if (GlobalList.second.getSummaryList().empty()) continue; - assert(GlobalList.second.SummaryList.size() == 1 && + assert(GlobalList.second.getSummaryList().size() == 1 && "Expected module's index to have one summary per GUID"); - auto &Summary = GlobalList.second.SummaryList[0]; + auto &Summary = GlobalList.second.getSummaryList()[0]; if (!IsThinLTO) { Summary->setNotEligibleToImport(); continue; diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp index 5e94e0b..5e92ca1 100644 --- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp +++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp @@ -1136,7 +1136,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) { if (!AreStatisticsEnabled()) return; for (auto &GVS : Index) - for (auto &GV : GVS.second.SummaryList) + for (auto &GV : GVS.second.getSummaryList()) if (FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get())) Stat += FS->paramAccesses().size(); }; @@ -1147,7 +1147,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) { // Convert the ModuleSummaryIndex to a FunctionMap for (auto &GVS : Index) { - for (auto &GV : GVS.second.SummaryList) { + for (auto &GV : GVS.second.getSummaryList()) { FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get()); if (!FS || FS->paramAccesses().empty()) continue; diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index cf63285..f71a534 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -451,6 +451,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { UpgradeModuleFlags(*M); UpgradeNVVMAnnotations(*M); UpgradeSectionAttributes(*M); + copyModuleAttrToFunctions(*M); if (!Slots) return false; diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index aaee1f0..cf7efbfa 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -7143,6 +7143,8 @@ Error BitcodeReader::materializeModule() { UpgradeARCRuntime(*TheModule); + copyModuleAttrToFunctions(*TheModule); + return Error::success(); } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 8ff3aa9..61aa7c2f5 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -235,7 +235,7 @@ public: return; for (const auto &GUIDSummaryLists : *Index) // Examine all summaries for this GUID. - for (auto &Summary : GUIDSummaryLists.second.SummaryList) + for (auto &Summary : GUIDSummaryLists.second.getSummaryList()) if (auto FS = dyn_cast<FunctionSummary>(Summary.get())) { // For each call in the function summary, see if the call // is to a GUID (which means it is for an indirect call, @@ -587,7 +587,7 @@ public: } } else { for (auto &Summaries : Index) - for (auto &Summary : Summaries.second.SummaryList) + for (auto &Summary : Summaries.second.getSummaryList()) Callback({Summaries.first, Summary.get()}, false); } } diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp index 72bb98c..64cbe9d 100644 --- a/llvm/lib/CAS/OnDiskGraphDB.cpp +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -836,6 +836,7 @@ uint64_t DataRecordHandle::getDataSize() const { case DataSizeFlags::Uses8B: return support::endian::read64le(DataSizePtr); } + llvm_unreachable("Unknown DataSizeFlags enum"); } void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const { @@ -863,6 +864,7 @@ uint32_t DataRecordHandle::getNumRefs() const { case NumRefsFlags::Uses8B: return support::endian::read64le(NumRefsPtr); } + llvm_unreachable("Unknown NumRefsFlags enum"); } void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const { @@ -1270,6 +1272,7 @@ Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) { return FaultInResult.takeError(); return true; } + llvm_unreachable("Unknown ObjectPresence enum"); } Expected<OnDiskGraphDB::ObjectPresence> diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index e2af0c5..fefde64f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1438,6 +1438,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges, BBFreqEnabled, BrProbEnabled, MF.hasBBSections() && NumMBBSectionRanges > 1, + // Use static_cast to avoid breakage of tests on windows. static_cast<bool>(BBAddrMapSkipEmitBBEntries), HasCalls, false}; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index f0f0861..c7d45897 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -566,32 +566,54 @@ bool DwarfExpression::addExpression( case dwarf::DW_OP_LLVM_extract_bits_zext: { unsigned SizeInBits = Op->getArg(1); unsigned BitOffset = Op->getArg(0); + unsigned DerefSize = 0; + // Operations are done in the DWARF "generic type" whose size + // is the size of a pointer. + unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize(); // If we have a memory location then dereference to get the value, though // we have to make sure we don't dereference any bytes past the end of the // object. if (isMemoryLocation()) { - emitOp(dwarf::DW_OP_deref_size); - emitUnsigned(alignTo(BitOffset + SizeInBits, 8) / 8); + DerefSize = alignTo(BitOffset + SizeInBits, 8) / 8; + if (DerefSize == PtrSizeInBytes) { + emitOp(dwarf::DW_OP_deref); + } else { + emitOp(dwarf::DW_OP_deref_size); + emitUnsigned(DerefSize); + } } - // Extract the bits by a shift left (to shift out the bits after what we - // want to extract) followed by shift right (to shift the bits to position - // 0 and also sign/zero extend). These operations are done in the DWARF - // "generic type" whose size is the size of a pointer. - unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize(); - unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset); - unsigned RightShift = LeftShift + BitOffset; - if (LeftShift) { - emitOp(dwarf::DW_OP_constu); - emitUnsigned(LeftShift); - emitOp(dwarf::DW_OP_shl); - } - if (RightShift) { - emitOp(dwarf::DW_OP_constu); - emitUnsigned(RightShift); - emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra - : dwarf::DW_OP_shr); + // If a dereference was emitted for an unsigned value, and + // there's no bit offset, then a bit of optimization is + // possible. + if (OpNum == dwarf::DW_OP_LLVM_extract_bits_zext && BitOffset == 0) { + if (8 * DerefSize == SizeInBits) { + // The correct value is already on the stack. + } else { + // No need to shift, we can just mask off the desired bits. + emitOp(dwarf::DW_OP_constu); + emitUnsigned((1u << SizeInBits) - 1); + emitOp(dwarf::DW_OP_and); + } + } else { + // Extract the bits by a shift left (to shift out the bits after what we + // want to extract) followed by shift right (to shift the bits to + // position 0 and also sign/zero extend). + unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset); + unsigned RightShift = LeftShift + BitOffset; + if (LeftShift) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(LeftShift); + emitOp(dwarf::DW_OP_shl); + } + if (RightShift) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(RightShift); + emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext + ? dwarf::DW_OP_shra + : dwarf::DW_OP_shr); + } } // The value is now at the top of the stack, so set the location to diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index c438eae..9795a0b 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -98,6 +98,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineUniformityAnalysisPassPass(Registry); initializeMIR2VecVocabLegacyAnalysisPass(Registry); initializeMIR2VecVocabPrinterLegacyPassPass(Registry); + initializeMIR2VecPrinterLegacyPassPass(Registry); initializeMachineUniformityInfoPrinterPassPass(Registry); initializeMachineVerifierLegacyPassPass(Registry); initializeObjCARCContractLegacyPassPass(Registry); diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 0522698..c1365f4 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -64,7 +64,6 @@ CGOPT_EXP(uint64_t, LargeDataThreshold) CGOPT(ExceptionHandling, ExceptionModel) CGOPT_EXP(CodeGenFileType, FileType) CGOPT(FramePointerKind, FramePointerUsage) -CGOPT(bool, EnableUnsafeFPMath) CGOPT(bool, EnableNoInfsFPMath) CGOPT(bool, EnableNoNaNsFPMath) CGOPT(bool, EnableNoSignedZerosFPMath) @@ -219,12 +218,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { "Enable frame pointer elimination"))); CGBINDOPT(FramePointerUsage); - static cl::opt<bool> EnableUnsafeFPMath( - "enable-unsafe-fp-math", - cl::desc("Enable optimizations that may decrease FP precision"), - cl::init(false)); - CGBINDOPT(EnableUnsafeFPMath); - static cl::opt<bool> EnableNoInfsFPMath( "enable-no-infs-fp-math", cl::desc("Enable FP math optimizations that assume no +-Infs"), @@ -552,7 +545,6 @@ TargetOptions codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { TargetOptions Options; Options.AllowFPOpFusion = getFuseFPOps(); - Options.UnsafeFPMath = getEnableUnsafeFPMath(); Options.NoInfsFPMath = getEnableNoInfsFPMath(); Options.NoNaNsFPMath = getEnableNoNaNsFPMath(); Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath(); @@ -706,7 +698,6 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, if (getStackRealign()) NewAttrs.addAttribute("stackrealign"); - HANDLE_BOOL_ATTR(EnableUnsafeFPMathView, "unsafe-fp-math"); HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math"); HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math"); HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math"); diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp index f80e1e8..3ac6d2a 100644 --- a/llvm/lib/CodeGen/IfConversion.cpp +++ b/llvm/lib/CodeGen/IfConversion.cpp @@ -1498,7 +1498,7 @@ static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) { // Before stepping forward past MI, remember which regs were live // before MI. This is needed to set the Undef flag only when reg is // dead. - SparseSet<MCPhysReg, identity<MCPhysReg>> LiveBeforeMI; + SparseSet<MCPhysReg, MCPhysReg> LiveBeforeMI; LiveBeforeMI.setUniverse(TRI->getNumRegs()); for (unsigned Reg : Redefs) LiveBeforeMI.insert(Reg); diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp index 5c78d98..99be1fc0 100644 --- a/llvm/lib/CodeGen/MIR2Vec.cpp +++ b/llvm/lib/CodeGen/MIR2Vec.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MIR2Vec.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/Module.h" @@ -29,20 +30,30 @@ using namespace mir2vec; STATISTIC(MIRVocabMissCounter, "Number of lookups to MIR entities not present in the vocabulary"); -cl::OptionCategory llvm::mir2vec::MIR2VecCategory("MIR2Vec Options"); +namespace llvm { +namespace mir2vec { +cl::OptionCategory MIR2VecCategory("MIR2Vec Options"); // FIXME: Use a default vocab when not specified static cl::opt<std::string> VocabFile("mir2vec-vocab-path", cl::Optional, cl::desc("Path to the vocabulary file for MIR2Vec"), cl::init(""), cl::cat(MIR2VecCategory)); -cl::opt<float> - llvm::mir2vec::OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0), - cl::desc("Weight for machine opcode embeddings"), - cl::cat(MIR2VecCategory)); +cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0), + cl::desc("Weight for machine opcode embeddings"), + cl::cat(MIR2VecCategory)); +cl::opt<MIR2VecKind> MIR2VecEmbeddingKind( + "mir2vec-kind", cl::Optional, + cl::values(clEnumValN(MIR2VecKind::Symbolic, "symbolic", + "Generate symbolic embeddings for MIR")), + cl::init(MIR2VecKind::Symbolic), cl::desc("MIR2Vec embedding kind"), + cl::cat(MIR2VecCategory)); + +} // namespace mir2vec +} // namespace llvm //===----------------------------------------------------------------------===// -// Vocabulary Implementation +// Vocabulary //===----------------------------------------------------------------------===// MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries, @@ -188,6 +199,28 @@ void MIRVocabulary::buildCanonicalOpcodeMapping() { << " unique base opcodes\n"); } +Expected<MIRVocabulary> +MIRVocabulary::createDummyVocabForTest(const TargetInstrInfo &TII, + unsigned Dim) { + assert(Dim > 0 && "Dimension must be greater than zero"); + + float DummyVal = 0.1f; + + // Create dummy embeddings for all canonical opcode names + VocabMap DummyVocabMap; + for (unsigned Opcode = 0; Opcode < TII.getNumOpcodes(); ++Opcode) { + std::string BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode)); + if (DummyVocabMap.count(BaseOpcode) == 0) { + // Only add if not already present + DummyVocabMap[BaseOpcode] = Embedding(Dim, DummyVal); + DummyVal += 0.1f; + } + } + + // Create and return vocabulary with dummy embeddings + return MIRVocabulary::create(std::move(DummyVocabMap), TII); +} + //===----------------------------------------------------------------------===// // MIR2VecVocabLegacyAnalysis Implementation //===----------------------------------------------------------------------===// @@ -258,7 +291,73 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) { } //===----------------------------------------------------------------------===// -// Printer Passes Implementation +// MIREmbedder and its subclasses +//===----------------------------------------------------------------------===// + +std::unique_ptr<MIREmbedder> MIREmbedder::create(MIR2VecKind Mode, + const MachineFunction &MF, + const MIRVocabulary &Vocab) { + switch (Mode) { + case MIR2VecKind::Symbolic: + return std::make_unique<SymbolicMIREmbedder>(MF, Vocab); + } + return nullptr; +} + +Embedding MIREmbedder::computeEmbeddings(const MachineBasicBlock &MBB) const { + Embedding MBBVector(Dimension, 0); + + // Get instruction info for opcode name resolution + const auto &Subtarget = MF.getSubtarget(); + const auto *TII = Subtarget.getInstrInfo(); + if (!TII) { + MF.getFunction().getContext().emitError( + "MIR2Vec: No TargetInstrInfo available; cannot compute embeddings"); + return MBBVector; + } + + // Process each machine instruction in the basic block + for (const auto &MI : MBB) { + // Skip debug instructions and other metadata + if (MI.isDebugInstr()) + continue; + MBBVector += computeEmbeddings(MI); + } + + return MBBVector; +} + +Embedding MIREmbedder::computeEmbeddings() const { + Embedding MFuncVector(Dimension, 0); + + // Consider all reachable machine basic blocks in the function + for (const auto *MBB : depth_first(&MF)) + MFuncVector += computeEmbeddings(*MBB); + return MFuncVector; +} + +SymbolicMIREmbedder::SymbolicMIREmbedder(const MachineFunction &MF, + const MIRVocabulary &Vocab) + : MIREmbedder(MF, Vocab) {} + +std::unique_ptr<SymbolicMIREmbedder> +SymbolicMIREmbedder::create(const MachineFunction &MF, + const MIRVocabulary &Vocab) { + return std::make_unique<SymbolicMIREmbedder>(MF, Vocab); +} + +Embedding SymbolicMIREmbedder::computeEmbeddings(const MachineInstr &MI) const { + // Skip debug instructions and other metadata + if (MI.isDebugInstr()) + return Embedding(Dimension, 0); + + // Todo: Add operand/argument contributions + + return Vocab[MI.getOpcode()]; +} + +//===----------------------------------------------------------------------===// +// Printer Passes //===----------------------------------------------------------------------===// char MIR2VecVocabPrinterLegacyPass::ID = 0; @@ -297,3 +396,56 @@ MachineFunctionPass * llvm::createMIR2VecVocabPrinterLegacyPass(raw_ostream &OS) { return new MIR2VecVocabPrinterLegacyPass(OS); } + +char MIR2VecPrinterLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(MIR2VecPrinterLegacyPass, "print-mir2vec", + "MIR2Vec Embedder Printer Pass", false, true) +INITIALIZE_PASS_DEPENDENCY(MIR2VecVocabLegacyAnalysis) +INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass) +INITIALIZE_PASS_END(MIR2VecPrinterLegacyPass, "print-mir2vec", + "MIR2Vec Embedder Printer Pass", false, true) + +bool MIR2VecPrinterLegacyPass::runOnMachineFunction(MachineFunction &MF) { + auto &Analysis = getAnalysis<MIR2VecVocabLegacyAnalysis>(); + auto VocabOrErr = + Analysis.getMIR2VecVocabulary(*MF.getFunction().getParent()); + assert(VocabOrErr && "Failed to get MIR2Vec vocabulary"); + auto &MIRVocab = *VocabOrErr; + + auto Emb = mir2vec::MIREmbedder::create(MIR2VecEmbeddingKind, MF, MIRVocab); + if (!Emb) { + OS << "Error creating MIR2Vec embeddings for function " << MF.getName() + << "\n"; + return false; + } + + OS << "MIR2Vec embeddings for machine function " << MF.getName() << ":\n"; + OS << "Machine Function vector: "; + Emb->getMFunctionVector().print(OS); + + OS << "Machine basic block vectors:\n"; + for (const MachineBasicBlock &MBB : MF) { + OS << "Machine basic block: " << MBB.getFullName() << ":\n"; + Emb->getMBBVector(MBB).print(OS); + } + + OS << "Machine instruction vectors:\n"; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // Skip debug instructions as they are not + // embedded + if (MI.isDebugInstr()) + continue; + + OS << "Machine instruction: "; + MI.print(OS); + Emb->getMInstVector(MI).print(OS); + } + } + + return false; +} + +MachineFunctionPass *llvm::createMIR2VecPrinterLegacyPass(raw_ostream &OS) { + return new MIR2VecPrinterLegacyPass(OS); +} diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp index d988a2a..e37f784 100644 --- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp +++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/CodeGen/MIRFSDiscriminatorOptions.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Function.h" @@ -35,13 +36,10 @@ using namespace sampleprofutil; // TODO(xur): Remove this option and related code once we make true as the // default. -namespace llvm { -cl::opt<bool> ImprovedFSDiscriminator( +cl::opt<bool> llvm::ImprovedFSDiscriminator( "improved-fs-discriminator", cl::Hidden, cl::init(false), cl::desc("New FS discriminators encoding (incompatible with the original " "encoding)")); -} // namespace llvm - char MIRAddFSDiscriminators::ID = 0; INITIALIZE_PASS(MIRAddFSDiscriminators, DEBUG_TYPE, diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp index 9bba50e8..d44f577 100644 --- a/llvm/lib/CodeGen/MIRSampleProfile.cpp +++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/CodeGen/MIRFSDiscriminatorOptions.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineDominators.h" @@ -62,9 +63,6 @@ static cl::opt<bool> ViewBFIAfter("fs-viewbfi-after", cl::Hidden, cl::init(false), cl::desc("View BFI after MIR loader")); -namespace llvm { -extern cl::opt<bool> ImprovedFSDiscriminator; -} char MIRProfileLoaderPass::ID = 0; INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE, diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 7acddff..729e73c 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -932,12 +932,11 @@ void MachineLICMImpl::InitRegPressure(MachineBasicBlock *BB) { void MachineLICMImpl::UpdateRegPressure(const MachineInstr *MI, bool ConsiderUnseenAsDef) { auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef); - for (const auto &RPIdAndCost : Cost) { - unsigned Class = RPIdAndCost.first; - if (static_cast<int>(RegPressure[Class]) < -RPIdAndCost.second) + for (const auto &[Class, Weight] : Cost) { + if (static_cast<int>(RegPressure[Class]) < -Weight) RegPressure[Class] = 0; else - RegPressure[Class] += RPIdAndCost.second; + RegPressure[Class] += Weight; } } @@ -1215,11 +1214,10 @@ bool MachineLICMImpl::IsCheapInstruction(MachineInstr &MI) const { /// given cost matrix can cause high register pressure. bool MachineLICMImpl::CanCauseHighRegPressure( const SmallDenseMap<unsigned, int> &Cost, bool CheapInstr) { - for (const auto &RPIdAndCost : Cost) { - if (RPIdAndCost.second <= 0) + for (const auto &[Class, Weight] : Cost) { + if (Weight <= 0) continue; - unsigned Class = RPIdAndCost.first; int Limit = RegLimit[Class]; // Don't hoist cheap instructions if they would increase register pressure, @@ -1228,7 +1226,7 @@ bool MachineLICMImpl::CanCauseHighRegPressure( return true; for (const auto &RP : BackTrace) - if (static_cast<int>(RP[Class]) + RPIdAndCost.second >= Limit) + if (static_cast<int>(RP[Class]) + Weight >= Limit) return true; } @@ -1246,8 +1244,8 @@ void MachineLICMImpl::UpdateBackTraceRegPressure(const MachineInstr *MI) { // Update register pressure of blocks from loop header to current block. for (auto &RP : BackTrace) - for (const auto &RPIdAndCost : Cost) - RP[RPIdAndCost.first] += RPIdAndCost.second; + for (const auto &[Class, Weight] : Cost) + RP[Class] += Weight; } /// Return true if it is potentially profitable to hoist the given loop diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 804480c..72b364c 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -211,7 +211,7 @@ private: unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); } }; - using LiveRegMap = SparseSet<LiveReg, identity<unsigned>, uint16_t>; + using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>; /// This map contains entries for each virtual register that is currently /// available in a physical register. LiveRegMap LiveVirtRegs; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6bf9008..310d35d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16433,7 +16433,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { case ISD::OR: case ISD::XOR: if (!LegalOperations && N0.hasOneUse() && - (isConstantOrConstantVector(N0.getOperand(0), true) || + (N0.getOperand(0) == N0.getOperand(1) || + isConstantOrConstantVector(N0.getOperand(0), true) || isConstantOrConstantVector(N0.getOperand(1), true))) { // TODO: We already restricted this to pre-legalization, but for vectors // we are extra cautious to not create an unsupported operation. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 437d0f4..bf1abfe 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -3765,6 +3765,8 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) { case ISD::FP_TO_UINT: case ISD::LRINT: case ISD::LLRINT: + case ISD::LROUND: + case ISD::LLROUND: Res = SoftPromoteHalfOp_Op0WithStrict(N); break; case ISD::FP_TO_SINT_SAT: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 88a4a8b..b1776ea 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -429,7 +429,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) { } SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) { - SDValue Op2 = GetPromotedInteger(N->getOperand(2)); + SDValue Op2 = N->getOperand(2); + switch (TLI.getExtendForAtomicRMWArg(N->getOpcode())) { + case ISD::SIGN_EXTEND: + Op2 = SExtPromotedInteger(Op2); + break; + case ISD::ZERO_EXTEND: + Op2 = ZExtPromotedInteger(Op2); + break; + case ISD::ANY_EXTEND: + Op2 = GetPromotedInteger(Op2); + break; + default: + llvm_unreachable("Invalid atomic op extension"); + } SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(), N->getChain(), N->getBasePtr(), diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp index 5eb86e7..049efe8 100644 --- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -51,7 +51,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const { /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume /// that the rounding mode of the FPU can change from its default. bool TargetOptions::HonorSignDependentRoundingFPMath() const { - return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption; + return HonorSignDependentRoundingFPMathOption; } /// NOTE: There are targets that still do not support the debug entry values diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 212a0c0..db5cc37 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -107,6 +107,28 @@ static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) { return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference(); } +static llvm::StringRef +prettyLanguageVersionString(const DWARFAttribute &AttrValue, + const DWARFDie &Die) { + if (AttrValue.Attr != DW_AT_language_version) + return {}; + + auto NameForm = Die.find(DW_AT_language_name); + if (!NameForm) + return {}; + + auto LName = NameForm->getAsUnsignedConstant(); + if (!LName) + return {}; + + auto LVersion = AttrValue.Value.getAsUnsignedConstant(); + if (!LVersion) + return {}; + + return llvm::dwarf::LanguageDescription( + static_cast<SourceLanguageName>(*LName), *LVersion); +} + static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, const DWARFAttribute &AttrValue, unsigned Indent, DIDumpOptions DumpOpts) { @@ -146,15 +168,28 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, } else if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant()) Name = AttributeValueString(Attr, *Val); - if (!Name.empty()) - WithColor(OS, Color) << Name; - else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column || - Attr == DW_AT_call_line || Attr == DW_AT_call_column || - Attr == DW_AT_language_version) { + auto DumpUnsignedConstant = [&OS, + &DumpOpts](const DWARFFormValue &FormValue) { if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant()) OS << *Val; else FormValue.dump(OS, DumpOpts); + }; + + llvm::StringRef PrettyVersionName = + prettyLanguageVersionString(AttrValue, Die); + bool ShouldDumpRawLanguageVersion = + Attr == DW_AT_language_version && + (DumpOpts.Verbose || PrettyVersionName.empty()); + + if (!Name.empty()) + WithColor(OS, Color) << Name; + else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column || + Attr == DW_AT_call_line || Attr == DW_AT_call_column) { + DumpUnsignedConstant(FormValue); + } else if (Attr == DW_AT_language_version) { + if (ShouldDumpRawLanguageVersion) + DumpUnsignedConstant(FormValue); } else if (Attr == DW_AT_low_pc && (FormValue.getAsAddress() == dwarf::computeTombstoneAddress(U->getAddressByteSize()))) { @@ -226,6 +261,10 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, DumpOpts.RecoverableErrorHandler(createStringError( errc::invalid_argument, "decoding address ranges: %s", toString(RangesOrError.takeError()).c_str())); + } else if (Attr == DW_AT_language_version) { + if (!PrettyVersionName.empty()) + WithColor(OS, Color) << (ShouldDumpRawLanguageVersion ? " " : "") + << PrettyVersionName; } OS << ")\n"; diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index f47b7ec..8d413a3 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -1173,39 +1173,7 @@ void JITDylib::dump(raw_ostream &OS) { << " pending queries: { "; for (const auto &Q : KV.second.pendingQueries()) OS << Q.get() << " (" << Q->getRequiredState() << ") "; - OS << "}\n Defining EDU: "; - if (KV.second.DefiningEDU) { - OS << KV.second.DefiningEDU.get() << " { "; - for (auto &[Name, Flags] : KV.second.DefiningEDU->Symbols) - OS << Name << " "; - OS << "}\n"; - OS << " Dependencies:\n"; - if (!KV.second.DefiningEDU->Dependencies.empty()) { - for (auto &[DepJD, Deps] : KV.second.DefiningEDU->Dependencies) { - OS << " " << DepJD->getName() << ": [ "; - for (auto &Dep : Deps) - OS << Dep << " "; - OS << "]\n"; - } - } else - OS << " none\n"; - } else - OS << "none\n"; - OS << " Dependant EDUs:\n"; - if (!KV.second.DependantEDUs.empty()) { - for (auto &DependantEDU : KV.second.DependantEDUs) { - OS << " " << DependantEDU << ": " - << DependantEDU->JD->getName() << " { "; - for (auto &[Name, Flags] : DependantEDU->Symbols) - OS << Name << " "; - OS << "}\n"; - } - } else - OS << " none\n"; - assert((Symbols[KV.first].getState() != SymbolState::Ready || - (KV.second.pendingQueries().empty() && !KV.second.DefiningEDU && - !KV.second.DependantEDUs.empty())) && - "Stale materializing info entry"); + OS << "}\n"; } }); } @@ -1967,9 +1935,6 @@ bool ExecutionSession::verifySessionState(Twine Phase) { return runSessionLocked([&]() { bool AllOk = true; - // We'll collect these and verify them later to avoid redundant checks. - DenseSet<JITDylib::EmissionDepUnit *> EDUsToCheck; - for (auto &JD : JDs) { auto LogFailure = [&]() -> raw_fd_ostream & { @@ -2063,86 +2028,6 @@ bool ExecutionSession::verifySessionState(Twine Phase) { << " has stale or misordered queries.\n"; } } - - // If there's a DefiningEDU then check that... - // 1. The JD matches. - // 2. The symbol is in the EDU's Symbols map. - // 3. The symbol table entry is in the Emitted state. - if (MII.DefiningEDU) { - - EDUsToCheck.insert(MII.DefiningEDU.get()); - - if (MII.DefiningEDU->JD != JD.get()) { - LogFailure() << "symbol " << Sym - << " has DefiningEDU with incorrect JD" - << (llvm::is_contained(JDs, MII.DefiningEDU->JD) - ? " (JD not currently in ExecutionSession" - : "") - << "\n"; - } - - if (SymItr->second.getState() != SymbolState::Emitted) { - LogFailure() - << "symbol " << Sym - << " has DefiningEDU, but is not in Emitted state.\n"; - } - } - - // Check that JDs for any DependantEDUs are also in the session -- - // that guarantees that we'll also visit them during this loop. - for (auto &DepEDU : MII.DependantEDUs) { - if (!llvm::is_contained(JDs, DepEDU->JD)) { - LogFailure() << "symbol " << Sym << " has DependantEDU " - << (void *)DepEDU << " with JD (" << DepEDU->JD - << ") that isn't in ExecutionSession.\n"; - } - } - } - } - } - - // Check EDUs. - for (auto *EDU : EDUsToCheck) { - assert(EDU->JD->State == JITDylib::Open && "EDU->JD is not Open"); - - auto LogFailure = [&]() -> raw_fd_ostream & { - AllOk = false; - auto &Stream = errs(); - Stream << "In EDU defining " << EDU->JD->getName() << ": { "; - for (auto &[Sym, Flags] : EDU->Symbols) - Stream << Sym << " "; - Stream << "}, "; - return Stream; - }; - - if (EDU->Symbols.empty()) - LogFailure() << "no symbols defined.\n"; - else { - for (auto &[Sym, Flags] : EDU->Symbols) { - if (!Sym) - LogFailure() << "null symbol defined.\n"; - else { - if (!EDU->JD->Symbols.count(SymbolStringPtr(Sym))) { - LogFailure() << "symbol " << Sym - << " isn't present in JD's symbol table.\n"; - } - } - } - } - - for (auto &[DepJD, Symbols] : EDU->Dependencies) { - if (!llvm::is_contained(JDs, DepJD)) { - LogFailure() << "dependant symbols listed for JD that isn't in " - "ExecutionSession.\n"; - } else { - for (auto &DepSym : Symbols) { - if (!DepJD->Symbols.count(SymbolStringPtr(DepSym))) { - LogFailure() - << "dependant symbol " << DepSym - << " does not appear in symbol table for dependant JD " - << DepJD->getName() << ".\n"; - } - } } } } @@ -2917,359 +2802,64 @@ Error ExecutionSession::OL_notifyResolved(MaterializationResponsibility &MR, return MR.JD.resolve(MR, Symbols); } -template <typename HandleNewDepFn> -void ExecutionSession::propagateExtraEmitDeps( - std::deque<JITDylib::EmissionDepUnit *> Worklist, EDUInfosMap &EDUInfos, - HandleNewDepFn HandleNewDep) { - - // Iterate to a fixed-point to propagate extra-emit dependencies through the - // EDU graph. - while (!Worklist.empty()) { - auto &EDU = *Worklist.front(); - Worklist.pop_front(); - - assert(EDUInfos.count(&EDU) && "No info entry for EDU"); - auto &EDUInfo = EDUInfos[&EDU]; - - // Propagate new dependencies to users. - for (auto *UserEDU : EDUInfo.IntraEmitUsers) { - - // UserEDUInfo only present if UserEDU has its own users. - JITDylib::EmissionDepUnitInfo *UserEDUInfo = nullptr; - { - auto UserEDUInfoItr = EDUInfos.find(UserEDU); - if (UserEDUInfoItr != EDUInfos.end()) - UserEDUInfo = &UserEDUInfoItr->second; - } - - for (auto &[DepJD, Deps] : EDUInfo.NewDeps) { - auto &UserEDUDepsForJD = UserEDU->Dependencies[DepJD]; - DenseSet<NonOwningSymbolStringPtr> *UserEDUNewDepsForJD = nullptr; - for (auto Dep : Deps) { - if (UserEDUDepsForJD.insert(Dep).second) { - HandleNewDep(*UserEDU, *DepJD, Dep); - if (UserEDUInfo) { - if (!UserEDUNewDepsForJD) { - // If UserEDU has no new deps then it's not in the worklist - // yet, so add it. - if (UserEDUInfo->NewDeps.empty()) - Worklist.push_back(UserEDU); - UserEDUNewDepsForJD = &UserEDUInfo->NewDeps[DepJD]; - } - // Add (DepJD, Dep) to NewDeps. - UserEDUNewDepsForJD->insert(Dep); - } - } +WaitingOnGraph::ExternalState +ExecutionSession::IL_getSymbolState(JITDylib *JD, + NonOwningSymbolStringPtr Name) { + if (JD->State != JITDylib::Open) + return WaitingOnGraph::ExternalState::Failed; + + auto I = JD->Symbols.find_as(Name); + + // FIXME: Can we eliminate this possibility if we support query binding? + if (I == JD->Symbols.end()) + return WaitingOnGraph::ExternalState::Failed; + + if (I->second.getFlags().hasError()) + return WaitingOnGraph::ExternalState::Failed; + + if (I->second.getState() == SymbolState::Ready) + return WaitingOnGraph::ExternalState::Ready; + + return WaitingOnGraph::ExternalState::None; +} + +template <typename UpdateSymbolFn, typename UpdateQueryFn> +void ExecutionSession::IL_collectQueries( + JITDylib::AsynchronousSymbolQuerySet &Qs, + WaitingOnGraph::ContainerElementsMap &QualifiedSymbols, + UpdateSymbolFn &&UpdateSymbol, UpdateQueryFn &&UpdateQuery) { + + for (auto &[JD, Symbols] : QualifiedSymbols) { + // IL_emit and JITDylib removal are synchronized by the session lock. + // Since JITDylib removal removes any contained nodes from the + // WaitingOnGraph, we should be able to assert that all nodes in the + // WaitingOnGraph have not been removed. + assert(JD->State == JITDylib::Open && + "WaitingOnGraph includes definition in defunct JITDylib"); + for (auto &Symbol : Symbols) { + // Update symbol table. + auto I = JD->Symbols.find_as(Symbol); + assert(I != JD->Symbols.end() && + "Failed Symbol missing from JD symbol table"); + auto &Entry = I->second; + UpdateSymbol(Entry); + + // Collect queries. + auto J = JD->MaterializingInfos.find_as(Symbol); + if (J != JD->MaterializingInfos.end()) { + for (auto &Q : J->second.takeAllPendingQueries()) { + UpdateQuery(*Q, *JD, Symbol, Entry); + Qs.insert(std::move(Q)); } + JD->MaterializingInfos.erase(J); } } - - EDUInfo.NewDeps.clear(); - } -} - -// Note: This method modifies the emitted set. -ExecutionSession::EDUInfosMap ExecutionSession::simplifyDepGroups( - MaterializationResponsibility &MR, - ArrayRef<SymbolDependenceGroup> EmittedDeps) { - - auto &TargetJD = MR.getTargetJITDylib(); - - // 1. Build initial EmissionDepUnit -> EmissionDepUnitInfo and - // Symbol -> EmissionDepUnit mappings. - DenseMap<JITDylib::EmissionDepUnit *, JITDylib::EmissionDepUnitInfo> EDUInfos; - EDUInfos.reserve(EmittedDeps.size()); - DenseMap<NonOwningSymbolStringPtr, JITDylib::EmissionDepUnit *> EDUForSymbol; - for (auto &DG : EmittedDeps) { - assert(!DG.Symbols.empty() && "DepGroup does not cover any symbols"); - - // Skip empty EDUs. - if (DG.Dependencies.empty()) - continue; - - auto TmpEDU = std::make_shared<JITDylib::EmissionDepUnit>(TargetJD); - auto &EDUInfo = EDUInfos[TmpEDU.get()]; - EDUInfo.EDU = std::move(TmpEDU); - for (const auto &Symbol : DG.Symbols) { - NonOwningSymbolStringPtr NonOwningSymbol(Symbol); - assert(!EDUForSymbol.count(NonOwningSymbol) && - "Symbol should not appear in more than one SymbolDependenceGroup"); - assert(MR.getSymbols().count(Symbol) && - "Symbol in DepGroups not in the emitted set"); - auto NewlyEmittedItr = MR.getSymbols().find(Symbol); - EDUInfo.EDU->Symbols[NonOwningSymbol] = NewlyEmittedItr->second; - EDUForSymbol[NonOwningSymbol] = EDUInfo.EDU.get(); - } - } - - // 2. Build a "residual" EDU to cover all symbols that have no dependencies. - { - DenseMap<NonOwningSymbolStringPtr, JITSymbolFlags> ResidualSymbolFlags; - for (auto &[Sym, Flags] : MR.getSymbols()) { - if (!EDUForSymbol.count(NonOwningSymbolStringPtr(Sym))) - ResidualSymbolFlags[NonOwningSymbolStringPtr(Sym)] = Flags; - } - if (!ResidualSymbolFlags.empty()) { - auto ResidualEDU = std::make_shared<JITDylib::EmissionDepUnit>(TargetJD); - ResidualEDU->Symbols = std::move(ResidualSymbolFlags); - auto &ResidualEDUInfo = EDUInfos[ResidualEDU.get()]; - ResidualEDUInfo.EDU = std::move(ResidualEDU); - - // If the residual EDU is the only one then bail out early. - if (EDUInfos.size() == 1) - return EDUInfos; - - // Otherwise add the residual EDU to the EDUForSymbol map. - for (auto &[Sym, Flags] : ResidualEDUInfo.EDU->Symbols) - EDUForSymbol[Sym] = ResidualEDUInfo.EDU.get(); - } - } - -#ifndef NDEBUG - assert(EDUForSymbol.size() == MR.getSymbols().size() && - "MR symbols not fully covered by EDUs?"); - for (auto &[Sym, Flags] : MR.getSymbols()) { - assert(EDUForSymbol.count(NonOwningSymbolStringPtr(Sym)) && - "Sym in MR not covered by EDU"); - } -#endif // NDEBUG - - // 3. Use the DepGroups array to build a graph of dependencies between - // EmissionDepUnits in this finalization. We want to remove these - // intra-finalization uses, propagating dependencies on symbols outside - // this finalization. Add EDUs to the worklist. - for (auto &DG : EmittedDeps) { - - // Skip SymbolDependenceGroups with no dependencies. - if (DG.Dependencies.empty()) - continue; - - assert(EDUForSymbol.count(NonOwningSymbolStringPtr(*DG.Symbols.begin())) && - "No EDU for DG"); - auto &EDU = - *EDUForSymbol.find(NonOwningSymbolStringPtr(*DG.Symbols.begin())) - ->second; - - for (auto &[DepJD, Deps] : DG.Dependencies) { - DenseSet<NonOwningSymbolStringPtr> NewDepsForJD; - - assert(!Deps.empty() && "Dependence set for DepJD is empty"); - - if (DepJD != &TargetJD) { - // DepJD is some other JITDylib.There can't be any intra-finalization - // edges here, so just skip. - for (auto &Dep : Deps) - NewDepsForJD.insert(NonOwningSymbolStringPtr(Dep)); - } else { - // DepJD is the Target JITDylib. Check for intra-finaliztaion edges, - // skipping any and recording the intra-finalization use instead. - for (auto &Dep : Deps) { - NonOwningSymbolStringPtr NonOwningDep(Dep); - auto I = EDUForSymbol.find(NonOwningDep); - if (I == EDUForSymbol.end()) { - if (!MR.getSymbols().count(Dep)) - NewDepsForJD.insert(NonOwningDep); - continue; - } - - if (I->second != &EDU) - EDUInfos[I->second].IntraEmitUsers.insert(&EDU); - } - } - - if (!NewDepsForJD.empty()) - EDU.Dependencies[DepJD] = std::move(NewDepsForJD); - } - } - - // 4. Build the worklist. - std::deque<JITDylib::EmissionDepUnit *> Worklist; - for (auto &[EDU, EDUInfo] : EDUInfos) { - // If this EDU has extra-finalization dependencies and intra-finalization - // users then add it to the worklist. - if (!EDU->Dependencies.empty()) { - auto I = EDUInfos.find(EDU); - if (I != EDUInfos.end()) { - auto &EDUInfo = I->second; - if (!EDUInfo.IntraEmitUsers.empty()) { - EDUInfo.NewDeps = EDU->Dependencies; - Worklist.push_back(EDU); - } - } - } - } - - // 4. Propagate dependencies through the EDU graph. - propagateExtraEmitDeps( - Worklist, EDUInfos, - [](JITDylib::EmissionDepUnit &, JITDylib &, NonOwningSymbolStringPtr) {}); - - return EDUInfos; -} - -void ExecutionSession::IL_makeEDUReady( - std::shared_ptr<JITDylib::EmissionDepUnit> EDU, - JITDylib::AsynchronousSymbolQuerySet &Queries) { - - // The symbols for this EDU are ready. - auto &JD = *EDU->JD; - - for (auto &[Sym, Flags] : EDU->Symbols) { - assert(JD.Symbols.count(SymbolStringPtr(Sym)) && - "JD does not have an entry for Sym"); - auto &Entry = JD.Symbols[SymbolStringPtr(Sym)]; - - assert(((Entry.getFlags().hasMaterializationSideEffectsOnly() && - Entry.getState() == SymbolState::Materializing) || - Entry.getState() == SymbolState::Resolved || - Entry.getState() == SymbolState::Emitted) && - "Emitting from state other than Resolved"); - - Entry.setState(SymbolState::Ready); - - auto MII = JD.MaterializingInfos.find(SymbolStringPtr(Sym)); - - // Check for pending queries. - if (MII == JD.MaterializingInfos.end()) - continue; - auto &MI = MII->second; - - for (auto &Q : MI.takeQueriesMeeting(SymbolState::Ready)) { - Q->notifySymbolMetRequiredState(SymbolStringPtr(Sym), Entry.getSymbol()); - if (Q->isComplete()) - Queries.insert(Q); - Q->removeQueryDependence(JD, SymbolStringPtr(Sym)); - } - - JD.MaterializingInfos.erase(MII); - } - - JD.shrinkMaterializationInfoMemory(); -} - -void ExecutionSession::IL_makeEDUEmitted( - std::shared_ptr<JITDylib::EmissionDepUnit> EDU, - JITDylib::AsynchronousSymbolQuerySet &Queries) { - - // The symbols for this EDU are emitted, but not ready. - auto &JD = *EDU->JD; - - for (auto &[Sym, Flags] : EDU->Symbols) { - assert(JD.Symbols.count(SymbolStringPtr(Sym)) && - "JD does not have an entry for Sym"); - auto &Entry = JD.Symbols[SymbolStringPtr(Sym)]; - - assert(((Entry.getFlags().hasMaterializationSideEffectsOnly() && - Entry.getState() == SymbolState::Materializing) || - Entry.getState() == SymbolState::Resolved || - Entry.getState() == SymbolState::Emitted) && - "Emitting from state other than Resolved"); - - if (Entry.getState() == SymbolState::Emitted) { - // This was already emitted, so we can skip the rest of this loop. -#ifndef NDEBUG - for (auto &[Sym, Flags] : EDU->Symbols) { - assert(JD.Symbols.count(SymbolStringPtr(Sym)) && - "JD does not have an entry for Sym"); - auto &Entry = JD.Symbols[SymbolStringPtr(Sym)]; - assert(Entry.getState() == SymbolState::Emitted && - "Symbols for EDU in inconsistent state"); - assert(JD.MaterializingInfos.count(SymbolStringPtr(Sym)) && - "Emitted symbol has no MI"); - auto MI = JD.MaterializingInfos[SymbolStringPtr(Sym)]; - assert(MI.takeQueriesMeeting(SymbolState::Emitted).empty() && - "Already-emitted symbol has waiting-on-emitted queries"); - } -#endif // NDEBUG - break; - } - - Entry.setState(SymbolState::Emitted); - auto &MI = JD.MaterializingInfos[SymbolStringPtr(Sym)]; - MI.DefiningEDU = EDU; - - for (auto &Q : MI.takeQueriesMeeting(SymbolState::Emitted)) { - Q->notifySymbolMetRequiredState(SymbolStringPtr(Sym), Entry.getSymbol()); - if (Q->isComplete()) - Queries.insert(Q); - } } - - for (auto &[DepJD, Deps] : EDU->Dependencies) { - for (auto &Dep : Deps) - DepJD->MaterializingInfos[SymbolStringPtr(Dep)].DependantEDUs.insert( - EDU.get()); - } -} - -/// Removes the given dependence from EDU. If EDU's dependence set becomes -/// empty then this function adds an entry for it to the EDUInfos map. -/// Returns true if a new EDUInfosMap entry is added. -bool ExecutionSession::IL_removeEDUDependence(JITDylib::EmissionDepUnit &EDU, - JITDylib &DepJD, - NonOwningSymbolStringPtr DepSym, - EDUInfosMap &EDUInfos) { - assert(EDU.Dependencies.count(&DepJD) && - "JD does not appear in Dependencies of DependantEDU"); - assert(EDU.Dependencies[&DepJD].count(DepSym) && - "Symbol does not appear in Dependencies of DependantEDU"); - auto &JDDeps = EDU.Dependencies[&DepJD]; - JDDeps.erase(DepSym); - if (JDDeps.empty()) { - EDU.Dependencies.erase(&DepJD); - if (EDU.Dependencies.empty()) { - // If the dependencies set has become empty then EDU _may_ be ready - // (we won't know for sure until we've propagated the extra-emit deps). - // Create an EDUInfo for it (if it doesn't have one already) so that - // it'll be visited after propagation. - auto &DepEDUInfo = EDUInfos[&EDU]; - if (!DepEDUInfo.EDU) { - assert(EDU.JD->Symbols.count( - SymbolStringPtr(EDU.Symbols.begin()->first)) && - "Missing symbol entry for first symbol in EDU"); - auto DepEDUFirstMI = EDU.JD->MaterializingInfos.find( - SymbolStringPtr(EDU.Symbols.begin()->first)); - assert(DepEDUFirstMI != EDU.JD->MaterializingInfos.end() && - "Missing MI for first symbol in DependantEDU"); - DepEDUInfo.EDU = DepEDUFirstMI->second.DefiningEDU; - return true; - } - } - } - return false; } -Error ExecutionSession::makeJDClosedError(JITDylib::EmissionDepUnit &EDU, - JITDylib &ClosedJD) { - SymbolNameSet FailedSymbols; - for (auto &[Sym, Flags] : EDU.Symbols) - FailedSymbols.insert(SymbolStringPtr(Sym)); - SymbolDependenceMap BadDeps; - for (auto &Dep : EDU.Dependencies[&ClosedJD]) - BadDeps[&ClosedJD].insert(SymbolStringPtr(Dep)); - return make_error<UnsatisfiedSymbolDependencies>( - ClosedJD.getExecutionSession().getSymbolStringPool(), EDU.JD, - std::move(FailedSymbols), std::move(BadDeps), - ClosedJD.getName() + " is closed"); -} - -Error ExecutionSession::makeUnsatisfiedDepsError(JITDylib::EmissionDepUnit &EDU, - JITDylib &BadJD, - SymbolNameSet BadDeps) { - SymbolNameSet FailedSymbols; - for (auto &[Sym, Flags] : EDU.Symbols) - FailedSymbols.insert(SymbolStringPtr(Sym)); - SymbolDependenceMap BadDepsMap; - BadDepsMap[&BadJD] = std::move(BadDeps); - return make_error<UnsatisfiedSymbolDependencies>( - BadJD.getExecutionSession().getSymbolStringPool(), &BadJD, - std::move(FailedSymbols), std::move(BadDepsMap), - "dependencies removed or in error state"); -} - -Expected<JITDylib::AsynchronousSymbolQuerySet> +Expected<ExecutionSession::EmitQueries> ExecutionSession::IL_emit(MaterializationResponsibility &MR, - EDUInfosMap EDUInfos) { + WaitingOnGraph::SimplifyResult SR) { if (MR.RT->isDefunct()) return make_error<ResourceTrackerDefunct>(MR.RT); @@ -3279,169 +2869,50 @@ ExecutionSession::IL_emit(MaterializationResponsibility &MR, return make_error<StringError>("JITDylib " + TargetJD.getName() + " is defunct", inconvertibleErrorCode()); + #ifdef EXPENSIVE_CHECKS verifySessionState("entering ExecutionSession::IL_emit"); #endif - // Walk all EDUs: - // 1. Verifying that dependencies are available (not removed or in the error - // state. - // 2. Removing any dependencies that are already Ready. - // 3. Lifting any EDUs for Emitted symbols into the EDUInfos map. - // 4. Finding any dependant EDUs and lifting them into the EDUInfos map. - std::deque<JITDylib::EmissionDepUnit *> Worklist; - for (auto &[EDU, _] : EDUInfos) - Worklist.push_back(EDU); - - for (auto *EDU : Worklist) { - auto *EDUInfo = &EDUInfos[EDU]; - - SmallVector<JITDylib *> DepJDsToRemove; - for (auto &[DepJD, Deps] : EDU->Dependencies) { - if (DepJD->State != JITDylib::Open) - return makeJDClosedError(*EDU, *DepJD); - - SymbolNameSet BadDeps; - SmallVector<NonOwningSymbolStringPtr> DepsToRemove; - for (auto &Dep : Deps) { - auto DepEntryItr = DepJD->Symbols.find(SymbolStringPtr(Dep)); - - // If this dep has been removed or moved to the error state then add it - // to the bad deps set. We aggregate these bad deps for more - // comprehensive error messages. - if (DepEntryItr == DepJD->Symbols.end() || - DepEntryItr->second.getFlags().hasError()) { - BadDeps.insert(SymbolStringPtr(Dep)); - continue; - } - - // If this dep isn't emitted yet then just add it to the NewDeps set to - // be propagated. - auto &DepEntry = DepEntryItr->second; - if (DepEntry.getState() < SymbolState::Emitted) { - EDUInfo->NewDeps[DepJD].insert(Dep); - continue; - } - - // This dep has been emitted, so add it to the list to be removed from - // EDU. - DepsToRemove.push_back(Dep); - - // If Dep is Ready then there's nothing further to do. - if (DepEntry.getState() == SymbolState::Ready) { - assert(!DepJD->MaterializingInfos.count(SymbolStringPtr(Dep)) && - "Unexpected MaterializationInfo attached to ready symbol"); - continue; - } + auto ER = G.emit(std::move(SR), + [this](JITDylib *JD, NonOwningSymbolStringPtr Name) { + return IL_getSymbolState(JD, Name); + }); - // If we get here then Dep is Emitted. We need to look up its defining - // EDU and add this EDU to the defining EDU's list of users (this means - // creating an EDUInfos entry if the defining EDU doesn't have one - // already). - assert(DepJD->MaterializingInfos.count(SymbolStringPtr(Dep)) && - "Expected MaterializationInfo for emitted dependency"); - auto &DepMI = DepJD->MaterializingInfos[SymbolStringPtr(Dep)]; - assert(DepMI.DefiningEDU && - "Emitted symbol does not have a defining EDU"); - assert(DepMI.DependantEDUs.empty() && - "Already-emitted symbol has dependant EDUs?"); - auto &DepEDUInfo = EDUInfos[DepMI.DefiningEDU.get()]; - if (!DepEDUInfo.EDU) { - // No EDUInfo yet -- build initial entry, and reset the EDUInfo - // pointer, which we will have invalidated. - EDUInfo = &EDUInfos[EDU]; - DepEDUInfo.EDU = DepMI.DefiningEDU; - for (auto &[DepDepJD, DepDeps] : DepEDUInfo.EDU->Dependencies) { - if (DepDepJD == &TargetJD) { - for (auto &DepDep : DepDeps) - if (!MR.getSymbols().count(SymbolStringPtr(DepDep))) - DepEDUInfo.NewDeps[DepDepJD].insert(DepDep); - } else - DepEDUInfo.NewDeps[DepDepJD] = DepDeps; - } - } - DepEDUInfo.IntraEmitUsers.insert(EDU); - } - - // Some dependencies were removed or in an error state -- error out. - if (!BadDeps.empty()) - return makeUnsatisfiedDepsError(*EDU, *DepJD, std::move(BadDeps)); - - // Remove the emitted / ready deps from DepJD. - for (auto &Dep : DepsToRemove) - Deps.erase(Dep); - - // If there are no further deps in DepJD then flag it for removal too. - if (Deps.empty()) - DepJDsToRemove.push_back(DepJD); - } + EmitQueries EQ; - // Remove any JDs whose dependence sets have become empty. - for (auto &DepJD : DepJDsToRemove) { - assert(EDU->Dependencies.count(DepJD) && - "Trying to remove non-existent dep entries"); - EDU->Dependencies.erase(DepJD); - } - - // Now look for users of this EDU. - for (auto &[Sym, Flags] : EDU->Symbols) { - assert(TargetJD.Symbols.count(SymbolStringPtr(Sym)) && - "Sym not present in symbol table"); - assert((TargetJD.Symbols[SymbolStringPtr(Sym)].getState() == - SymbolState::Resolved || - TargetJD.Symbols[SymbolStringPtr(Sym)] - .getFlags() - .hasMaterializationSideEffectsOnly()) && - "Emitting symbol not in the resolved state"); - assert(!TargetJD.Symbols[SymbolStringPtr(Sym)].getFlags().hasError() && - "Symbol is already in an error state"); - - auto MII = TargetJD.MaterializingInfos.find(SymbolStringPtr(Sym)); - if (MII == TargetJD.MaterializingInfos.end() || - MII->second.DependantEDUs.empty()) - continue; - - for (auto &DependantEDU : MII->second.DependantEDUs) { - if (IL_removeEDUDependence(*DependantEDU, TargetJD, Sym, EDUInfos)) - EDUInfo = &EDUInfos[EDU]; - EDUInfo->IntraEmitUsers.insert(DependantEDU); - } - MII->second.DependantEDUs.clear(); - } - } - - Worklist.clear(); - for (auto &[EDU, EDUInfo] : EDUInfos) { - if (!EDUInfo.IntraEmitUsers.empty() && !EDU->Dependencies.empty()) { - if (EDUInfo.NewDeps.empty()) - EDUInfo.NewDeps = EDU->Dependencies; - Worklist.push_back(EDU); - } - } - - propagateExtraEmitDeps( - Worklist, EDUInfos, - [](JITDylib::EmissionDepUnit &EDU, JITDylib &JD, - NonOwningSymbolStringPtr Sym) { - JD.MaterializingInfos[SymbolStringPtr(Sym)].DependantEDUs.insert(&EDU); - }); + // Handle failed queries. + for (auto &SN : ER.Failed) + IL_collectQueries( + EQ.Failed, SN->defs(), + [](JITDylib::SymbolTableEntry &E) { + E.setFlags(E.getFlags() = JITSymbolFlags::HasError); + }, + [&](AsynchronousSymbolQuery &Q, JITDylib &JD, + NonOwningSymbolStringPtr Name, JITDylib::SymbolTableEntry &E) { + auto &FS = EQ.FailedSymsForQuery[&Q]; + if (!FS) + FS = std::make_shared<SymbolDependenceMap>(); + (*FS)[&JD].insert(SymbolStringPtr(Name)); + }); - JITDylib::AsynchronousSymbolQuerySet CompletedQueries; + for (auto &FQ : EQ.Failed) + FQ->detach(); - // Extract completed queries and lodge not-yet-ready EDUs in the - // session. - for (auto &[EDU, EDUInfo] : EDUInfos) { - if (EDU->Dependencies.empty()) - IL_makeEDUReady(std::move(EDUInfo.EDU), CompletedQueries); - else - IL_makeEDUEmitted(std::move(EDUInfo.EDU), CompletedQueries); - } + for (auto &SN : ER.Ready) + IL_collectQueries( + EQ.Updated, SN->defs(), + [](JITDylib::SymbolTableEntry &E) { E.setState(SymbolState::Ready); }, + [](AsynchronousSymbolQuery &Q, JITDylib &JD, + NonOwningSymbolStringPtr Name, JITDylib::SymbolTableEntry &E) { + Q.notifySymbolMetRequiredState(SymbolStringPtr(Name), E.getSymbol()); + }); #ifdef EXPENSIVE_CHECKS verifySessionState("exiting ExecutionSession::IL_emit"); #endif - return std::move(CompletedQueries); + return std::move(EQ); } Error ExecutionSession::OL_notifyEmitted( @@ -3471,40 +2942,127 @@ Error ExecutionSession::OL_notifyEmitted( } #endif // NDEBUG - auto EDUInfos = simplifyDepGroups(MR, DepGroups); + std::vector<std::unique_ptr<WaitingOnGraph::SuperNode>> SNs; + WaitingOnGraph::ContainerElementsMap Residual; + { + auto &JDResidual = Residual[&MR.getTargetJITDylib()]; + for (auto &[Name, Flags] : MR.getSymbols()) + JDResidual.insert(NonOwningSymbolStringPtr(Name)); + + for (auto &SDG : DepGroups) { + WaitingOnGraph::ContainerElementsMap Defs; + assert(!SDG.Symbols.empty()); + auto &JDDefs = Defs[&MR.getTargetJITDylib()]; + for (auto &Def : SDG.Symbols) { + JDDefs.insert(NonOwningSymbolStringPtr(Def)); + JDResidual.erase(NonOwningSymbolStringPtr(Def)); + } + WaitingOnGraph::ContainerElementsMap Deps; + if (!SDG.Dependencies.empty()) { + for (auto &[JD, Syms] : SDG.Dependencies) { + auto &JDDeps = Deps[JD]; + for (auto &Dep : Syms) + JDDeps.insert(NonOwningSymbolStringPtr(Dep)); + } + } + SNs.push_back(std::make_unique<WaitingOnGraph::SuperNode>( + std::move(Defs), std::move(Deps))); + } + if (!JDResidual.empty()) + SNs.push_back(std::make_unique<WaitingOnGraph::SuperNode>( + std::move(Residual), WaitingOnGraph::ContainerElementsMap())); + } + + auto SR = WaitingOnGraph::simplify(std::move(SNs)); LLVM_DEBUG({ dbgs() << " Simplified dependencies:\n"; - for (auto &[EDU, EDUInfo] : EDUInfos) { - dbgs() << " Symbols: { "; - for (auto &[Sym, Flags] : EDU->Symbols) - dbgs() << Sym << " "; - dbgs() << "}, Dependencies: { "; - for (auto &[DepJD, Deps] : EDU->Dependencies) { - dbgs() << "(" << DepJD->getName() << ", { "; - for (auto &Dep : Deps) - dbgs() << Dep << " "; - dbgs() << "}) "; + for (auto &SN : SR.superNodes()) { + + auto SortedLibs = [](WaitingOnGraph::ContainerElementsMap &C) { + std::vector<JITDylib *> JDs; + for (auto &[JD, _] : C) + JDs.push_back(JD); + llvm::sort(JDs, [](const JITDylib *LHS, const JITDylib *RHS) { + return LHS->getName() < RHS->getName(); + }); + return JDs; + }; + + auto SortedNames = [](WaitingOnGraph::ElementSet &Elems) { + std::vector<NonOwningSymbolStringPtr> Names(Elems.begin(), Elems.end()); + llvm::sort(Names, [](const NonOwningSymbolStringPtr &LHS, + const NonOwningSymbolStringPtr &RHS) { + return *LHS < *RHS; + }); + return Names; + }; + + dbgs() << " Defs: {"; + for (auto *JD : SortedLibs(SN->defs())) { + dbgs() << " (" << JD->getName() << ", ["; + for (auto &Sym : SortedNames(SN->defs()[JD])) + dbgs() << " " << Sym; + dbgs() << " ])"; + } + dbgs() << " }, Deps: {"; + for (auto *JD : SortedLibs(SN->deps())) { + dbgs() << " (" << JD->getName() << ", ["; + for (auto &Sym : SortedNames(SN->deps()[JD])) + dbgs() << " " << Sym; + dbgs() << " ])"; } - dbgs() << "}\n"; + dbgs() << " }\n"; } }); - - auto CompletedQueries = - runSessionLocked([&]() { return IL_emit(MR, EDUInfos); }); + auto EmitQueries = + runSessionLocked([&]() { return IL_emit(MR, std::move(SR)); }); // On error bail out. - if (!CompletedQueries) - return CompletedQueries.takeError(); + if (!EmitQueries) + return EmitQueries.takeError(); - MR.SymbolFlags.clear(); + // Otherwise notify failed queries, and any updated queries that have been + // completed. - // Otherwise notify all the completed queries. - for (auto &Q : *CompletedQueries) { - assert(Q->isComplete() && "Q is not complete"); - Q->handleComplete(*this); + // FIXME: Get rid of error return from notifyEmitted. + SymbolDependenceMap BadDeps; + { + for (auto &FQ : EmitQueries->Failed) { + FQ->detach(); + assert(EmitQueries->FailedSymsForQuery.count(FQ.get()) && + "Missing failed symbols for query"); + auto FailedSyms = std::move(EmitQueries->FailedSymsForQuery[FQ.get()]); + for (auto &[JD, Syms] : *FailedSyms) { + auto &BadDepsForJD = BadDeps[JD]; + for (auto &Sym : Syms) + BadDepsForJD.insert(Sym); + } + FQ->handleFailed(make_error<FailedToMaterialize>(getSymbolStringPool(), + std::move(FailedSyms))); + } + } + + for (auto &UQ : EmitQueries->Updated) + if (UQ->isComplete()) + UQ->handleComplete(*this); + + // If there are any bad dependencies then return an error. + if (!BadDeps.empty()) { + SymbolNameSet BadNames; + // Note: The name set calculated here is bogus: it includes all symbols in + // the MR, not just the ones that failed. We want to remove the error + // return path from notifyEmitted anyway, so this is just a brief + // placeholder to maintain (roughly) the current error behavior. + for (auto &[Name, Flags] : MR.getSymbols()) + BadNames.insert(Name); + MR.SymbolFlags.clear(); + return make_error<UnsatisfiedSymbolDependencies>( + getSymbolStringPool(), &MR.getTargetJITDylib(), std::move(BadNames), + std::move(BadDeps), "dependencies removed or in error state"); } + MR.SymbolFlags.clear(); return Error::success(); } @@ -3535,158 +3093,48 @@ ExecutionSession::IL_failSymbols(JITDylib &JD, #endif JITDylib::AsynchronousSymbolQuerySet FailedQueries; - auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>(); - auto ExtractFailedQueries = [&](JITDylib::MaterializingInfo &MI) { - JITDylib::AsynchronousSymbolQueryList ToDetach; - for (auto &Q : MI.pendingQueries()) { - // Add the query to the list to be failed and detach it. - FailedQueries.insert(Q); - ToDetach.push_back(Q); + auto Fail = [&](JITDylib *FailJD, NonOwningSymbolStringPtr FailSym) { + auto I = FailJD->Symbols.find_as(FailSym); + assert(I != FailJD->Symbols.end()); + I->second.setFlags(I->second.getFlags() | JITSymbolFlags::HasError); + auto J = FailJD->MaterializingInfos.find_as(FailSym); + if (J != FailJD->MaterializingInfos.end()) { + for (auto &Q : J->second.takeAllPendingQueries()) + FailedQueries.insert(std::move(Q)); + FailJD->MaterializingInfos.erase(J); } - for (auto &Q : ToDetach) - Q->detach(); - assert(!MI.hasQueriesPending() && "Queries still pending after detach"); }; - for (auto &Name : SymbolsToFail) { - (*FailedSymbolsMap)[&JD].insert(Name); - - // Look up the symbol to fail. - auto SymI = JD.Symbols.find(Name); - - // FIXME: Revisit this. We should be able to assert sequencing between - // ResourceTracker removal and symbol failure. - // - // It's possible that this symbol has already been removed, e.g. if a - // materialization failure happens concurrently with a ResourceTracker or - // JITDylib removal. In that case we can safely skip this symbol and - // continue. - if (SymI == JD.Symbols.end()) - continue; - auto &Sym = SymI->second; - - // If the symbol is already in the error state then we must have visited - // it earlier. - if (Sym.getFlags().hasError()) { - assert(!JD.MaterializingInfos.count(Name) && - "Symbol in error state still has MaterializingInfo"); - continue; - } + auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>(); - // Move the symbol into the error state. - Sym.setFlags(Sym.getFlags() | JITSymbolFlags::HasError); - - // FIXME: Come up with a sane mapping of state to - // presence-of-MaterializingInfo so that we can assert presence / absence - // here, rather than testing it. - auto MII = JD.MaterializingInfos.find(Name); - if (MII == JD.MaterializingInfos.end()) - continue; - - auto &MI = MII->second; - - // Collect queries to be failed for this MII. - ExtractFailedQueries(MI); - - if (MI.DefiningEDU) { - // If there is a DefiningEDU for this symbol then remove this - // symbol from it. - assert(MI.DependantEDUs.empty() && - "Symbol with DefiningEDU should not have DependantEDUs"); - assert(Sym.getState() >= SymbolState::Emitted && - "Symbol has EDU, should have been emitted"); - assert(MI.DefiningEDU->Symbols.count(NonOwningSymbolStringPtr(Name)) && - "Symbol does not appear in its DefiningEDU"); - MI.DefiningEDU->Symbols.erase(NonOwningSymbolStringPtr(Name)); - - // Remove this EDU from the dependants lists of its dependencies. - for (auto &[DepJD, DepSyms] : MI.DefiningEDU->Dependencies) { - for (auto DepSym : DepSyms) { - assert(DepJD->Symbols.count(SymbolStringPtr(DepSym)) && - "DepSym not in DepJD"); - assert(DepJD->MaterializingInfos.count(SymbolStringPtr(DepSym)) && - "DepSym has not MaterializingInfo"); - auto &SymMI = DepJD->MaterializingInfos[SymbolStringPtr(DepSym)]; - assert(SymMI.DependantEDUs.count(MI.DefiningEDU.get()) && - "DefiningEDU missing from DependantEDUs list of dependency"); - SymMI.DependantEDUs.erase(MI.DefiningEDU.get()); - } - } + { + auto &FailedSymsForJD = (*FailedSymbolsMap)[&JD]; + for (auto &Sym : SymbolsToFail) { + FailedSymsForJD.insert(Sym); + Fail(&JD, NonOwningSymbolStringPtr(Sym)); + } + } - MI.DefiningEDU = nullptr; - } else { - // Otherwise if there are any EDUs waiting on this symbol then move - // those symbols to the error state too, and deregister them from the - // symbols that they depend on. - // Note: We use a copy of DependantEDUs here since we'll be removing - // from the original set as we go. - for (auto &DependantEDU : MI.DependantEDUs) { - - // Remove DependantEDU from all of its users DependantEDUs lists. - for (auto &[DepJD, DepSyms] : DependantEDU->Dependencies) { - for (auto DepSym : DepSyms) { - // Skip self-reference to avoid invalidating the MI.DependantEDUs - // map. We'll clear this later. - if (DepJD == &JD && DepSym == Name) - continue; - assert(DepJD->Symbols.count(SymbolStringPtr(DepSym)) && - "DepSym not in DepJD?"); - assert(DepJD->MaterializingInfos.count(SymbolStringPtr(DepSym)) && - "DependantEDU not registered with symbol it depends on"); - auto &SymMI = DepJD->MaterializingInfos[SymbolStringPtr(DepSym)]; - assert(SymMI.DependantEDUs.count(DependantEDU) && - "DependantEDU missing from DependantEDUs list"); - SymMI.DependantEDUs.erase(DependantEDU); - } - } + WaitingOnGraph::ContainerElementsMap ToFail; + auto &JDToFail = ToFail[&JD]; + for (auto &Sym : SymbolsToFail) + JDToFail.insert(NonOwningSymbolStringPtr(Sym)); - // Move any symbols defined by DependantEDU into the error state and - // fail any queries waiting on them. - auto &DepJD = *DependantEDU->JD; - auto DepEDUSymbols = std::move(DependantEDU->Symbols); - for (auto &[DepName, Flags] : DepEDUSymbols) { - auto DepSymItr = DepJD.Symbols.find(SymbolStringPtr(DepName)); - assert(DepSymItr != DepJD.Symbols.end() && - "Symbol not present in table"); - auto &DepSym = DepSymItr->second; - - assert(DepSym.getState() >= SymbolState::Emitted && - "Symbol has EDU, should have been emitted"); - assert(!DepSym.getFlags().hasError() && - "Symbol is already in the error state?"); - DepSym.setFlags(DepSym.getFlags() | JITSymbolFlags::HasError); - (*FailedSymbolsMap)[&DepJD].insert(SymbolStringPtr(DepName)); - - // This symbol has a defining EDU so its MaterializingInfo object must - // exist. - auto DepMIItr = - DepJD.MaterializingInfos.find(SymbolStringPtr(DepName)); - assert(DepMIItr != DepJD.MaterializingInfos.end() && - "Symbol has defining EDU but not MaterializingInfo"); - auto &DepMI = DepMIItr->second; - assert(DepMI.DefiningEDU.get() == DependantEDU && - "Bad EDU dependence edge"); - assert(DepMI.DependantEDUs.empty() && - "Symbol was emitted, should not have any DependantEDUs"); - ExtractFailedQueries(DepMI); - DepJD.MaterializingInfos.erase(SymbolStringPtr(DepName)); - } + auto FailedSNs = G.fail(ToFail); - DepJD.shrinkMaterializationInfoMemory(); + for (auto &SN : FailedSNs) { + for (auto &[FailJD, Defs] : SN->defs()) { + auto &FailedSymsForFailJD = (*FailedSymbolsMap)[FailJD]; + for (auto &Def : Defs) { + FailedSymsForFailJD.insert(SymbolStringPtr(Def)); + Fail(FailJD, Def); } - - MI.DependantEDUs.clear(); } - - assert(!MI.DefiningEDU && "DefiningEDU should have been reset"); - assert(MI.DependantEDUs.empty() && - "DependantEDUs should have been removed above"); - assert(!MI.hasQueriesPending() && - "Can not delete MaterializingInfo with queries pending"); - JD.MaterializingInfos.erase(Name); } - JD.shrinkMaterializationInfoMemory(); + // Detach all failed queries. + for (auto &Q : FailedQueries) + Q->detach(); #ifdef EXPENSIVE_CHECKS verifySessionState("exiting ExecutionSession::IL_failSymbols"); @@ -3721,9 +3169,11 @@ void ExecutionSession::OL_notifyFailed(MaterializationResponsibility &MR) { return IL_failSymbols(MR.getTargetJITDylib(), SymbolsToFail); }); - for (auto &Q : FailedQueries) + for (auto &Q : FailedQueries) { + Q->detach(); Q->handleFailed( make_error<FailedToMaterialize>(getSymbolStringPool(), FailedSymbols)); + } } Error ExecutionSession::OL_replace(MaterializationResponsibility &MR, diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp index dec1df7..893523c 100644 --- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp +++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp @@ -448,7 +448,7 @@ Error SimpleRemoteEPC::handleHangup(SimpleRemoteEPCArgBytesVector ArgBytes) { if (const char *ErrMsg = WFR.getOutOfBandError()) return make_error<StringError>(ErrMsg, inconvertibleErrorCode()); - detail::SPSSerializableError Info; + orc::shared::detail::SPSSerializableError Info; SPSInputBuffer IB(WFR.data(), WFR.size()); if (!SPSArgList<SPSError>::deserialize(IB, Info)) return make_error<StringError>("Could not deserialize hangup info", diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 3908a78..488b078 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -3196,7 +3196,7 @@ void AssemblyWriter::printModuleSummaryIndex() { // for aliasee (then update BitcodeWriter.cpp and remove get/setAliaseeGUID). for (auto &GlobalList : *TheIndex) { auto GUID = GlobalList.first; - for (auto &Summary : GlobalList.second.SummaryList) + for (auto &Summary : GlobalList.second.getSummaryList()) SummaryToGUIDMap[Summary.get()] = GUID; } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 10f915d..7e5e7b5 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -6045,6 +6045,120 @@ void llvm::UpgradeFunctionAttributes(Function &F) { } } +// Check if the function attribute is not present and set it. +static void setFunctionAttrIfNotSet(Function &F, StringRef FnAttrName, + StringRef Value) { + if (!F.hasFnAttribute(FnAttrName)) + F.addFnAttr(FnAttrName, Value); +} + +// Check if the function attribute is not present and set it if needed. +// If the attribute is "false" then removes it. +// If the attribute is "true" resets it to a valueless attribute. +static void ConvertFunctionAttr(Function &F, bool Set, StringRef FnAttrName) { + if (!F.hasFnAttribute(FnAttrName)) { + if (Set) + F.addFnAttr(FnAttrName); + } else { + auto A = F.getFnAttribute(FnAttrName); + if ("false" == A.getValueAsString()) + F.removeFnAttr(FnAttrName); + else if ("true" == A.getValueAsString()) { + F.removeFnAttr(FnAttrName); + F.addFnAttr(FnAttrName); + } + } +} + +void llvm::copyModuleAttrToFunctions(Module &M) { + Triple T(M.getTargetTriple()); + if (!T.isThumb() && !T.isARM() && !T.isAArch64()) + return; + + uint64_t BTEValue = 0; + uint64_t BPPLRValue = 0; + uint64_t GCSValue = 0; + uint64_t SRAValue = 0; + uint64_t SRAALLValue = 0; + uint64_t SRABKeyValue = 0; + + NamedMDNode *ModFlags = M.getModuleFlagsMetadata(); + if (ModFlags) { + for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) { + MDNode *Op = ModFlags->getOperand(I); + if (Op->getNumOperands() != 3) + continue; + + MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(1)); + auto *CI = mdconst::dyn_extract<ConstantInt>(Op->getOperand(2)); + if (!ID || !CI) + continue; + + StringRef IDStr = ID->getString(); + uint64_t *ValPtr = IDStr == "branch-target-enforcement" ? &BTEValue + : IDStr == "branch-protection-pauth-lr" ? &BPPLRValue + : IDStr == "guarded-control-stack" ? &GCSValue + : IDStr == "sign-return-address" ? &SRAValue + : IDStr == "sign-return-address-all" ? &SRAALLValue + : IDStr == "sign-return-address-with-bkey" + ? &SRABKeyValue + : nullptr; + if (!ValPtr) + continue; + + *ValPtr = CI->getZExtValue(); + if (*ValPtr == 2) + return; + } + } + + bool BTE = BTEValue == 1; + bool BPPLR = BPPLRValue == 1; + bool GCS = GCSValue == 1; + bool SRA = SRAValue == 1; + + StringRef SignTypeValue = "non-leaf"; + if (SRA && SRAALLValue == 1) + SignTypeValue = "all"; + + StringRef SignKeyValue = "a_key"; + if (SRA && SRABKeyValue == 1) + SignKeyValue = "b_key"; + + for (Function &F : M.getFunctionList()) { + if (F.isDeclaration()) + continue; + + if (SRA) { + setFunctionAttrIfNotSet(F, "sign-return-address", SignTypeValue); + setFunctionAttrIfNotSet(F, "sign-return-address-key", SignKeyValue); + } else { + if (auto A = F.getFnAttribute("sign-return-address"); + A.isValid() && "none" == A.getValueAsString()) { + F.removeFnAttr("sign-return-address"); + F.removeFnAttr("sign-return-address-key"); + } + } + ConvertFunctionAttr(F, BTE, "branch-target-enforcement"); + ConvertFunctionAttr(F, BPPLR, "branch-protection-pauth-lr"); + ConvertFunctionAttr(F, GCS, "guarded-control-stack"); + } + + if (BTE) + M.setModuleFlag(llvm::Module::Min, "branch-target-enforcement", 2); + if (BPPLR) + M.setModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr", 2); + if (GCS) + M.setModuleFlag(llvm::Module::Min, "guarded-control-stack", 2); + if (SRA) { + M.setModuleFlag(llvm::Module::Min, "sign-return-address", 2); + if (SRAALLValue == 1) + M.setModuleFlag(llvm::Module::Min, "sign-return-address-all", 2); + if (SRABKeyValue == 1) + M.setModuleFlag(llvm::Module::Min, "sign-return-address-with-bkey", 2); + } +} + static bool isOldLoopArgument(Metadata *MD) { auto *T = dyn_cast_or_null<MDTuple>(MD); if (!T) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 9060a89..3b8fde8 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -2878,7 +2878,7 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp, { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc | { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt | { 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt | - { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr | + { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr | { 99,99,99,99,99,99,99,99,99,11,11,99,15, 0}, // IntToPtr | { 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp index dc55b63..a6353664 100644 --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -162,7 +162,7 @@ void ModuleSummaryIndex::collectDefinedFunctionsForModule( StringRef ModulePath, GVSummaryMapTy &GVSummaryMap) const { for (auto &GlobalList : *this) { auto GUID = GlobalList.first; - for (auto &GlobSummary : GlobalList.second.SummaryList) { + for (auto &GlobSummary : GlobalList.second.getSummaryList()) { auto *Summary = dyn_cast_or_null<FunctionSummary>(GlobSummary.get()); if (!Summary) // Ignore global variable, focus on functions @@ -263,7 +263,7 @@ void ModuleSummaryIndex::propagateAttributes( DenseSet<ValueInfo> MarkedNonReadWriteOnly; for (auto &P : *this) { bool IsDSOLocal = true; - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { if (!isGlobalValueLive(S.get())) { // computeDeadSymbolsAndUpdateIndirectCalls should have marked all // copies live. Note that it is possible that there is a GUID collision @@ -273,7 +273,7 @@ void ModuleSummaryIndex::propagateAttributes( // all copies live we can assert here that all are dead if any copy is // dead. assert(llvm::none_of( - P.second.SummaryList, + P.second.getSummaryList(), [&](const std::unique_ptr<GlobalValueSummary> &Summary) { return isGlobalValueLive(Summary.get()); })); @@ -308,16 +308,16 @@ void ModuleSummaryIndex::propagateAttributes( // Mark the flag in all summaries false so that we can do quick check // without going through the whole list. for (const std::unique_ptr<GlobalValueSummary> &Summary : - P.second.SummaryList) + P.second.getSummaryList()) Summary->setDSOLocal(false); } setWithAttributePropagation(); setWithDSOLocalPropagation(); if (llvm::AreStatisticsEnabled()) for (auto &P : *this) - if (P.second.SummaryList.size()) + if (P.second.getSummaryList().size()) if (auto *GVS = dyn_cast<GlobalVarSummary>( - P.second.SummaryList[0]->getBaseObject())) + P.second.getSummaryList()[0]->getBaseObject())) if (isGlobalValueLive(GVS)) { if (GVS->maybeReadOnly()) ReadOnlyLiveGVars++; diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 7ea2e46..77af29b 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -21,9 +21,6 @@ using namespace RTLIB; #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS #define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME #include "llvm/IR/RuntimeLibcalls.inc" -#undef GET_INIT_RUNTIME_LIBCALL_NAMES -#undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS -#undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index aec8891..cbc0b1d 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -457,7 +457,7 @@ void llvm::thinLTOResolvePrevailingInIndex( // when needed. DenseSet<GlobalValueSummary *> GlobalInvolvedWithAlias; for (auto &I : Index) - for (auto &S : I.second.SummaryList) + for (auto &S : I.second.getSummaryList()) if (auto AS = dyn_cast<AliasSummary>(S.get())) GlobalInvolvedWithAlias.insert(&AS->getAliasee()); @@ -1182,7 +1182,7 @@ Error LTO::checkPartiallySplit() { // Otherwise check if there are any recorded in the combined summary from the // ThinLTO modules. for (auto &P : ThinLTO.CombinedIndex) { - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { auto *FS = dyn_cast<FunctionSummary>(S.get()); if (!FS) continue; diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 280c3d1..93118be 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -770,11 +770,11 @@ bool lto::initImportList(const Module &M, // via a WriteIndexesThinBackend. for (const auto &GlobalList : CombinedIndex) { // Ignore entries for undefined references. - if (GlobalList.second.SummaryList.empty()) + if (GlobalList.second.getSummaryList().empty()) continue; auto GUID = GlobalList.first; - for (const auto &Summary : GlobalList.second.SummaryList) { + for (const auto &Summary : GlobalList.second.getSummaryList()) { // Skip the summaries for the importing module. These are included to // e.g. record required linkage changes. if (Summary->modulePath() == M.getModuleIdentifier()) diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index 5b333cd..ff94c54 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -101,8 +101,8 @@ static void saveTempBitcode(const Module &TheModule, StringRef TempDir, WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); } -static const GlobalValueSummary * -getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) { +static const GlobalValueSummary *getFirstDefinitionForLinker( + ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) { // If there is any strong definition anywhere, get it. auto StrongDefForLinker = llvm::find_if( GVSummaryList, [](const std::unique_ptr<GlobalValueSummary> &Summary) { @@ -131,14 +131,15 @@ getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) { static void computePrevailingCopies( const ModuleSummaryIndex &Index, DenseMap<GlobalValue::GUID, const GlobalValueSummary *> &PrevailingCopy) { - auto HasMultipleCopies = [&](const GlobalValueSummaryList &GVSummaryList) { - return GVSummaryList.size() > 1; - }; + auto HasMultipleCopies = + [&](ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) { + return GVSummaryList.size() > 1; + }; for (auto &I : Index) { - if (HasMultipleCopies(I.second.SummaryList)) + if (HasMultipleCopies(I.second.getSummaryList())) PrevailingCopy[I.first] = - getFirstDefinitionForLinker(I.second.SummaryList); + getFirstDefinitionForLinker(I.second.getSummaryList()); } } diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 1bff6cd..f78d9b0 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -1512,6 +1512,11 @@ Error IRLinker::run() { // Loop over all of the linked values to compute type mappings. computeTypeMapping(); + // Convert module level attributes to function level attributes because + // after merging modules the attributes might change and would have different + // effect on the functions as the original module would have. + copyModuleAttrToFunctions(*SrcM); + std::reverse(Worklist.begin(), Worklist.end()); while (!Worklist.empty()) { GlobalValue *GV = Worklist.back(); @@ -1677,6 +1682,11 @@ IRMover::IRMover(Module &M) : Composite(M) { for (const auto *MD : StructTypes.getVisitedMetadata()) { SharedMDs[MD].reset(const_cast<MDNode *>(MD)); } + + // Convert module level attributes to function level attributes because + // after merging modules the attributes might change and would have different + // effect on the functions as the original module would have. + copyModuleAttrToFunctions(M); } Error IRMover::move(std::unique_ptr<Module> Src, diff --git a/llvm/lib/Support/AllocToken.cpp b/llvm/lib/Support/AllocToken.cpp new file mode 100644 index 0000000..95ecda2 --- /dev/null +++ b/llvm/lib/Support/AllocToken.cpp @@ -0,0 +1,50 @@ +//===- AllocToken.cpp - Allocation Token Calculation ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definition of AllocToken modes and shared calculation of stateless token IDs. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/AllocToken.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/SipHash.h" + +using namespace llvm; + +static uint64_t getStableHash(const AllocTokenMetadata &Metadata, + uint64_t MaxTokens) { + return getStableSipHash(Metadata.TypeName) % MaxTokens; +} + +std::optional<uint64_t> llvm::getAllocToken(AllocTokenMode Mode, + const AllocTokenMetadata &Metadata, + uint64_t MaxTokens) { + assert(MaxTokens && "Must provide non-zero max tokens"); + + switch (Mode) { + case AllocTokenMode::Increment: + case AllocTokenMode::Random: + // Stateful modes cannot be implemented as a pure function. + return std::nullopt; + + case AllocTokenMode::TypeHash: + return getStableHash(Metadata, MaxTokens); + + case AllocTokenMode::TypeHashPointerSplit: { + if (MaxTokens == 1) + return 0; + const uint64_t HalfTokens = MaxTokens / 2; + uint64_t Hash = getStableHash(Metadata, HalfTokens); + if (Metadata.ContainsPointer) + Hash += HalfTokens; + return Hash; + } + } + + llvm_unreachable(""); +} diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 42b21b5..671a5fe 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -149,6 +149,7 @@ add_llvm_component_library(LLVMSupport AArch64BuildAttributes.cpp ARMAttributeParser.cpp ARMWinEH.cpp + AllocToken.cpp Allocator.cpp AutoConvert.cpp Base64.cpp diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 549c418..f74e52a 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -111,7 +111,7 @@ Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) { return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M); } -LLVM_ABI void SpecialCaseList::Matcher::preprocess(bool BySize) { +void SpecialCaseList::Matcher::preprocess(bool BySize) { return std::visit([&](auto &V) { return V.preprocess(BySize); }, M); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 662d84b..a81de5c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -27602,6 +27602,15 @@ static SDValue performPTestFirstCombine(SDNode *N, static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + SDLoc DL(N); + + // If a DUP(Op0) already exists, reuse it for the scalar_to_vector. + if (DCI.isAfterLegalizeDAG()) { + if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(), + N->getOperand(0))) + return SDValue(LN, 0); + } + // Let's do below transform. // // t34: v4i32 = AArch64ISD::UADDLV t2 @@ -27638,7 +27647,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); // Let's generate new sequence with AArch64ISD::NVCAST. - SDLoc DL(N); SDValue EXTRACT_SUBVEC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV, DAG.getConstant(0, DL, MVT::i64)); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 479e345..e3370d3 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5722,7 +5722,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( } // Add additional cost for the extends that would need to be inserted. - return Cost + 4; + return Cost + 2; } InstructionCost diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp new file mode 100644 index 0000000..30a1f05 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -0,0 +1,73 @@ +//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to add latency to +/// barrier edges between ATOMIC_FENCE instructions and preceding +/// memory accesses potentially affected by the fence. +/// This encourages the scheduling of more instructions before +/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may +/// introduce wait counting or indicate an impending S_BARRIER +/// wait. Having more instructions in-flight across these +/// constructs improves latency hiding. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUBarrierLatency.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" + +using namespace llvm; + +namespace { + +class BarrierLatency : public ScheduleDAGMutation { +public: + BarrierLatency() = default; + void apply(ScheduleDAGInstrs *DAG) override; +}; + +void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { + constexpr unsigned SyntheticLatency = 2000; + for (SUnit &SU : DAG->SUnits) { + const MachineInstr *MI = SU.getInstr(); + if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) + continue; + + // Update latency on barrier edges of ATOMIC_FENCE. + // We don't consider the scope of the fence or type of instruction + // involved in the barrier edge. + for (SDep &PredDep : SU.Preds) { + if (!PredDep.isBarrier()) + continue; + SUnit *PredSU = PredDep.getSUnit(); + MachineInstr *MI = PredSU->getInstr(); + // Only consider memory loads + if (!MI->mayLoad() || MI->mayStore()) + continue; + SDep ForwardD = PredDep; + ForwardD.setSUnit(&SU); + for (SDep &SuccDep : PredSU->Succs) { + if (SuccDep == ForwardD) { + SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency); + break; + } + } + PredDep.setLatency(PredDep.getLatency() + SyntheticLatency); + PredSU->setDepthDirty(); + SU.setDepthDirty(); + } + } +} + +} // end namespace + +std::unique_ptr<ScheduleDAGMutation> +llvm::createAMDGPUBarrierLatencyDAGMutation() { + return std::make_unique<BarrierLatency>(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h new file mode 100644 index 0000000..c23f0b9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h @@ -0,0 +1,21 @@ +//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include <memory> + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4958a20..996b55f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" +#include "AMDGPUBarrierLatency.h" #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" @@ -639,6 +640,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } @@ -659,6 +661,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } @@ -1197,6 +1200,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 13f727b68..a1e0e52 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -52,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUAsmPrinter.cpp AMDGPUAtomicOptimizer.cpp AMDGPUAttributor.cpp + AMDGPUBarrierLatency.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 50447f4..2ff2d2f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4032,28 +4032,31 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) { } } +/// Helper struct for the implementation of 3-address conversion to communicate +/// updates made to instruction operands. +struct SIInstrInfo::ThreeAddressUpdates { + /// Other instruction whose def is no longer used by the converted + /// instruction. + MachineInstr *RemoveMIUse = nullptr; +}; + MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { MachineBasicBlock &MBB = *MI.getParent(); - unsigned Opc = MI.getOpcode(); + ThreeAddressUpdates U; + MachineInstr *NewMI = convertToThreeAddressImpl(MI, U); - // Handle MFMA. - int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); - if (NewMFMAOpc != -1) { - MachineInstrBuilder MIB = - BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) - MIB.add(MI.getOperand(I)); - updateLiveVariables(LV, MI, *MIB); + if (NewMI) { + updateLiveVariables(LV, MI, *NewMI); if (LIS) { - LIS->ReplaceMachineInstrInMaps(MI, *MIB); + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); // SlotIndex of defs needs to be updated when converting to early-clobber - MachineOperand &Def = MIB->getOperand(0); + MachineOperand &Def = NewMI->getOperand(0); if (Def.isEarlyClobber() && Def.isReg() && LIS->hasInterval(Def.getReg())) { - SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false); - SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true); + SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false); + SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true); auto &LI = LIS->getInterval(Def.getReg()); auto UpdateDefIndex = [&](LiveRange &LR) { auto *S = LR.find(OldIndex); @@ -4068,6 +4071,58 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, UpdateDefIndex(SR); } } + } + + if (U.RemoveMIUse) { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + // The only user is the instruction which will be killed. + Register DefReg = U.RemoveMIUse->getOperand(0).getReg(); + + if (MRI.hasOneNonDBGUse(DefReg)) { + // We cannot just remove the DefMI here, calling pass will crash. + U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF)); + U.RemoveMIUse->getOperand(0).setIsDead(true); + for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I) + U.RemoveMIUse->removeOperand(I); + if (LV) + LV->getVarInfo(DefReg).AliveBlocks.clear(); + } + + if (LIS) { + LiveInterval &DefLI = LIS->getInterval(DefReg); + + // We cannot delete the original instruction here, so hack out the use + // in the original instruction with a dummy register so we can use + // shrinkToUses to deal with any multi-use edge cases. Other targets do + // not have the complexity of deleting a use to consider here. + Register DummyReg = MRI.cloneVirtualRegister(DefReg); + for (MachineOperand &MIOp : MI.uses()) { + if (MIOp.isReg() && MIOp.getReg() == DefReg) { + MIOp.setIsUndef(true); + MIOp.setReg(DummyReg); + } + } + + LIS->shrinkToUses(&DefLI); + } + } + + return NewMI; +} + +MachineInstr * +SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI, + ThreeAddressUpdates &U) const { + MachineBasicBlock &MBB = *MI.getParent(); + unsigned Opc = MI.getOpcode(); + + // Handle MFMA. + int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); + if (NewMFMAOpc != -1) { + MachineInstrBuilder MIB = + BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); return MIB; } @@ -4077,11 +4132,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .setMIFlags(MI.getFlags()); for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) MIB->addOperand(MI.getOperand(I)); - - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - return MIB; } @@ -4152,39 +4202,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { MachineInstr *DefMI; - const auto killDef = [&]() -> void { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - // The only user is the instruction which will be killed. - Register DefReg = DefMI->getOperand(0).getReg(); - - if (MRI.hasOneNonDBGUse(DefReg)) { - // We cannot just remove the DefMI here, calling pass will crash. - DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); - DefMI->getOperand(0).setIsDead(true); - for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) - DefMI->removeOperand(I); - if (LV) - LV->getVarInfo(DefReg).AliveBlocks.clear(); - } - - if (LIS) { - LiveInterval &DefLI = LIS->getInterval(DefReg); - - // We cannot delete the original instruction here, so hack out the use - // in the original instruction with a dummy register so we can use - // shrinkToUses to deal with any multi-use edge cases. Other targets do - // not have the complexity of deleting a use to consider here. - Register DummyReg = MRI.cloneVirtualRegister(DefReg); - for (MachineOperand &MIOp : MI.uses()) { - if (MIOp.isReg() && MIOp.getReg() == DefReg) { - MIOp.setIsUndef(true); - MIOp.setReg(DummyReg); - } - } - - LIS->shrinkToUses(&DefLI); - } - }; int64_t Imm; if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { @@ -4196,10 +4213,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Src1) .addImm(Imm) .setMIFlags(MI.getFlags()); - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - killDef(); + U.RemoveMIUse = DefMI; return MIB; } } @@ -4212,11 +4226,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .addImm(Imm) .add(*Src2) .setMIFlags(MI.getFlags()); - updateLiveVariables(LV, MI, *MIB); - - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - killDef(); + U.RemoveMIUse = DefMI; return MIB; } } @@ -4235,12 +4245,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .addImm(Imm) .add(*Src2) .setMIFlags(MI.getFlags()); - updateLiveVariables(LV, MI, *MIB); - - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - if (DefMI) - killDef(); + U.RemoveMIUse = DefMI; return MIB; } } @@ -4269,9 +4274,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .setMIFlags(MI.getFlags()); if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) MIB.addImm(OpSel ? OpSel->getImm() : 0); - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); return MIB; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index df27ec1..e1d7a07 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -88,6 +88,8 @@ private: }; class SIInstrInfo final : public AMDGPUGenInstrInfo { + struct ThreeAddressUpdates; + private: const SIRegisterInfo RI; const GCNSubtarget &ST; @@ -190,6 +192,9 @@ private: bool resultDependsOnExec(const MachineInstr &MI) const; + MachineInstr *convertToThreeAddressImpl(MachineInstr &MI, + ThreeAddressUpdates &Updates) const; + protected: /// If the specific machine instruction is a instruction that moves/copies /// value from one register to another register return destination and source diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 74d4153..6f1feb1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2223,8 +2223,8 @@ def : GCNPat < def : GCNPat < (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)), - (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, - 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0), + (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src, + !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0), 0, 0, 0, 0, 0) > { let SubtargetPredicate = HasPackedFP32Ops; diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index bdbc000..07264d9 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -397,12 +397,6 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos) const = 0; - /// Inserts any necessary instructions before the barrier start instruction - /// \p MI in order to support pairing of barriers and fences. - virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const { - return false; - }; - /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; }; @@ -583,12 +577,8 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos, AtomicOrdering Order, bool AtomicsOnly) const override; - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - - bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, Position Pos) const override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -2069,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, // the WGP. Therefore need to wait for operations to complete to ensure // they are visible to waves in the other CU as the L0 is per CU. // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. + if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) VMCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2225,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx10CacheControl::insertBarrierStart( - MachineBasicBlock::iterator &MI) const { - // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU - // mode. This is because a CU mode release fence does not emit any wait, which - // is fine when only dealing with vmem, but isn't sufficient in the presence - // of barriers which do not go through vmem. - // GFX12.5 does not require this additional wait. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) - return false; - - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); - return true; -} - bool SIGfx11CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { @@ -2419,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // In WGP mode the waves of a work-group can be executing on either CU // of the WGP. Therefore need to wait for operations to complete to // ensure they are visible to waves in the other CU as the L0 is per CU. + // // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. // // GFX12.5: // CU$ has two ports. To ensure operations are visible at the workgroup // level, we need to ensure all operations in this port have completed // so the other SIMDs in the WG can see them. There is no ordering // guarantee between the ports. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() || + isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -3017,11 +2999,6 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { MI = II->getIterator(); } - if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) { - Changed |= CC->insertBarrierStart(MI); - continue; - } - if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 42ec8ba..7cce033 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { - defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>; - defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>; - defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>; - defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>; + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; } defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 1f773e2..3368a50 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -820,7 +820,7 @@ void ARMAsmPrinter::emitAttributes() { auto *BTIValue = mdconst::extract_or_null<ConstantInt>( SourceModule->getModuleFlag("branch-target-enforcement")); - if (BTIValue && BTIValue->isOne()) { + if (BTIValue && !BTIValue->isZero()) { // If "+pacbti" is used as an architecture extension, // Tag_BTI_extension is emitted in // ARMTargetStreamer::emitTargetAttributes(). diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 35e1127..b1a668e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1089,7 +1089,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, // Register based DivRem for AEABI (RTABI 4.2) if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() || - TT.isTargetMuslAEABI() || TT.isOSWindows()) { + TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) { setOperationAction(ISD::SREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); HasStandaloneRem = false; @@ -1353,6 +1353,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FLOG10, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::LRINT, MVT::f16, Expand); + setOperationAction(ISD::LROUND, MVT::f16, Expand); setOperationAction(ISD::FROUND, MVT::f16, Legal); setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); @@ -20574,7 +20575,7 @@ static TargetLowering::ArgListTy getDivRemArgList( SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || - Subtarget->isTargetWindows()) && + Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) && "Register-based DivRem lowering only"); unsigned Opcode = Op->getOpcode(); assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 96ee69c..597d311 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -882,7 +882,7 @@ static bool producesFalseLanesZero(MachineInstr &MI, continue; // Skip the lr predicate reg int PIdx = llvm::findFirstVPTPredOperandIdx(MI); - if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2) + if (PIdx != -1 && MO.getOperandNo() == PIdx + ARM::SUBOP_vpred_n_tp_reg) continue; // Check that this instruction will produce zeros in its false lanes: @@ -1036,6 +1036,7 @@ bool LowOverheadLoop::ValidateLiveOuts() { while (!Worklist.empty()) { MachineInstr *MI = Worklist.pop_back_val(); if (MI->getOpcode() == ARM::MQPRCopy) { + LLVM_DEBUG(dbgs() << " Must generate copy as VMOV: " << *MI); VMOVCopies.insert(MI); MachineInstr *CopySrc = RDI.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg()); @@ -1045,6 +1046,20 @@ bool LowOverheadLoop::ValidateLiveOuts() { LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); VMOVCopies.clear(); return false; + } else if (isVectorPredicated(MI)) { + // If this is a predicated instruction with merging semantics, + // check where it gets its false lanes from, if any. + int InactiveIdx = findVPTInactiveOperandIdx(*MI); + if (InactiveIdx != -1) { + SmallPtrSet<MachineInstr *, 2> Defs; + MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef( + MI, MI->getOperand(InactiveIdx).getReg()); + if (FalseSrc) { + LLVM_DEBUG(dbgs() + << " Must check source of false lanes for: " << *MI); + Worklist.push_back(FalseSrc); + } + } } } diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index b2d368e..4a0883c 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -343,6 +343,7 @@ public: bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); } bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); } bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); } + bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index 5eeb4fe..413e844 100644 --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -534,7 +534,7 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, Register LR = LoopPhi->getOperand(0).getReg(); for (MachineInstr *MI : MVEInstrs) { int Idx = findFirstVPTPredOperandIdx(*MI); - MI->getOperand(Idx + 2).setReg(LR); + MI->getOperand(Idx + ARM::SUBOP_vpred_n_tp_reg).setReg(LR); } } diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 431ce38..f5653d4 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -805,6 +805,16 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) { return -1; } +int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) + if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) + return i + ARM::SUBOP_vpred_r_inactive; + + return -1; +} + ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI, Register &PredReg) { int PIdx = findFirstVPTPredOperandIdx(MI); diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 3ec3a621..1b0bf2d 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -90,6 +90,9 @@ inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) { Register PredReg; return getVPTInstrPredicate(MI, PredReg); } +// Identify the input operand in an MVE predicated instruction which +// contributes the values of any inactive vector lanes. +int findVPTInactiveOperandIdx(const MachineInstr &MI); // Recomputes the Block Mask of Instr, a VPT or VPST instruction. // This rebuilds the block mask of the instruction depending on the predicates diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp index c8866bf..42e90f0 100644 --- a/llvm/lib/Target/DirectX/DXILPrepare.cpp +++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp @@ -294,6 +294,14 @@ public: if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) RootSignature->eraseFromParent(); + // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and + // causes all tests using the DXIL Validator to fail. + // + // This is a temporary fix and should be replaced with a whitelist once + // we have determined all metadata that the DXIL Validator allows + if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) + ErrNo->eraseFromParent(); + return true; } diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt index d758260..1a5f096 100644 --- a/llvm/lib/Target/Hexagon/CMakeLists.txt +++ b/llvm/lib/Target/Hexagon/CMakeLists.txt @@ -54,6 +54,7 @@ add_llvm_target(HexagonCodeGen HexagonOptAddrMode.cpp HexagonOptimizeSZextends.cpp HexagonPeephole.cpp + HexagonQFPOptimizer.cpp HexagonRDFOpt.cpp HexagonRegisterInfo.cpp HexagonSelectionDAGInfo.cpp diff --git a/llvm/lib/Target/Hexagon/Hexagon.h b/llvm/lib/Target/Hexagon/Hexagon.h index 109aba5..422ab20 100644 --- a/llvm/lib/Target/Hexagon/Hexagon.h +++ b/llvm/lib/Target/Hexagon/Hexagon.h @@ -67,6 +67,8 @@ void initializeHexagonPeepholePass(PassRegistry &); void initializeHexagonSplitConst32AndConst64Pass(PassRegistry &); void initializeHexagonVectorPrintPass(PassRegistry &); +void initializeHexagonQFPOptimizerPass(PassRegistry &); + Pass *createHexagonLoopIdiomPass(); Pass *createHexagonVectorLoopCarriedReuseLegacyPass(); @@ -112,6 +114,7 @@ FunctionPass *createHexagonVectorCombineLegacyPass(); FunctionPass *createHexagonVectorPrint(); FunctionPass *createHexagonVExtract(); FunctionPass *createHexagonExpandCondsets(); +FunctionPass *createHexagonQFPOptimizer(); } // end namespace llvm; diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index 4ddbe7a..ff876f6 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -920,6 +920,10 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B, // successors have been processed. RegisterSet BlockDefs, InsDefs; for (MachineInstr &MI : *B) { + // Stop if the map size is too large. + if (IFMap.size() >= MaxIFMSize) + break; + InsDefs.clear(); getInstrDefs(&MI, InsDefs); // Leave those alone. They are more transparent than "insert". @@ -942,8 +946,8 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B, findRecordInsertForms(VR, AVs); // Stop if the map size is too large. - if (IFMap.size() > MaxIFMSize) - return; + if (IFMap.size() >= MaxIFMSize) + break; } } diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index a94e131..54c8972 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -117,8 +117,10 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - if (Subtarget.useHVX128BOps()) + if (Subtarget.useHVX128BOps()) { setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); + setOperationAction(ISD::BITCAST, MVT::v64i1, Custom); + } if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() && Subtarget.useHVXFloatingPoint()) { @@ -2024,13 +2026,9 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const { // Handle bitcast from i32, v2i16, and v4i8 to v32i1. // Splat the input into a 32-element i32 vector, then AND each element // with a unique bitmask to isolate individual bits. - if (ResTy == MVT::v32i1 && - (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) && - Subtarget.useHVX128BOps()) { - SDValue Val32 = Val; - if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8) - Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val); - + auto bitcastI32ToV32I1 = [&](SDValue Val32) { + assert(Val32.getValueType().getSizeInBits() == 32 && + "Input must be 32 bits"); MVT VecTy = MVT::getVectorVT(MVT::i32, 32); SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Val32); SmallVector<SDValue, 32> Mask; @@ -2039,7 +2037,31 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const { SDValue MaskVec = DAG.getBuildVector(VecTy, dl, Mask); SDValue Anded = DAG.getNode(ISD::AND, dl, VecTy, Splat, MaskVec); - return DAG.getNode(HexagonISD::V2Q, dl, ResTy, Anded); + return DAG.getNode(HexagonISD::V2Q, dl, MVT::v32i1, Anded); + }; + // === Case: v32i1 === + if (ResTy == MVT::v32i1 && + (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) && + Subtarget.useHVX128BOps()) { + SDValue Val32 = Val; + if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8) + Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val); + return bitcastI32ToV32I1(Val32); + } + // === Case: v64i1 === + if (ResTy == MVT::v64i1 && ValTy == MVT::i64 && Subtarget.useHVX128BOps()) { + // Split i64 into lo/hi 32-bit halves. + SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Val); + SDValue HiShifted = DAG.getNode(ISD::SRL, dl, MVT::i64, Val, + DAG.getConstant(32, dl, MVT::i64)); + SDValue Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, HiShifted); + + // Reuse the same 32-bit logic twice. + SDValue LoRes = bitcastI32ToV32I1(Lo); + SDValue HiRes = bitcastI32ToV32I1(Hi); + + // Concatenate into a v64i1 predicate. + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, LoRes, HiRes); } if (isHvxBoolTy(ResTy) && ValTy.isScalarInteger()) { diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp new file mode 100644 index 0000000..479ac90 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp @@ -0,0 +1,334 @@ +//===----- HexagonQFPOptimizer.cpp - Qualcomm-FP to IEEE-FP conversions +// optimizer ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Basic infrastructure for optimizing intermediate conversion instructions +// generated while performing vector floating point operations. +// Currently run at the starting of the code generation for Hexagon, cleans +// up redundant conversion instructions and replaces the uses of conversion +// with appropriate machine operand. Liveness is preserved after this pass. +// +// @note: The redundant conversion instructions are not eliminated in this pass. +// In this pass, we are only trying to replace the uses of conversion +// instructions with its appropriate QFP instruction. We are leaving the job to +// Dead instruction Elimination pass to remove redundant conversion +// instructions. +// +// Brief overview of working of this QFP optimizer. +// This version of Hexagon QFP optimizer basically iterates over each +// instruction, checks whether if it belongs to hexagon floating point HVX +// arithmetic instruction category(Add, Sub, Mul). And then it finds the unique +// definition for the machine operands corresponding to the instruction. +// +// Example: +// MachineInstruction *MI be the HVX vadd instruction +// MI -> $v0 = V6_vadd_sf $v1, $v2 +// MachineOperand *DefMI1 = MRI->getVRegDef(MI->getOperand(1).getReg()); +// MachineOperand *DefMI2 = MRI->getVRegDef(MI->getOperand(2).getReg()); +// +// In the above example, DefMI1 and DefMI2 gives the unique definitions +// corresponding to the operands($v1 and &v2 respectively) of instruction MI. +// +// If both of the definitions are not conversion instructions(V6_vconv_sf_qf32, +// V6_vconv_hf_qf16), then it will skip optimizing the current instruction and +// iterates over next instruction. +// +// If one the definitions is conversion instruction then our pass will replace +// the arithmetic instruction with its corresponding mix variant. +// In the above example, if $v1 is conversion instruction +// DefMI1 -> $v1 = V6_vconv_sf_qf32 $v3 +// After Transformation: +// MI -> $v0 = V6_vadd_qf32_mix $v3, $v2 ($v1 is replaced with $v3) +// +// If both the definitions are conversion instructions then the instruction will +// be replaced with its qf variant +// In the above example, if $v1 and $v2 are conversion instructions +// DefMI1 -> $v1 = V6_vconv_sf_qf32 $v3 +// DefMI2 -> $v2 = V6_vconv_sf_qf32 $v4 +// After Transformation: +// MI -> $v0 = V6_vadd_qf32 $v3, $v4 ($v1 is replaced with $v3, $v2 is replaced +// with $v4) +// +// Currently, in this pass, we are not handling the case when the definitions +// are PHI inst. +// +//===----------------------------------------------------------------------===// +#include <unordered_set> +#define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass" + +#include "Hexagon.h" +#include "HexagonInstrInfo.h" +#include "HexagonSubtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <map> +#include <vector> + +#define DEBUG_TYPE "hexagon-qfp-optimizer" + +using namespace llvm; + +cl::opt<bool> + DisableQFOptimizer("disable-qfp-opt", cl::init(false), + cl::desc("Disable optimization of Qfloat operations.")); + +namespace { +const std::map<unsigned short, unsigned short> QFPInstMap{ + {Hexagon::V6_vadd_hf, Hexagon::V6_vadd_qf16_mix}, + {Hexagon::V6_vadd_qf16_mix, Hexagon::V6_vadd_qf16}, + {Hexagon::V6_vadd_sf, Hexagon::V6_vadd_qf32_mix}, + {Hexagon::V6_vadd_qf32_mix, Hexagon::V6_vadd_qf32}, + {Hexagon::V6_vsub_hf, Hexagon::V6_vsub_qf16_mix}, + {Hexagon::V6_vsub_qf16_mix, Hexagon::V6_vsub_qf16}, + {Hexagon::V6_vsub_sf, Hexagon::V6_vsub_qf32_mix}, + {Hexagon::V6_vsub_qf32_mix, Hexagon::V6_vsub_qf32}, + {Hexagon::V6_vmpy_qf16_hf, Hexagon::V6_vmpy_qf16_mix_hf}, + {Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16}, + {Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf}, + {Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16}, + {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}}; +} // namespace + +namespace llvm { + +FunctionPass *createHexagonQFPOptimizer(); +void initializeHexagonQFPOptimizerPass(PassRegistry &); + +} // namespace llvm + +namespace { + +struct HexagonQFPOptimizer : public MachineFunctionPass { +public: + static char ID; + + HexagonQFPOptimizer() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB); + + StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + const HexagonSubtarget *HST = nullptr; + const HexagonInstrInfo *HII = nullptr; + const MachineRegisterInfo *MRI = nullptr; +}; + +char HexagonQFPOptimizer::ID = 0; +} // namespace + +INITIALIZE_PASS(HexagonQFPOptimizer, "hexagon-qfp-optimizer", + HEXAGON_QFP_OPTIMIZER, false, false) + +FunctionPass *llvm::createHexagonQFPOptimizer() { + return new HexagonQFPOptimizer(); +} + +bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI, + MachineBasicBlock *MBB) { + + // Early exit: + // - if instruction is invalid or has too few operands (QFP ops need 2 sources + // + 1 dest), + // - or does not have a transformation mapping. + if (MI->getNumOperands() < 3) + return false; + auto It = QFPInstMap.find(MI->getOpcode()); + if (It == QFPInstMap.end()) + return false; + unsigned short InstTy = It->second; + + unsigned Op0F = 0; + unsigned Op1F = 0; + // Get the reaching defs of MI, DefMI1 and DefMI2 + MachineInstr *DefMI1 = nullptr; + MachineInstr *DefMI2 = nullptr; + + if (MI->getOperand(1).isReg()) + DefMI1 = MRI->getVRegDef(MI->getOperand(1).getReg()); + if (MI->getOperand(2).isReg()) + DefMI2 = MRI->getVRegDef(MI->getOperand(2).getReg()); + if (!DefMI1 || !DefMI2) + return false; + + MachineOperand &Res = MI->getOperand(0); + MachineInstr *Inst1 = nullptr; + MachineInstr *Inst2 = nullptr; + LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump(); + DefMI2->dump()); + + // Get the reaching defs of DefMI + if (DefMI1->getNumOperands() > 1 && DefMI1->getOperand(1).isReg() && + DefMI1->getOperand(1).getReg().isVirtual()) + Inst1 = MRI->getVRegDef(DefMI1->getOperand(1).getReg()); + + if (DefMI2->getNumOperands() > 1 && DefMI2->getOperand(1).isReg() && + DefMI2->getOperand(1).getReg().isVirtual()) + Inst2 = MRI->getVRegDef(DefMI2->getOperand(1).getReg()); + + unsigned Def1OP = DefMI1->getOpcode(); + unsigned Def2OP = DefMI2->getOpcode(); + + MachineInstrBuilder MIB; + // Case 1: Both reaching defs of MI are qf to sf/hf conversions + if ((Def1OP == Hexagon::V6_vconv_sf_qf32 && + Def2OP == Hexagon::V6_vconv_sf_qf32) || + (Def1OP == Hexagon::V6_vconv_hf_qf16 && + Def2OP == Hexagon::V6_vconv_hf_qf16)) { + + // If the reaching defs of DefMI are W register type, we return + if ((Inst1 && Inst1->getNumOperands() > 0 && Inst1->getOperand(0).isReg() && + MRI->getRegClass(Inst1->getOperand(0).getReg()) == + &Hexagon::HvxWRRegClass) || + (Inst2 && Inst2->getNumOperands() > 0 && Inst2->getOperand(0).isReg() && + MRI->getRegClass(Inst2->getOperand(0).getReg()) == + &Hexagon::HvxWRRegClass)) + return false; + + // Analyze the use operands of the conversion to get their KILL status + MachineOperand &Src1 = DefMI1->getOperand(1); + MachineOperand &Src2 = DefMI2->getOperand(1); + + Op0F = getKillRegState(Src1.isKill()); + Src1.setIsKill(false); + + Op1F = getKillRegState(Src2.isKill()); + Src2.setIsKill(false); + + if (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf) { + auto OuterIt = QFPInstMap.find(MI->getOpcode()); + if (OuterIt == QFPInstMap.end()) + return false; + auto InnerIt = QFPInstMap.find(OuterIt->second); + if (InnerIt == QFPInstMap.end()) + return false; + InstTy = InnerIt->second; + } + + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg()) + .addReg(Src1.getReg(), Op0F, Src1.getSubReg()) + .addReg(Src2.getReg(), Op1F, Src2.getSubReg()); + LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump()); + return true; + + // Case 2: Left operand is conversion to sf/hf + } else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 && + Def2OP != Hexagon::V6_vconv_sf_qf32) || + (Def1OP == Hexagon::V6_vconv_hf_qf16 && + Def2OP != Hexagon::V6_vconv_hf_qf16)) && + !DefMI2->isPHI() && + (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) { + + if (Inst1 && MRI->getRegClass(Inst1->getOperand(0).getReg()) == + &Hexagon::HvxWRRegClass) + return false; + + MachineOperand &Src1 = DefMI1->getOperand(1); + MachineOperand &Src2 = MI->getOperand(2); + + Op0F = getKillRegState(Src1.isKill()); + Src1.setIsKill(false); + Op1F = getKillRegState(Src2.isKill()); + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg()) + .addReg(Src1.getReg(), Op0F, Src1.getSubReg()) + .addReg(Src2.getReg(), Op1F, Src2.getSubReg()); + LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump()); + return true; + + // Case 2: Left operand is conversion to sf/hf + } else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 && + Def2OP == Hexagon::V6_vconv_sf_qf32) || + (Def1OP != Hexagon::V6_vconv_hf_qf16 && + Def2OP == Hexagon::V6_vconv_hf_qf16)) && + !DefMI1->isPHI() && + (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) { + // The second operand of original instruction is converted. + // In "mix" instructions, "qf" operand is always the first operand. + + // Caveat: vsub is not commutative w.r.t operands. + if (InstTy == Hexagon::V6_vsub_qf16_mix || + InstTy == Hexagon::V6_vsub_qf32_mix) + return false; + + if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) == + &Hexagon::HvxWRRegClass) + return false; + + MachineOperand &Src1 = MI->getOperand(1); + MachineOperand &Src2 = DefMI2->getOperand(1); + + Op1F = getKillRegState(Src2.isKill()); + Src2.setIsKill(false); + Op0F = getKillRegState(Src1.isKill()); + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg()) + .addReg(Src2.getReg(), Op1F, + Src2.getSubReg()) // Notice the operands are flipped. + .addReg(Src1.getReg(), Op0F, Src1.getSubReg()); + LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump()); + return true; + } + + return false; +} + +bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) { + + bool Changed = false; + + if (DisableQFOptimizer) + return Changed; + + HST = &MF.getSubtarget<HexagonSubtarget>(); + if (!HST->useHVXV68Ops() || !HST->usePackets() || + skipFunction(MF.getFunction())) + return false; + HII = HST->getInstrInfo(); + MRI = &MF.getRegInfo(); + + MachineFunction::iterator MBBI = MF.begin(); + LLVM_DEBUG(dbgs() << "\n=== Running QFPOptimzer Pass for : " << MF.getName() + << " Optimize intermediate conversions ===\n"); + while (MBBI != MF.end()) { + MachineBasicBlock *MBB = &*MBBI; + MachineBasicBlock::iterator MII = MBBI->instr_begin(); + while (MII != MBBI->instr_end()) { + MachineInstr *MI = &*MII; + ++MII; // As MI might be removed. + + if (QFPInstMap.count(MI->getOpcode()) && + MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 && + MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) { + LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump()); + if (optimizeQfp(MI, MBB)) { + MI->eraseFromParent(); + LLVM_DEBUG(dbgs() << "\t....Removing...."); + Changed = true; + } + } + } + ++MBBI; + } + return Changed; +} diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index f5d8b69..d9824a31 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -220,6 +220,7 @@ LLVMInitializeHexagonTarget() { initializeHexagonPeepholePass(PR); initializeHexagonSplitConst32AndConst64Pass(PR); initializeHexagonVectorPrintPass(PR); + initializeHexagonQFPOptimizerPass(PR); } HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, @@ -386,6 +387,7 @@ bool HexagonPassConfig::addInstSelector() { addPass(createHexagonGenInsert()); if (EnableEarlyIf) addPass(createHexagonEarlyIfConversion()); + addPass(createHexagonQFPOptimizer()); } return false; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8bf0d11..d477522 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -442,14 +442,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); // If we're enabling GP optimizations, use hardware square root - if (!Subtarget.hasFSQRT() && - !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && - Subtarget.hasFRE())) + if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE())) setOperationAction(ISD::FSQRT, MVT::f64, Expand); if (!Subtarget.hasFSQRT() && - !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && - Subtarget.hasFRES())) + !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); if (Subtarget.hasFCPSGN()) { @@ -565,16 +562,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i32, Legal); setOperationAction(ISD::BITCAST, MVT::i64, Legal); setOperationAction(ISD::BITCAST, MVT::f64, Legal); - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::LRINT, MVT::f64, Legal); - setOperationAction(ISD::LRINT, MVT::f32, Legal); - setOperationAction(ISD::LLRINT, MVT::f64, Legal); - setOperationAction(ISD::LLRINT, MVT::f32, Legal); - setOperationAction(ISD::LROUND, MVT::f64, Legal); - setOperationAction(ISD::LROUND, MVT::f32, Legal); - setOperationAction(ISD::LLROUND, MVT::f64, Legal); - setOperationAction(ISD::LLROUND, MVT::f32, Legal); - } + + setOperationAction(ISD::STRICT_LRINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LRINT, MVT::f32, Custom); + setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Custom); + setOperationAction(ISD::STRICT_LROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Custom); } else { setOperationAction(ISD::BITCAST, MVT::f32, Expand); setOperationAction(ISD::BITCAST, MVT::i32, Expand); @@ -1034,11 +1030,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); // The nearbyint variants are not allowed to raise the inexact exception - // so we can only code-gen them with unsafe math. - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); - } + // so we can only code-gen them with fpexcept.ignore. + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); @@ -8911,11 +8905,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // be lost at this stage, but is below the single-precision rounding // position. // - // However, if -enable-unsafe-fp-math is in effect, accept double + // However, if afn is in effect, accept double // rounding to avoid the extra overhead. - if (Op.getValueType() == MVT::f32 && - !Subtarget.hasFPCVT() && - !DAG.getTarget().Options.UnsafeFPMath) { + // FIXME: Currently INT_TO_FP can't support fast math flags because + // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always + // false. + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() && + !Op->getFlags().hasApproximateFuncs()) { // Twiddle input to make sure the low 11 bits are zero. (If this // is the case, we are guaranteed the value will fit into the 53 bit @@ -12759,6 +12755,14 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerADDSUBO_CARRY(Op, DAG); case ISD::UCMP: return LowerUCMP(Op, DAG); + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_FNEARBYINT: + if (Op->getFlags().hasNoFPExcept()) + return Op; + return SDValue(); } } @@ -13088,7 +13092,9 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(TmpReg).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); + .addImm(PPC::PRED_NE_MINUS) + .addReg(PPC::CR0) + .addMBB(loopMBB); BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); @@ -13346,7 +13352,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( .addReg(ZeroReg) .addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) + .addImm(PPC::PRED_NE_MINUS) .addReg(PPC::CR0) .addMBB(loopMBB); BB->addSuccessor(loopMBB); @@ -14177,7 +14183,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(dest) .addReg(oldval); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) + .addImm(PPC::PRED_NE_MINUS) .addReg(CrReg) .addMBB(exitMBB); BB->addSuccessor(loop2MBB); @@ -14189,7 +14195,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(ptrA) .addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) + .addImm(PPC::PRED_NE_MINUS) .addReg(PPC::CR0) .addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); @@ -18707,11 +18713,12 @@ bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const { const Function *F = I->getFunction(); const DataLayout &DL = F->getDataLayout(); Type *Ty = User->getOperand(0)->getType(); + bool AllowContract = I->getFastMathFlags().allowContract() && + User->getFastMathFlags().allowContract(); - return !( - isFMAFasterThanFMulAndFAdd(*F, Ty) && - isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); + return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && + isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && + (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast)); } case Instruction::Load: { // Don't break "store (load float*)" pattern, this pattern will be combined diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 979ba31..885bed6 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -2786,14 +2786,16 @@ def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>; // Rounding without exceptions (nearbyint). Due to strange tblgen behaviour, // these need to be defined after the any_frint versions so ISEL will correctly // add the chain to the strict versions. -def : Pat<(f32 (fnearbyint f32:$S)), +// TODO: Match strict fp rounding intrinsics with instructions like xsrdpiz when +// rounding mode is propagated to CodeGen part. +def : Pat<(f32 (strict_fnearbyint f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPIC (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; -def : Pat<(f64 (fnearbyint f64:$S)), +def : Pat<(f64 (strict_fnearbyint f64:$S)), (f64 (XSRDPIC $S))>; -def : Pat<(v2f64 (fnearbyint v2f64:$S)), +def : Pat<(v2f64 (strict_fnearbyint v2f64:$S)), (v2f64 (XVRDPIC $S))>; -def : Pat<(v4f32 (fnearbyint v4f32:$S)), +def : Pat<(v4f32 (strict_fnearbyint v4f32:$S)), (v4f32 (XVRSPIC $S))>; // Materialize a zero-vector of long long @@ -3578,25 +3580,25 @@ def : Pat<(f64 (bitconvert i64:$S)), (f64 (MTVSRD $S))>; // Rounding to integer. -def : Pat<(i64 (lrint f64:$S)), +def : Pat<(i64 (strict_lrint f64:$S)), (i64 (MFVSRD (FCTID $S)))>; -def : Pat<(i64 (lrint f32:$S)), +def : Pat<(i64 (strict_lrint f32:$S)), (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>; -def : Pat<(i64 (llrint f64:$S)), +def : Pat<(i64 (strict_llrint f64:$S)), (i64 (MFVSRD (FCTID $S)))>; -def : Pat<(i64 (llrint f32:$S)), +def : Pat<(i64 (strict_llrint f32:$S)), (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>; -def : Pat<(i64 (lround f64:$S)), +def : Pat<(i64 (strict_lround f64:$S)), (i64 (MFVSRD (FCTID (XSRDPI $S))))>; -def : Pat<(i64 (lround f32:$S)), +def : Pat<(i64 (strict_lround f32:$S)), (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; -def : Pat<(i32 (lround f64:$S)), +def : Pat<(i32 (strict_lround f64:$S)), (i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>; -def : Pat<(i32 (lround f32:$S)), +def : Pat<(i32 (strict_lround f32:$S)), (i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; -def : Pat<(i64 (llround f64:$S)), +def : Pat<(i64 (strict_llround f64:$S)), (i64 (MFVSRD (FCTID (XSRDPI $S))))>; -def : Pat<(i64 (llround f32:$S)), +def : Pat<(i64 (strict_llround f32:$S)), (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index e857b2d..edde7ac 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2406,7 +2406,8 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) { } bool RISCVAsmParser::generateVTypeError(SMLoc ErrorLoc) { - if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa)) + if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa) || + STI->hasFeature(RISCV::FeatureVendorXSfvfbfexp16e)) return Error( ErrorLoc, "operand must be " diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index b8ec0bb..4bea4c4 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -654,7 +654,10 @@ static constexpr FeatureBitset XqciFeatureGroup = { static constexpr FeatureBitset XSfVectorGroup = { RISCV::FeatureVendorXSfvcp, RISCV::FeatureVendorXSfvqmaccdod, RISCV::FeatureVendorXSfvqmaccqoq, RISCV::FeatureVendorXSfvfwmaccqqq, - RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase}; + RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase, + RISCV::FeatureVendorXSfvfexpa, RISCV::FeatureVendorXSfvfexpa64e, + RISCV::FeatureVendorXSfvfbfexp16e, RISCV::FeatureVendorXSfvfexp16e, + RISCV::FeatureVendorXSfvfexp32e}; static constexpr FeatureBitset XSfSystemGroup = { RISCV::FeatureVendorXSiFivecdiscarddlone, RISCV::FeatureVendorXSiFivecflushdlone, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index 50f5a5d..7b9c4b3 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -220,7 +220,8 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo, if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED || RISCVVType::getSEW(Imm) > 64 || (RISCVVType::isAltFmt(Imm) && - !STI.hasFeature(RISCV::FeatureStdExtZvfbfa)) || + !(STI.hasFeature(RISCV::FeatureStdExtZvfbfa) || + STI.hasFeature(RISCV::FeatureVendorXSfvfbfexp16e))) || (Imm >> 9) != 0) { O << formatImm(Imm); return; diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index 5dd4bf4..98b636e 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -109,12 +109,70 @@ bool RISCVExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB, // expanded instructions for each pseudo is correct in the Size field of the // tablegen definition for the pseudo. switch (MBBI->getOpcode()) { + case RISCV::PseudoAtomicSwap32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 32, + NextMBBI); + case RISCV::PseudoAtomicSwap64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadAdd32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadAdd64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadSub32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadSub64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadAnd32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadAnd64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadOr32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 32, NextMBBI); + case RISCV::PseudoAtomicLoadOr64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 64, NextMBBI); + case RISCV::PseudoAtomicLoadXor32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadXor64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 64, + NextMBBI); case RISCV::PseudoAtomicLoadNand32: return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32, NextMBBI); case RISCV::PseudoAtomicLoadNand64: return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 64, NextMBBI); + case RISCV::PseudoAtomicLoadMin32: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadMin64: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadMax32: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadMax64: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadUMin32: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadUMin64: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadUMax32: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadUMax64: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 64, + NextMBBI); case RISCV::PseudoMaskedAtomicSwap32: return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, true, 32, NextMBBI); @@ -277,6 +335,36 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, switch (BinOp) { default: llvm_unreachable("Unexpected AtomicRMW BinOp"); + case AtomicRMWInst::Xchg: + BuildMI(LoopMBB, DL, TII->get(RISCV::ADDI), ScratchReg) + .addReg(IncrReg) + .addImm(0); + break; + case AtomicRMWInst::Add: + BuildMI(LoopMBB, DL, TII->get(RISCV::ADD), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Sub: + BuildMI(LoopMBB, DL, TII->get(RISCV::SUB), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::And: + BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Or: + BuildMI(LoopMBB, DL, TII->get(RISCV::OR), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Xor: + BuildMI(LoopMBB, DL, TII->get(RISCV::XOR), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; case AtomicRMWInst::Nand: BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg) .addReg(DestReg) @@ -433,38 +521,85 @@ static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL, .addReg(ShamtReg); } -bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width, - MachineBasicBlock::iterator &NextMBBI) { - assert(IsMasked == true && - "Should only need to expand masked atomic max/min"); - assert(Width == 32 && "Should never need to expand masked 64-bit operations"); +static void doAtomicMinMaxOpExpansion( + const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL, + MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB, + MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB, + MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width, + const RISCVSubtarget *STI) { + Register DestReg = MI.getOperand(0).getReg(); + Register ScratchReg = MI.getOperand(1).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register IncrReg = MI.getOperand(3).getReg(); + AtomicOrdering Ordering = + static_cast<AtomicOrdering>(MI.getOperand(4).getImm()); - MachineInstr &MI = *MBBI; - DebugLoc DL = MI.getDebugLoc(); - MachineFunction *MF = MBB.getParent(); - auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); - auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); - auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); - auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + // .loophead: + // lr.[w|d] dest, (addr) + // mv scratch, dest + // ifnochangeneeded scratch, incr, .looptail + BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width, STI)), DestReg) + .addReg(AddrReg); + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::ADDI), ScratchReg) + .addReg(DestReg) + .addImm(0); + switch (BinOp) { + default: + llvm_unreachable("Unexpected AtomicRMW BinOp"); + case AtomicRMWInst::Max: { + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE)) + .addReg(ScratchReg) + .addReg(IncrReg) + .addMBB(LoopTailMBB); + break; + } + case AtomicRMWInst::Min: { + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE)) + .addReg(IncrReg) + .addReg(ScratchReg) + .addMBB(LoopTailMBB); + break; + } + case AtomicRMWInst::UMax: + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU)) + .addReg(ScratchReg) + .addReg(IncrReg) + .addMBB(LoopTailMBB); + break; + case AtomicRMWInst::UMin: + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU)) + .addReg(IncrReg) + .addReg(ScratchReg) + .addMBB(LoopTailMBB); + break; + } - // Insert new MBBs. - MF->insert(++MBB.getIterator(), LoopHeadMBB); - MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB); - MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB); - MF->insert(++LoopTailMBB->getIterator(), DoneMBB); + // .loopifbody: + // mv scratch, incr + BuildMI(LoopIfBodyMBB, DL, TII->get(RISCV::ADDI), ScratchReg) + .addReg(IncrReg) + .addImm(0); - // Set up successors and transfer remaining instructions to DoneMBB. - LoopHeadMBB->addSuccessor(LoopIfBodyMBB); - LoopHeadMBB->addSuccessor(LoopTailMBB); - LoopIfBodyMBB->addSuccessor(LoopTailMBB); - LoopTailMBB->addSuccessor(LoopHeadMBB); - LoopTailMBB->addSuccessor(DoneMBB); - DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end()); - DoneMBB->transferSuccessors(&MBB); - MBB.addSuccessor(LoopHeadMBB); + // .looptail: + // sc.[w|d] scratch, scratch, (addr) + // bnez scratch, loop + BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), + ScratchReg) + .addReg(ScratchReg) + .addReg(AddrReg); + BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) + .addReg(ScratchReg) + .addReg(RISCV::X0) + .addMBB(LoopHeadMBB); +} +static void doMaskedAtomicMinMaxOpExpansion( + const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL, + MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB, + MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB, + MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width, + const RISCVSubtarget *STI) { + assert(Width == 32 && "Should never need to expand masked 64-bit operations"); Register DestReg = MI.getOperand(0).getReg(); Register Scratch1Reg = MI.getOperand(1).getReg(); Register Scratch2Reg = MI.getOperand(2).getReg(); @@ -541,6 +676,44 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( .addReg(Scratch1Reg) .addReg(RISCV::X0) .addMBB(LoopHeadMBB); +} + +bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width, + MachineBasicBlock::iterator &NextMBBI) { + + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + // Insert new MBBs. + MF->insert(++MBB.getIterator(), LoopHeadMBB); + MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB); + MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB); + MF->insert(++LoopTailMBB->getIterator(), DoneMBB); + + // Set up successors and transfer remaining instructions to DoneMBB. + LoopHeadMBB->addSuccessor(LoopIfBodyMBB); + LoopHeadMBB->addSuccessor(LoopTailMBB); + LoopIfBodyMBB->addSuccessor(LoopTailMBB); + LoopTailMBB->addSuccessor(LoopHeadMBB); + LoopTailMBB->addSuccessor(DoneMBB); + DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end()); + DoneMBB->transferSuccessors(&MBB); + MBB.addSuccessor(LoopHeadMBB); + + if (!IsMasked) + doAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB, LoopIfBodyMBB, + LoopTailMBB, DoneMBB, BinOp, Width, STI); + else + doMaskedAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB, + LoopIfBodyMBB, LoopTailMBB, DoneMBB, BinOp, + Width, STI); NextMBBI = MBB.end(); MI.eraseFromParent(); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 19992e6..9e6b7f0 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -218,6 +218,7 @@ def HasStdExtZaamo : Predicate<"Subtarget->hasStdExtZaamo()">, AssemblerPredicate<(any_of FeatureStdExtZaamo), "'Zaamo' (Atomic Memory Operations)">; +def NoStdExtZaamo : Predicate<"!Subtarget->hasStdExtZaamo()">; def FeatureStdExtZalrsc : RISCVExtension<1, 0, "Load-Reserved/Store-Conditional">; @@ -1334,6 +1335,44 @@ def HasVendorXSfvfnrclipxfqf AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf), "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">; +// Note: XSfvfbfexp16e depends on either Zvfbfmin _or_ Zvfbfa, which cannot be expressed here in +// TableGen. Instead, we check that in RISCVISAInfo. +def FeatureVendorXSfvfbfexp16e + : RISCVExtension<0, 5, + "SiFive Vector Floating-Point Exponential Function Instruction, BFloat16">; +def HasVendorXSfvfbfexp16e : Predicate<"Subtarget->hasVendorXSfvfbfexp16e()">; + +def FeatureVendorXSfvfexp16e + : RISCVExtension<0, 5, + "SiFive Vector Floating-Point Exponential Function Instruction, Half Precision", + [FeatureStdExtZvfh]>; +def HasVendorXSfvfexp16e : Predicate<"Subtarget->hasVendorXSfvfexp16e()">; + +def FeatureVendorXSfvfexp32e + : RISCVExtension<0, 5, + "SiFive Vector Floating-Point Exponential Function Instruction, Single Precision", + [FeatureStdExtZve32f]>; +def HasVendorXSfvfexp32e : Predicate<"Subtarget->hasVendorXSfvfexp32e()">; + +def HasVendorXSfvfexpAnyFloat : Predicate<"Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">; +def HasVendorXSfvfexpAny : Predicate<"Subtarget->hasVendorXSfvfbfexp16e() || Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">, + AssemblerPredicate<(any_of FeatureVendorXSfvfbfexp16e, FeatureVendorXSfvfexp16e, FeatureVendorXSfvfexp32e), + "'Xsfvfbfexp16e', 'Xsfvfexp16e', or 'Xsfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction)">; + +def FeatureVendorXSfvfexpa + : RISCVExtension<0, 2, + "SiFive Vector Floating-Point Exponential Approximation Instruction", + [FeatureStdExtZve32f]>; +def HasVendorXSfvfexpa : Predicate<"Subtarget->hasVendorXSfvfexpa()">, + AssemblerPredicate<(all_of FeatureVendorXSfvfexpa), + "'Xsfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction)">; + +def FeatureVendorXSfvfexpa64e + : RISCVExtension<0, 2, + "SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision", + [FeatureVendorXSfvfexpa, FeatureStdExtZve64d]>; +def HasVendorXSfvfexpa64e : Predicate<"Subtarget->hasVendorXSfvfexpa64e()">; + def FeatureVendorXSiFivecdiscarddlone : RISCVExtension<1, 0, "SiFive sf.cdiscard.d.l1 Instruction", []>; @@ -1864,7 +1903,7 @@ def FeatureForcedAtomics : SubtargetFeature< "forced-atomics", "HasForcedAtomics", "true", "Assume that lock-free native-width atomics are available">; def HasAtomicLdSt - : Predicate<"Subtarget->hasStdExtA() || Subtarget->hasForcedAtomics()">; + : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">; def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "AllowTaggedGlobals", diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 169465e..26fe9ed 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -688,7 +688,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, else if (Subtarget.hasStdExtZicbop()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); - if (Subtarget.hasStdExtA()) { + if (Subtarget.hasStdExtZalrsc()) { setMaxAtomicSizeInBitsSupported(Subtarget.getXLen()); if (Subtarget.hasStdExtZabha() && Subtarget.hasStdExtZacas()) setMinCmpXchgSizeInBits(8); @@ -1558,7 +1558,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } } - if (Subtarget.hasStdExtA()) + if (Subtarget.hasStdExtZaamo()) setOperationAction(ISD::ATOMIC_LOAD_SUB, XLenVT, Expand); if (Subtarget.hasForcedAtomics()) { @@ -12649,10 +12649,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op, Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, LoVT, Lo); Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, HiVT, Hi); // Reassemble the low and high pieces reversed. - // FIXME: This is a CONCAT_VECTORS. - SDValue Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(VecVT), Hi, 0); - return DAG.getInsertSubvector(DL, Res, Lo, - LoVT.getVectorMinNumElements()); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Hi, Lo); } // Just promote the int type to i16 which will double the LMUL. @@ -21878,7 +21875,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( // result is then sign extended to XLEN. With +A, the minimum width is // 32 for both 64 and 32. assert(getMinCmpXchgSizeInBits() == 32); - assert(Subtarget.hasStdExtA()); + assert(Subtarget.hasStdExtZalrsc()); return Op.getValueSizeInBits() - 31; } break; @@ -24047,18 +24044,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } } - std::pair<Register, const TargetRegisterClass *> Res = - TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); - - // If we picked one of the Zfinx register classes, remap it to the GPR class. - // FIXME: When Zfinx is supported in CodeGen this will need to take the - // Subtarget into account. - if (Res.second == &RISCV::GPRF16RegClass || - Res.second == &RISCV::GPRF32RegClass || - Res.second == &RISCV::GPRPairRegClass) - return std::make_pair(Res.first, &RISCV::GPRRegClass); - - return Res; + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } InlineAsm::ConstraintCode @@ -24485,6 +24471,25 @@ ISD::NodeType RISCVTargetLowering::getExtendForAtomicCmpSwapArg() const { return Subtarget.hasStdExtZacas() ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND; } +ISD::NodeType RISCVTargetLowering::getExtendForAtomicRMWArg(unsigned Op) const { + // Zaamo will use amo<op>.w which does not require extension. + if (Subtarget.hasStdExtZaamo() || Subtarget.hasForcedAtomics()) + return ISD::ANY_EXTEND; + + // Zalrsc pseudo expansions with comparison require sign-extension. + assert(Subtarget.hasStdExtZalrsc()); + switch (Op) { + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_LOAD_UMAX: + return ISD::SIGN_EXTEND; + default: + break; + } + return ISD::ANY_EXTEND; +} + Register RISCVTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { return RISCV::X10; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 3f81ed7..9e3e2a9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -245,6 +245,7 @@ public: } ISD::NodeType getExtendForAtomicCmpSwapArg() const override; + ISD::NodeType getExtendForAtomicRMWArg(unsigned Op) const override; bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 12f776b..912b82d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1689,42 +1689,44 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp, // instruction opcode. Otherwise, return RISCV::INSTRUCTION_LIST_END. // TODO: Support more operations. unsigned getPredicatedOpcode(unsigned Opcode) { + // clang-format off switch (Opcode) { - case RISCV::ADD: return RISCV::PseudoCCADD; break; - case RISCV::SUB: return RISCV::PseudoCCSUB; break; - case RISCV::SLL: return RISCV::PseudoCCSLL; break; - case RISCV::SRL: return RISCV::PseudoCCSRL; break; - case RISCV::SRA: return RISCV::PseudoCCSRA; break; - case RISCV::AND: return RISCV::PseudoCCAND; break; - case RISCV::OR: return RISCV::PseudoCCOR; break; - case RISCV::XOR: return RISCV::PseudoCCXOR; break; - - case RISCV::ADDI: return RISCV::PseudoCCADDI; break; - case RISCV::SLLI: return RISCV::PseudoCCSLLI; break; - case RISCV::SRLI: return RISCV::PseudoCCSRLI; break; - case RISCV::SRAI: return RISCV::PseudoCCSRAI; break; - case RISCV::ANDI: return RISCV::PseudoCCANDI; break; - case RISCV::ORI: return RISCV::PseudoCCORI; break; - case RISCV::XORI: return RISCV::PseudoCCXORI; break; - - case RISCV::ADDW: return RISCV::PseudoCCADDW; break; - case RISCV::SUBW: return RISCV::PseudoCCSUBW; break; - case RISCV::SLLW: return RISCV::PseudoCCSLLW; break; - case RISCV::SRLW: return RISCV::PseudoCCSRLW; break; - case RISCV::SRAW: return RISCV::PseudoCCSRAW; break; - - case RISCV::ADDIW: return RISCV::PseudoCCADDIW; break; - case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; break; - case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; break; - case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; break; - - case RISCV::ANDN: return RISCV::PseudoCCANDN; break; - case RISCV::ORN: return RISCV::PseudoCCORN; break; - case RISCV::XNOR: return RISCV::PseudoCCXNOR; break; - - case RISCV::NDS_BFOS: return RISCV::PseudoCCNDS_BFOS; break; - case RISCV::NDS_BFOZ: return RISCV::PseudoCCNDS_BFOZ; break; + case RISCV::ADD: return RISCV::PseudoCCADD; + case RISCV::SUB: return RISCV::PseudoCCSUB; + case RISCV::SLL: return RISCV::PseudoCCSLL; + case RISCV::SRL: return RISCV::PseudoCCSRL; + case RISCV::SRA: return RISCV::PseudoCCSRA; + case RISCV::AND: return RISCV::PseudoCCAND; + case RISCV::OR: return RISCV::PseudoCCOR; + case RISCV::XOR: return RISCV::PseudoCCXOR; + + case RISCV::ADDI: return RISCV::PseudoCCADDI; + case RISCV::SLLI: return RISCV::PseudoCCSLLI; + case RISCV::SRLI: return RISCV::PseudoCCSRLI; + case RISCV::SRAI: return RISCV::PseudoCCSRAI; + case RISCV::ANDI: return RISCV::PseudoCCANDI; + case RISCV::ORI: return RISCV::PseudoCCORI; + case RISCV::XORI: return RISCV::PseudoCCXORI; + + case RISCV::ADDW: return RISCV::PseudoCCADDW; + case RISCV::SUBW: return RISCV::PseudoCCSUBW; + case RISCV::SLLW: return RISCV::PseudoCCSLLW; + case RISCV::SRLW: return RISCV::PseudoCCSRLW; + case RISCV::SRAW: return RISCV::PseudoCCSRAW; + + case RISCV::ADDIW: return RISCV::PseudoCCADDIW; + case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; + case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; + case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; + + case RISCV::ANDN: return RISCV::PseudoCCANDN; + case RISCV::ORN: return RISCV::PseudoCCORN; + case RISCV::XNOR: return RISCV::PseudoCCXNOR; + + case RISCV::NDS_BFOS: return RISCV::PseudoCCNDS_BFOS; + case RISCV::NDS_BFOZ: return RISCV::PseudoCCNDS_BFOZ; } + // clang-format on return RISCV::INSTRUCTION_LIST_END; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 571d72f..5c81a09 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -158,9 +158,9 @@ class seq_cst_store<PatFrag base> } } // IsAtomic = 1 -// Atomic load/store are available under both +a and +force-atomics. -// Fences will be inserted for atomic load/stores according to the logic in -// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}. +// Atomic load/store are available under +zalrsc (thus also +a) and +// +force-atomics. Fences will be inserted for atomic load/stores according to +// the logic in RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}. // The normal loads/stores are relaxed (unordered) loads/stores that don't have // any ordering. This is necessary because AtomicExpandPass has added fences to // atomic load/stores and changed them to unordered ones. @@ -308,7 +308,65 @@ class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst> (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt, timm:$ordering)>; -let Predicates = [HasStdExtA] in { +let Predicates = [HasStdExtZalrsc, NoStdExtZaamo] in { + +let Size = 16 in { +def PseudoAtomicSwap32 : PseudoAMO; +def PseudoAtomicLoadAdd32 : PseudoAMO; +def PseudoAtomicLoadSub32 : PseudoAMO; +def PseudoAtomicLoadAnd32 : PseudoAMO; +def PseudoAtomicLoadOr32 : PseudoAMO; +def PseudoAtomicLoadXor32 : PseudoAMO; +} // Size = 16 +let Size = 24 in { +def PseudoAtomicLoadMax32 : PseudoAMO; +def PseudoAtomicLoadMin32 : PseudoAMO; +def PseudoAtomicLoadUMax32 : PseudoAMO; +def PseudoAtomicLoadUMin32 : PseudoAMO; +} // Size = 24 + +defm : PseudoAMOPat<"atomic_swap_i32", PseudoAtomicSwap32>; +defm : PseudoAMOPat<"atomic_load_add_i32", PseudoAtomicLoadAdd32>; +defm : PseudoAMOPat<"atomic_load_sub_i32", PseudoAtomicLoadSub32>; +defm : PseudoAMOPat<"atomic_load_and_i32", PseudoAtomicLoadAnd32>; +defm : PseudoAMOPat<"atomic_load_or_i32", PseudoAtomicLoadOr32>; +defm : PseudoAMOPat<"atomic_load_xor_i32", PseudoAtomicLoadXor32>; +defm : PseudoAMOPat<"atomic_load_max_i32", PseudoAtomicLoadMax32>; +defm : PseudoAMOPat<"atomic_load_min_i32", PseudoAtomicLoadMin32>; +defm : PseudoAMOPat<"atomic_load_umax_i32", PseudoAtomicLoadUMax32>; +defm : PseudoAMOPat<"atomic_load_umin_i32", PseudoAtomicLoadUMin32>; +} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo] + +let Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64] in { + +let Size = 16 in { +def PseudoAtomicSwap64 : PseudoAMO; +def PseudoAtomicLoadAdd64 : PseudoAMO; +def PseudoAtomicLoadSub64 : PseudoAMO; +def PseudoAtomicLoadAnd64 : PseudoAMO; +def PseudoAtomicLoadOr64 : PseudoAMO; +def PseudoAtomicLoadXor64 : PseudoAMO; +} // Size = 16 +let Size = 24 in { +def PseudoAtomicLoadMax64 : PseudoAMO; +def PseudoAtomicLoadMin64 : PseudoAMO; +def PseudoAtomicLoadUMax64 : PseudoAMO; +def PseudoAtomicLoadUMin64 : PseudoAMO; +} // Size = 24 + +defm : PseudoAMOPat<"atomic_swap_i64", PseudoAtomicSwap64, i64>; +defm : PseudoAMOPat<"atomic_load_add_i64", PseudoAtomicLoadAdd64, i64>; +defm : PseudoAMOPat<"atomic_load_sub_i64", PseudoAtomicLoadSub64, i64>; +defm : PseudoAMOPat<"atomic_load_and_i64", PseudoAtomicLoadAnd64, i64>; +defm : PseudoAMOPat<"atomic_load_or_i64", PseudoAtomicLoadOr64, i64>; +defm : PseudoAMOPat<"atomic_load_xor_i64", PseudoAtomicLoadXor64, i64>; +defm : PseudoAMOPat<"atomic_load_max_i64", PseudoAtomicLoadMax64, i64>; +defm : PseudoAMOPat<"atomic_load_min_i64", PseudoAtomicLoadMin64, i64>; +defm : PseudoAMOPat<"atomic_load_umax_i64", PseudoAtomicLoadUMax64, i64>; +defm : PseudoAMOPat<"atomic_load_umin_i64", PseudoAtomicLoadUMin64, i64>; +} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64] + +let Predicates = [HasStdExtZalrsc] in { let Size = 20 in def PseudoAtomicLoadNand32 : PseudoAMO; @@ -347,14 +405,14 @@ def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umax, PseudoMaskedAtomicLoadUMax32>; def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin, PseudoMaskedAtomicLoadUMin32>; -} // Predicates = [HasStdExtA] +} // Predicates = [HasStdExtZalrsc] -let Predicates = [HasStdExtA, IsRV64] in { +let Predicates = [HasStdExtZalrsc, IsRV64] in { let Size = 20 in def PseudoAtomicLoadNand64 : PseudoAMO; defm : PseudoAMOPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64, i64>; -} // Predicates = [HasStdExtA, IsRV64] +} // Predicates = [HasStdExtZalrsc, IsRV64] /// Compare and exchange @@ -385,17 +443,17 @@ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst, (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>; } -let Predicates = [HasStdExtA, NoStdExtZacas] in { +let Predicates = [HasStdExtZalrsc, NoStdExtZacas] in { def PseudoCmpXchg32 : PseudoCmpXchg; defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32>; } -let Predicates = [HasStdExtA, NoStdExtZacas, IsRV64] in { +let Predicates = [HasStdExtZalrsc, NoStdExtZacas, IsRV64] in { def PseudoCmpXchg64 : PseudoCmpXchg; defm : PseudoCmpXchgPat<"atomic_cmp_swap_i64", PseudoCmpXchg64, i64>; } -let Predicates = [HasStdExtA] in { +let Predicates = [HasStdExtZalrsc] in { def PseudoMaskedCmpXchg32 : Pseudo<(outs GPR:$res, GPR:$scratch), (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, @@ -412,4 +470,4 @@ def : Pat<(XLenVT (int_riscv_masked_cmpxchg (XLenVT GPR:$mask), (XLenVT timm:$ordering))), (PseudoMaskedCmpXchg32 GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>; -} // Predicates = [HasStdExtA] +} // Predicates = [HasStdExtZalrsc] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 6a4119a..4104abd 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -217,6 +217,14 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0, defm FVW : CustomSiFiveVCIX<"fvw", VCIX_XVW, VR, VR, FPR32>, Sched<[]>; } +let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in { + def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">; +} + +let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in { + def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">; +} + let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector", DestEEW = EEWSEWx4, RVVConstraint=VS2Constraint in { def SF_VQMACCU_2x8x2 : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 40d7341..62b7bcd 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -527,8 +527,8 @@ def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2), let Predicates = [HasStdExtZbs] in { def : Pat<(XLenVT (and (not (shl 1, shiftMaskXLen:$rs2)), GPR:$rs1)), (BCLR GPR:$rs1, shiftMaskXLen:$rs2)>; -def : Pat<(XLenVT (and (rotl -2, (XLenVT GPR:$rs2)), GPR:$rs1)), - (BCLR GPR:$rs1, GPR:$rs2)>; +def : Pat<(XLenVT (and (rotl -2, shiftMaskXLen:$rs2), GPR:$rs1)), + (BCLR GPR:$rs1, shiftMaskXLen:$rs2)>; def : Pat<(XLenVT (or (shl 1, shiftMaskXLen:$rs2), GPR:$rs1)), (BSET GPR:$rs1, shiftMaskXLen:$rs2)>; def : Pat<(XLenVT (xor (shl 1, shiftMaskXLen:$rs2), GPR:$rs1)), diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 5591d9f..021353a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -355,9 +355,9 @@ private: SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const; bool extractSubvector(Register &ResVReg, const SPIRVType *ResType, Register &ReadReg, MachineInstr &InsertionPoint) const; - bool generateImageRead(Register &ResVReg, const SPIRVType *ResType, - Register ImageReg, Register IdxReg, DebugLoc Loc, - MachineInstr &Pos) const; + bool generateImageReadOrFetch(Register &ResVReg, const SPIRVType *ResType, + Register ImageReg, Register IdxReg, + DebugLoc Loc, MachineInstr &Pos) const; bool BuildCOPY(Register DestReg, Register SrcReg, MachineInstr &I) const; bool loadVec3BuiltinInputID(SPIRV::BuiltIn::BuiltIn BuiltInValue, Register ResVReg, const SPIRVType *ResType, @@ -1321,8 +1321,8 @@ bool SPIRVInstructionSelector::selectLoad(Register ResVReg, } Register IdxReg = IntPtrDef->getOperand(3).getReg(); - return generateImageRead(ResVReg, ResType, NewHandleReg, IdxReg, - I.getDebugLoc(), I); + return generateImageReadOrFetch(ResVReg, ResType, NewHandleReg, IdxReg, + I.getDebugLoc(), I); } } @@ -3639,27 +3639,33 @@ bool SPIRVInstructionSelector::selectReadImageIntrinsic( DebugLoc Loc = I.getDebugLoc(); MachineInstr &Pos = I; - return generateImageRead(ResVReg, ResType, NewImageReg, IdxReg, Loc, Pos); + return generateImageReadOrFetch(ResVReg, ResType, NewImageReg, IdxReg, Loc, + Pos); } -bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg, - const SPIRVType *ResType, - Register ImageReg, - Register IdxReg, DebugLoc Loc, - MachineInstr &Pos) const { +bool SPIRVInstructionSelector::generateImageReadOrFetch( + Register &ResVReg, const SPIRVType *ResType, Register ImageReg, + Register IdxReg, DebugLoc Loc, MachineInstr &Pos) const { SPIRVType *ImageType = GR.getSPIRVTypeForVReg(ImageReg); assert(ImageType && ImageType->getOpcode() == SPIRV::OpTypeImage && "ImageReg is not an image type."); + bool IsSignedInteger = sampledTypeIsSignedInteger(GR.getTypeForSPIRVType(ImageType)); + // Check if the "sampled" operand of the image type is 1. + // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpImageFetch + auto SampledOp = ImageType->getOperand(6); + bool IsFetch = (SampledOp.getImm() == 1); uint64_t ResultSize = GR.getScalarOrVectorComponentCount(ResType); if (ResultSize == 4) { - auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead)) - .addDef(ResVReg) - .addUse(GR.getSPIRVTypeID(ResType)) - .addUse(ImageReg) - .addUse(IdxReg); + auto BMI = + BuildMI(*Pos.getParent(), Pos, Loc, + TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(ImageReg) + .addUse(IdxReg); if (IsSignedInteger) BMI.addImm(0x1000); // SignExtend @@ -3668,11 +3674,13 @@ bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg, SPIRVType *ReadType = widenTypeToVec4(ResType, Pos); Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType)); - auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead)) - .addDef(ReadReg) - .addUse(GR.getSPIRVTypeID(ReadType)) - .addUse(ImageReg) - .addUse(IdxReg); + auto BMI = + BuildMI(*Pos.getParent(), Pos, Loc, + TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead)) + .addDef(ReadReg) + .addUse(GR.getSPIRVTypeID(ReadType)) + .addUse(ImageReg) + .addUse(IdxReg); if (IsSignedInteger) BMI.addImm(0x1000); // SignExtend bool Succeed = BMI.constrainAllUses(TII, TRI, RBI); diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index cf85691..9bda8a4 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -158,7 +158,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const { Options.X = F.getFnAttribute(Y).getValueAsBool(); \ } while (0) - RESET_OPTION(UnsafeFPMath, "unsafe-fp-math"); RESET_OPTION(NoInfsFPMath, "no-infs-fp-math"); RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math"); RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math"); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b5f8ee5..b54a1e7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29516,11 +29516,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX)) return MinMax; - if (DAG.isKnownNeverNaN(NewX)) - NewX = NewY; - - SDValue IsNaN = - DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO); + SDValue NaNSrc = IsNum ? MinMax : NewX; + SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO); return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax); } diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp index 7882045..0fce5b9 100644 --- a/llvm/lib/TargetParser/ARMTargetParser.cpp +++ b/llvm/lib/TargetParser/ARMTargetParser.cpp @@ -567,8 +567,8 @@ StringRef ARM::computeDefaultTargetABI(const Triple &TT) { default: if (TT.isOSNetBSD()) return "apcs-gnu"; - if (TT.isOSFreeBSD() || TT.isOSOpenBSD() || TT.isOSHaiku() || - TT.isOHOSFamily()) + if (TT.isOSFreeBSD() || TT.isOSFuchsia() || TT.isOSOpenBSD() || + TT.isOSHaiku() || TT.isOHOSFamily()) return "aapcs-linux"; return "aapcs"; } @@ -648,6 +648,8 @@ StringRef ARM::getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch) { } case llvm::Triple::OpenBSD: return "cortex-a8"; + case llvm::Triple::Fuchsia: + return "cortex-a53"; default: switch (Triple.getEnvironment()) { case llvm::Triple::EABIHF: diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index 31126cc..f08a0c0 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -765,6 +765,12 @@ Error RISCVISAInfo::checkDependency() { if (HasZvl && !HasVector) return getExtensionRequiresError("zvl*b", "v' or 'zve*"); + if (Exts.count("xsfvfbfexp16e") && + !(Exts.count("zvfbfmin") || Exts.count("zvfbfa"))) + return createStringError(errc::invalid_argument, + "'xsfvfbfexp16e' requires 'zvfbfmin' or " + "'zvfbfa' extension to also be specified"); + if (HasD && (HasC || Exts.count("zcd"))) for (auto Ext : ZcdOverlaps) if (Exts.count(Ext.str())) diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 28ee444..a29faab 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -1368,13 +1368,13 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest( FunctionImporter::ImportMapTy &ImportList) { for (const auto &GlobalList : Index) { // Ignore entries for undefined references. - if (GlobalList.second.SummaryList.empty()) + if (GlobalList.second.getSummaryList().empty()) continue; auto GUID = GlobalList.first; - assert(GlobalList.second.SummaryList.size() == 1 && + assert(GlobalList.second.getSummaryList().size() == 1 && "Expected individual combined index to have one summary per GUID"); - auto &Summary = GlobalList.second.SummaryList[0]; + auto &Summary = GlobalList.second.getSummaryList()[0]; // Skip the summaries for the importing module. These are included to // e.g. record required linkage changes. if (Summary->modulePath() == ModulePath) @@ -1423,7 +1423,7 @@ void updateValueInfoForIndirectCalls(ModuleSummaryIndex &Index, void llvm::updateIndirectCalls(ModuleSummaryIndex &Index) { for (const auto &Entry : Index) { - for (const auto &S : Entry.second.SummaryList) { + for (const auto &S : Entry.second.getSummaryList()) { if (auto *FS = dyn_cast<FunctionSummary>(S.get())) updateValueInfoForIndirectCalls(Index, FS); } @@ -1456,7 +1456,7 @@ void llvm::computeDeadSymbolsAndUpdateIndirectCalls( // Add values flagged in the index as live roots to the worklist. for (const auto &Entry : Index) { auto VI = Index.getValueInfo(Entry); - for (const auto &S : Entry.second.SummaryList) { + for (const auto &S : Entry.second.getSummaryList()) { if (auto *FS = dyn_cast<FunctionSummary>(S.get())) updateValueInfoForIndirectCalls(Index, FS); if (S->isLive()) { @@ -2094,7 +2094,7 @@ static bool doImportingForModuleForTest( // is only enabled when testing importing via the 'opt' tool, which does // not do the ThinLink that would normally determine what values to promote. for (auto &I : *Index) { - for (auto &S : I.second.SummaryList) { + for (auto &S : I.second.getSummaryList()) { if (GlobalValue::isLocalLinkage(S->linkage())) S->setLinkage(GlobalValue::ExternalLinkage); } diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index be6cba3..aa1346d 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1271,7 +1271,7 @@ bool LowerTypeTestsModule::hasBranchTargetEnforcement() { // the module flags. if (const auto *BTE = mdconst::extract_or_null<ConstantInt>( M.getModuleFlag("branch-target-enforcement"))) - HasBranchTargetEnforcement = (BTE->getZExtValue() != 0); + HasBranchTargetEnforcement = !BTE->isZero(); else HasBranchTargetEnforcement = 0; } @@ -2130,7 +2130,7 @@ bool LowerTypeTestsModule::lower() { // A set of all functions that are address taken by a live global object. DenseSet<GlobalValue::GUID> AddressTaken; for (auto &I : *ExportSummary) - for (auto &GVS : I.second.SummaryList) + for (auto &GVS : I.second.getSummaryList()) if (GVS->isLive()) for (const auto &Ref : GVS->refs()) { AddressTaken.insert(Ref.getGUID()); @@ -2409,7 +2409,7 @@ bool LowerTypeTestsModule::lower() { } for (auto &P : *ExportSummary) { - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { if (!ExportSummary->isGlobalValueLive(S.get())) continue; if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject())) diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 2d5cb82..76e588b 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -928,7 +928,7 @@ void llvm::updateVCallVisibilityInIndex( // linker, as we have no information on their eventual use. if (DynamicExportSymbols.count(P.first)) continue; - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { auto *GVar = dyn_cast<GlobalVarSummary>(S.get()); if (!GVar || GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic) @@ -2413,7 +2413,7 @@ bool DevirtModule::run() { } for (auto &P : *ExportSummary) { - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { auto *FS = dyn_cast<FunctionSummary>(S.get()); if (!FS) continue; @@ -2564,7 +2564,7 @@ void DevirtIndex::run() { // Collect information from summary about which calls to try to devirtualize. for (auto &P : ExportSummary) { - for (auto &S : P.second.SummaryList) { + for (auto &S : P.second.getSummaryList()) { auto *FS = dyn_cast<FunctionSummary>(S.get()); if (!FS) continue; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index cdc559b..9b9fe26 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1643,33 +1643,46 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) { /// Return a Constant* for the specified floating-point constant if it fits /// in the specified FP type without changing its value. -static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { +static bool fitsInFPType(APFloat F, const fltSemantics &Sem) { bool losesInfo; - APFloat F = CFP->getValueAPF(); (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); return !losesInfo; } -static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) { - if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext())) - return nullptr; // No constant folding of this. +static Type *shrinkFPConstant(LLVMContext &Ctx, const APFloat &F, + bool PreferBFloat) { // See if the value can be truncated to bfloat and then reextended. - if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat())) - return Type::getBFloatTy(CFP->getContext()); + if (PreferBFloat && fitsInFPType(F, APFloat::BFloat())) + return Type::getBFloatTy(Ctx); // See if the value can be truncated to half and then reextended. - if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf())) - return Type::getHalfTy(CFP->getContext()); + if (!PreferBFloat && fitsInFPType(F, APFloat::IEEEhalf())) + return Type::getHalfTy(Ctx); // See if the value can be truncated to float and then reextended. - if (fitsInFPType(CFP, APFloat::IEEEsingle())) - return Type::getFloatTy(CFP->getContext()); - if (CFP->getType()->isDoubleTy()) - return nullptr; // Won't shrink. - if (fitsInFPType(CFP, APFloat::IEEEdouble())) - return Type::getDoubleTy(CFP->getContext()); + if (fitsInFPType(F, APFloat::IEEEsingle())) + return Type::getFloatTy(Ctx); + if (&F.getSemantics() == &APFloat::IEEEdouble()) + return nullptr; // Won't shrink. + // See if the value can be truncated to double and then reextended. + if (fitsInFPType(F, APFloat::IEEEdouble())) + return Type::getDoubleTy(Ctx); // Don't try to shrink to various long double types. return nullptr; } +static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) { + Type *Ty = CFP->getType(); + if (Ty->getScalarType()->isPPC_FP128Ty()) + return nullptr; // No constant folding of this. + + Type *ShrinkTy = + shrinkFPConstant(CFP->getContext(), CFP->getValueAPF(), PreferBFloat); + if (ShrinkTy) + if (auto *VecTy = dyn_cast<VectorType>(Ty)) + ShrinkTy = VectorType::get(ShrinkTy, VecTy); + + return ShrinkTy; +} + // Determine if this is a vector of ConstantFPs and if so, return the minimal // type we can safely truncate all elements to. static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) { @@ -1720,10 +1733,10 @@ static Type *getMinimumFPType(Value *V, bool PreferBFloat) { // Try to shrink scalable and fixed splat vectors. if (auto *FPC = dyn_cast<Constant>(V)) - if (isa<VectorType>(V->getType())) + if (auto *VTy = dyn_cast<VectorType>(V->getType())) if (auto *Splat = dyn_cast_or_null<ConstantFP>(FPC->getSplatValue())) if (Type *T = shrinkFPConstant(Splat, PreferBFloat)) - return T; + return VectorType::get(T, VTy); // Try to shrink a vector of FP constants. This returns nullptr on scalable // vectors @@ -1796,10 +1809,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { Type *Ty = FPT.getType(); auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0)); if (BO && BO->hasOneUse()) { - Type *LHSMinType = - getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy()); - Type *RHSMinType = - getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy()); + bool PreferBFloat = Ty->getScalarType()->isBFloatTy(); + Type *LHSMinType = getMinimumFPType(BO->getOperand(0), PreferBFloat); + Type *RHSMinType = getMinimumFPType(BO->getOperand(1), PreferBFloat); unsigned OpWidth = BO->getType()->getFPMantissaWidth(); unsigned LHSWidth = LHSMinType->getFPMantissaWidth(); unsigned RHSWidth = RHSMinType->getFPMantissaWidth(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 975498f..5aa8de3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3455,27 +3455,45 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { // select a, false, b -> select !a, b, false if (match(TrueVal, m_Specific(Zero))) { Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); - return SelectInst::Create(NotCond, FalseVal, Zero); + Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI; + SelectInst *NewSI = + SelectInst::Create(NotCond, FalseVal, Zero, "", nullptr, MDFrom); + NewSI->swapProfMetadata(); + return NewSI; } // select a, b, true -> select !a, true, b if (match(FalseVal, m_Specific(One))) { Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); - return SelectInst::Create(NotCond, One, TrueVal); + Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI; + SelectInst *NewSI = + SelectInst::Create(NotCond, One, TrueVal, "", nullptr, MDFrom); + NewSI->swapProfMetadata(); + return NewSI; } // DeMorgan in select form: !a && !b --> !(a || b) // select !a, !b, false --> not (select a, true, b) if (match(&SI, m_LogicalAnd(m_Not(m_Value(A)), m_Not(m_Value(B)))) && (CondVal->hasOneUse() || TrueVal->hasOneUse()) && - !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) - return BinaryOperator::CreateNot(Builder.CreateSelect(A, One, B)); + !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) { + Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI; + SelectInst *NewSI = + cast<SelectInst>(Builder.CreateSelect(A, One, B, "", MDFrom)); + NewSI->swapProfMetadata(); + return BinaryOperator::CreateNot(NewSI); + } // DeMorgan in select form: !a || !b --> !(a && b) // select !a, true, !b --> not (select a, b, false) if (match(&SI, m_LogicalOr(m_Not(m_Value(A)), m_Not(m_Value(B)))) && (CondVal->hasOneUse() || FalseVal->hasOneUse()) && - !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) - return BinaryOperator::CreateNot(Builder.CreateSelect(A, B, Zero)); + !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) { + Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI; + SelectInst *NewSI = + cast<SelectInst>(Builder.CreateSelect(A, B, Zero, "", MDFrom)); + NewSI->swapProfMetadata(); + return BinaryOperator::CreateNot(NewSI); + } // select (select a, true, b), true, b -> select a, true, b if (match(CondVal, m_Select(m_Value(A), m_One(), m_Value(B))) && diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp index 40720ae..0873845 100644 --- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp +++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp @@ -31,10 +31,12 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" +#include "llvm/Support/AllocToken.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" @@ -53,29 +55,12 @@ #include <variant> using namespace llvm; +using TokenMode = AllocTokenMode; #define DEBUG_TYPE "alloc-token" namespace { -//===--- Constants --------------------------------------------------------===// - -enum class TokenMode : unsigned { - /// Incrementally increasing token ID. - Increment = 0, - - /// Simple mode that returns a statically-assigned random token ID. - Random = 1, - - /// Token ID based on allocated type hash. - TypeHash = 2, - - /// Token ID based on allocated type hash, where the top half ID-space is - /// reserved for types that contain pointers and the bottom half for types - /// that do not contain pointers. - TypeHashPointerSplit = 3, -}; - //===--- Command-line options ---------------------------------------------===// cl::opt<TokenMode> ClMode( @@ -131,7 +116,7 @@ cl::opt<uint64_t> ClFallbackToken( //===--- Statistics -------------------------------------------------------===// -STATISTIC(NumFunctionsInstrumented, "Functions instrumented"); +STATISTIC(NumFunctionsModified, "Functions modified"); STATISTIC(NumAllocationsInstrumented, "Allocations instrumented"); //===----------------------------------------------------------------------===// @@ -140,9 +125,19 @@ STATISTIC(NumAllocationsInstrumented, "Allocations instrumented"); /// /// Expected format is: !{<type-name>, <contains-pointer>} MDNode *getAllocTokenMetadata(const CallBase &CB) { - MDNode *Ret = CB.getMetadata(LLVMContext::MD_alloc_token); - if (!Ret) - return nullptr; + MDNode *Ret = nullptr; + if (auto *II = dyn_cast<IntrinsicInst>(&CB); + II && II->getIntrinsicID() == Intrinsic::alloc_token_id) { + auto *MDV = cast<MetadataAsValue>(II->getArgOperand(0)); + Ret = cast<MDNode>(MDV->getMetadata()); + // If the intrinsic has an empty MDNode, type inference failed. + if (Ret->getNumOperands() == 0) + return nullptr; + } else { + Ret = CB.getMetadata(LLVMContext::MD_alloc_token); + if (!Ret) + return nullptr; + } assert(Ret->getNumOperands() == 2 && "bad !alloc_token"); assert(isa<MDString>(Ret->getOperand(0))); assert(isa<ConstantAsMetadata>(Ret->getOperand(1))); @@ -206,22 +201,19 @@ public: using ModeBase::ModeBase; uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) { - const auto [N, H] = getHash(CB, ORE); - return N ? boundedToken(H) : H; - } -protected: - std::pair<MDNode *, uint64_t> getHash(const CallBase &CB, - OptimizationRemarkEmitter &ORE) { if (MDNode *N = getAllocTokenMetadata(CB)) { MDString *S = cast<MDString>(N->getOperand(0)); - return {N, getStableSipHash(S->getString())}; + AllocTokenMetadata Metadata{S->getString(), containsPointer(N)}; + if (auto Token = getAllocToken(TokenMode::TypeHash, Metadata, MaxTokens)) + return *Token; } // Fallback. remarkNoMetadata(CB, ORE); - return {nullptr, ClFallbackToken}; + return ClFallbackToken; } +protected: /// Remark that there was no precise type information. static void remarkNoMetadata(const CallBase &CB, OptimizationRemarkEmitter &ORE) { @@ -242,20 +234,18 @@ public: using TypeHashMode::TypeHashMode; uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) { - if (MaxTokens == 1) - return 0; - const uint64_t HalfTokens = MaxTokens / 2; - const auto [N, H] = getHash(CB, ORE); - if (!N) { - // Pick the fallback token (ClFallbackToken), which by default is 0, - // meaning it'll fall into the pointer-less bucket. Override by setting - // -alloc-token-fallback if that is the wrong choice. - return H; + if (MDNode *N = getAllocTokenMetadata(CB)) { + MDString *S = cast<MDString>(N->getOperand(0)); + AllocTokenMetadata Metadata{S->getString(), containsPointer(N)}; + if (auto Token = getAllocToken(TokenMode::TypeHashPointerSplit, Metadata, + MaxTokens)) + return *Token; } - uint64_t Hash = H % HalfTokens; // base hash - if (containsPointer(N)) - Hash += HalfTokens; - return Hash; + // Pick the fallback token (ClFallbackToken), which by default is 0, meaning + // it'll fall into the pointer-less bucket. Override by setting + // -alloc-token-fallback if that is the wrong choice. + remarkNoMetadata(CB, ORE); + return ClFallbackToken; } }; @@ -315,6 +305,9 @@ private: FunctionCallee getTokenAllocFunction(const CallBase &CB, uint64_t TokenID, LibFunc OriginalFunc); + /// Lower alloc_token_* intrinsics. + void replaceIntrinsicInst(IntrinsicInst *II, OptimizationRemarkEmitter &ORE); + /// Return the token ID from metadata in the call. uint64_t getToken(const CallBase &CB, OptimizationRemarkEmitter &ORE) { return std::visit([&](auto &&Mode) { return Mode(CB, ORE); }, Mode); @@ -336,21 +329,32 @@ bool AllocToken::instrumentFunction(Function &F) { // Do not apply any instrumentation for naked functions. if (F.hasFnAttribute(Attribute::Naked)) return false; - if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation)) - return false; // Don't touch available_externally functions, their actual body is elsewhere. if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false; - // Only instrument functions that have the sanitize_alloc_token attribute. - if (!F.hasFnAttribute(Attribute::SanitizeAllocToken)) - return false; auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); SmallVector<std::pair<CallBase *, LibFunc>, 4> AllocCalls; + SmallVector<IntrinsicInst *, 4> IntrinsicInsts; + + // Only instrument functions that have the sanitize_alloc_token attribute. + const bool InstrumentFunction = + F.hasFnAttribute(Attribute::SanitizeAllocToken) && + !F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation); // Collect all allocation calls to avoid iterator invalidation. for (Instruction &I : instructions(F)) { + // Collect all alloc_token_* intrinsics. + if (auto *II = dyn_cast<IntrinsicInst>(&I); + II && II->getIntrinsicID() == Intrinsic::alloc_token_id) { + IntrinsicInsts.emplace_back(II); + continue; + } + + if (!InstrumentFunction) + continue; + auto *CB = dyn_cast<CallBase>(&I); if (!CB) continue; @@ -359,11 +363,21 @@ bool AllocToken::instrumentFunction(Function &F) { } bool Modified = false; - for (auto &[CB, Func] : AllocCalls) - Modified |= replaceAllocationCall(CB, Func, ORE, TLI); - if (Modified) - NumFunctionsInstrumented++; + if (!AllocCalls.empty()) { + for (auto &[CB, Func] : AllocCalls) + Modified |= replaceAllocationCall(CB, Func, ORE, TLI); + if (Modified) + NumFunctionsModified++; + } + + if (!IntrinsicInsts.empty()) { + for (auto *II : IntrinsicInsts) + replaceIntrinsicInst(II, ORE); + Modified = true; + NumFunctionsModified++; + } + return Modified; } @@ -381,7 +395,7 @@ AllocToken::shouldInstrumentCall(const CallBase &CB, if (TLI.getLibFunc(*Callee, Func)) { if (isInstrumentableLibFunc(Func, CB, TLI)) return Func; - } else if (Options.Extended && getAllocTokenMetadata(CB)) { + } else if (Options.Extended && CB.getMetadata(LLVMContext::MD_alloc_token)) { return NotLibFunc; } @@ -528,6 +542,16 @@ FunctionCallee AllocToken::getTokenAllocFunction(const CallBase &CB, return TokenAlloc; } +void AllocToken::replaceIntrinsicInst(IntrinsicInst *II, + OptimizationRemarkEmitter &ORE) { + assert(II->getIntrinsicID() == Intrinsic::alloc_token_id); + + uint64_t TokenID = getToken(*II, ORE); + Value *V = ConstantInt::get(IntPtrTy, TokenID); + II->replaceAllUsesWith(V); + II->eraseFromParent(); +} + } // namespace AllocTokenPass::AllocTokenPass(AllocTokenOptions Opts) diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index d18c0d0..80e77e09 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -2020,7 +2020,6 @@ static void moveFastMathFlags(Function &F, F.removeFnAttr(attr); \ FMF.set##setter(); \ } - MOVE_FLAG("unsafe-fp-math", Fast) MOVE_FLAG("no-infs-fp-math", NoInfs) MOVE_FLAG("no-nans-fp-math", NoNaNs) MOVE_FLAG("no-signed-zeros-fp-math", NoSignedZeros) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index febdc54..adf27be 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1011,6 +1011,10 @@ public: /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { + // Truncs must truncate at most to their destination type. + if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) && + I->getType()->getScalarSizeInBits() < MinBWs.lookup(I)) + return false; return VF.isVector() && MinBWs.contains(I) && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); @@ -7227,9 +7231,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( return DenseMap<const SCEV *, Value *>(); } - VPlanTransforms::narrowInterleaveGroups( - BestVPlan, BestVF, - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); @@ -8198,6 +8199,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, if (CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, *Plan, CM.getMaxSafeElements()); + + if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI)) + VPlans.push_back(std::move(P)); + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 3f18bd7..cdb9e7e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5577,62 +5577,79 @@ private: } // Decrement the unscheduled counter and insert to ready list if // ready. - auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE, - unsigned OpIdx) { - if (!ScheduleCopyableDataMap.empty()) { - const EdgeInfo EI = {UserTE, OpIdx}; - if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) { - DecrUnsched(CD, /*IsControl=*/false); - return; - } - } - auto It = OperandsUses.find(I); - assert(It != OperandsUses.end() && "Operand not found"); - if (It->second > 0) { - --It->getSecond(); - assert(TotalOpCount > 0 && "No more operands to decrement"); - --TotalOpCount; - if (ScheduleData *OpSD = getScheduleData(I)) - DecrUnsched(OpSD, /*IsControl=*/false); - } - }; + auto DecrUnschedForInst = + [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx, + SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> + &Checked) { + if (!ScheduleCopyableDataMap.empty()) { + const EdgeInfo EI = {UserTE, OpIdx}; + if (ScheduleCopyableData *CD = + getScheduleCopyableData(EI, I)) { + if (!Checked.insert(std::make_pair(CD, OpIdx)).second) + return; + DecrUnsched(CD, /*IsControl=*/false); + return; + } + } + auto It = OperandsUses.find(I); + assert(It != OperandsUses.end() && "Operand not found"); + if (It->second > 0) { + --It->getSecond(); + assert(TotalOpCount > 0 && "No more operands to decrement"); + --TotalOpCount; + if (ScheduleData *OpSD = getScheduleData(I)) { + if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second) + return; + DecrUnsched(OpSD, /*IsControl=*/false); + } + } + }; for (ScheduleBundle *Bundle : Bundles) { if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0) break; // Need to search for the lane since the tree entry can be // reordered. - int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(), - find(Bundle->getTreeEntry()->Scalars, In)); - assert(Lane >= 0 && "Lane not set"); - if (isa<StoreInst>(In) && - !Bundle->getTreeEntry()->ReorderIndices.empty()) - Lane = Bundle->getTreeEntry()->ReorderIndices[Lane]; - assert(Lane < static_cast<int>( - Bundle->getTreeEntry()->Scalars.size()) && - "Couldn't find extract lane"); - - // Since vectorization tree is being built recursively this - // assertion ensures that the tree entry has all operands set before - // reaching this code. Couple of exceptions known at the moment are - // extracts where their second (immediate) operand is not added. - // Since immediates do not affect scheduler behavior this is - // considered okay. - assert(In && - (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) || - In->getNumOperands() == - Bundle->getTreeEntry()->getNumOperands() || - Bundle->getTreeEntry()->isCopyableElement(In)) && - "Missed TreeEntry operands?"); - - for (unsigned OpIdx : - seq<unsigned>(Bundle->getTreeEntry()->getNumOperands())) - if (auto *I = dyn_cast<Instruction>( - Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { - LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I - << "\n"); - DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx); - } + auto *It = find(Bundle->getTreeEntry()->Scalars, In); + SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked; + do { + int Lane = + std::distance(Bundle->getTreeEntry()->Scalars.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa<StoreInst>(In) && + !Bundle->getTreeEntry()->ReorderIndices.empty()) + Lane = Bundle->getTreeEntry()->ReorderIndices[Lane]; + assert(Lane < static_cast<int>( + Bundle->getTreeEntry()->Scalars.size()) && + "Couldn't find extract lane"); + + // Since vectorization tree is being built recursively this + // assertion ensures that the tree entry has all operands set + // before reaching this code. Couple of exceptions known at the + // moment are extracts where their second (immediate) operand is + // not added. Since immediates do not affect scheduler behavior + // this is considered okay. + assert(In && + (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) || + In->getNumOperands() == + Bundle->getTreeEntry()->getNumOperands() || + Bundle->getTreeEntry()->isCopyableElement(In)) && + "Missed TreeEntry operands?"); + + for (unsigned OpIdx : + seq<unsigned>(Bundle->getTreeEntry()->getNumOperands())) + if (auto *I = dyn_cast<Instruction>( + Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { + LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " + << *I << "\n"); + DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked); + } + // If parent node is schedulable, it will be handle correctly. + if (!Bundle->getTreeEntry()->doesNotNeedToSchedule()) + break; + It = std::find(std::next(It), + Bundle->getTreeEntry()->Scalars.end(), In); + } while (It != Bundle->getTreeEntry()->Scalars.end()); } } else { // If BundleMember is a stand-alone instruction, no operand reordering diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index d167009..c95c887 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -217,32 +217,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { return Parent->getEnclosingBlockWithPredecessors(); } -bool VPBlockUtils::isHeader(const VPBlockBase *VPB, - const VPDominatorTree &VPDT) { - auto *VPBB = dyn_cast<VPBasicBlock>(VPB); - if (!VPBB) - return false; - - // If VPBB is in a region R, VPBB is a loop header if R is a loop region with - // VPBB as its entry, i.e., free of predecessors. - if (auto *R = VPBB->getParent()) - return !R->isReplicator() && !VPBB->hasPredecessors(); - - // A header dominates its second predecessor (the latch), with the other - // predecessor being the preheader - return VPB->getPredecessors().size() == 2 && - VPDT.dominates(VPB, VPB->getPredecessors()[1]); -} - -bool VPBlockUtils::isLatch(const VPBlockBase *VPB, - const VPDominatorTree &VPDT) { - // A latch has a header as its second successor, with its other successor - // leaving the loop. A preheader OTOH has a header as its first (and only) - // successor. - return VPB->getNumSuccessors() == 2 && - VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT); -} - VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { iterator It = begin(); while (It != end() && It->isPhi()) @@ -768,8 +742,12 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) { VPRegionBlock *VPRegionBlock::clone() { const auto &[NewEntry, NewExiting] = cloneFrom(getEntry()); - auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting, - getName(), isReplicator()); + VPlan &Plan = *getPlan(); + VPRegionBlock *NewRegion = + isReplicator() + ? Plan.createReplicateRegion(NewEntry, NewExiting, getName()) + : Plan.createLoopRegion(getName(), NewEntry, NewExiting); + for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry)) Block->setParent(NewRegion); return NewRegion; @@ -1213,6 +1191,7 @@ VPlan *VPlan::duplicate() { } Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount; Old2NewVPValues[&VF] = &NewPlan->VF; + Old2NewVPValues[&UF] = &NewPlan->UF; Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF; if (BackedgeTakenCount) { NewPlan->BackedgeTakenCount = new VPValue(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index fed04eb..167ba55 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4152,6 +4152,9 @@ class VPlan { /// Represents the vectorization factor of the loop. VPValue VF; + /// Represents the symbolic unroll factor of the loop. + VPValue UF; + /// Represents the loop-invariant VF * UF of the vector loop region. VPValue VFxUF; @@ -4305,6 +4308,9 @@ public: VPValue &getVF() { return VF; }; const VPValue &getVF() const { return VF; }; + /// Returns the symbolic UF of the vector loop region. + VPValue &getSymbolicUF() { return UF; }; + /// Returns VF * UF of the vector loop region. VPValue &getVFxUF() { return VFxUF; } @@ -4314,6 +4320,12 @@ public: void addVF(ElementCount VF) { VFs.insert(VF); } + /// Remove \p VF from the plan. + void removeVF(ElementCount VF) { + assert(hasVF(VF) && "tried to remove VF not present in plan"); + VFs.remove(VF); + } + void setVF(ElementCount VF) { assert(hasVF(VF) && "Cannot set VF not already in plan"); VFs.clear(); @@ -4438,22 +4450,24 @@ public: return VPB; } - /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p - /// IsReplicator is true, the region is a replicate region. The returned block - /// is owned by the VPlan and deleted once the VPlan is destroyed. - VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, - const std::string &Name = "", - bool IsReplicator = false) { - auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator); + /// Create a new loop region with \p Name and entry and exiting blocks set + /// to \p Entry and \p Exiting respectively, if set. The returned block is + /// owned by the VPlan and deleted once the VPlan is destroyed. + VPRegionBlock *createLoopRegion(const std::string &Name = "", + VPBlockBase *Entry = nullptr, + VPBlockBase *Exiting = nullptr) { + auto *VPB = Entry ? new VPRegionBlock(Entry, Exiting, Name) + : new VPRegionBlock(Name); CreatedBlocks.push_back(VPB); return VPB; } - /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set - /// to nullptr. The returned block is owned by the VPlan and deleted once the - /// VPlan is destroyed. - VPRegionBlock *createVPRegionBlock(const std::string &Name = "") { - auto *VPB = new VPRegionBlock(Name); + /// Create a new replicate region with \p Entry, \p Exiting and \p Name. The + /// returned block is owned by the VPlan and deleted once the VPlan is + /// destroyed. + VPRegionBlock *createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, + const std::string &Name = "") { + auto *VPB = new VPRegionBlock(Entry, Exiting, Name, true); CreatedBlocks.push_back(VPB); return VPB; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 332791a..65688a3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -406,7 +406,7 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) { // LatchExitVPB, taking care to preserve the original predecessor & successor // order of blocks. Set region entry and exiting after both HeaderVPB and // LatchVPBB have been disconnected from their predecessors/successors. - auto *R = Plan.createVPRegionBlock(); + auto *R = Plan.createLoopRegion(); VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R); VPBlockUtils::disconnectBlocks(LatchVPBB, R); VPBlockUtils::connectBlocks(PreheaderVPBB, R); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e060e70..ff25ef5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -372,7 +372,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); VPRegionBlock *Region = - Plan.createVPRegionBlock(Entry, Exiting, RegionName, true); + Plan.createReplicateRegion(Entry, Exiting, RegionName); // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. @@ -1478,11 +1478,8 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, if (!Plan.getVectorLoopRegion()) return false; - if (!Plan.getTripCount()->isLiveIn()) - return false; - auto *TC = dyn_cast_if_present<ConstantInt>( - Plan.getTripCount()->getUnderlyingValue()); - if (!TC || !BestVF.isFixed()) + const APInt *TC; + if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC))) return false; // Calculate the minimum power-of-2 bit width that can fit the known TC, VF @@ -1495,7 +1492,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8); }; unsigned NewBitWidth = - ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF); + ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF); LLVMContext &Ctx = Plan.getContext(); auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth); @@ -2092,8 +2089,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> { // Recipes in replicate regions implicitly depend on predicate. If either // recipe is in a replicate region, only consider them equal if both have // the same parent. - const VPRegionBlock *RegionL = L->getParent()->getParent(); - const VPRegionBlock *RegionR = R->getParent()->getParent(); + const VPRegionBlock *RegionL = L->getRegion(); + const VPRegionBlock *RegionR = R->getRegion(); if (((RegionL && RegionL->isReplicator()) || (RegionR && RegionR->isReplicator())) && L->getParent() != R->getParent()) @@ -3867,8 +3864,7 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) { // required lanes implicitly. // TODO: Remove once replicate regions are unrolled completely. auto IsCandidateUnpackUser = [Def](VPUser *U) { - VPRegionBlock *ParentRegion = - cast<VPRecipeBase>(U)->getParent()->getParent(); + VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion(); return U->usesScalars(Def) && (!ParentRegion || !ParentRegion->isReplicator()); }; @@ -3960,6 +3956,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, // used. // TODO: Assert that they aren't used. + VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); + Plan.getSymbolicUF().replaceAllUsesWith(UF); + // If there are no users of the runtime VF, compute VFxUF by constant folding // the multiplication of VF and UF. if (VF.getNumUsers() == 0) { @@ -3979,7 +3978,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, } VF.replaceAllUsesWith(RuntimeVF); - VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF}); VFxUF.replaceAllUsesWith(MulByUF); } @@ -4047,14 +4045,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, return false; } -/// Returns true if \p IR is a full interleave group with factor and number of -/// members both equal to \p VF. The interleave group must also access the full -/// vector width \p VectorRegWidth. -static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, - unsigned VF, VPTypeAnalysis &TypeInfo, - unsigned VectorRegWidth) { +/// Returns VF from \p VFs if \p IR is a full interleave group with factor and +/// number of members both equal to VF. The interleave group must also access +/// the full vector width. +static std::optional<ElementCount> isConsecutiveInterleaveGroup( + VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs, + VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) { if (!InterleaveR) - return false; + return std::nullopt; Type *GroupElementTy = nullptr; if (InterleaveR->getStoredValues().empty()) { @@ -4063,7 +4061,7 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, [&TypeInfo, GroupElementTy](VPValue *Op) { return TypeInfo.inferScalarType(Op) == GroupElementTy; })) - return false; + return std::nullopt; } else { GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]); @@ -4071,13 +4069,27 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, [&TypeInfo, GroupElementTy](VPValue *Op) { return TypeInfo.inferScalarType(Op) == GroupElementTy; })) - return false; + return std::nullopt; } - unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF; - auto IG = InterleaveR->getInterleaveGroup(); - return IG->getFactor() == VF && IG->getNumMembers() == VF && - GroupSize == VectorRegWidth; + auto GetVectorWidthForVF = [&TTI](ElementCount VF) { + TypeSize Size = TTI.getRegisterBitWidth( + VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector + : TargetTransformInfo::RGK_ScalableVector); + assert(Size.isScalable() == VF.isScalable() && + "if Size is scalable, VF must to and vice versa"); + return Size.getKnownMinValue(); + }; + + for (ElementCount VF : VFs) { + unsigned MinVal = VF.getKnownMinValue(); + unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal; + auto IG = InterleaveR->getInterleaveGroup(); + if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal && + GroupSize == GetVectorWidthForVF(VF)) + return {VF}; + } + return std::nullopt; } /// Returns true if \p VPValue is a narrow VPValue. @@ -4088,16 +4100,18 @@ static bool isAlreadyNarrow(VPValue *VPV) { return RepR && RepR->isSingleScalar(); } -void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, - unsigned VectorRegWidth) { +std::unique_ptr<VPlan> +VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, + const TargetTransformInfo &TTI) { + using namespace llvm::VPlanPatternMatch; VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); + if (!VectorLoop) - return; + return nullptr; VPTypeAnalysis TypeInfo(Plan); - - unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; + std::optional<ElementCount> VFToOptimize; for (auto &R : *VectorLoop->getEntryBasicBlock()) { if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount())) continue; @@ -4111,30 +4125,33 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // * recipes writing to memory except interleave groups // Only support plans with a canonical induction phi. if (R.isPhi()) - return; + return nullptr; auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R); if (R.mayWriteToMemory() && !InterleaveR) - return; - - // Do not narrow interleave groups if there are VectorPointer recipes and - // the plan was unrolled. The recipe implicitly uses VF from - // VPTransformState. - // TODO: Remove restriction once the VF for the VectorPointer offset is - // modeled explicitly as operand. - if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1) - return; + return nullptr; // All other ops are allowed, but we reject uses that cannot be converted // when checking all allowed consumers (store interleave groups) below. if (!InterleaveR) continue; - // Bail out on non-consecutive interleave groups. - if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo, - VectorRegWidth)) - return; - + // Try to find a single VF, where all interleave groups are consecutive and + // saturate the full vector width. If we already have a candidate VF, check + // if it is applicable for the current InterleaveR, otherwise look for a + // suitable VF across the Plans VFs. + // + if (VFToOptimize) { + if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo, + TTI)) + return nullptr; + } else { + if (auto VF = isConsecutiveInterleaveGroup( + InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI)) + VFToOptimize = *VF; + else + return nullptr; + } // Skip read interleave groups. if (InterleaveR->getStoredValues().empty()) continue; @@ -4168,24 +4185,34 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>( InterleaveR->getStoredValues()[0]->getDefiningRecipe()); if (!WideMember0) - return; + return nullptr; for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) { auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe()); if (!R || R->getOpcode() != WideMember0->getOpcode() || R->getNumOperands() > 2) - return; + return nullptr; if (any_of(enumerate(R->operands()), [WideMember0, Idx = I](const auto &P) { const auto &[OpIdx, OpV] = P; return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx); })) - return; + return nullptr; } StoreGroups.push_back(InterleaveR); } if (StoreGroups.empty()) - return; + return nullptr; + + // All interleave groups in Plan can be narrowed for VFToOptimize. Split the + // original Plan into 2: a) a new clone which contains all VFs of Plan, except + // VFToOptimize, and b) the original Plan with VFToOptimize as single VF. + std::unique_ptr<VPlan> NewPlan; + if (size(Plan.vectorFactors()) != 1) { + NewPlan = std::unique_ptr<VPlan>(Plan.duplicate()); + Plan.setVF(*VFToOptimize); + NewPlan->removeVF(*VFToOptimize); + } // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. SmallPtrSet<VPValue *, 4> NarrowedOps; @@ -4256,9 +4283,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); VPBuilder PHBuilder(Plan.getVectorPreheader()); - VPValue *UF = Plan.getOrAddLiveIn( - ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); - if (VF.isScalable()) { + VPValue *UF = &Plan.getSymbolicUF(); + if (VFToOptimize->isScalable()) { VPValue *VScale = PHBuilder.createElementCount( CanIV->getScalarType(), ElementCount::getScalable(1)); VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); @@ -4270,6 +4296,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); } removeDeadRecipes(Plan); + assert(none_of(*VectorLoop->getEntryBasicBlock(), + IsaPred<VPVectorPointerRecipe>) && + "All VPVectorPointerRecipes should have been removed"); + return NewPlan; } /// Add branch weight metadata, if the \p Plan's middle block is terminated by a diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b28559b..ca8d956 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -341,14 +341,20 @@ struct VPlanTransforms { static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan, ScalarEvolution &SE); - /// Try to convert a plan with interleave groups with VF elements to a plan - /// with the interleave groups replaced by wide loads and stores processing VF - /// elements, if all transformed interleave groups access the full vector - /// width (checked via \o VectorRegWidth). This effectively is a very simple - /// form of loop-aware SLP, where we use interleave groups to identify - /// candidates. - static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, - unsigned VectorRegWidth); + /// Try to find a single VF among \p Plan's VFs for which all interleave + /// groups (with known minimum VF elements) can be replaced by wide loads and + /// stores processing VF elements, if all transformed interleave groups access + /// the full vector width (checked via the maximum vector register width). If + /// the transformation can be applied, the original \p Plan will be split in + /// 2: + /// 1. The original Plan with the single VF containing the optimized recipes + /// using wide loads instead of interleave groups. + /// 2. A new clone which contains all VFs of Plan except the optimized VF. + /// + /// This effectively is a very simple form of loop-aware SLP, where we use + /// interleave groups to identify candidates. + static std::unique_ptr<VPlan> + narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI); /// Predicate and linearize the control-flow in the only loop region of /// \p Plan. If \p FoldTail is true, create a mask guarding the loop diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 10801c0..32e4b88 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -8,6 +8,7 @@ #include "VPlanUtils.h" #include "VPlanCFG.h" +#include "VPlanDominatorTree.h" #include "VPlanPatternMatch.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -253,3 +254,29 @@ vputils::getRecipesForUncountableExit(VPlan &Plan, return UncountableCondition; } + +bool VPBlockUtils::isHeader(const VPBlockBase *VPB, + const VPDominatorTree &VPDT) { + auto *VPBB = dyn_cast<VPBasicBlock>(VPB); + if (!VPBB) + return false; + + // If VPBB is in a region R, VPBB is a loop header if R is a loop region with + // VPBB as its entry, i.e., free of predecessors. + if (auto *R = VPBB->getParent()) + return !R->isReplicator() && !VPBB->hasPredecessors(); + + // A header dominates its second predecessor (the latch), with the other + // predecessor being the preheader + return VPB->getPredecessors().size() == 2 && + VPDT.dominates(VPB, VPB->getPredecessors()[1]); +} + +bool VPBlockUtils::isLatch(const VPBlockBase *VPB, + const VPDominatorTree &VPDT) { + // A latch has a header as its second successor, with its other successor + // leaving the loop. A preheader OTOH has a header as its first (and only) + // successor. + return VPB->getNumSuccessors() == 2 && + VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT); +} |