diff options
Diffstat (limited to 'llvm/lib')
257 files changed, 6514 insertions, 2888 deletions
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index f2dc25f..26a5602 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -75,7 +75,7 @@ AAResults::AAResults(const TargetLibraryInfo &TLI) : TLI(TLI) {} AAResults::AAResults(AAResults &&Arg) : TLI(Arg.TLI), AAs(std::move(Arg.AAs)), AADeps(std::move(Arg.AADeps)) {} -AAResults::~AAResults() {} +AAResults::~AAResults() = default; bool AAResults::invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv) { diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index e9e2e7d..da32542 100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2163,18 +2163,42 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), } Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) { - FixedVectorType *VT = dyn_cast<FixedVectorType>(Op->getType()); - if (!VT) - return nullptr; - - // This isn't strictly necessary, but handle the special/common case of zero: - // all integer reductions of a zero input produce zero. - if (isa<ConstantAggregateZero>(Op)) - return ConstantInt::get(VT->getElementType(), 0); + auto *OpVT = cast<VectorType>(Op->getType()); // This is the same as the underlying binops - poison propagates. - if (isa<PoisonValue>(Op) || Op->containsPoisonElement()) - return PoisonValue::get(VT->getElementType()); + if (Op->containsPoisonElement()) + return PoisonValue::get(OpVT->getElementType()); + + // Shortcut non-accumulating reductions. + if (Constant *SplatVal = Op->getSplatValue()) { + switch (IID) { + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: + return SplatVal; + case Intrinsic::vector_reduce_add: + if (SplatVal->isNullValue()) + return SplatVal; + break; + case Intrinsic::vector_reduce_mul: + if (SplatVal->isNullValue() || SplatVal->isOneValue()) + return SplatVal; + break; + case Intrinsic::vector_reduce_xor: + if (SplatVal->isNullValue()) + return SplatVal; + if (OpVT->getElementCount().isKnownMultipleOf(2)) + return Constant::getNullValue(OpVT->getElementType()); + break; + } + } + + FixedVectorType *VT = dyn_cast<FixedVectorType>(OpVT); + if (!VT) + return nullptr; // TODO: Handle undef. auto *EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(0U)); diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 84ee8c0..11d8294 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -2854,14 +2854,18 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst, banerjeeMIVtest(Src, Dst, Loops, Result); } -// Given a product, e.g., 10*X*Y, returns the first constant operand, -// in this case 10. If there is no constant part, returns std::nullopt. -static std::optional<APInt> getConstantPart(const SCEV *Expr) { +/// Given a SCEVMulExpr, returns its first operand if its first operand is a +/// constant and the product doesn't overflow in a signed sense. Otherwise, +/// returns std::nullopt. For example, given (10 * X * Y)<nsw>, it returns 10. +/// Notably, if it doesn't have nsw, the multiplication may overflow, and if +/// so, it may not a multiple of 10. +static std::optional<APInt> getConstanCoefficient(const SCEV *Expr) { if (const auto *Constant = dyn_cast<SCEVConstant>(Expr)) return Constant->getAPInt(); if (const auto *Product = dyn_cast<SCEVMulExpr>(Expr)) if (const auto *Constant = dyn_cast<SCEVConstant>(Product->getOperand(0))) - return Constant->getAPInt(); + if (Product->hasNoSignedWrap()) + return Constant->getAPInt(); return std::nullopt; } @@ -2887,7 +2891,7 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr, if (AddRec->getLoop() == CurLoop) { CurLoopCoeff = Step; } else { - std::optional<APInt> ConstCoeff = getConstantPart(Step); + std::optional<APInt> ConstCoeff = getConstanCoefficient(Step); // If the coefficient is the product of a constant and other stuff, we can // use the constant in the GCD computation. @@ -2940,7 +2944,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, const SCEV *Coeff = AddRec->getStepRecurrence(*SE); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. - std::optional<APInt> ConstCoeff = getConstantPart(Coeff); + std::optional<APInt> ConstCoeff = getConstanCoefficient(Coeff); if (!ConstCoeff) return false; RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs()); @@ -2958,7 +2962,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, const SCEV *Coeff = AddRec->getStepRecurrence(*SE); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. - std::optional<APInt> ConstCoeff = getConstantPart(Coeff); + std::optional<APInt> ConstCoeff = getConstanCoefficient(Coeff); if (!ConstCoeff) return false; RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs()); @@ -2979,7 +2983,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, } else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) { // Search for constant operand to participate in GCD; // If none found; return false. - std::optional<APInt> ConstOp = getConstantPart(Product); + std::optional<APInt> ConstOp = getConstanCoefficient(Product); if (!ConstOp) return false; ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs()); @@ -3032,7 +3036,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. - std::optional<APInt> ConstCoeff = getConstantPart(Delta); + std::optional<APInt> ConstCoeff = getConstanCoefficient(Delta); if (!ConstCoeff) // The difference of the two coefficients might not be a product // or constant, in which case we give up on this direction. diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp index 4529123..8974ce5 100644 --- a/llvm/lib/Analysis/HashRecognize.cpp +++ b/llvm/lib/Analysis/HashRecognize.cpp @@ -468,8 +468,11 @@ std::variant<PolynomialInfo, StringRef> HashRecognize::recognizeCRC() const { // Ensure that the PHIs have exactly two uses: // the bit-shift, and the XOR (or a cast feeding into the XOR). + // Also ensure that the SimpleRecurrence's evolution doesn't have stray + // users. if (!ConditionalRecurrence.Phi->hasNUses(2) || - !SimpleRecurrence.Phi->hasNUses(2)) + !SimpleRecurrence.Phi->hasNUses(2) || + SimpleRecurrence.BO->getUniqueUndroppableUser() != SimpleRecurrence.Phi) return "Recurrences have stray uses"; // Check that the SelectInst ConditionalRecurrence.Step is conditional on diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 7597f3a..a31f17b 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -2424,10 +2424,10 @@ ScalarEvolution::getStrengthenedNoWrapFlagsFromBinOp( // We're trying to construct a SCEV of type `Type' with `Ops' as operands and // `OldFlags' as can't-wrap behavior. Infer a more aggressive set of // can't-overflow flags for the operation if possible. -static SCEV::NoWrapFlags -StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type, - const ArrayRef<const SCEV *> Ops, - SCEV::NoWrapFlags Flags) { +static SCEV::NoWrapFlags StrengthenNoWrapFlags(ScalarEvolution *SE, + SCEVTypes Type, + ArrayRef<const SCEV *> Ops, + SCEV::NoWrapFlags Flags) { using namespace std::placeholders; using OBO = OverflowingBinaryOperator; @@ -2540,7 +2540,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, unsigned Idx = isa<SCEVConstant>(Ops[0]) ? 1 : 0; // Delay expensive flag strengthening until necessary. - auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) { + auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) { return StrengthenNoWrapFlags(this, scAddExpr, Ops, OrigFlags); }; @@ -3125,7 +3125,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, return Folded; // Delay expensive flag strengthening until necessary. - auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) { + auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) { return StrengthenNoWrapFlags(this, scMulExpr, Ops, OrigFlags); }; @@ -15510,6 +15510,78 @@ static const SCEV *getNextSCEVDivisibleByDivisor(const SCEV *Expr, return SE.getConstant(*ExprVal + DivisorVal - Rem); } +static bool collectDivisibilityInformation( + ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, + DenseMap<const SCEV *, const SCEV *> &DivInfo, + DenseMap<const SCEV *, APInt> &Multiples, ScalarEvolution &SE) { + // If we have LHS == 0, check if LHS is computing a property of some unknown + // SCEV %v which we can rewrite %v to express explicitly. + if (Predicate != CmpInst::ICMP_EQ || !match(RHS, m_scev_Zero())) + return false; + // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to + // explicitly express that. + const SCEVUnknown *URemLHS = nullptr; + const SCEV *URemRHS = nullptr; + if (!match(LHS, m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) + return false; + + const SCEV *Multiple = + SE.getMulExpr(SE.getUDivExpr(URemLHS, URemRHS), URemRHS); + DivInfo[URemLHS] = Multiple; + if (auto *C = dyn_cast<SCEVConstant>(URemRHS)) + Multiples[URemLHS] = C->getAPInt(); + return true; +} + +// Check if the condition is a divisibility guard (A % B == 0). +static bool isDivisibilityGuard(const SCEV *LHS, const SCEV *RHS, + ScalarEvolution &SE) { + const SCEV *X, *Y; + return match(LHS, m_scev_URem(m_SCEV(X), m_SCEV(Y), SE)) && RHS->isZero(); +} + +// Apply divisibility by \p Divisor on MinMaxExpr with constant values, +// recursively. This is done by aligning up/down the constant value to the +// Divisor. +static const SCEV *applyDivisibilityOnMinMaxExpr(const SCEV *MinMaxExpr, + APInt Divisor, + ScalarEvolution &SE) { + // Return true if \p Expr is a MinMax SCEV expression with a non-negative + // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS + // the non-constant operand and in \p LHS the constant operand. + auto IsMinMaxSCEVWithNonNegativeConstant = + [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS, + const SCEV *&RHS) { + if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) { + if (MinMax->getNumOperands() != 2) + return false; + if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) { + if (C->getAPInt().isNegative()) + return false; + SCTy = MinMax->getSCEVType(); + LHS = MinMax->getOperand(0); + RHS = MinMax->getOperand(1); + return true; + } + } + return false; + }; + + const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr; + SCEVTypes SCTy; + if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS, + MinMaxRHS)) + return MinMaxExpr; + auto IsMin = isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr); + assert(SE.isKnownNonNegative(MinMaxLHS) && "Expected non-negative operand!"); + auto *DivisibleExpr = + IsMin ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE) + : getNextSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE); + SmallVector<const SCEV *> Ops = { + applyDivisibilityOnMinMaxExpr(MinMaxRHS, Divisor, SE), DivisibleExpr}; + return SE.getMinMaxExpr(SCTy, Ops); +} + void ScalarEvolution::LoopGuards::collectFromBlock( ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, const BasicBlock *Block, const BasicBlock *Pred, @@ -15520,19 +15592,13 @@ void ScalarEvolution::LoopGuards::collectFromBlock( SmallVector<const SCEV *> ExprsToRewrite; auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, - DenseMap<const SCEV *, const SCEV *> - &RewriteMap) { + DenseMap<const SCEV *, const SCEV *> &RewriteMap, + const LoopGuards &DivGuards) { // WARNING: It is generally unsound to apply any wrap flags to the proposed // replacement SCEV which isn't directly implied by the structure of that // SCEV. In particular, using contextual facts to imply flags is *NOT* // legal. See the scoping rules for flags in the header to understand why. - // If LHS is a constant, apply information to the other expression. - if (isa<SCEVConstant>(LHS)) { - std::swap(LHS, RHS); - Predicate = CmpInst::getSwappedPredicate(Predicate); - } - // Check for a condition of the form (-C1 + X < C2). InstCombine will // create this form when combining two checks of the form (X u< C2 + C1) and // (X >=u C1). @@ -15565,67 +15631,6 @@ void ScalarEvolution::LoopGuards::collectFromBlock( if (MatchRangeCheckIdiom()) return; - // Return true if \p Expr is a MinMax SCEV expression with a non-negative - // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS - // the non-constant operand and in \p LHS the constant operand. - auto IsMinMaxSCEVWithNonNegativeConstant = - [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS, - const SCEV *&RHS) { - const APInt *C; - SCTy = Expr->getSCEVType(); - return match(Expr, m_scev_MinMax(m_SCEV(LHS), m_SCEV(RHS))) && - match(LHS, m_scev_APInt(C)) && C->isNonNegative(); - }; - - // Apply divisibilty by \p Divisor on MinMaxExpr with constant values, - // recursively. This is done by aligning up/down the constant value to the - // Divisor. - std::function<const SCEV *(const SCEV *, const SCEV *)> - ApplyDivisibiltyOnMinMaxExpr = [&](const SCEV *MinMaxExpr, - const SCEV *Divisor) { - auto *ConstDivisor = dyn_cast<SCEVConstant>(Divisor); - if (!ConstDivisor) - return MinMaxExpr; - const APInt &DivisorVal = ConstDivisor->getAPInt(); - - const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr; - SCEVTypes SCTy; - if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS, - MinMaxRHS)) - return MinMaxExpr; - auto IsMin = - isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr); - assert(SE.isKnownNonNegative(MinMaxLHS) && - "Expected non-negative operand!"); - auto *DivisibleExpr = - IsMin - ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE) - : getNextSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE); - SmallVector<const SCEV *> Ops = { - ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr}; - return SE.getMinMaxExpr(SCTy, Ops); - }; - - // If we have LHS == 0, check if LHS is computing a property of some unknown - // SCEV %v which we can rewrite %v to express explicitly. - if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) { - // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to - // explicitly express that. - const SCEVUnknown *URemLHS = nullptr; - const SCEV *URemRHS = nullptr; - if (match(LHS, - m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) { - auto I = RewriteMap.find(URemLHS); - const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : URemLHS; - RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS); - const auto *Multiple = - SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS); - RewriteMap[URemLHS] = Multiple; - ExprsToRewrite.push_back(URemLHS); - return; - } - } - // Do not apply information for constants or if RHS contains an AddRec. if (isa<SCEVConstant>(LHS) || SE.containsAddRecurrence(RHS)) return; @@ -15655,7 +15660,9 @@ void ScalarEvolution::LoopGuards::collectFromBlock( }; const SCEV *RewrittenLHS = GetMaybeRewritten(LHS); - const APInt &DividesBy = SE.getConstantMultiple(RewrittenLHS); + // Apply divisibility information when computing the constant multiple. + const APInt &DividesBy = + SE.getConstantMultiple(DivGuards.rewrite(RewrittenLHS)); // Collect rewrites for LHS and its transitive operands based on the // condition. @@ -15670,31 +15677,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock( // predicate. const SCEV *One = SE.getOne(RHS->getType()); switch (Predicate) { - case CmpInst::ICMP_ULT: - if (RHS->getType()->isPointerTy()) - return; - RHS = SE.getUMaxExpr(RHS, One); - [[fallthrough]]; - case CmpInst::ICMP_SLT: { - RHS = SE.getMinusSCEV(RHS, One); - RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - } - case CmpInst::ICMP_UGT: - case CmpInst::ICMP_SGT: - RHS = SE.getAddExpr(RHS, One); - RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - case CmpInst::ICMP_ULE: - case CmpInst::ICMP_SLE: - RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - case CmpInst::ICMP_UGE: - case CmpInst::ICMP_SGE: - RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - default: - break; + case CmpInst::ICMP_ULT: + if (RHS->getType()->isPointerTy()) + return; + RHS = SE.getUMaxExpr(RHS, One); + [[fallthrough]]; + case CmpInst::ICMP_SLT: { + RHS = SE.getMinusSCEV(RHS, One); + RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + } + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_SGT: + RHS = SE.getAddExpr(RHS, One); + RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SLE: + RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGE: + RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + default: + break; } SmallVector<const SCEV *, 16> Worklist(1, LHS); @@ -15840,8 +15847,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock( // Now apply the information from the collected conditions to // Guards.RewriteMap. Conditions are processed in reverse order, so the - // earliest conditions is processed first. This ensures the SCEVs with the + // earliest conditions is processed first, except guards with divisibility + // information, which are moved to the back. This ensures the SCEVs with the // shortest dependency chains are constructed first. + SmallVector<std::tuple<CmpInst::Predicate, const SCEV *, const SCEV *>> + GuardsToProcess; for (auto [Term, EnterIfTrue] : reverse(Terms)) { SmallVector<Value *, 8> Worklist; SmallPtrSet<Value *, 8> Visited; @@ -15856,7 +15866,14 @@ void ScalarEvolution::LoopGuards::collectFromBlock( EnterIfTrue ? Cmp->getPredicate() : Cmp->getInversePredicate(); const auto *LHS = SE.getSCEV(Cmp->getOperand(0)); const auto *RHS = SE.getSCEV(Cmp->getOperand(1)); - CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap); + // If LHS is a constant, apply information to the other expression. + // TODO: If LHS is not a constant, check if using CompareSCEVComplexity + // can improve results. + if (isa<SCEVConstant>(LHS)) { + std::swap(LHS, RHS); + Predicate = CmpInst::getSwappedPredicate(Predicate); + } + GuardsToProcess.emplace_back(Predicate, LHS, RHS); continue; } @@ -15869,6 +15886,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock( } } + // Process divisibility guards in reverse order to populate DivGuards early. + DenseMap<const SCEV *, APInt> Multiples; + LoopGuards DivGuards(SE); + for (const auto &[Predicate, LHS, RHS] : GuardsToProcess) { + if (!isDivisibilityGuard(LHS, RHS, SE)) + continue; + collectDivisibilityInformation(Predicate, LHS, RHS, DivGuards.RewriteMap, + Multiples, SE); + } + + for (const auto &[Predicate, LHS, RHS] : GuardsToProcess) + CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap, DivGuards); + + // Apply divisibility information last. This ensures it is applied to the + // outermost expression after other rewrites for the given value. + for (const auto &[K, Divisor] : Multiples) { + const SCEV *DivisorSCEV = SE.getConstant(Divisor); + Guards.RewriteMap[K] = + SE.getMulExpr(SE.getUDivExpr(applyDivisibilityOnMinMaxExpr( + Guards.rewrite(K), Divisor, SE), + DivisorSCEV), + DivisorSCEV); + ExprsToRewrite.push_back(K); + } + // Let the rewriter preserve NUW/NSW flags if the unsigned/signed ranges of // the replacement expressions are contained in the ranges of the replaced // expressions. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c47a1c1..0426ac7 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1353,9 +1353,9 @@ TargetTransformInfo::getInlineCallPenalty(const Function *F, return TTIImpl->getInlineCallPenalty(F, Call, DefaultCallPenalty); } -bool TargetTransformInfo::areTypesABICompatible( - const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { +bool TargetTransformInfo::areTypesABICompatible(const Function *Caller, + const Function *Callee, + ArrayRef<Type *> Types) const { return TTIImpl->areTypesABICompatible(Caller, Callee, Types); } diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp index 55fa2df..a6c7e6a 100644 --- a/llvm/lib/BinaryFormat/Dwarf.cpp +++ b/llvm/lib/BinaryFormat/Dwarf.cpp @@ -1076,10 +1076,3 @@ StringRef (*const llvm::dwarf::EnumTraits<LineNumberOps>::StringFn)(unsigned) = LNStandardString; StringRef (*const llvm::dwarf::EnumTraits<Index>::StringFn)(unsigned) = IndexString; - -constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[]; -constexpr char llvm::dwarf::EnumTraits<Form>::Type[]; -constexpr char llvm::dwarf::EnumTraits<Index>::Type[]; -constexpr char llvm::dwarf::EnumTraits<Tag>::Type[]; -constexpr char llvm::dwarf::EnumTraits<LineNumberOps>::Type[]; -constexpr char llvm::dwarf::EnumTraits<LocationAtom>::Type[]; diff --git a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp index 3de3dcc..80b421d 100644 --- a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp +++ b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp @@ -209,12 +209,12 @@ template <> struct CustomMappingTraits<MapDocNode> { static void inputOne(IO &IO, StringRef Key, MapDocNode &M) { ScalarDocNode KeyObj = M.getDocument()->getNode(); KeyObj.fromString(Key, ""); - IO.mapRequired(Key.str().c_str(), M.getMap()[KeyObj]); + IO.mapRequired(Key, M.getMap()[KeyObj]); } static void output(IO &IO, MapDocNode &M) { for (auto I : M.getMap()) { - IO.mapRequired(I.first.toString().c_str(), I.second); + IO.mapRequired(I.first.toString(), I.second); } } }; diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp index 571c5b3..003c850 100644 --- a/llvm/lib/CAS/ActionCaches.cpp +++ b/llvm/lib/CAS/ActionCaches.cpp @@ -13,7 +13,11 @@ #include "BuiltinCAS.h" #include "llvm/ADT/TrieRawHashMap.h" #include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Config/llvm-config.h" #include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Errc.h" #define DEBUG_TYPE "cas-action-caches" @@ -47,12 +51,54 @@ public: Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, bool CanBeDistributed) const final; + Error validate() const final { + return createStringError("InMemoryActionCache doesn't support validate()"); + } + private: using DataT = CacheEntry<sizeof(HashType)>; using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>; InMemoryCacheT Cache; }; + +/// Builtin basic OnDiskActionCache that uses one underlying OnDiskKeyValueDB. +class OnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path); + + Error validate() const final; + +private: + static StringRef getHashName() { return "BLAKE3"; } + + OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB); + + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + using DataT = CacheEntry<sizeof(HashType)>; +}; + +/// Builtin unified ActionCache that wraps around UnifiedOnDiskCache to provide +/// access to its ActionCache. +class UnifiedOnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + + Error validate() const final; + +private: + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; +}; } // end namespace static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash, @@ -99,3 +145,123 @@ std::unique_ptr<ActionCache> createInMemoryActionCache() { } } // namespace llvm::cas + +OnDiskActionCache::OnDiskActionCache( + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + DB(std::move(DB)) {} + +Expected<std::unique_ptr<OnDiskActionCache>> +OnDiskActionCache::create(StringRef AbsPath) { + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(), + sizeof(HashType), getHashName(), + sizeof(DataT)) + .moveInto(DB)) + return std::move(E); + return std::unique_ptr<OnDiskActionCache>( + new OnDiskActionCache(std::move(DB))); +} + +Expected<std::optional<CASID>> +OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = DB->get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + return CASID::create(&getContext(), toStringRef(*Val)); +} + +Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result, + bool /*CanBeDistributed*/) { + auto ResultHash = Result.getHash(); + ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size()); + ArrayRef<char> Observed; + if (Error E = DB->put(Key, Expected).moveInto(Observed)) + return E; + + if (Expected == Observed) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, + ArrayRef((const uint8_t *)Observed.data(), Observed.size())); +} + +Error OnDiskActionCache::validate() const { + // FIXME: without the matching CAS there is nothing we can check about the + // cached values. The hash size is already validated by the DB validator. + return DB->validate(nullptr); +} + +UnifiedOnDiskActionCache::UnifiedOnDiskActionCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + UniDB(std::move(UniDB)) {} + +Expected<std::optional<CASID>> +UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = UniDB->getKeyValueDB().get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Val); + return CASID::create(&getContext(), + toStringRef(UniDB->getGraphDB().getDigest(ID))); +} + +Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, + const CASID &Result, + bool /*CanBeDistributed*/) { + auto Expected = UniDB->getGraphDB().getReference(Result.getHash()); + if (LLVM_UNLIKELY(!Expected)) + return Expected.takeError(); + + auto Value = ondisk::UnifiedOnDiskCache::getValueFromObjectID(*Expected); + std::optional<ArrayRef<char>> Observed; + if (Error E = UniDB->getKeyValueDB().put(Key, Value).moveInto(Observed)) + return E; + + auto ObservedID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Observed); + if (*Expected == ObservedID) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, UniDB->getGraphDB().getDigest(ObservedID)); +} + +Error UnifiedOnDiskActionCache::validate() const { + auto ValidateRef = [](FileOffset Offset, ArrayRef<char> Value) -> Error { + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(Value); + auto formatError = [&](Twine Msg) { + return createStringError( + llvm::errc::illegal_byte_sequence, + "bad record at 0x" + + utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " + + Msg.str()); + }; + if (ID.getOpaqueData() == 0) + return formatError("zero is not a valid ref"); + return Error::success(); + }; + return UniDB->getKeyValueDB().validate(ValidateRef); +} + +Expected<std::unique_ptr<ActionCache>> +cas::createOnDiskActionCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return OnDiskActionCache::create(Path); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} + +std::unique_ptr<ActionCache> +cas::builtin::createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp index 73646ad..e9bc6d8 100644 --- a/llvm/lib/CAS/BuiltinCAS.cpp +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -9,6 +9,7 @@ #include "BuiltinCAS.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Process.h" using namespace llvm; @@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs, Refs, Data); } -Error BuiltinCAS::validate(const CASID &ID) { +Error BuiltinCAS::validateObject(const CASID &ID) { auto Ref = getReference(ID); if (!Ref) return createUnknownObjectError(ID); @@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) { return Error::success(); } + +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt, + BuiltinCASContext::getHashName(), + sizeof(HashType)); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h index 3b5374d..4d2de66 100644 --- a/llvm/lib/CAS/BuiltinCAS.h +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -1,4 +1,4 @@ -//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +15,9 @@ namespace llvm::cas { class ActionCache; +namespace ondisk { +class UnifiedOnDiskCache; +} // namespace ondisk namespace builtin { /// Common base class for builtin CAS implementations using the same CASContext. @@ -65,9 +68,27 @@ public: "corrupt storage"); } - Error validate(const CASID &ID) final; + Error validateObject(const CASID &ID) final; }; +/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing. +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +createBuiltinUnifiedOnDiskCache(StringRef Path); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +// FIXME: Proxy not portable. Maybe also error-prone? +constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default"; +constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default"; + } // end namespace builtin } // end namespace llvm::cas diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp new file mode 100644 index 0000000..f3f6fa0 --- /dev/null +++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "BuiltinCAS.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" + +using namespace llvm; +using namespace llvm::cas; + +Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> +cas::createOnDiskUnifiedCASDatabases(StringRef Path) { + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; + if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB)) + return std::move(E); + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB)); + return std::make_pair(std::move(CAS), std::move(AC)); +} + +Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinary) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::validateIfNeeded( + Path, builtin::BuiltinCASContext::getHashName(), + sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation, + LLVMCasBinary); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index a2f8c49..aad77dc 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -2,15 +2,18 @@ add_llvm_component_library(LLVMCAS ActionCache.cpp ActionCaches.cpp BuiltinCAS.cpp + BuiltinUnifiedCASDatabases.cpp DatabaseFile.cpp InMemoryCAS.cpp MappedFileRegionArena.cpp ObjectStore.cpp + OnDiskCAS.cpp OnDiskCommon.cpp OnDiskDataAllocator.cpp OnDiskGraphDB.cpp OnDiskKeyValueDB.cpp OnDiskTrieRawHashMap.cpp + UnifiedOnDiskCache.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp index c63ee70d..2d4eedd 100644 --- a/llvm/lib/CAS/InMemoryCAS.cpp +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -233,6 +233,12 @@ public: return cast<InMemoryObject>(asInMemoryObject(Node)).getData(); } + void print(raw_ostream &OS) const final; + + Error validate(bool CheckHash) const final { + return createStringError("InMemoryCAS doesn't support validate()"); + } + InMemoryCAS() = default; private: @@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const { return cast<InMemoryInlineObject>(this)->getRefsImpl(); } +void InMemoryCAS::print(raw_ostream &OS) const {} + Expected<ObjectRef> InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, sys::fs::mapped_file_region Map) { diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp index e0be50b..3110577 100644 --- a/llvm/lib/CAS/ObjectStore.cpp +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -1,4 +1,4 @@ -//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,7 +12,7 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" -#include <optional> +#include <deque> using namespace llvm; using namespace llvm::cas; @@ -21,6 +21,7 @@ void CASContext::anchor() {} void ObjectStore::anchor() {} LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } @@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) { auto [I, Inserted] = ValidatedRefs.insert(Ref); if (!Inserted) continue; // already validated. - if (Error E = validate(getID(Ref))) + if (Error E = validateObject(getID(Ref))) return E; Expected<ObjectHandle> Obj = load(Ref); if (!Obj) @@ -155,6 +156,92 @@ Error ObjectStore::validateTree(ObjectRef Root) { return Error::success(); } +Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream, + ObjectRef Other) { + // Copy the full CAS tree from upstream with depth-first ordering to ensure + // all the child nodes are available in downstream CAS before inserting + // current object. This uses a similar algorithm as + // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema + // so it can be used to import from any other ObjectStore reguardless of the + // CAS schema. + + // There is no work to do if importing from self. + if (this == &Upstream) + return Other; + + /// Keeps track of the state of visitation for current node and all of its + /// parents. Upstream Cursor holds information only from upstream CAS. + struct UpstreamCursor { + ObjectRef Ref; + ObjectHandle Node; + size_t RefsCount; + std::deque<ObjectRef> Refs; + }; + SmallVector<UpstreamCursor, 16> CursorStack; + /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either + /// just stored in the CAS or nodes already exists in the current CAS. + SmallVector<ObjectRef, 128> PrimaryRefStack; + /// A map from upstream ObjectRef to current ObjectRef. + llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects; + + auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) { + unsigned NumRefs = Upstream.getNumRefs(Node); + std::deque<ObjectRef> Refs; + for (unsigned I = 0; I < NumRefs; ++I) + Refs.push_back(Upstream.readRef(Node, I)); + + CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)}); + }; + + auto UpstreamHandle = Upstream.load(Other); + if (!UpstreamHandle) + return UpstreamHandle.takeError(); + enqueueNode(Other, *UpstreamHandle); + + while (!CursorStack.empty()) { + UpstreamCursor &Cur = CursorStack.back(); + if (Cur.Refs.empty()) { + // Copy the node data into the primary store. + // The bottom of \p PrimaryRefStack contains the ObjectRef for the + // current node. + assert(PrimaryRefStack.size() >= Cur.RefsCount); + auto Refs = ArrayRef(PrimaryRefStack) + .slice(PrimaryRefStack.size() - Cur.RefsCount); + auto NewNode = store(Refs, Upstream.getData(Cur.Node)); + if (!NewNode) + return NewNode.takeError(); + + // Remove the current node and its IDs from the stack. + PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount); + CursorStack.pop_back(); + + PrimaryRefStack.push_back(*NewNode); + CreatedObjects.try_emplace(Cur.Ref, *NewNode); + continue; + } + + // Check if the node exists already. + auto CurrentID = Cur.Refs.front(); + Cur.Refs.pop_front(); + auto Ref = CreatedObjects.find(CurrentID); + if (Ref != CreatedObjects.end()) { + // If exists already, just need to enqueue the primary node. + PrimaryRefStack.push_back(Ref->second); + continue; + } + + // Load child. + auto PrimaryID = Upstream.load(CurrentID); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + + enqueueNode(CurrentID, *PrimaryID); + } + + assert(PrimaryRefStack.size() == 1); + return PrimaryRefStack.front(); +} + std::unique_ptr<MemoryBuffer> ObjectProxy::getMemoryBuffer(StringRef Name, bool RequiresNullTerminator) const { diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp new file mode 100644 index 0000000..7d29f44 --- /dev/null +++ b/llvm/lib/CAS/OnDiskCAS.cpp @@ -0,0 +1,211 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Error.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class OnDiskCAS : public BuiltinCAS { +public: + Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) final; + + Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final; + + CASID getID(ObjectRef Ref) const final; + + std::optional<ObjectRef> getReference(const CASID &ID) const final; + + Expected<bool> isMaterialized(ObjectRef Ref) const final; + + ArrayRef<char> getDataConst(ObjectHandle Node) const final; + + void print(raw_ostream &OS) const final; + Error validate(bool CheckHash) const final; + + static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path); + + OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {} + +private: + ObjectHandle convertHandle(ondisk::ObjectHandle Node) const { + return makeObjectHandle(Node.getOpaqueData()); + } + + ondisk::ObjectHandle convertHandle(ObjectHandle Node) const { + return ondisk::ObjectHandle(Node.getInternalRef(*this)); + } + + ObjectRef convertRef(ondisk::ObjectID Ref) const { + return makeObjectRef(Ref.getOpaqueData()); + } + + ondisk::ObjectID convertRef(ObjectRef Ref) const { + return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this)); + } + + size_t getNumRefs(ObjectHandle Node) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return std::distance(RefsRange.begin(), RefsRange.end()); + } + + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return convertRef(RefsRange.begin()[I]); + } + + Error forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const final; + + Error setSizeLimit(std::optional<uint64_t> SizeLimit) final; + Expected<std::optional<uint64_t>> getStorageSize() const final; + Error pruneStorageData() final; + + OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB) + : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {} + + std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB; + std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB; + ondisk::OnDiskGraphDB *DB; +}; + +} // end anonymous namespace + +void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); } +Error OnDiskCAS::validate(bool CheckHash) const { + auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data, + SmallVectorImpl<uint8_t> &Result) { + auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject( + Refs, Data); + Result.assign(Hash.begin(), Hash.end()); + }; + + if (auto E = DB->validate(CheckHash, Hasher)) + return E; + + return Error::success(); +} + +CASID OnDiskCAS::getID(ObjectRef Ref) const { + ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref)); + return CASID::create(&getContext(), toStringRef(Hash)); +} + +std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const { + std::optional<ondisk::ObjectID> ObjID = + DB->getExistingReference(ID.getHash()); + if (!ObjID) + return std::nullopt; + return convertRef(*ObjID); +} + +Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const { + return DB->isMaterialized(convertRef(ExternalRef)); +} + +ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const { + return DB->getObjectData(convertHandle(Node)); +} + +Expected<std::optional<ObjectHandle>> +OnDiskCAS::loadIfExists(ObjectRef ExternalRef) { + Expected<std::optional<ondisk::ObjectHandle>> ObjHnd = + DB->load(convertRef(ExternalRef)); + if (!ObjHnd) + return ObjHnd.takeError(); + if (!*ObjHnd) + return std::nullopt; + return convertHandle(**ObjHnd); +} + +Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + SmallVector<ondisk::ObjectID, 64> IDs; + IDs.reserve(Refs.size()); + for (ObjectRef Ref : Refs) { + IDs.push_back(convertRef(Ref)); + } + + auto StoredID = DB->getReference(ComputedHash); + if (LLVM_UNLIKELY(!StoredID)) + return StoredID.takeError(); + if (Error E = DB->store(*StoredID, IDs, Data)) + return std::move(E); + return convertRef(*StoredID); +} + +Error OnDiskCAS::forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + for (ondisk::ObjectID Ref : RefsRange) { + if (Error E = Callback(convertRef(Ref))) + return E; + } + return Error::success(); +} + +Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) { + UnifiedDB->setSizeLimit(SizeLimit); + return Error::success(); +} + +Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const { + return UnifiedDB->getStorageSize(); +} + +Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); } + +Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) { + Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB = + ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(), + sizeof(HashType)); + if (!DB) + return DB.takeError(); + return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB))); +} + +bool cas::isOnDiskCASEnabled() { +#if LLVM_ENABLE_ONDISK_CAS + return true; +#else + return false; +#endif +} + +Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) { +#if LLVM_ENABLE_ONDISK_CAS + // FIXME: An absolute path isn't really good enough. Should open a directory + // and use openat() for files underneath. + SmallString<256> AbsPath; + Path.toVector(AbsPath); + sys::fs::make_absolute(AbsPath); + + return OnDiskCAS::open(AbsPath); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled"); +#endif /* LLVM_ENABLE_ONDISK_CAS */ +} + +std::unique_ptr<ObjectStore> +cas::builtin::createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<OnDiskCAS>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp index 64cbe9d..245b6fb 100644 --- a/llvm/lib/CAS/OnDiskGraphDB.cpp +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -893,6 +893,10 @@ int64_t DataRecordHandle::getDataRelOffset() const { } Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const { + if (UpstreamDB) { + if (auto E = UpstreamDB->validate(Deep, Hasher)) + return E; + } return Index.validate([&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { @@ -1202,11 +1206,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) { return I.takeError(); TrieRecord::Data Object = I->Ref.load(); - if (Object.SK == TrieRecord::StorageKind::Unknown) { - if (!UpstreamDB) - return std::nullopt; + if (Object.SK == TrieRecord::StorageKind::Unknown) return faultInFromUpstream(ExternalRef); - } if (Object.SK == TrieRecord::StorageKind::DataPool) return ObjectHandle::fromFileOffset(Object.Offset); @@ -1286,8 +1287,10 @@ OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef, TrieRecord::Data Object = I->Ref.load(); if (Object.SK != TrieRecord::StorageKind::Unknown) return ObjectPresence::InPrimaryDB; + if (!CheckUpstream || !UpstreamDB) return ObjectPresence::Missing; + std::optional<ObjectID> UpstreamID = UpstreamDB->getExistingReference(getDigest(*I)); return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB @@ -1549,9 +1552,10 @@ unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const { return std::max(IndexPercent, DataPercent); } -Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( - StringRef AbsPath, StringRef HashName, unsigned HashByteSize, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) { +Expected<std::unique_ptr<OnDiskGraphDB>> +OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName, + unsigned HashByteSize, OnDiskGraphDB *UpstreamDB, + FaultInPolicy Policy) { if (std::error_code EC = sys::fs::create_directories(AbsPath)) return createFileError(AbsPath, EC); @@ -1604,18 +1608,15 @@ Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( "unexpected user header in '" + DataPoolPath + "'"); - return std::unique_ptr<OnDiskGraphDB>( - new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool), - std::move(UpstreamDB), Policy)); + return std::unique_ptr<OnDiskGraphDB>(new OnDiskGraphDB( + AbsPath, std::move(*Index), std::move(*DataPool), UpstreamDB, Policy)); } OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index, OnDiskDataAllocator DataPool, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, - FaultInPolicy Policy) + OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy) : Index(std::move(Index)), DataPool(std::move(DataPool)), - RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)), - FIPolicy(Policy) { + RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy) { /// Lifetime for "big" objects not in DataPool. /// /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something @@ -1638,7 +1639,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID, // against the process dying during importing and leaving the database with an // incomplete tree. Note that if the upstream has missing nodes then the tree // will be copied with missing nodes as well, it won't be considered an error. - struct UpstreamCursor { ObjectHandle Node; size_t RefsCount; @@ -1720,7 +1720,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, // Copy the node data into the primary store. // FIXME: Use hard-link or cloning if the file-system supports it and data is // stored into a separate file. - auto Data = UpstreamDB->getObjectData(UpstreamNode); auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode); SmallVector<ObjectID, 64> Refs; @@ -1737,7 +1736,8 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, Expected<std::optional<ObjectHandle>> OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) { - assert(UpstreamDB); + if (!UpstreamDB) + return std::nullopt; auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID)); if (LLVM_UNLIKELY(!UpstreamID)) diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp index 2186071..15656cb 100644 --- a/llvm/lib/CAS/OnDiskKeyValueDB.cpp +++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp @@ -20,6 +20,7 @@ #include "llvm/CAS/OnDiskKeyValueDB.h" #include "OnDiskCommon.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Errc.h" @@ -53,15 +54,21 @@ Expected<std::optional<ArrayRef<char>>> OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) { // Check the result cache. OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key); - if (!ActionP) + if (ActionP) { + assert(isAddrAligned(Align(8), ActionP->Data.data())); + return ActionP->Data; + } + if (!UnifiedCache || !UnifiedCache->UpstreamKVDB) return std::nullopt; - assert(isAddrAligned(Align(8), ActionP->Data.data())); - return ActionP->Data; + + // Try to fault in from upstream. + return UnifiedCache->faultInFromUpstreamKV(Key); } Expected<std::unique_ptr<OnDiskKeyValueDB>> OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, - StringRef ValueName, size_t ValueSize) { + StringRef ValueName, size_t ValueSize, + UnifiedOnDiskCache *Cache) { if (std::error_code EC = sys::fs::create_directories(Path)) return createFileError(Path, EC); @@ -87,10 +94,14 @@ OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, return std::move(E); return std::unique_ptr<OnDiskKeyValueDB>( - new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache))); + new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache), Cache)); } Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const { + if (UnifiedCache && UnifiedCache->UpstreamKVDB) { + if (auto E = UnifiedCache->UpstreamKVDB->validate(CheckValue)) + return E; + } return Cache.validate( [&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp new file mode 100644 index 0000000..ae9d818 --- /dev/null +++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp @@ -0,0 +1,613 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one +/// directory while also restricting storage growth with a scheme of chaining +/// the two most recent directories (primary & upstream), where the primary +/// "faults-in" data from the upstream one. When the primary (most recent) +/// directory exceeds its intended limit a new empty directory becomes the +/// primary one. +/// +/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open +/// receives) there are directories named like this: +/// +/// 'v<version>.<x>' +/// 'v<version>.<x+1>' +/// 'v<version>.<x+2>' +/// ... +/// +/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and +/// the part after the dot is an increasing integer. The primary directory is +/// the one with the highest integer and the upstream one is the directory +/// before it. For example, if the sub-directories contained are: +/// +/// 'v1.5', 'v1.6', 'v1.7', 'v1.8' +/// +/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are +/// unused directories that can be safely deleted at any time and by any +/// process. +/// +/// Contained within the top-level directory is a file named "lock" which is +/// used for processes to take shared or exclusive locks for the contents of the +/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock +/// for the top-level directory; when it closes, if the primary sub-directory +/// exceeded its limit, it attempts to get an exclusive lock in order to create +/// a new empty primary directory; if it can't get the exclusive lock it gives +/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt +/// again. +/// +/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a +/// directory, by any process, the storage size in that directory will keep +/// growing unrestricted. But the major benefit is that garbage-collection can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers in the same process or other +/// processes. +/// +/// The \c UnifiedOnDiskCache also provides validation and recovery on top of +/// the underlying on-disk storage. The low-level storage is designed to remain +/// coherent across regular process crashes, but may be invalid after power loss +/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows +/// validating the contents once per boot and can recover by marking invalid +/// data for garbage collection. +/// +/// The data recovery described above requires exclusive access to the CAS, and +/// it is an error to attempt recovery if the CAS is open in any process/thread. +/// In order to maximize backwards compatibility with tools that do not perform +/// validation before opening the CAS, we do not attempt to get exclusive access +/// until recovery is actually performed, meaning as long as the data is valid +/// it will not conflict with concurrent use. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "BuiltinCAS.h" +#include "OnDiskCommon.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include <optional> + +#if __has_include(<sys/sysctl.h>) +#include <sys/sysctl.h> +#endif + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out +/// how to handle the leftover sub-directories of the previous version, within +/// the \p UnifiedOnDiskCache::collectGarbage function. +static constexpr StringLiteral DBDirPrefix = "v1."; + +static constexpr StringLiteral ValidationFilename = "v1.validation"; +static constexpr StringLiteral CorruptPrefix = "corrupt."; + +ObjectID UnifiedOnDiskCache::getObjectIDFromValue(ArrayRef<char> Value) { + // little endian encoded. + assert(Value.size() == sizeof(uint64_t)); + return ObjectID::fromOpaqueData(support::endian::read64le(Value.data())); +} + +UnifiedOnDiskCache::ValueBytes +UnifiedOnDiskCache::getValueFromObjectID(ObjectID ID) { + // little endian encoded. + UnifiedOnDiskCache::ValueBytes ValBytes; + static_assert(ValBytes.size() == sizeof(ID.getOpaqueData())); + support::endian::write64le(ValBytes.data(), ID.getOpaqueData()); + return ValBytes; +} + +Expected<std::optional<ArrayRef<char>>> +UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) { + assert(UpstreamGraphDB); + assert(UpstreamKVDB); + + std::optional<ArrayRef<char>> UpstreamValue; + if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue)) + return std::move(E); + if (!UpstreamValue) + return std::nullopt; + + // The value is the \p ObjectID in the context of the upstream + // \p OnDiskGraphDB instance. Translate it to the context of the primary + // \p OnDiskGraphDB instance. + ObjectID UpstreamID = getObjectIDFromValue(*UpstreamValue); + auto PrimaryID = + PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID)); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + return PrimaryKVDB->put(Key, getValueFromObjectID(*PrimaryID)); +} + +/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with +/// ascending order of the integer after the dot. Corrupt directories, if +/// included, will come first. +static Expected<SmallVector<std::string, 4>> +getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) { + struct DBDir { + uint64_t Order; + std::string Name; + }; + SmallVector<DBDir> FoundDBDirs; + + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() != sys::fs::file_type::directory_file) + continue; + StringRef SubDir = sys::path::filename(DirI->path()); + if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) { + FoundDBDirs.push_back({0, std::string(SubDir)}); + continue; + } + if (!SubDir.starts_with(DBDirPrefix)) + continue; + uint64_t Order; + if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order)) + return createStringError(inconvertibleErrorCode(), + "unexpected directory " + DirI->path()); + FoundDBDirs.push_back({Order, std::string(SubDir)}); + } + if (EC) + return createFileError(Path, EC); + + llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool { + return LHS.Order <= RHS.Order; + }); + + SmallVector<std::string, 4> DBDirs; + for (DBDir &Dir : FoundDBDirs) + DBDirs.push_back(std::move(Dir.Name)); + return DBDirs; +} + +static Expected<SmallVector<std::string, 4>> getAllGarbageDirs(StringRef Path) { + auto DBDirs = getAllDBDirs(Path, /*IncludeCorrupt=*/true); + if (!DBDirs) + return DBDirs.takeError(); + + // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure + // out how to handle the leftover sub-directories of the previous version. + + for (unsigned Keep = 2; Keep > 0 && !DBDirs->empty(); --Keep) { + StringRef Back(DBDirs->back()); + if (Back.starts_with(CorruptPrefix)) + break; + DBDirs->pop_back(); + } + return *DBDirs; +} + +/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the +/// 'v<version>.<x+1>' name. +static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) { + assert(DBDir.starts_with(DBDirPrefix)); + uint64_t Count; + bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count); + assert(!Failed); + (void)Failed; + OS << DBDirPrefix << Count + 1; +} + +static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath, + bool CheckHash) { + SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"}; + if (CheckHash) + Args.push_back("-check-hash"); + + llvm::SmallString<128> StdErrPath; + int StdErrFD = -1; + if (std::error_code EC = sys::fs::createTemporaryFile( + "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath, + llvm::sys::fs::OF_Text)) + return createStringError(EC, "failed to create temporary file"); + FileRemover OutputRemover(StdErrPath.c_str()); + + std::optional<llvm::StringRef> Redirects[] = { + {""}, // stdin = /dev/null + {""}, // stdout = /dev/null + StdErrPath.str(), + }; + + std::string ErrMsg; + int Result = + sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects, + /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg); + + if (Result == -1) + return createStringError("failed to exec " + join(Args, " ") + ": " + + ErrMsg); + if (Result != 0) { + llvm::SmallString<64> Err("cas contents invalid"); + if (!ErrMsg.empty()) { + Err += ": "; + Err += ErrMsg; + } + auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str()); + if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) { + Err += ": "; + Err += (*StdErrBuf)->getBuffer(); + } + return createStringError(Err); + } + return Error::success(); +} + +static Error validateInProcess(StringRef RootPath, StringRef HashName, + unsigned HashByteSize, bool CheckHash) { + std::shared_ptr<UnifiedOnDiskCache> UniDB; + if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName, + HashByteSize) + .moveInto(UniDB)) + return E; + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + if (Error E = CAS->validate(CheckHash)) + return E; + auto Cache = builtin::createActionCacheFromUnifiedOnDiskCache(UniDB); + if (Error E = Cache->validate()) + return E; + return Error::success(); +} + +static Expected<uint64_t> getBootTime() { +#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME) + struct timeval TV; + size_t TVLen = sizeof(TV); + int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME}; + if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0) + return createStringError(llvm::errnoAsErrorCode(), + "failed to get boottime"); + if (TVLen != sizeof(TV)) + return createStringError("sysctl kern.boottime unexpected format"); + return TV.tv_sec; +#elif defined(__linux__) + // Use the mtime for /proc, which is recreated during system boot. + // We could also read /proc/stat and search for 'btime'. + sys::fs::file_status Status; + if (std::error_code EC = sys::fs::status("/proc", Status)) + return createFileError("/proc", EC); + return Status.getLastModificationTime().time_since_epoch().count(); +#else + llvm::report_fatal_error("getBootTime unimplemented"); +#endif +} + +Expected<ValidationResult> UnifiedOnDiskCache::validateIfNeeded( + StringRef RootPath, StringRef HashName, unsigned HashByteSize, + bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinaryPath) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, ValidationFilename); + int FD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(FD != -1); + + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); }); + + if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive)) + return createFileError(PathBuf, EC); + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); }); + + SmallString<8> Bytes; + if (Error E = sys::fs::readNativeFileToEOF(File, Bytes)) + return createFileError(PathBuf, std::move(E)); + + uint64_t ValidationBootTime = 0; + if (!Bytes.empty() && + StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime)) + return createFileError(PathBuf, errc::illegal_byte_sequence, + "expected integer"); + + static uint64_t BootTime = 0; + if (BootTime == 0) + if (Error E = getBootTime().moveInto(BootTime)) + return std::move(E); + + std::string LogValidationError; + + if (ValidationBootTime == BootTime && !ForceValidation) + return ValidationResult::Skipped; + + // Validate! + bool NeedsRecovery = false; + if (Error E = + LLVMCasBinaryPath + ? validateOutOfProcess(*LLVMCasBinaryPath, RootPath, CheckHash) + : validateInProcess(RootPath, HashName, HashByteSize, + CheckHash)) { + if (AllowRecovery) { + consumeError(std::move(E)); + NeedsRecovery = true; + } else { + return std::move(E); + } + } + + if (NeedsRecovery) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, "lock"); + + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); }); + if (std::error_code EC = tryLockFileThreadSafe(LockFD)) { + if (EC == std::errc::no_lock_available) + return createFileError( + PathBuf, EC, + "CAS validation requires exclusive access but CAS was in use"); + return createFileError(PathBuf, EC); + } + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + + for (StringRef DBDir : *DBDirs) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, DBDir); + std::error_code EC; + int Attempt = 0, MaxAttempts = 100; + SmallString<128> GCPath; + for (; Attempt < MaxAttempts; ++Attempt) { + GCPath.assign(RootPath); + sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) + + "." + DBDir); + EC = sys::fs::rename(PathBuf, GCPath); + // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST. + if (EC != errc::directory_not_empty && EC != errc::file_exists) + break; + } + if (Attempt == MaxAttempts) + return createStringError( + EC, "rename " + PathBuf + + " failed: too many CAS directories awaiting pruning"); + if (EC) + return createStringError(EC, "rename " + PathBuf + " to " + GCPath + + " failed: " + EC.message()); + } + } + + if (ValidationBootTime != BootTime) { + // Fix filename in case we have error to report. + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, ValidationFilename); + if (std::error_code EC = sys::fs::resize_file(FD, 0)) + return createFileError(PathBuf, EC); + raw_fd_ostream OS(FD, /*shouldClose=*/false); + OS.seek(0); // resize does not reset position + OS << BootTime << '\n'; + if (OS.has_error()) + return createFileError(PathBuf, OS.error()); + } + + return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid; +} + +Expected<std::unique_ptr<UnifiedOnDiskCache>> +UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit, + StringRef HashName, unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, "lock"); + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(LockFD != -1); + // Locking the directory using shared lock, which will prevent other processes + // from creating a new chain (essentially while a \p UnifiedOnDiskCache + // instance holds a shared lock the storage for the primary directory will + // grow unrestricted). + if (std::error_code EC = + lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared)) + return createFileError(PathBuf, EC); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + if (DBDirs->empty()) + DBDirs->push_back((Twine(DBDirPrefix) + "1").str()); + + assert(!DBDirs->empty()); + + /// If there is only one directory open databases on it. If there are 2 or + /// more directories, get the most recent directories and chain them, with the + /// most recent being the primary one. The remaining directories are unused + /// data than can be garbage-collected. + auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache()); + std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB; + std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; + if (DBDirs->size() > 1) { + StringRef UpstreamDir = *(DBDirs->end() - 2); + PathBuf = RootPath; + sys::path::append(PathBuf, UpstreamDir); + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + /*UpstreamDB=*/nullptr, FaultInPolicy) + .moveInto(UpstreamGraphDB)) + return std::move(E); + if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t)) + .moveInto(UpstreamKVDB)) + return std::move(E); + } + + StringRef PrimaryDir = *(DBDirs->end() - 1); + PathBuf = RootPath; + sys::path::append(PathBuf, PrimaryDir); + std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + UpstreamGraphDB.get(), FaultInPolicy) + .moveInto(PrimaryGraphDB)) + return std::move(E); + std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; + // \p UnifiedOnDiskCache does manual chaining for key-value requests, + // including an extra translation step of the value during fault-in. + if (Error E = + OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t), UniDB.get()) + .moveInto(PrimaryKVDB)) + return std::move(E); + + UniDB->RootPath = RootPath; + UniDB->SizeLimit = SizeLimit.value_or(0); + UniDB->LockFD = LockFD; + UniDB->NeedsGarbageCollection = DBDirs->size() > 2; + UniDB->PrimaryDBDir = PrimaryDir; + UniDB->UpstreamGraphDB = std::move(UpstreamGraphDB); + UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB); + UniDB->UpstreamKVDB = std::move(UpstreamKVDB); + UniDB->PrimaryKVDB = std::move(PrimaryKVDB); + + return std::move(UniDB); +} + +void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) { + this->SizeLimit = SizeLimit.value_or(0); +} + +uint64_t UnifiedOnDiskCache::getStorageSize() const { + uint64_t TotalSize = getPrimaryStorageSize(); + if (UpstreamGraphDB) + TotalSize += UpstreamGraphDB->getStorageSize(); + if (UpstreamKVDB) + TotalSize += UpstreamKVDB->getStorageSize(); + return TotalSize; +} + +uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const { + return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize(); +} + +bool UnifiedOnDiskCache::hasExceededSizeLimit() const { + uint64_t CurSizeLimit = SizeLimit; + if (!CurSizeLimit) + return false; + + // If the hard limit is beyond 85%, declare above limit and request clean up. + unsigned CurrentPercent = + std::max(PrimaryGraphDB->getHardStorageLimitUtilization(), + PrimaryKVDB->getHardStorageLimitUtilization()); + if (CurrentPercent > 85) + return true; + + // We allow each of the directories in the chain to reach up to half the + // intended size limit. Check whether the primary directory has exceeded half + // the limit or not, in order to decide whether we need to start a new chain. + // + // We could check the size limit against the sum of sizes of both the primary + // and upstream directories but then if the upstream is significantly larger + // than the intended limit, it would trigger a new chain to be created before + // the primary has reached its own limit. Essentially in such situation we + // prefer reclaiming the storage later in order to have more consistent cache + // hits behavior. + return (CurSizeLimit / 2) < getPrimaryStorageSize(); +} + +Error UnifiedOnDiskCache::close(bool CheckSizeLimit) { + if (LockFD == -1) + return Error::success(); // already closed. + auto CloseLock = make_scope_exit([&]() { + assert(LockFD >= 0); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + sys::fs::closeFile(LockFile); + LockFD = -1; + }); + + bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false; + UpstreamKVDB.reset(); + PrimaryKVDB.reset(); + UpstreamGraphDB.reset(); + PrimaryGraphDB.reset(); + if (std::error_code EC = unlockFileThreadSafe(LockFD)) + return createFileError(RootPath, EC); + + if (!ExceededSizeLimit) + return Error::success(); + + // The primary directory exceeded its intended size limit. Try to get an + // exclusive lock in order to create a new primary directory for next time + // this \p UnifiedOnDiskCache path is opened. + + if (std::error_code EC = tryLockFileThreadSafe( + LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) { + if (EC == errc::no_lock_available) + return Error::success(); // couldn't get exclusive lock, give up. + return createFileError(RootPath, EC); + } + auto UnlockFile = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + // Managed to get an exclusive lock which means there are no other open + // \p UnifiedOnDiskCache instances for the same path, so we can safely start a + // new primary directory. To start a new primary directory we just have to + // create a new empty directory with the next consecutive index; since this is + // an atomic operation we will leave the top-level directory in a consistent + // state even if the process dies during this code-path. + + SmallString<256> PathBuf(RootPath); + raw_svector_ostream OS(PathBuf); + OS << sys::path::get_separator(); + getNextDBDirName(PrimaryDBDir, OS); + if (std::error_code EC = sys::fs::create_directory(PathBuf)) + return createFileError(PathBuf, EC); + + NeedsGarbageCollection = true; + return Error::success(); +} + +UnifiedOnDiskCache::UnifiedOnDiskCache() = default; + +UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); } + +Error UnifiedOnDiskCache::collectGarbage(StringRef Path) { + auto DBDirs = getAllGarbageDirs(Path); + if (!DBDirs) + return DBDirs.takeError(); + + SmallString<256> PathBuf(Path); + for (StringRef UnusedSubDir : *DBDirs) { + sys::path::append(PathBuf, UnusedSubDir); + if (std::error_code EC = sys::fs::remove_directories(PathBuf)) + return createFileError(PathBuf, EC); + sys::path::remove_filename(PathBuf); + } + return Error::success(); +} + +Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); } diff --git a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp index cc76063..2b6e2f0 100644 --- a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp +++ b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp @@ -37,7 +37,7 @@ template <> struct MappingTraits<HashNodeStable> { template <> struct CustomMappingTraits<IdHashNodeStableMapTy> { static void inputOne(IO &io, StringRef Key, IdHashNodeStableMapTy &V) { HashNodeStable NodeStable; - io.mapRequired(Key.str().c_str(), NodeStable); + io.mapRequired(Key, NodeStable); unsigned Id; if (Key.getAsInteger(0, Id)) { io.setError("Id not an integer"); @@ -48,7 +48,7 @@ template <> struct CustomMappingTraits<IdHashNodeStableMapTy> { static void output(IO &io, IdHashNodeStableMapTy &V) { for (auto Iter = V.begin(); Iter != V.end(); ++Iter) - io.mapRequired(utostr(Iter->first).c_str(), Iter->second); + io.mapRequired(utostr(Iter->first), Iter->second); } }; diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp index e5c85d5..1ea30d8 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp @@ -745,11 +745,6 @@ void AppleAccelTableStaticTypeData::emit(AsmPrinter *Asm) const { Asm->emitInt32(QualifiedNameHash); } -constexpr AppleAccelTableData::Atom AppleAccelTableTypeData::Atoms[]; -constexpr AppleAccelTableData::Atom AppleAccelTableOffsetData::Atoms[]; -constexpr AppleAccelTableData::Atom AppleAccelTableStaticOffsetData::Atoms[]; -constexpr AppleAccelTableData::Atom AppleAccelTableStaticTypeData::Atoms[]; - #ifndef NDEBUG void AppleAccelTableWriter::Header::print(raw_ostream &OS) const { OS << "Magic: " << format("0x%x", Magic) << "\n" diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 8aa488f..f65d88a 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1443,7 +1443,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges, MF.hasBBSections() && NumMBBSectionRanges > 1, // Use static_cast to avoid breakage of tests on windows. static_cast<bool>(BBAddrMapSkipEmitBBEntries), HasCalls, - static_cast<bool>(EmitBBHash)}; + static_cast<bool>(EmitBBHash), false}; } void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp index 171fb83..98cdada 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp @@ -112,8 +112,7 @@ void DbgValueHistoryMap::Entry::endEntry(EntryIndex Index) { /// to the first intersecting scope range if one exists. static std::optional<ArrayRef<InsnRange>::iterator> intersects(const MachineInstr *StartMI, const MachineInstr *EndMI, - const ArrayRef<InsnRange> &Ranges, - const InstructionOrdering &Ordering) { + ArrayRef<InsnRange> Ranges, const InstructionOrdering &Ordering) { for (auto RangesI = Ranges.begin(), RangesE = Ranges.end(); RangesI != RangesE; ++RangesI) { if (EndMI && Ordering.isBefore(EndMI, RangesI->first)) diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 555c56f..b16e1315 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1120,7 +1120,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { constructMemberDIE(Buffer, DDTy); } } else if (auto *Property = dyn_cast<DIObjCProperty>(Element)) { - DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer); + DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer, Property); StringRef PropertyName = Property->getName(); addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName); if (Property->getType()) diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index fbcd614..485b44ae 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -287,6 +287,25 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() { } continue; } + case 'h': { // Basic block hash secifier. + // Skip the profile when the profile iterator (FI) refers to the + // past-the-end element. + if (FI == ProgramPathAndClusterInfo.end()) + continue; + for (auto BBIDHashStr : Values) { + auto [BBIDStr, HashStr] = BBIDHashStr.split(':'); + unsigned long long BBID = 0, Hash = 0; + if (getAsUnsignedInteger(BBIDStr, 10, BBID)) + return createProfileParseError(Twine("unsigned integer expected: '") + + BBIDStr + "'"); + if (getAsUnsignedInteger(HashStr, 16, Hash)) + return createProfileParseError( + Twine("unsigned integer expected in hex format: '") + HashStr + + "'"); + FI->second.BBHashes[BBID] = Hash; + } + continue; + } default: return createProfileParseError(Twine("invalid specifier: '") + Twine(Specifier) + "'"); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 8ea1326..0309e22 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -368,7 +368,7 @@ class CodeGenPrepare { std::unique_ptr<DominatorTree> DT; public: - CodeGenPrepare(){}; + CodeGenPrepare() = default; CodeGenPrepare(const TargetMachine *TM) : TM(TM){}; /// If encounter huge function, we need to limit the build time. bool IsHugeFunc = false; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 9ace7d6..ec4d13f 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -589,8 +589,8 @@ bool CombinerHelper::matchCombineShuffleVector( return true; } -void CombinerHelper::applyCombineShuffleVector( - MachineInstr &MI, const ArrayRef<Register> Ops) const { +void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI, + ArrayRef<Register> Ops) const { Register DstReg = MI.getOperand(0).getReg(); Builder.setInsertPt(*MI.getParent(), MI); Register NewDstReg = MRI.cloneVirtualRegister(DstReg); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index ca82857..5fab6ec 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1893,6 +1893,8 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI, case TargetOpcode::G_UADDSAT: case TargetOpcode::G_SSUBSAT: case TargetOpcode::G_USUBSAT: + case TargetOpcode::G_SBFX: + case TargetOpcode::G_UBFX: return false; case TargetOpcode::G_SSHLSAT: case TargetOpcode::G_USHLSAT: diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index bb9c76f..8c6d219 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -363,8 +363,9 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { case MachineOperand::MO_RegisterMask: case MachineOperand::MO_RegisterLiveOut: { // Shallow compare of the two RegMasks - const uint32_t *RegMask = getRegMask(); - const uint32_t *OtherRegMask = Other.getRegMask(); + const uint32_t *RegMask = isRegMask() ? getRegMask() : getRegLiveOut(); + const uint32_t *OtherRegMask = + isRegMask() ? Other.getRegMask() : Other.getRegLiveOut(); if (RegMask == OtherRegMask) return true; @@ -434,7 +435,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) { if (const MachineFunction *MF = getMFIfAvailable(MO)) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); - const uint32_t *RegMask = MO.getRegMask(); + const uint32_t *RegMask = + MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut(); std::vector<stable_hash> RegMaskHashes(RegMask, RegMask + RegMaskSize); return hash_combine(MO.getType(), MO.getTargetFlags(), stable_hash_combine(RegMaskHashes)); diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 3ed1045..f18c051 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -334,7 +334,7 @@ public: LiveIntervals &LIS; }; - MachineSchedulerImpl() {} + MachineSchedulerImpl() = default; // Migration only void setLegacyPass(MachineFunctionPass *P) { this->P = P; } void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; } @@ -358,7 +358,7 @@ public: MachineLoopInfo &MLI; AAResults &AA; }; - PostMachineSchedulerImpl() {} + PostMachineSchedulerImpl() = default; // Migration only void setLegacyPass(MachineFunctionPass *P) { this->P = P; } void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; } diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp index 9d56696..6da708d 100644 --- a/llvm/lib/CodeGen/MachineStableHash.cpp +++ b/llvm/lib/CodeGen/MachineStableHash.cpp @@ -136,7 +136,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); - const uint32_t *RegMask = MO.getRegMask(); + const uint32_t *RegMask = + MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut(); std::vector<llvm::stable_hash> RegMaskHashes(RegMask, RegMask + RegMaskSize); return stable_hash_combine(MO.getType(), MO.getTargetFlags(), diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 697b779..ec6ffd4 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -206,7 +206,7 @@ private: bool Error = false; ///< Could not allocate. explicit LiveReg(Register VirtReg) : VirtReg(VirtReg) {} - explicit LiveReg() {} + explicit LiveReg() = default; unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); } }; diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index e17a214b..38f6deb 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -378,7 +378,7 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate { public: // For legacy pass only. - RegisterCoalescer() {} + RegisterCoalescer() = default; RegisterCoalescer &operator=(RegisterCoalescer &&Other) = default; RegisterCoalescer(LiveIntervals *LIS, SlotIndexes *SI, diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1ef5dc2..46c4bb8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2715,6 +2715,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); AddToWorklist(Add.getNode()); + // We can't set InBounds even if both original ptradds were InBounds and + // NUW: SDAG usually represents pointers as integers, therefore, the + // matched pattern behaves as if it had implicit casts: + // (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds x, y))), z) + // The outer inbounds ptradd might therefore rely on a provenance that x + // does not have. return DAG.getMemBasePlusOffset(X, Add, DL, Flags); } } @@ -2740,6 +2746,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { // that. SDNodeFlags Flags = (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; + // We can't set InBounds even if both original ptradds were InBounds and + // NUW: SDAG usually represents pointers as integers, therefore, the + // matched pattern behaves as if it had implicit casts: + // (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds GA, v))), c) + // The outer inbounds ptradd might therefore rely on a provenance that + // GA does not have. SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); AddToWorklist(Inner.getNode()); return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); @@ -2763,8 +2775,13 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); // If both additions in the original were NUW, reassociation preserves that. - SDNodeFlags ReassocFlags = - (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; + SDNodeFlags CommonFlags = N->getFlags() & N1->getFlags(); + SDNodeFlags ReassocFlags = CommonFlags & SDNodeFlags::NoUnsignedWrap; + if (CommonFlags.hasNoUnsignedWrap()) { + // If both operations are NUW and the PTRADD is inbounds, the offests are + // both non-negative, so the reassociated PTRADDs are also inbounds. + ReassocFlags |= N->getFlags() & SDNodeFlags::InBounds; + } if (ZIsConstant != YIsConstant) { if (YIsConstant) @@ -9357,7 +9374,7 @@ static unsigned bigEndianByteAt(unsigned BW, unsigned i) { // Check if the bytes offsets we are looking at match with either big or // little endian value loaded. Return true for big endian, false for little // endian, and std::nullopt if match failed. -static std::optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets, +static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets, int64_t FirstOffset) { // The endian can be decided only when it is 2 bytes at least. unsigned Width = ByteOffsets.size(); @@ -22743,7 +22760,10 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) { NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(COffset), DL); PointerInfo = ST->getPointerInfo().getWithOffset(COffset); } else { - NewPtr = TLI.getVectorElementPointer(DAG, Ptr, Value.getValueType(), Idx); + // The original DAG loaded the entire vector from memory, so arithmetic + // within it must be inbounds. + NewPtr = TLI.getInboundsVectorElementPointer(DAG, Ptr, Value.getValueType(), + Idx); } return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(), diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index bf1abfe..58983cb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -1172,6 +1172,12 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { case ISD::FAKE_USE: Res = SoftenFloatOp_FAKE_USE(N); break; + case ISD::STACKMAP: + Res = SoftenFloatOp_STACKMAP(N, OpNo); + break; + case ISD::PATCHPOINT: + Res = SoftenFloatOp_PATCHPOINT(N, OpNo); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -1512,6 +1518,20 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FAKE_USE(SDNode *N) { N->getOperand(0), Op1); } +SDValue DAGTypeLegalizer::SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo) { + assert(OpNo > 1); // Because the first two arguments are guaranteed legal. + SmallVector<SDValue> NewOps(N->ops()); + NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + +SDValue DAGTypeLegalizer::SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo) { + assert(OpNo >= 7); + SmallVector<SDValue> NewOps(N->ops()); + NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + //===----------------------------------------------------------------------===// // Float Result Expansion //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index b1776ea..44e5a18 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2871,18 +2871,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) { SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) { assert(OpNo > 1); // Because the first two arguments are guaranteed legal. SmallVector<SDValue> NewOps(N->ops()); - SDValue Operand = N->getOperand(OpNo); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType()); - NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand); + NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) { assert(OpNo >= 7); SmallVector<SDValue> NewOps(N->ops()); - SDValue Operand = N->getOperand(OpNo); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType()); - NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand); + NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 9656a30..ede522e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -658,6 +658,8 @@ private: SDValue SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo); SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N); SDValue SoftenFloatOp_FAKE_USE(SDNode *N); + SDValue SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo); + SDValue SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo); //===--------------------------------------------------------------------===// // Float Expansion Support: LegalizeFloatTypes.cpp diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index da4e409..9bdf822 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10668,19 +10668,20 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx, DAG.getConstant(MaxIndex, dl, IdxVT)); } -SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG, - SDValue VecPtr, EVT VecVT, - SDValue Index) const { +SDValue +TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, SDValue Index, + const SDNodeFlags PtrArithFlags) const { return getVectorSubVecPointer( DAG, VecPtr, VecVT, EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1), - Index); + Index, PtrArithFlags); } -SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, - SDValue VecPtr, EVT VecVT, - EVT SubVecVT, - SDValue Index) const { +SDValue +TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, EVT SubVecVT, SDValue Index, + const SDNodeFlags PtrArithFlags) const { SDLoc dl(Index); // Make sure the index type is big enough to compute in. Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType()); @@ -10704,7 +10705,7 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index, DAG.getConstant(EltSize, dl, IdxVT)); - return DAG.getMemBasePlusOffset(VecPtr, Index, dl); + return DAG.getMemBasePlusOffset(VecPtr, Index, dl, PtrArithFlags); } //===----------------------------------------------------------------------===// @@ -12382,8 +12383,10 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT, !IsFast) return SDValue(); - SDValue NewPtr = - getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); + // The original DAG loaded the entire vector from memory, so arithmetic + // within it must be inbounds. + SDValue NewPtr = getInboundsVectorElementPointer( + DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); // We are replacing a vector load with a scalar load. The new load must have // identical memory op ordering to the original. diff --git a/llvm/lib/CodeGenTypes/LowLevelType.cpp b/llvm/lib/CodeGenTypes/LowLevelType.cpp index 4785f26..92b7fad 100644 --- a/llvm/lib/CodeGenTypes/LowLevelType.cpp +++ b/llvm/lib/CodeGenTypes/LowLevelType.cpp @@ -54,9 +54,3 @@ LLVM_DUMP_METHOD void LLT::dump() const { dbgs() << '\n'; } #endif - -const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo; -const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo; -const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo; -const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo; -const constexpr LLT::BitFieldInfo LLT::VectorScalableFieldInfo; diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h index 84757ae..970abdc 100644 --- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h +++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h @@ -28,7 +28,7 @@ using MacroOffset2UnitMapTy = DenseMap<uint64_t, DwarfUnit *>; /// Base class for all Dwarf units(Compile unit/Type table unit). class DwarfUnit : public OutputSections { public: - virtual ~DwarfUnit() {} + virtual ~DwarfUnit() = default; DwarfUnit(LinkingGlobalData &GlobalData, unsigned ID, StringRef ClangModuleName) : OutputSections(GlobalData), ID(ID), ClangModuleName(ClangModuleName), diff --git a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h index f67536e..8ccb4a5 100644 --- a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h +++ b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h @@ -22,7 +22,7 @@ class StringEntryToDwarfStringPoolEntryMap { public: StringEntryToDwarfStringPoolEntryMap(LinkingGlobalData &GlobalData) : GlobalData(GlobalData) {} - ~StringEntryToDwarfStringPoolEntryMap() {} + ~StringEntryToDwarfStringPoolEntryMap() = default; /// Create DwarfStringPoolEntry for specified StringEntry if necessary. /// Initialize DwarfStringPoolEntry with initial values. diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp index 6c23ba8..23ab534 100644 --- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp +++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp @@ -102,7 +102,8 @@ std::optional<CVType> LazyRandomTypeCollection::tryGetType(TypeIndex Index) { return std::nullopt; } - assert(contains(Index)); + if (!contains(Index)) + return std::nullopt; return Records[Index.toArrayIndex()].Type; } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index db5cc37..6c78ef0 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -129,6 +129,25 @@ prettyLanguageVersionString(const DWARFAttribute &AttrValue, static_cast<SourceLanguageName>(*LName), *LVersion); } +static llvm::Expected<llvm::StringRef> +getApplePropertyName(const DWARFDie &PropDIE) { + if (!PropDIE) + return llvm::createStringError("invalid DIE"); + + if (PropDIE.getTag() != DW_TAG_APPLE_property) + return llvm::createStringError("not referencing a DW_TAG_APPLE_property"); + + auto PropNameForm = PropDIE.find(DW_AT_APPLE_property_name); + if (!PropNameForm) + return ""; + + auto NameOrErr = PropNameForm->getAsCString(); + if (!NameOrErr) + return NameOrErr.takeError(); + + return *NameOrErr; +} + static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, const DWARFAttribute &AttrValue, unsigned Indent, DIDumpOptions DumpOpts) { @@ -233,6 +252,15 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, Die.getAttributeValueAsReferencedDie(FormValue).getName( DINameKind::LinkageName)) OS << Space << "\"" << Name << '\"'; + } else if (Attr == DW_AT_APPLE_property) { + auto PropDIE = Die.getAttributeValueAsReferencedDie(FormValue); + if (auto PropNameOrErr = getApplePropertyName(PropDIE)) + OS << Space << "\"" << *PropNameOrErr << '\"'; + else + DumpOpts.RecoverableErrorHandler(createStringError( + errc::invalid_argument, + llvm::formatv("decoding DW_AT_APPLE_property_name: {}", + toString(PropNameOrErr.takeError())))); } else if (Attr == DW_AT_type || Attr == DW_AT_containing_type) { DWARFDie D = resolveReferencedType(Die, FormValue); if (D && !D.isNULL()) { diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp index 7e606c6a..4e7db82 100644 --- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp @@ -27,7 +27,7 @@ namespace llvm { namespace orc { -MemoryMapper::~MemoryMapper() {} +MemoryMapper::~MemoryMapper() = default; InProcessMemoryMapper::InProcessMemoryMapper(size_t PageSize) : PageSize(PageSize) {} diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt index 9275586..ca8192b 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt @@ -16,9 +16,11 @@ add_llvm_component_library(LLVMOrcTargetProcess ExecutorSharedMemoryMapperService.cpp DefaultHostBootstrapValues.cpp ExecutorResolver.cpp + LibraryResolver.cpp JITLoaderGDB.cpp JITLoaderPerf.cpp JITLoaderVTune.cpp + LibraryScanner.cpp OrcRTBootstrap.cpp RegisterEHFrames.cpp SimpleExecutorDylibManager.cpp @@ -36,6 +38,8 @@ add_llvm_component_library(LLVMOrcTargetProcess LINK_COMPONENTS ${intel_jit_profiling} + BinaryFormat + Object OrcShared Support TargetParser diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp new file mode 100644 index 0000000..35da82a --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp @@ -0,0 +1,370 @@ +//===- LibraryResolver.cpp - Library Resolution of Unresolved Symbols ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Library resolution impl for unresolved symbols +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h" + +#include "llvm/ADT/StringSet.h" + +#include "llvm/BinaryFormat/MachO.h" +#include "llvm/Object/COFF.h" +#include "llvm/Object/ELF.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Error.h" + +#include <mutex> +#include <thread> + +#define DEBUG_TYPE "orc-resolver" + +namespace llvm::orc { + +LibraryResolver::LibraryResolver(const LibraryResolver::Setup &S) + : LibPathCache(S.Cache ? S.Cache : std::make_shared<LibraryPathCache>()), + LibPathResolver(S.PResolver + ? S.PResolver + : std::make_shared<PathResolver>(LibPathCache)), + ScanHelper(S.BasePaths, LibPathCache, LibPathResolver), + FB(S.FilterBuilder), LibMgr(), + ShouldScanCall(S.ShouldScanCall ? S.ShouldScanCall + : [](StringRef) -> bool { return true; }), + scanBatchSize(S.ScanBatchSize) { + + if (ScanHelper.getAllUnits().empty()) { + LLVM_DEBUG(dbgs() << "Warning: No base paths provided for scanning.\n"); + } +} + +std::unique_ptr<LibraryResolutionDriver> +LibraryResolutionDriver::create(const LibraryResolver::Setup &S) { + auto LR = std::make_unique<LibraryResolver>(S); + return std::unique_ptr<LibraryResolutionDriver>( + new LibraryResolutionDriver(std::move(LR))); +} + +void LibraryResolutionDriver::addScanPath(const std::string &Path, PathType K) { + LR->ScanHelper.addBasePath(Path, K); +} + +bool LibraryResolutionDriver::markLibraryLoaded(StringRef Path) { + auto Lib = LR->LibMgr.getLibrary(Path); + if (!Lib) + return false; + + Lib->setState(LibraryManager::LibState::Loaded); + + return true; +} + +bool LibraryResolutionDriver::markLibraryUnLoaded(StringRef Path) { + auto Lib = LR->LibMgr.getLibrary(Path); + if (!Lib) + return false; + + Lib->setState(LibraryManager::LibState::Unloaded); + + return true; +} + +void LibraryResolutionDriver::resolveSymbols( + std::vector<std::string> Syms, + LibraryResolver::OnSearchComplete OnCompletion, + const SearchConfig &Config) { + LR->searchSymbolsInLibraries(Syms, std::move(OnCompletion), Config); +} + +static bool shouldIgnoreSymbol(const object::SymbolRef &Sym, + uint32_t IgnoreFlags) { + Expected<uint32_t> FlagsOrErr = Sym.getFlags(); + if (!FlagsOrErr) { + consumeError(FlagsOrErr.takeError()); + return true; + } + + uint32_t Flags = *FlagsOrErr; + + using Filter = SymbolEnumeratorOptions; + if ((IgnoreFlags & Filter::IgnoreUndefined) && + (Flags & object::SymbolRef::SF_Undefined)) + return true; + if ((IgnoreFlags & Filter::IgnoreIndirect) && + (Flags & object::SymbolRef::SF_Indirect)) + return true; + if ((IgnoreFlags & Filter::IgnoreWeak) && + (Flags & object::SymbolRef::SF_Weak)) + return true; + + return false; +} + +bool SymbolEnumerator::enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach, + const SymbolEnumeratorOptions &Opts) { + if (Path.empty()) + return false; + + ObjectFileLoader ObjLoader(Path); + + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + std::string ErrMsg; + handleAllErrors(ObjOrErr.takeError(), + [&](const ErrorInfoBase &EIB) { ErrMsg = EIB.message(); }); + LLVM_DEBUG(dbgs() << "Failed loading object file: " << Path + << "\nError: " << ErrMsg << "\n"); + return false; + } + + object::ObjectFile *Obj = &ObjOrErr.get(); + + auto processSymbolRange = + [&](object::ObjectFile::symbol_iterator_range Range) -> EnumerateResult { + for (const auto &Sym : Range) { + if (shouldIgnoreSymbol(Sym, Opts.FilterFlags)) + continue; + + auto NameOrErr = Sym.getName(); + if (!NameOrErr) { + consumeError(NameOrErr.takeError()); + continue; + } + + StringRef Name = *NameOrErr; + if (Name.empty()) + continue; + + EnumerateResult Res = OnEach(Name); + if (Res != EnumerateResult::Continue) + return Res; + } + return EnumerateResult::Continue; + }; + + EnumerateResult Res = processSymbolRange(Obj->symbols()); + if (Res != EnumerateResult::Continue) + return Res == EnumerateResult::Stop; + + if (Obj->isELF()) { + const auto *ElfObj = cast<object::ELFObjectFileBase>(Obj); + Res = processSymbolRange(ElfObj->getDynamicSymbolIterators()); + if (Res != EnumerateResult::Continue) + return Res == EnumerateResult::Stop; + } else if (Obj->isCOFF()) { + const auto *CoffObj = cast<object::COFFObjectFile>(Obj); + for (auto I = CoffObj->export_directory_begin(), + E = CoffObj->export_directory_end(); + I != E; ++I) { + StringRef Name; + if (I->getSymbolName(Name)) + continue; + if (Name.empty()) + continue; + + EnumerateResult Res = OnEach(Name); + if (Res != EnumerateResult::Continue) + return Res == EnumerateResult::Stop; + } + } else if (Obj->isMachO()) { + } + + return true; +} + +class SymbolSearchContext { +public: + SymbolSearchContext(SymbolQuery &Q) : Q(Q) {} + + bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); } + + void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); } + + inline bool allResolved() const { return Q.allResolved(); } + + SymbolQuery &query() { return Q; } + +private: + SymbolQuery &Q; + DenseSet<LibraryInfo *> Searched; +}; + +void LibraryResolver::resolveSymbolsInLibrary( + LibraryInfo &Lib, SymbolQuery &UnresolvedSymbols, + const SymbolEnumeratorOptions &Opts) { + LLVM_DEBUG(dbgs() << "Checking unresolved symbols " + << " in library : " << Lib.getFileName() << "\n";); + StringSet<> DiscoveredSymbols; + + if (!UnresolvedSymbols.hasUnresolved()) { + LLVM_DEBUG(dbgs() << "Skipping library: " << Lib.getFullPath() + << " — unresolved symbols exist.\n";); + return; + } + + bool HasEnumerated = false; + auto enumerateSymbolsIfNeeded = [&]() { + if (HasEnumerated) + return; + + HasEnumerated = true; + + LLVM_DEBUG(dbgs() << "Enumerating symbols in library: " << Lib.getFullPath() + << "\n";); + SymbolEnumerator::enumerateSymbols( + Lib.getFullPath(), + [&](StringRef sym) { + DiscoveredSymbols.insert(sym); + return EnumerateResult::Continue; + }, + Opts); + + if (DiscoveredSymbols.empty()) { + LLVM_DEBUG(dbgs() << " No symbols and remove library : " + << Lib.getFullPath() << "\n";); + LibMgr.removeLibrary(Lib.getFullPath()); + return; + } + }; + + if (!Lib.hasFilter()) { + LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath() + << "\n";); + enumerateSymbolsIfNeeded(); + SmallVector<StringRef> SymbolVec; + SymbolVec.reserve(DiscoveredSymbols.size()); + for (const auto &KV : DiscoveredSymbols) + SymbolVec.push_back(KV.first()); + + Lib.ensureFilterBuilt(FB, SymbolVec); + LLVM_DEBUG({ + dbgs() << "DiscoveredSymbols : " << DiscoveredSymbols.size() << "\n"; + for (const auto &KV : DiscoveredSymbols) + dbgs() << "DiscoveredSymbols : " << KV.first() << "\n"; + }); + } + + const auto &Unresolved = UnresolvedSymbols.getUnresolvedSymbols(); + bool HadAnySym = false; + LLVM_DEBUG(dbgs() << "Total unresolved symbols : " << Unresolved.size() + << "\n";); + for (const auto &Sym : Unresolved) { + if (Lib.mayContain(Sym)) { + LLVM_DEBUG(dbgs() << "Checking symbol '" << Sym + << "' in library: " << Lib.getFullPath() << "\n";); + enumerateSymbolsIfNeeded(); + if (DiscoveredSymbols.count(Sym) > 0) { + LLVM_DEBUG(dbgs() << " Resolved symbol: " << Sym + << " in library: " << Lib.getFullPath() << "\n";); + UnresolvedSymbols.resolve(Sym, Lib.getFullPath()); + HadAnySym = true; + } + } + } + + using LibraryState = LibraryManager::LibState; + if (HadAnySym && Lib.getState() != LibraryState::Loaded) + Lib.setState(LibraryState::Queried); +} + +void LibraryResolver::searchSymbolsInLibraries( + std::vector<std::string> &SymbolList, OnSearchComplete OnComplete, + const SearchConfig &Config) { + SymbolQuery Q(SymbolList); + + using LibraryState = LibraryManager::LibState; + using LibraryType = PathType; + auto tryResolveFrom = [&](LibraryState S, LibraryType K) { + LLVM_DEBUG(dbgs() << "Trying resolve from state=" << static_cast<int>(S) + << " type=" << static_cast<int>(K) << "\n";); + + SymbolSearchContext Ctx(Q); + while (!Ctx.allResolved()) { + + for (auto &Lib : LibMgr.getView(S, K)) { + if (Ctx.hasSearched(Lib.get())) + continue; + + // can use Async here? + resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options); + Ctx.markSearched(Lib.get()); + + if (Ctx.allResolved()) + return; + } + + if (Ctx.allResolved()) + return; + + if (!scanLibrariesIfNeeded(K, scanBatchSize)) + break; // no more new libs to scan + } + }; + + for (const auto &[St, Ty] : Config.Policy.Plan) { + tryResolveFrom(St, Ty); + if (Q.allResolved()) + break; + } + + // done: + LLVM_DEBUG({ + dbgs() << "Search complete.\n"; + for (const auto &r : Q.getAllResults()) + dbgs() << "Resolved Symbol:" << r->Name << " -> " << r->ResolvedLibPath + << "\n"; + }); + + OnComplete(Q); +} + +bool LibraryResolver::scanLibrariesIfNeeded(PathType PK, size_t BatchSize) { + LLVM_DEBUG(dbgs() << "LibraryResolver::scanLibrariesIfNeeded: Scanning for " + << (PK == PathType::User ? "User" : "System") + << " libraries\n";); + if (!ScanHelper.leftToScan(PK)) + return false; + + LibraryScanner Scanner(ScanHelper, LibMgr, ShouldScanCall); + Scanner.scanNext(PK, BatchSize); + return true; +} + +bool LibraryResolver::symbolExistsInLibrary(const LibraryInfo &Lib, + StringRef SymName, + std::vector<std::string> *AllSyms) { + SymbolEnumeratorOptions Opts; + return symbolExistsInLibrary(Lib, SymName, AllSyms, Opts); +} + +bool LibraryResolver::symbolExistsInLibrary( + const LibraryInfo &Lib, StringRef SymName, + std::vector<std::string> *AllSyms, const SymbolEnumeratorOptions &Opts) { + bool Found = false; + + SymbolEnumerator::enumerateSymbols( + Lib.getFullPath(), + [&](StringRef Sym) { + if (AllSyms) + AllSyms->emplace_back(Sym.str()); + + if (Sym == SymName) { + Found = true; + } + + return EnumerateResult::Continue; + }, + Opts); + + return Found; +} + +} // end namespace llvm::orc diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp new file mode 100644 index 0000000..d93f686 --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp @@ -0,0 +1,1161 @@ +//===- LibraryScanner.cpp - Provide Library Scanning Implementation ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/Object/COFF.h" +#include "llvm/Object/ELF.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ELFTypes.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/MachOUniversal.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TargetParser/Triple.h" + +#ifdef LLVM_ON_UNIX +#include <sys/stat.h> +#include <unistd.h> +#endif // LLVM_ON_UNIX + +#ifdef __APPLE__ +#include <sys/stat.h> +#undef LC_LOAD_DYLIB +#undef LC_RPATH +#endif // __APPLE__ + +#define DEBUG_TYPE "orc-scanner" + +namespace llvm::orc { + +void handleError(Error Err, StringRef context = "") { + consumeError(handleErrors(std::move(Err), [&](const ErrorInfoBase &EIB) { + dbgs() << "LLVM Error"; + if (!context.empty()) + dbgs() << " [" << context << "]"; + dbgs() << ": " << EIB.message() << "\n"; + })); +} + +bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) { + Triple HostTriple(sys::getDefaultTargetTriple()); + Triple ObjTriple = Obj.makeTriple(); + + LLVM_DEBUG({ + dbgs() << "Host triple: " << HostTriple.str() + << ", Object triple: " << ObjTriple.str() << "\n"; + }); + + if (ObjTriple.getArch() != Triple::UnknownArch && + HostTriple.getArch() != ObjTriple.getArch()) + return false; + + if (ObjTriple.getOS() != Triple::UnknownOS && + HostTriple.getOS() != ObjTriple.getOS()) + return false; + + if (ObjTriple.getEnvironment() != Triple::UnknownEnvironment && + HostTriple.getEnvironment() != Triple::UnknownEnvironment && + HostTriple.getEnvironment() != ObjTriple.getEnvironment()) + return false; + + return true; +} + +Expected<object::OwningBinary<object::ObjectFile>> +ObjectFileLoader::loadObjectFileWithOwnership(StringRef FilePath) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Attempting to open file " << FilePath + << "\n";); + auto BinOrErr = object::createBinary(FilePath); + if (!BinOrErr) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to open file " << FilePath + << "\n";); + return BinOrErr.takeError(); + } + + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Successfully opened file " << FilePath + << "\n";); + + auto OwningBin = BinOrErr->takeBinary(); + object::Binary *Bin = OwningBin.first.get(); + + if (Bin->isArchive()) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: File is an archive, not supported: " + << FilePath << "\n";); + return createStringError(std::errc::invalid_argument, + "Archive files are not supported: %s", + FilePath.str().c_str()); + } + +#if defined(__APPLE__) + if (auto *UB = dyn_cast<object::MachOUniversalBinary>(Bin)) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected Mach-O universal binary: " + << FilePath << "\n";); + for (auto ObjForArch : UB->objects()) { + auto ObjOrErr = ObjForArch.getAsObjectFile(); + if (!ObjOrErr) { + LLVM_DEBUG( + dbgs() + << "ObjectFileLoader: Skipping invalid architecture slice\n";); + + consumeError(ObjOrErr.takeError()); + continue; + } + + std::unique_ptr<object::ObjectFile> Obj = std::move(ObjOrErr.get()); + if (isArchitectureCompatible(*Obj)) { + LLVM_DEBUG( + dbgs() << "ObjectFileLoader: Found compatible object slice\n";); + + return object::OwningBinary<object::ObjectFile>( + std::move(Obj), std::move(OwningBin.second)); + + } else { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture " + "slice skipped\n";); + } + } + LLVM_DEBUG(dbgs() << "ObjectFileLoader: No compatible slices found in " + "universal binary\n";); + return createStringError(inconvertibleErrorCode(), + "No compatible object found in fat binary: %s", + FilePath.str().c_str()); + } +#endif + + auto ObjOrErr = + object::ObjectFile::createObjectFile(Bin->getMemoryBufferRef()); + if (!ObjOrErr) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to create object file\n";); + return ObjOrErr.takeError(); + } + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected object file\n";); + + std::unique_ptr<object::ObjectFile> Obj = std::move(*ObjOrErr); + if (!isArchitectureCompatible(*Obj)) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture: " + << FilePath << "\n";); + return createStringError(inconvertibleErrorCode(), + "Incompatible object file: %s", + FilePath.str().c_str()); + } + + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Object file is compatible\n";); + + return object::OwningBinary<object::ObjectFile>(std::move(Obj), + std::move(OwningBin.second)); +} + +template <class ELFT> +bool isELFSharedLibrary(const object::ELFFile<ELFT> &ELFObj) { + if (ELFObj.getHeader().e_type != ELF::ET_DYN) + return false; + + auto PHOrErr = ELFObj.program_headers(); + if (!PHOrErr) { + consumeError(PHOrErr.takeError()); + return true; + } + + for (auto Phdr : *PHOrErr) { + if (Phdr.p_type == ELF::PT_INTERP) + return false; + } + + return true; +} + +bool isSharedLibraryObject(object::ObjectFile &Obj) { + if (Obj.isELF()) { + if (auto *ELF32LE = dyn_cast<object::ELF32LEObjectFile>(&Obj)) + return isELFSharedLibrary(ELF32LE->getELFFile()); + if (auto *ELF64LE = dyn_cast<object::ELF64LEObjectFile>(&Obj)) + return isELFSharedLibrary(ELF64LE->getELFFile()); + if (auto *ELF32BE = dyn_cast<object::ELF32BEObjectFile>(&Obj)) + return isELFSharedLibrary(ELF32BE->getELFFile()); + if (auto *ELF64BE = dyn_cast<object::ELF64BEObjectFile>(&Obj)) + return isELFSharedLibrary(ELF64BE->getELFFile()); + } else if (Obj.isMachO()) { + const object::MachOObjectFile *MachO = + dyn_cast<object::MachOObjectFile>(&Obj); + if (!MachO) { + LLVM_DEBUG(dbgs() << "Failed to cast to MachOObjectFile.\n";); + return false; + } + LLVM_DEBUG({ + bool Result = + MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB; + dbgs() << "Mach-O filetype: " << MachO->getHeader().filetype + << " (MH_DYLIB == " << MachO::HeaderFileType::MH_DYLIB + << "), shared: " << Result << "\n"; + }); + + return MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB; + } else if (Obj.isCOFF()) { + const object::COFFObjectFile *coff = dyn_cast<object::COFFObjectFile>(&Obj); + if (!coff) + return false; + return coff->getCharacteristics() & COFF::IMAGE_FILE_DLL; + } else { + LLVM_DEBUG(dbgs() << "Binary is not an ObjectFile.\n";); + } + + return false; +} + +bool DylibPathValidator::isSharedLibrary(StringRef Path) { + LLVM_DEBUG(dbgs() << "Checking if path is a shared library: " << Path + << "\n";); + + auto FileType = sys::fs::get_file_type(Path, /*Follow*/ true); + if (FileType != sys::fs::file_type::regular_file) { + LLVM_DEBUG(dbgs() << "File type is not a regular file for path: " << Path + << "\n";); + return false; + } + + file_magic MagicCode; + identify_magic(Path, MagicCode); + + // Skip archives. + if (MagicCode == file_magic::archive) + return false; + + // Universal binary handling. +#if defined(__APPLE__) + if (MagicCode == file_magic::macho_universal_binary) { + ObjectFileLoader ObjLoader(Path); + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + consumeError(ObjOrErr.takeError()); + return false; + } + return isSharedLibraryObject(ObjOrErr.get()); + } +#endif + + // Object file inspection for PE/COFF, ELF, and Mach-O + bool NeedsObjectInspection = +#if defined(_WIN32) + (MagicCode == file_magic::pecoff_executable); +#elif defined(__APPLE__) + (MagicCode == file_magic::macho_fixed_virtual_memory_shared_lib || + MagicCode == file_magic::macho_dynamically_linked_shared_lib || + MagicCode == file_magic::macho_dynamically_linked_shared_lib_stub); +#elif defined(LLVM_ON_UNIX) +#ifdef __CYGWIN__ + (MagicCode == file_magic::pecoff_executable); +#else + (MagicCode == file_magic::elf_shared_object); +#endif +#else +#error "Unsupported platform." +#endif + + if (NeedsObjectInspection) { + ObjectFileLoader ObjLoader(Path); + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + consumeError(ObjOrErr.takeError()); + return false; + } + return isSharedLibraryObject(ObjOrErr.get()); + } + + LLVM_DEBUG(dbgs() << "Path is not identified as a shared library: " << Path + << "\n";); + return false; +} + +void DylibSubstitutor::configure(StringRef LoaderPath) { + SmallString<512> ExecPath(sys::fs::getMainExecutable(nullptr, nullptr)); + sys::path::remove_filename(ExecPath); + + SmallString<512> LoaderDir; + if (LoaderPath.empty()) { + LoaderDir = ExecPath; + } else { + LoaderDir = LoaderPath.str(); + if (!sys::fs::is_directory(LoaderPath)) + sys::path::remove_filename(LoaderDir); + } + +#ifdef __APPLE__ + Placeholders["@loader_path"] = std::string(LoaderDir); + Placeholders["@executable_path"] = std::string(ExecPath); +#else + Placeholders["$origin"] = std::string(LoaderDir); +#endif +} + +std::optional<std::string> +SearchPathResolver::resolve(StringRef Stem, const DylibSubstitutor &Subst, + DylibPathValidator &Validator) const { + for (const auto &SP : Paths) { + std::string Base = Subst.substitute(SP); + + SmallString<512> FullPath(Base); + if (!PlaceholderPrefix.empty() && + Stem.starts_with_insensitive(PlaceholderPrefix)) + FullPath.append(Stem.drop_front(PlaceholderPrefix.size())); + else + sys::path::append(FullPath, Stem); + + LLVM_DEBUG(dbgs() << "SearchPathResolver::resolve FullPath = " << FullPath + << "\n";); + + if (auto Valid = Validator.validate(FullPath.str())) + return Valid; + } + + return std::nullopt; +} + +std::optional<std::string> +DylibResolverImpl::tryWithExtensions(StringRef LibStem) const { + LLVM_DEBUG(dbgs() << "tryWithExtensions: baseName = " << LibStem << "\n";); + SmallVector<SmallString<256>, 8> Candidates; + + // Add extensions by platform +#if defined(__APPLE__) + Candidates.emplace_back(LibStem); + Candidates.back() += ".dylib"; +#elif defined(_WIN32) + Candidates.emplace_back(LibStem); + Candidates.back() += ".dll"; +#else + Candidates.emplace_back(LibStem); + Candidates.back() += ".so"; +#endif + + // Optionally try "lib" prefix if not already there + StringRef FileName = sys::path::filename(LibStem); + StringRef Base = sys::path::parent_path(LibStem); + if (!FileName.starts_with("lib")) { + SmallString<256> WithPrefix(Base); + if (!WithPrefix.empty()) + sys::path::append(WithPrefix, ""); // ensure separator if needed + WithPrefix += "lib"; + WithPrefix += FileName; + +#if defined(__APPLE__) + WithPrefix += ".dylib"; +#elif defined(_WIN32) + WithPrefix += ".dll"; +#else + WithPrefix += ".so"; +#endif + + Candidates.push_back(std::move(WithPrefix)); + } + + LLVM_DEBUG({ + dbgs() << " Candidates to try:\n"; + for (const auto &C : Candidates) + dbgs() << " " << C << "\n"; + }); + + // Try all variants using tryAllPaths + for (const auto &Name : Candidates) { + + LLVM_DEBUG(dbgs() << " Trying candidate: " << Name << "\n";); + + for (const auto &R : Resolvers) { + if (auto Res = R.resolve(Name, Substitutor, Validator)) + return Res; + } + } + + LLVM_DEBUG(dbgs() << " -> No candidate Resolved.\n";); + + return std::nullopt; +} + +std::optional<std::string> +DylibResolverImpl::resolve(StringRef LibStem, bool VariateLibStem) const { + LLVM_DEBUG(dbgs() << "Resolving library stem: " << LibStem << "\n";); + + // If it is an absolute path, don't try iterate over the paths. + if (sys::path::is_absolute(LibStem)) { + LLVM_DEBUG(dbgs() << " -> Absolute path detected.\n";); + return Validator.validate(LibStem); + } + + if (!LibStem.starts_with_insensitive("@rpath")) { + if (auto norm = Validator.validate(Substitutor.substitute(LibStem))) { + LLVM_DEBUG(dbgs() << " -> Resolved after substitution: " << *norm + << "\n";); + + return norm; + } + } + + for (const auto &R : Resolvers) { + LLVM_DEBUG(dbgs() << " -> Resolving via search path ... \n";); + if (auto Result = R.resolve(LibStem, Substitutor, Validator)) { + LLVM_DEBUG(dbgs() << " -> Resolved via search path: " << *Result + << "\n";); + + return Result; + } + } + + // Expand libStem with paths, extensions, etc. + // std::string foundName; + if (VariateLibStem) { + LLVM_DEBUG(dbgs() << " -> Trying with extensions...\n";); + + if (auto Norm = tryWithExtensions(LibStem)) { + LLVM_DEBUG(dbgs() << " -> Resolved via tryWithExtensions: " << *Norm + << "\n";); + + return Norm; + } + } + + LLVM_DEBUG(dbgs() << " -> Could not resolve: " << LibStem << "\n";); + + return std::nullopt; +} + +#ifndef _WIN32 +mode_t PathResolver::lstatCached(StringRef Path) { + // If already cached - retun cached result + if (auto Cache = LibPathCache->read_lstat(Path)) + return *Cache; + + // Not cached: perform lstat and store + struct stat buf{}; + mode_t st_mode = (lstat(Path.str().c_str(), &buf) == -1) ? 0 : buf.st_mode; + + LibPathCache->insert_lstat(Path, st_mode); + + return st_mode; +} + +std::optional<std::string> PathResolver::readlinkCached(StringRef Path) { + // If already cached - retun cached result + if (auto Cache = LibPathCache->read_link(Path)) + return Cache; + + // If result not in cache - call system function and cache result + char buf[PATH_MAX]; + ssize_t len; + if ((len = readlink(Path.str().c_str(), buf, sizeof(buf))) != -1) { + buf[len] = '\0'; + std::string s(buf); + LibPathCache->insert_link(Path, s); + return s; + } + return std::nullopt; +} + +void createComponent(StringRef Path, StringRef BasePath, bool BaseIsResolved, + SmallVector<StringRef, 16> &Component) { + StringRef Separator = sys::path::get_separator(); + if (!BaseIsResolved) { + if (Path[0] == '~' && + (Path.size() == 1 || sys::path::is_separator(Path[1]))) { + static SmallString<128> HomeP; + if (HomeP.str().empty()) + sys::path::home_directory(HomeP); + StringRef(HomeP).split(Component, Separator, /*MaxSplit*/ -1, + /*KeepEmpty*/ false); + } else if (BasePath.empty()) { + static SmallString<256> CurrentPath; + if (CurrentPath.str().empty()) + sys::fs::current_path(CurrentPath); + StringRef(CurrentPath) + .split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false); + } else { + BasePath.split(Component, Separator, /*MaxSplit*/ -1, + /*KeepEmpty*/ false); + } + } + + Path.split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false); +} + +void normalizePathSegments(SmallVector<StringRef, 16> &PathParts) { + SmallVector<StringRef, 16> NormalizedPath; + for (auto &Part : PathParts) { + if (Part == ".") { + continue; + } else if (Part == "..") { + if (!NormalizedPath.empty() && NormalizedPath.back() != "..") { + NormalizedPath.pop_back(); + } else { + NormalizedPath.push_back(".."); + } + } else { + NormalizedPath.push_back(Part); + } + } + PathParts.swap(NormalizedPath); +} +#endif + +std::optional<std::string> PathResolver::realpathCached(StringRef Path, + std::error_code &EC, + StringRef Base, + bool BaseIsResolved, + long SymLoopLevel) { + EC.clear(); + + if (Path.empty()) { + EC = std::make_error_code(std::errc::no_such_file_or_directory); + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Empty path\n";); + + return std::nullopt; + } + + if (SymLoopLevel <= 0) { + EC = std::make_error_code(std::errc::too_many_symbolic_link_levels); + LLVM_DEBUG( + dbgs() << "PathResolver::realpathCached: Too many Symlink levels: " + << Path << "\n";); + + return std::nullopt; + } + + // If already cached - retun cached result + bool isRelative = sys::path::is_relative(Path); + if (!isRelative) { + if (auto Cached = LibPathCache->read_realpath(Path)) { + EC = Cached->ErrnoCode; + if (EC) { + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Cached (error) for " + << Path << "\n";); + } else { + LLVM_DEBUG( + dbgs() << "PathResolver::realpathCached: Cached (success) for " + << Path << " => " << Cached->canonicalPath << "\n";); + } + return Cached->canonicalPath.empty() + ? std::nullopt + : std::make_optional(Cached->canonicalPath); + } + } + + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Resolving path: " << Path + << "\n";); + + // If result not in cache - call system function and cache result + + StringRef Separator(sys::path::get_separator()); + SmallString<256> Resolved(Separator); +#ifndef _WIN32 + SmallVector<StringRef, 16> Components; + + if (isRelative) { + if (BaseIsResolved) { + Resolved.assign(Base); + LLVM_DEBUG(dbgs() << " Using Resolved base: " << Base << "\n";); + } + createComponent(Path, Base, BaseIsResolved, Components); + } else { + Path.split(Components, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false); + } + + normalizePathSegments(Components); + LLVM_DEBUG({ + for (auto &C : Components) + dbgs() << " " << C << " "; + + dbgs() << "\n"; + }); + + // Handle path list items + for (const auto &Component : Components) { + if (Component == ".") + continue; + if (Component == "..") { + // collapse "a/b/../c" to "a/c" + size_t S = Resolved.rfind(Separator); + if (S != llvm::StringRef::npos) + Resolved.resize(S); + if (Resolved.empty()) + Resolved = Separator; + continue; + } + + size_t oldSize = Resolved.size(); + sys::path::append(Resolved, Component); + const char *ResolvedPath = Resolved.c_str(); + LLVM_DEBUG(dbgs() << " Processing Component: " << Component << " => " + << ResolvedPath << "\n";); + mode_t st_mode = lstatCached(ResolvedPath); + + if (S_ISLNK(st_mode)) { + LLVM_DEBUG(dbgs() << " Found symlink: " << ResolvedPath << "\n";); + + auto SymlinkOpt = readlinkCached(ResolvedPath); + if (!SymlinkOpt) { + EC = std::make_error_code(std::errc::no_such_file_or_directory); + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC}); + LLVM_DEBUG(dbgs() << " Failed to read symlink: " << ResolvedPath + << "\n";); + + return std::nullopt; + } + + StringRef Symlink = *SymlinkOpt; + LLVM_DEBUG(dbgs() << " Symlink points to: " << Symlink << "\n";); + + std::string resolvedBase = ""; + if (sys::path::is_relative(Symlink)) { + Resolved.resize(oldSize); + resolvedBase = Resolved.str().str(); + } + + auto RealSymlink = + realpathCached(Symlink, EC, resolvedBase, + /*BaseIsResolved=*/true, SymLoopLevel - 1); + if (!RealSymlink) { + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC}); + LLVM_DEBUG(dbgs() << " Failed to resolve symlink target: " << Symlink + << "\n";); + + return std::nullopt; + } + + Resolved.assign(*RealSymlink); + LLVM_DEBUG(dbgs() << " Symlink Resolved to: " << Resolved << "\n";); + + } else if (st_mode == 0) { + EC = std::make_error_code(std::errc::no_such_file_or_directory); + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC}); + LLVM_DEBUG(dbgs() << " Component does not exist: " << ResolvedPath + << "\n";); + + return std::nullopt; + } + } +#else + EC = sys::fs::real_path(Path, Resolved); // Windows fallback +#endif + + std::string Canonical = Resolved.str().str(); + { + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{ + Canonical, + std::error_code() // success + }); + } + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Final Resolved: " << Path + << " => " << Canonical << "\n";); + return Canonical; +} + +void LibraryScanHelper::addBasePath(const std::string &Path, PathType K) { + std::error_code EC; + std::string Canon = resolveCanonical(Path, EC); + if (EC) { + LLVM_DEBUG( + dbgs() + << "LibraryScanHelper::addBasePath: Failed to canonicalize path: " + << Path << "\n";); + return; + } + std::unique_lock<std::shared_mutex> Lock(Mtx); + if (LibSearchPaths.count(Canon)) { + LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Already added: " + << Canon << "\n";); + return; + } + K = K == PathType::Unknown ? classifyKind(Canon) : K; + auto SP = std::make_shared<LibrarySearchPath>(Canon, K); + LibSearchPaths[Canon] = SP; + + if (K == PathType::User) { + LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added User path: " + << Canon << "\n";); + UnscannedUsr.push_back(StringRef(SP->BasePath)); + } else { + LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added System path: " + << Canon << "\n";); + UnscannedSys.push_back(StringRef(SP->BasePath)); + } +} + +std::vector<std::shared_ptr<LibrarySearchPath>> +LibraryScanHelper::getNextBatch(PathType K, size_t BatchSize) { + std::vector<std::shared_ptr<LibrarySearchPath>> Result; + auto &Queue = (K == PathType::User) ? UnscannedUsr : UnscannedSys; + + std::unique_lock<std::shared_mutex> Lock(Mtx); + + while (!Queue.empty() && (BatchSize == 0 || Result.size() < BatchSize)) { + StringRef Base = Queue.front(); + auto It = LibSearchPaths.find(Base); + if (It != LibSearchPaths.end()) { + auto &SP = It->second; + ScanState Expected = ScanState::NotScanned; + if (SP->State.compare_exchange_strong(Expected, ScanState::Scanning)) { + Result.push_back(SP); + } + } + Queue.pop_front(); + } + + return Result; +} + +bool LibraryScanHelper::isTrackedBasePath(StringRef Path) const { + std::error_code EC; + std::string Canon = resolveCanonical(Path, EC); + if (EC) + return false; + + std::shared_lock<std::shared_mutex> Lock(Mtx); + return LibSearchPaths.count(Canon) > 0; +} + +bool LibraryScanHelper::leftToScan(PathType K) const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + for (const auto &KV : LibSearchPaths) { + const auto &SP = KV.second; + if (SP->Kind == K && SP->State == ScanState::NotScanned) + return true; + } + return false; +} + +void LibraryScanHelper::resetToScan() { + std::shared_lock<std::shared_mutex> Lock(Mtx); + + for (auto &[_, SP] : LibSearchPaths) { + ScanState Expected = ScanState::Scanned; + + if (!SP->State.compare_exchange_strong(Expected, ScanState::NotScanned)) + continue; + + auto &TargetList = + (SP->Kind == PathType::User) ? UnscannedUsr : UnscannedSys; + TargetList.emplace_back(SP->BasePath); + } +} + +std::vector<std::shared_ptr<LibrarySearchPath>> +LibraryScanHelper::getAllUnits() const { + std::shared_lock<std::shared_mutex> Lock(Mtx); + std::vector<std::shared_ptr<LibrarySearchPath>> Result; + Result.reserve(LibSearchPaths.size()); + for (const auto &[_, SP] : LibSearchPaths) { + Result.push_back(SP); + } + return Result; +} + +std::string LibraryScanHelper::resolveCanonical(StringRef Path, + std::error_code &EC) const { + auto Canon = LibPathResolver->resolve(Path, EC); + return EC ? Path.str() : *Canon; +} + +PathType LibraryScanHelper::classifyKind(StringRef Path) const { + // Detect home directory + const char *Home = getenv("HOME"); + if (Home && Path.find(Home) == 0) + return PathType::User; + + static const std::array<std::string, 5> UserPrefixes = { + "/usr/local", // often used by users for manual installs + "/opt/homebrew", // common on macOS + "/opt/local", // MacPorts + "/home", // Linux home dirs + "/Users", // macOS user dirs + }; + + for (const auto &Prefix : UserPrefixes) { + if (Path.find(Prefix) == 0) + return PathType::User; + } + + return PathType::System; +} + +Expected<LibraryDepsInfo> parseMachODeps(const object::MachOObjectFile &Obj) { + LibraryDepsInfo Libdeps; + LLVM_DEBUG(dbgs() << "Parsing Mach-O dependencies...\n";); + for (const auto &Command : Obj.load_commands()) { + switch (Command.C.cmd) { + case MachO::LC_LOAD_DYLIB: { + MachO::dylib_command dylibCmd = Obj.getDylibIDLoadCommand(Command); + const char *name = Command.Ptr + dylibCmd.dylib.name; + Libdeps.addDep(name); + LLVM_DEBUG(dbgs() << " Found LC_LOAD_DYLIB: " << name << "\n";); + } break; + case MachO::LC_LOAD_WEAK_DYLIB: + case MachO::LC_REEXPORT_DYLIB: + case MachO::LC_LOAD_UPWARD_DYLIB: + case MachO::LC_LAZY_LOAD_DYLIB: + break; + case MachO::LC_RPATH: { + // Extract RPATH + MachO::rpath_command rpathCmd = Obj.getRpathCommand(Command); + const char *rpath = Command.Ptr + rpathCmd.path; + LLVM_DEBUG(dbgs() << " Found LC_RPATH: " << rpath << "\n";); + + SmallVector<StringRef, 4> RawPaths; + SplitString(StringRef(rpath), RawPaths, + sys::EnvPathSeparator == ':' ? ":" : ";"); + + for (const auto &raw : RawPaths) { + Libdeps.addRPath(raw.str()); // Convert to std::string + LLVM_DEBUG(dbgs() << " Parsed RPATH entry: " << raw << "\n";); + } + break; + } + } + } + + return Expected<LibraryDepsInfo>(std::move(Libdeps)); +} + +template <class ELFT> +static Expected<StringRef> getDynamicStrTab(const object::ELFFile<ELFT> &Elf) { + auto DynamicEntriesOrError = Elf.dynamicEntries(); + if (!DynamicEntriesOrError) + return DynamicEntriesOrError.takeError(); + + for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) { + if (Dyn.d_tag == ELF::DT_STRTAB) { + auto MappedAddrOrError = Elf.toMappedAddr(Dyn.getPtr()); + if (!MappedAddrOrError) + return MappedAddrOrError.takeError(); + return StringRef(reinterpret_cast<const char *>(*MappedAddrOrError)); + } + } + + // If the dynamic segment is not present, we fall back on the sections. + auto SectionsOrError = Elf.sections(); + if (!SectionsOrError) + return SectionsOrError.takeError(); + + for (const typename ELFT::Shdr &Sec : *SectionsOrError) { + if (Sec.sh_type == ELF::SHT_DYNSYM) + return Elf.getStringTableForSymtab(Sec); + } + + return make_error<StringError>("dynamic string table not found", + inconvertibleErrorCode()); +} + +template <typename ELFT> +Expected<LibraryDepsInfo> parseELF(const object::ELFFile<ELFT> &Elf) { + LibraryDepsInfo Deps; + Expected<StringRef> StrTabOrErr = getDynamicStrTab(Elf); + if (!StrTabOrErr) + return StrTabOrErr.takeError(); + + const char *Data = StrTabOrErr->data(); + + auto DynamicEntriesOrError = Elf.dynamicEntries(); + if (!DynamicEntriesOrError) { + return DynamicEntriesOrError.takeError(); + } + + for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) { + switch (Dyn.d_tag) { + case ELF::DT_NEEDED: + Deps.addDep(Data + Dyn.d_un.d_val); + break; + case ELF::DT_RPATH: { + SmallVector<StringRef, 4> RawPaths; + SplitString(Data + Dyn.d_un.d_val, RawPaths, + sys::EnvPathSeparator == ':' ? ":" : ";"); + for (const auto &raw : RawPaths) + Deps.addRPath(raw.str()); + break; + } + case ELF::DT_RUNPATH: { + SmallVector<StringRef, 4> RawPaths; + SplitString(Data + Dyn.d_un.d_val, RawPaths, + sys::EnvPathSeparator == ':' ? ":" : ";"); + for (const auto &raw : RawPaths) + Deps.addRunPath(raw.str()); + break; + } + case ELF::DT_FLAGS_1: + // Check if this is not a pie executable. + if (Dyn.d_un.d_val & ELF::DF_1_PIE) + Deps.isPIE = true; + break; + // (Dyn.d_tag == ELF::DT_NULL) continue; + // (Dyn.d_tag == ELF::DT_AUXILIARY || Dyn.d_tag == ELF::DT_FILTER) + default: + break; + } + } + + return Expected<LibraryDepsInfo>(std::move(Deps)); +} + +Expected<LibraryDepsInfo> parseELFDeps(const object::ELFObjectFileBase &Obj) { + using namespace object; + LLVM_DEBUG(dbgs() << "parseELFDeps: Detected ELF object\n";); + if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj)) + return parseELF(ELF->getELFFile()); + else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj)) + return parseELF(ELF->getELFFile()); + else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj)) + return parseELF(ELF->getELFFile()); + else if (const auto *ELF = dyn_cast<ELF64BEObjectFile>(&Obj)) + return parseELF(ELF->getELFFile()); + + LLVM_DEBUG(dbgs() << "parseELFDeps: Unknown ELF format\n";); + return createStringError(std::errc::not_supported, "Unknown ELF format"); +} + +Expected<LibraryDepsInfo> LibraryScanner::extractDeps(StringRef FilePath) { + LLVM_DEBUG(dbgs() << "extractDeps: Attempting to open file " << FilePath + << "\n";); + + ObjectFileLoader ObjLoader(FilePath); + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + LLVM_DEBUG(dbgs() << "extractDeps: Failed to open " << FilePath << "\n";); + return ObjOrErr.takeError(); + } + + object::ObjectFile *Obj = &ObjOrErr.get(); + + if (auto *elfObj = dyn_cast<object::ELFObjectFileBase>(Obj)) { + LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath + << " is an ELF object\n";); + + return parseELFDeps(*elfObj); + } + + if (auto *macho = dyn_cast<object::MachOObjectFile>(Obj)) { + LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath + << " is a Mach-O object\n";); + return parseMachODeps(*macho); + } + + if (Obj->isCOFF()) { + // TODO: COFF support + return LibraryDepsInfo(); + } + + LLVM_DEBUG(dbgs() << "extractDeps: Unsupported binary format for file " + << FilePath << "\n";); + return createStringError(inconvertibleErrorCode(), + "Unsupported binary format: %s", + FilePath.str().c_str()); +} + +std::optional<std::string> LibraryScanner::shouldScan(StringRef FilePath) { + std::error_code EC; + + LLVM_DEBUG(dbgs() << "[shouldScan] Checking: " << FilePath << "\n";); + + // [1] Check file existence early + if (!sys::fs::exists(FilePath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: file does not exist.\n";); + + return std::nullopt; + } + + // [2] Resolve to canonical path + auto CanonicalPathOpt = ScanHelper.resolve(FilePath, EC); + if (EC || !CanonicalPathOpt) { + LLVM_DEBUG(dbgs() << " -> Skipped: failed to resolve path (EC=" + << EC.message() << ").\n";); + + return std::nullopt; + } + + const std::string &CanonicalPath = *CanonicalPathOpt; + LLVM_DEBUG(dbgs() << " -> Canonical path: " << CanonicalPath << "\n"); + + // [3] Check if it's a directory — skip directories + if (sys::fs::is_directory(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: path is a directory.\n";); + + return std::nullopt; + } + + // [4] Skip if it's not a shared library. + if (!DylibPathValidator::isSharedLibrary(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: not a shared library.\n";); + return std::nullopt; + } + + // [5] Skip if we've already seen this path (via cache) + if (ScanHelper.hasSeenOrMark(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: already seen.\n";); + + return std::nullopt; + } + + // [6] Already tracked in LibraryManager? + if (LibMgr.hasLibrary(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: already tracked by LibraryManager.\n";); + + return std::nullopt; + } + + // [7] Run user-defined hook (default: always true) + if (!ShouldScanCall(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: user-defined hook rejected.\n";); + + return std::nullopt; + } + + LLVM_DEBUG(dbgs() << " -> Accepted: ready to scan " << CanonicalPath + << "\n";); + return CanonicalPath; +} + +void LibraryScanner::handleLibrary(StringRef FilePath, PathType K, int level) { + LLVM_DEBUG(dbgs() << "LibraryScanner::handleLibrary: Scanning: " << FilePath + << ", level=" << level << "\n";); + auto CanonPathOpt = shouldScan(FilePath); + if (!CanonPathOpt) { + LLVM_DEBUG(dbgs() << " Skipped (shouldScan returned false): " << FilePath + << "\n";); + + return; + } + const std::string CanonicalPath = *CanonPathOpt; + + auto DepsOrErr = extractDeps(CanonicalPath); + if (!DepsOrErr) { + LLVM_DEBUG(dbgs() << " Failed to extract deps for: " << CanonicalPath + << "\n";); + handleError(DepsOrErr.takeError()); + return; + } + + LibraryDepsInfo &Deps = *DepsOrErr; + + LLVM_DEBUG({ + dbgs() << " Found deps : \n"; + for (const auto &dep : Deps.deps) + dbgs() << " : " << dep << "\n"; + dbgs() << " Found @rpath : " << Deps.rpath.size() << "\n"; + for (const auto &r : Deps.rpath) + dbgs() << " : " << r << "\n"; + dbgs() << " Found @runpath : \n"; + for (const auto &r : Deps.runPath) + dbgs() << " : " << r << "\n"; + }); + + if (Deps.isPIE && level == 0) { + LLVM_DEBUG(dbgs() << " Skipped PIE executable at top level: " + << CanonicalPath << "\n";); + + return; + } + + bool Added = LibMgr.addLibrary(CanonicalPath, K); + if (!Added) { + LLVM_DEBUG(dbgs() << " Already added: " << CanonicalPath << "\n";); + return; + } + + // Heuristic 1: No RPATH/RUNPATH, skip deps + if (Deps.rpath.empty() && Deps.runPath.empty()) { + LLVM_DEBUG( + dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic1): " + << CanonicalPath << "\n";); + return; + } + + // Heuristic 2: All RPATH and RUNPATH already tracked + auto allTracked = [&](const auto &Paths) { + LLVM_DEBUG(dbgs() << " Checking : " << Paths.size() << "\n";); + return std::all_of(Paths.begin(), Paths.end(), [&](StringRef P) { + LLVM_DEBUG(dbgs() << " Checking isTrackedBasePath : " << P << "\n";); + return ScanHelper.isTrackedBasePath( + DylibResolver::resolvelinkerFlag(P, CanonicalPath)); + }); + }; + + if (allTracked(Deps.rpath) && allTracked(Deps.runPath)) { + LLVM_DEBUG( + dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic2): " + << CanonicalPath << "\n";); + return; + } + + DylibPathValidator Validator(ScanHelper.getPathResolver()); + DylibResolver Resolver(Validator); + Resolver.configure(CanonicalPath, + {{Deps.rpath, SearchPathType::RPath}, + {ScanHelper.getSearchPaths(), SearchPathType::UsrOrSys}, + {Deps.runPath, SearchPathType::RunPath}}); + for (StringRef Dep : Deps.deps) { + LLVM_DEBUG(dbgs() << " Resolving dep: " << Dep << "\n";); + auto DepFullOpt = Resolver.resolve(Dep); + if (!DepFullOpt) { + LLVM_DEBUG(dbgs() << " Failed to resolve dep: " << Dep << "\n";); + + continue; + } + LLVM_DEBUG(dbgs() << " Resolved dep to: " << *DepFullOpt << "\n";); + + handleLibrary(*DepFullOpt, K, level + 1); + } +} + +void LibraryScanner::scanBaseDir(std::shared_ptr<LibrarySearchPath> SP) { + if (!sys::fs::is_directory(SP->BasePath) || SP->BasePath.empty()) { + LLVM_DEBUG( + dbgs() << "LibraryScanner::scanBaseDir: Invalid or empty basePath: " + << SP->BasePath << "\n";); + return; + } + + LLVM_DEBUG(dbgs() << "LibraryScanner::scanBaseDir: Scanning directory: " + << SP->BasePath << "\n";); + std::error_code EC; + + SP->State.store(ScanState::Scanning); + + for (sys::fs::directory_iterator It(SP->BasePath, EC), end; It != end && !EC; + It.increment(EC)) { + auto Entry = *It; + if (!Entry.status()) + continue; + + auto Status = *Entry.status(); + if (sys::fs::is_regular_file(Status) || sys::fs::is_symlink_file(Status)) { + LLVM_DEBUG(dbgs() << " Found file: " << Entry.path() << "\n";); + // async support ? + handleLibrary(Entry.path(), SP->Kind); + } + } + + SP->State.store(ScanState::Scanned); +} + +void LibraryScanner::scanNext(PathType K, size_t BatchSize) { + LLVM_DEBUG(dbgs() << "LibraryScanner::scanNext: Scanning next batch of size " + << BatchSize << " for kind " + << (K == PathType::User ? "User" : "System") << "\n";); + + auto SearchPaths = ScanHelper.getNextBatch(K, BatchSize); + for (auto &SP : SearchPaths) { + LLVM_DEBUG(dbgs() << " Scanning unit with basePath: " << SP->BasePath + << "\n";); + + scanBaseDir(SP); + } +} + +} // end namespace llvm::orc diff --git a/llvm/lib/FileCheck/FileCheckImpl.h b/llvm/lib/FileCheck/FileCheckImpl.h index a08502e..5851cfc 100644 --- a/llvm/lib/FileCheck/FileCheckImpl.h +++ b/llvm/lib/FileCheck/FileCheckImpl.h @@ -528,7 +528,7 @@ public: SMRange getRange() const { return Range; } static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg, - SMRange Range = std::nullopt) { + SMRange Range = {}) { return make_error<ErrorDiagnostic>( SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg), Range); } diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp index df88490..b546e81 100644 --- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp +++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp @@ -12,7 +12,6 @@ #include "llvm/TargetParser/Triple.h" namespace llvm { -extern llvm::cl::opt<bool> DebugInfoCorrelate; extern llvm::cl::opt<llvm::InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate; } // namespace llvm @@ -64,8 +63,7 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple, } std::string getDefaultProfileGenName() { - return llvm::DebugInfoCorrelate || - llvm::ProfileCorrelate != InstrProfCorrelator::NONE + return llvm::ProfileCorrelate != InstrProfCorrelator::NONE ? "default_%m.proflite" : "default_%m.profraw"; } diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 0e5926f..fff9a81 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -528,7 +528,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION); Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems); auto Int32Ty = Type::getInt32Ty(Builder.getContext()); - constexpr const size_t MaxDim = 3; + constexpr size_t MaxDim = 3; Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim)); Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index b838e36..58b7ddd 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -730,7 +730,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, // (arm|aarch64).neon.bfdot.*'. Intrinsic::ID ID = StringSwitch<Intrinsic::ID>(Name) - .Cases("v2f32.v8i8", "v4f32.v16i8", + .Cases({"v2f32.v8i8", "v4f32.v16i8"}, IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfdot : (Intrinsic::ID)Intrinsic::aarch64_neon_bfdot) .Default(Intrinsic::not_intrinsic); @@ -1456,7 +1456,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (F->arg_size() == 1) { Intrinsic::ID IID = StringSwitch<Intrinsic::ID>(Name) - .Cases("brev32", "brev64", Intrinsic::bitreverse) + .Cases({"brev32", "brev64"}, Intrinsic::bitreverse) .Case("clz.i", Intrinsic::ctlz) .Case("popc.i", Intrinsic::ctpop) .Default(Intrinsic::not_intrinsic); @@ -1504,6 +1504,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, else if (Name.consume_front("fabs.")) // nvvm.fabs.{f,ftz.f,d} Expand = Name == "f" || Name == "ftz.f" || Name == "d"; + else if (Name.consume_front("ex2.approx.")) + // nvvm.ex2.approx.{f,ftz.f,d,f16x2} + Expand = + Name == "f" || Name == "ftz.f" || Name == "d" || Name == "f16x2"; else if (Name.consume_front("max.") || Name.consume_front("min.")) // nvvm.{min,max}.{i,ii,ui,ull} Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" || @@ -2550,6 +2554,11 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, Intrinsic::ID IID = (Name == "fabs.ftz.f") ? Intrinsic::nvvm_fabs_ftz : Intrinsic::nvvm_fabs; Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0)); + } else if (Name.consume_front("ex2.approx.")) { + // nvvm.ex2.approx.{f,ftz.f,d,f16x2} + Intrinsic::ID IID = Name.starts_with("ftz") ? Intrinsic::nvvm_ex2_approx_ftz + : Intrinsic::nvvm_ex2_approx; + Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0)); } else if (Name.starts_with("atomic.load.add.f32.p") || Name.starts_with("atomic.load.add.f64.p")) { Value *Ptr = CI->getArgOperand(0); diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index 51fb40b..e3e8d89 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -535,7 +535,7 @@ struct ConstantPtrAuthKeyType { unsigned getHash() const { return hash_combine_range(Operands); } - using TypeClass = typename ConstantInfo<ConstantPtrAuth>::TypeClass; + using TypeClass = ConstantInfo<ConstantPtrAuth>::TypeClass; ConstantPtrAuth *create(TypeClass *Ty) const { return new ConstantPtrAuth(Operands[0], cast<ConstantInt>(Operands[1]), diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp index 62fd62c..3394754 100644 --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -34,8 +34,6 @@ static cl::opt<bool> ImportConstantsWithRefs( "import-constants-with-refs", cl::init(true), cl::Hidden, cl::desc("Import constant global variables with references")); -constexpr uint32_t FunctionSummary::ParamAccess::RangeWidth; - FunctionSummary FunctionSummary::ExternalNode = FunctionSummary::makeDummyFunctionSummary( SmallVector<FunctionSummary::EdgeTy, 0>()); @@ -88,8 +86,6 @@ std::pair<unsigned, unsigned> FunctionSummary::specialRefCounts() const { return {RORefCnt, WORefCnt}; } -constexpr uint64_t ModuleSummaryIndex::BitcodeSummaryVersion; - uint64_t ModuleSummaryIndex::getFlags() const { uint64_t Flags = 0; // Flags & 0x4 is reserved. DO NOT REUSE. diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index b618222..23be42f 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1076,63 +1076,59 @@ Expected<ArrayRef<SymbolResolution>> LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, ArrayRef<SymbolResolution> Res) { llvm::TimeTraceScope timeScope("LTO add thin LTO"); + const auto BMID = BM.getModuleIdentifier(); ArrayRef<SymbolResolution> ResTmp = Res; for (const InputFile::Symbol &Sym : Syms) { assert(!ResTmp.empty()); const SymbolResolution &R = ResTmp.consume_front(); - if (!Sym.getIRName().empty()) { + if (!Sym.getIRName().empty() && R.Prevailing) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (R.Prevailing) - ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()); + ThinLTO.setPrevailingModuleForGUID(GUID, BMID); } } - if (Error Err = - BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(), - [&](GlobalValue::GUID GUID) { - return ThinLTO.isPrevailingModuleForGUID( - GUID, BM.getModuleIdentifier()); - })) + if (Error Err = BM.readSummary( + ThinLTO.CombinedIndex, BMID, [&](GlobalValue::GUID GUID) { + return ThinLTO.isPrevailingModuleForGUID(GUID, BMID); + })) return Err; - LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n"); + LLVM_DEBUG(dbgs() << "Module " << BMID << "\n"); for (const InputFile::Symbol &Sym : Syms) { assert(!Res.empty()); const SymbolResolution &R = Res.consume_front(); - if (!Sym.getIRName().empty()) { + if (!Sym.getIRName().empty() && + (R.Prevailing || R.FinalDefinitionInLinkageUnit)) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); if (R.Prevailing) { - assert( - ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier())); + assert(ThinLTO.isPrevailingModuleForGUID(GUID, BMID)); // For linker redefined symbols (via --wrap or --defsym) we want to // switch the linkage to `weak` to prevent IPOs from happening. // Find the summary in the module for this very GV and record the new // linkage so that we can switch it when we import the GV. if (R.LinkerRedefined) - if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( - GUID, BM.getModuleIdentifier())) + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID)) S->setLinkage(GlobalValue::WeakAnyLinkage); } // If the linker resolved the symbol to a local definition then mark it // as local in the summary for the module we are adding. if (R.FinalDefinitionInLinkageUnit) { - if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( - GUID, BM.getModuleIdentifier())) { + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID)) { S->setDSOLocal(true); } } } } - if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second) + if (!ThinLTO.ModuleMap.insert({BMID, BM}).second) return make_error<StringError>( "Expected at most one ThinLTO module per bitcode file", inconvertibleErrorCode()); @@ -1143,10 +1139,10 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // This is a fuzzy name matching where only modules with name containing the // specified switch values are going to be compiled. for (const std::string &Name : Conf.ThinLTOModulesToCompile) { - if (BM.getModuleIdentifier().contains(Name)) { - ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM}); - LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier() - << " to compile\n"); + if (BMID.contains(Name)) { + ThinLTO.ModulesToCompile->insert({BMID, BM}); + LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BMID << " to compile\n"); + break; } } } diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp index 71bd397..a3eaaa7 100644 --- a/llvm/lib/MC/GOFFObjectWriter.cpp +++ b/llvm/lib/MC/GOFFObjectWriter.cpp @@ -520,7 +520,7 @@ GOFFObjectWriter::GOFFObjectWriter( std::unique_ptr<MCGOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS) : TargetObjectWriter(std::move(MOTW)), OS(OS) {} -GOFFObjectWriter::~GOFFObjectWriter() {} +GOFFObjectWriter::~GOFFObjectWriter() = default; uint64_t GOFFObjectWriter::writeObject() { uint64_t Size = GOFFWriter(OS, *Asm).writeObject(); diff --git a/llvm/lib/MC/MCDXContainerWriter.cpp b/llvm/lib/MC/MCDXContainerWriter.cpp index 5eda039..ebed411 100644 --- a/llvm/lib/MC/MCDXContainerWriter.cpp +++ b/llvm/lib/MC/MCDXContainerWriter.cpp @@ -16,7 +16,7 @@ using namespace llvm; -MCDXContainerTargetWriter::~MCDXContainerTargetWriter() {} +MCDXContainerTargetWriter::~MCDXContainerTargetWriter() = default; uint64_t DXContainerObjectWriter::writeObject() { auto &Asm = *this->Asm; diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp index 8b228db..ad6397b 100644 --- a/llvm/lib/MC/MCGOFFStreamer.cpp +++ b/llvm/lib/MC/MCGOFFStreamer.cpp @@ -20,7 +20,7 @@ using namespace llvm; -MCGOFFStreamer::~MCGOFFStreamer() {} +MCGOFFStreamer::~MCGOFFStreamer() = default; GOFFObjectWriter &MCGOFFStreamer::getWriter() { return static_cast<GOFFObjectWriter &>(getAssembler().getWriter()); diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index a6188f0..1af4a29 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCParser/AsmLexer.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SaveAndRestore.h" diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index dd1bc2b..3c9ab8e 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -228,11 +228,9 @@ public: AssemblerDialect = i; } - void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override; - bool Warning(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; - bool printError(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; + void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override; const AsmToken &Lex() override; @@ -312,7 +310,7 @@ private: void printMacroInstantiations(); void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg, - SMRange Range = std::nullopt) const { + SMRange Range = {}) const { ArrayRef<SMRange> Ranges(Range); SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges); } diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 1a3752f..911d92c 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -695,15 +695,15 @@ bool ELFAsmParser::parseDirectivePrevious(StringRef DirName, SMLoc) { static MCSymbolAttr MCAttrForString(StringRef Type) { return StringSwitch<MCSymbolAttr>(Type) - .Cases("STT_FUNC", "function", MCSA_ELF_TypeFunction) - .Cases("STT_OBJECT", "object", MCSA_ELF_TypeObject) - .Cases("STT_TLS", "tls_object", MCSA_ELF_TypeTLS) - .Cases("STT_COMMON", "common", MCSA_ELF_TypeCommon) - .Cases("STT_NOTYPE", "notype", MCSA_ELF_TypeNoType) - .Cases("STT_GNU_IFUNC", "gnu_indirect_function", - MCSA_ELF_TypeIndFunction) - .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject) - .Default(MCSA_Invalid); + .Cases({"STT_FUNC", "function"}, MCSA_ELF_TypeFunction) + .Cases({"STT_OBJECT", "object"}, MCSA_ELF_TypeObject) + .Cases({"STT_TLS", "tls_object"}, MCSA_ELF_TypeTLS) + .Cases({"STT_COMMON", "common"}, MCSA_ELF_TypeCommon) + .Cases({"STT_NOTYPE", "notype"}, MCSA_ELF_TypeNoType) + .Cases({"STT_GNU_IFUNC", "gnu_indirect_function"}, + MCSA_ELF_TypeIndFunction) + .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject) + .Default(MCSA_Invalid); } /// parseDirectiveELFType diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index 8a8f111..3a85770 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -483,11 +483,9 @@ public: AssemblerDialect = i; } - void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override; - bool Warning(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; - bool printError(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; + void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override; enum ExpandKind { ExpandMacros, DoNotExpandMacros }; const AsmToken &Lex(ExpandKind ExpandNextToken); @@ -592,7 +590,7 @@ private: bool expandStatement(SMLoc Loc); void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg, - SMRange Range = std::nullopt) const { + SMRange Range = {}) const { ArrayRef<SMRange> Ranges(Range); SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges); } @@ -5325,10 +5323,10 @@ void MasmParser::initializeDirectiveKindMap() { bool MasmParser::isMacroLikeDirective() { if (getLexer().is(AsmToken::Identifier)) { bool IsMacroLike = StringSwitch<bool>(getTok().getIdentifier()) - .CasesLower("repeat", "rept", true) + .CasesLower({"repeat", "rept"}, true) .CaseLower("while", true) - .CasesLower("for", "irp", true) - .CasesLower("forc", "irpc", true) + .CasesLower({"for", "irp"}, true) + .CasesLower({"forc", "irpc"}, true) .Default(false); if (IsMacroLike) return true; diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.h b/llvm/lib/ObjCopy/COFF/COFFWriter.h index 66d7f01..3ee0e06 100644 --- a/llvm/lib/ObjCopy/COFF/COFFWriter.h +++ b/llvm/lib/ObjCopy/COFF/COFFWriter.h @@ -50,7 +50,7 @@ class COFFWriter { Expected<uint32_t> virtualAddressToFileAddress(uint32_t RVA); public: - virtual ~COFFWriter() {} + virtual ~COFFWriter() = default; Error write(); COFFWriter(Object &Obj, raw_ostream &Out) diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h index 4f6473f..2783ef27 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.h +++ b/llvm/lib/ObjCopy/ELF/ELFObject.h @@ -134,7 +134,7 @@ private: using Elf_Sym = typename ELFT::Sym; public: - ~ELFSectionWriter() override {} + ~ELFSectionWriter() override = default; Error visit(const SymbolTableSection &Sec) override; Error visit(const RelocationSection &Sec) override; Error visit(const GnuDebugLinkSection &Sec) override; @@ -180,7 +180,7 @@ public: class BinarySectionWriter : public SectionWriter { public: - ~BinarySectionWriter() override {} + ~BinarySectionWriter() override = default; Error visit(const SymbolTableSection &Sec) override; Error visit(const RelocationSection &Sec) override; @@ -346,7 +346,7 @@ private: size_t totalSize() const; public: - ~ELFWriter() override {} + ~ELFWriter() override = default; bool WriteSectionHeaders; // For --only-keep-debug, select an alternative section/segment layout @@ -367,7 +367,7 @@ private: uint64_t TotalSize = 0; public: - ~BinaryWriter() override {} + ~BinaryWriter() override = default; Error finalize() override; Error write() override; BinaryWriter(Object &Obj, raw_ostream &Out, const CommonConfig &Config) @@ -784,7 +784,7 @@ private: SymbolTableSection *Symbols = nullptr; public: - ~SectionIndexSection() override {} + ~SectionIndexSection() override = default; void addIndex(uint32_t Index) { assert(Size > 0); Indexes.push_back(Index); diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.h b/llvm/lib/ObjCopy/MachO/MachOReader.h index e315e6fd..940ba4c 100644 --- a/llvm/lib/ObjCopy/MachO/MachOReader.h +++ b/llvm/lib/ObjCopy/MachO/MachOReader.h @@ -23,7 +23,7 @@ namespace macho { // raw binaries and regular MachO object files. class Reader { public: - virtual ~Reader(){}; + virtual ~Reader() = default; virtual Expected<std::unique_ptr<Object>> create() const = 0; }; diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h index 8620548..47639ad 100644 --- a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h +++ b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h @@ -20,7 +20,7 @@ namespace xcoff { class XCOFFWriter { public: - virtual ~XCOFFWriter() {} + virtual ~XCOFFWriter() = default; XCOFFWriter(Object &Obj, raw_ostream &Out) : Obj(Obj), Out(Out) {} Error write(); diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index 6da97f9..354c51d 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -831,17 +831,17 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, }; uint8_t Version = 0; - uint8_t Feature = 0; + uint16_t Feature = 0; BBAddrMap::Features FeatEnable{}; while (!ULEBSizeErr && !MetadataDecodeErr && Cur && Cur.tell() < Content.size()) { Version = Data.getU8(Cur); if (!Cur) break; - if (Version < 2 || Version > 4) + if (Version < 2 || Version > 5) return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " + Twine(static_cast<int>(Version))); - Feature = Data.getU8(Cur); // Feature byte + Feature = Version < 5 ? Data.getU8(Cur) : Data.getU16(Cur); if (!Cur) break; auto FeatEnableOrErr = BBAddrMap::Features::decode(Feature); @@ -858,6 +858,11 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, "basic block hash feature is enabled: version = " + Twine(static_cast<int>(Version)) + " feature = " + Twine(static_cast<int>(Feature))); + if (FeatEnable.PostLinkCfg && Version < 5) + return createError("version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when " + "post link cfg feature is enabled: version = " + + Twine(static_cast<int>(Version)) + + " feature = " + Twine(static_cast<int>(Feature))); uint32_t NumBlocksInBBRange = 0; uint32_t NumBBRanges = 1; typename ELFFile<ELFT>::uintX_t RangeBaseAddress = 0; @@ -946,6 +951,10 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, uint64_t BBF = FeatEnable.BBFreq ? readULEB128As<uint64_t>(Data, Cur, ULEBSizeErr) : 0; + uint32_t PostLinkBBFreq = + FeatEnable.PostLinkCfg + ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) + : 0; // Branch probability llvm::SmallVector<PGOAnalysisMap::PGOBBEntry::SuccessorEntry, 2> @@ -955,13 +964,20 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, for (uint64_t I = 0; I < SuccCount; ++I) { uint32_t BBID = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); uint32_t BrProb = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); + uint32_t PostLinkFreq = + FeatEnable.PostLinkCfg + ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) + : 0; + if (PGOAnalyses) - Successors.push_back({BBID, BranchProbability::getRaw(BrProb)}); + Successors.push_back( + {BBID, BranchProbability::getRaw(BrProb), PostLinkFreq}); } } if (PGOAnalyses) - PGOBBEntries.push_back({BlockFrequency(BBF), std::move(Successors)}); + PGOBBEntries.push_back( + {BlockFrequency(BBF), PostLinkBBFreq, std::move(Successors)}); } if (PGOAnalyses) diff --git a/llvm/lib/Object/WindowsMachineFlag.cpp b/llvm/lib/Object/WindowsMachineFlag.cpp index caf357e8..14c14f6 100644 --- a/llvm/lib/Object/WindowsMachineFlag.cpp +++ b/llvm/lib/Object/WindowsMachineFlag.cpp @@ -23,8 +23,8 @@ using namespace llvm; COFF::MachineTypes llvm::getMachineType(StringRef S) { // Flags must be a superset of Microsoft lib.exe /machine flags. return StringSwitch<COFF::MachineTypes>(S.lower()) - .Cases("x64", "amd64", COFF::IMAGE_FILE_MACHINE_AMD64) - .Cases("x86", "i386", COFF::IMAGE_FILE_MACHINE_I386) + .Cases({"x64", "amd64"}, COFF::IMAGE_FILE_MACHINE_AMD64) + .Cases({"x86", "i386"}, COFF::IMAGE_FILE_MACHINE_I386) .Case("arm", COFF::IMAGE_FILE_MACHINE_ARMNT) .Case("arm64", COFF::IMAGE_FILE_MACHINE_ARM64) .Case("arm64ec", COFF::IMAGE_FILE_MACHINE_ARM64EC) diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index 8b75fbe..8530785 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -1465,13 +1465,19 @@ void ELFState<ELFT>::writeSectionContent( for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) { // Write version and feature values. if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) { - if (E.Version > 4) + if (E.Version > 5) WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: " << static_cast<int>(E.Version) << "; encoding using the most recent version"; CBA.write(E.Version); - CBA.write(E.Feature); - SHeader.sh_size += 2; + SHeader.sh_size += 1; + if (E.Version < 5) { + CBA.write(static_cast<uint8_t>(E.Feature)); + SHeader.sh_size += 1; + } else { + CBA.write<uint16_t>(E.Feature, ELFT::Endianness); + SHeader.sh_size += 2; + } } auto FeatureOrErr = llvm::object::BBAddrMap::Features::decode(E.Feature); bool MultiBBRangeFeatureEnabled = false; @@ -1556,11 +1562,15 @@ void ELFState<ELFT>::writeSectionContent( for (const auto &PGOBBE : PGOBBEntries) { if (PGOBBE.BBFreq) SHeader.sh_size += CBA.writeULEB128(*PGOBBE.BBFreq); + if (FeatureOrErr->PostLinkCfg || PGOBBE.PostLinkBBFreq.has_value()) + SHeader.sh_size += CBA.writeULEB128(PGOBBE.PostLinkBBFreq.value_or(0)); if (PGOBBE.Successors) { SHeader.sh_size += CBA.writeULEB128(PGOBBE.Successors->size()); - for (const auto &[ID, BrProb] : *PGOBBE.Successors) { + for (const auto &[ID, BrProb, PostLinkBrFreq] : *PGOBBE.Successors) { SHeader.sh_size += CBA.writeULEB128(ID); SHeader.sh_size += CBA.writeULEB128(BrProb); + if (FeatureOrErr->PostLinkCfg || PostLinkBrFreq.has_value()) + SHeader.sh_size += CBA.writeULEB128(PostLinkBrFreq.value_or(0)); } } } diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index f8a84b0..e5e5fc2 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1886,7 +1886,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry>::mapping( IO &IO, ELFYAML::BBAddrMapEntry &E) { assert(IO.getContext() && "The IO context is not initialized"); IO.mapRequired("Version", E.Version); - IO.mapOptional("Feature", E.Feature, Hex8(0)); + IO.mapOptional("Feature", E.Feature, Hex16(0)); IO.mapOptional("NumBBRanges", E.NumBBRanges); IO.mapOptional("BBRanges", E.BBRanges); } @@ -1920,6 +1920,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry>::mapping( IO &IO, ELFYAML::PGOAnalysisMapEntry::PGOBBEntry &E) { assert(IO.getContext() && "The IO context is not initialized"); IO.mapOptional("BBFreq", E.BBFreq); + IO.mapOptional("PostLinkBBFreq", E.PostLinkBBFreq); IO.mapOptional("Successors", E.Successors); } @@ -1929,6 +1930,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry>:: assert(IO.getContext() && "The IO context is not initialized"); IO.mapRequired("ID", E.ID); IO.mapRequired("BrProb", E.BrProb); + IO.mapOptional("PostLinkBrFreq", E.PostLinkBrFreq); } void MappingTraits<ELFYAML::GnuHashHeader>::mapping(IO &IO, diff --git a/llvm/lib/ObjectYAML/GOFFYAML.cpp b/llvm/lib/ObjectYAML/GOFFYAML.cpp index 60bc1f7..ecd7fb6 100644 --- a/llvm/lib/ObjectYAML/GOFFYAML.cpp +++ b/llvm/lib/ObjectYAML/GOFFYAML.cpp @@ -15,7 +15,7 @@ namespace llvm { namespace GOFFYAML { -Object::Object() {} +Object::Object() = default; } // namespace GOFFYAML diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 7290a86..6b7e980 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -537,7 +537,7 @@ void IRChangedPrinter::handleAfter(StringRef PassID, std::string &Name, Out << "*** IR Dump After " << PassID << " on " << Name << " ***\n" << After; } -IRChangedTester::~IRChangedTester() {} +IRChangedTester::~IRChangedTester() = default; void IRChangedTester::registerCallbacks(PassInstrumentationCallbacks &PIC) { if (TestChanged != "") @@ -1566,7 +1566,7 @@ void InLineChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) { TextChangeReporter<IRDataT<EmptyData>>::registerRequiredCallbacks(PIC); } -TimeProfilingPassesHandler::TimeProfilingPassesHandler() {} +TimeProfilingPassesHandler::TimeProfilingPassesHandler() = default; void TimeProfilingPassesHandler::registerCallbacks( PassInstrumentationCallbacks &PIC) { diff --git a/llvm/lib/Remarks/RemarkFormat.cpp b/llvm/lib/Remarks/RemarkFormat.cpp index 1c52e35..f9fd4af 100644 --- a/llvm/lib/Remarks/RemarkFormat.cpp +++ b/llvm/lib/Remarks/RemarkFormat.cpp @@ -19,7 +19,7 @@ using namespace llvm::remarks; Expected<Format> llvm::remarks::parseFormat(StringRef FormatStr) { auto Result = StringSwitch<Format>(FormatStr) - .Cases("", "yaml", Format::YAML) + .Cases({"", "yaml"}, Format::YAML) .Case("bitstream", Format::Bitstream) .Default(Format::Unknown); diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index fb6ff62..6f5d072 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -637,7 +637,7 @@ Context::Context(LLVMContext &LLVMCtx) : LLVMCtx(LLVMCtx), IRTracker(*this), LLVMIRBuilder(LLVMCtx, ConstantFolder()) {} -Context::~Context() {} +Context::~Context() = default; void Context::clear() { // TODO: Ideally we should clear only function-scope objects, and keep global diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp index 4a6b2fd..be4d1f1 100644 --- a/llvm/lib/Support/AArch64BuildAttributes.cpp +++ b/llvm/lib/Support/AArch64BuildAttributes.cpp @@ -67,8 +67,8 @@ StringRef AArch64BuildAttributes::getTypeStr(unsigned Type) { } SubsectionType AArch64BuildAttributes::getTypeID(StringRef Type) { return StringSwitch<SubsectionType>(Type) - .Cases("uleb128", "ULEB128", ULEB128) - .Cases("ntbs", "NTBS", NTBS) + .Cases({"uleb128", "ULEB128"}, ULEB128) + .Cases({"ntbs", "NTBS"}, NTBS) .Default(TYPE_NOT_FOUND); } StringRef AArch64BuildAttributes::getSubsectionTypeUnknownError() { diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index e21cf8e..e2645fa 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -269,12 +269,6 @@ bool APFloatBase::isRepresentableBy(const fltSemantics &A, A.precision <= B.precision; } -constexpr RoundingMode APFloatBase::rmNearestTiesToEven; -constexpr RoundingMode APFloatBase::rmTowardPositive; -constexpr RoundingMode APFloatBase::rmTowardNegative; -constexpr RoundingMode APFloatBase::rmTowardZero; -constexpr RoundingMode APFloatBase::rmNearestTiesToAway; - /* A tight upper bound on number of parts required to hold the value pow(5, power) is diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp index 1914f4c..d859abd 100644 --- a/llvm/lib/Support/BalancedPartitioning.cpp +++ b/llvm/lib/Support/BalancedPartitioning.cpp @@ -231,7 +231,7 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, } // Compute move gains - typedef std::pair<float, BPFunctionNode *> GainPair; + using GainPair = std::pair<float, BPFunctionNode *>; std::vector<GainPair> Gains; for (auto &N : Nodes) { bool FromLeftToRight = (N.Bucket == LeftBucket); diff --git a/llvm/lib/Support/BranchProbability.cpp b/llvm/lib/Support/BranchProbability.cpp index e376344..143e58a 100644 --- a/llvm/lib/Support/BranchProbability.cpp +++ b/llvm/lib/Support/BranchProbability.cpp @@ -20,8 +20,6 @@ using namespace llvm; -constexpr uint32_t BranchProbability::D; - raw_ostream &BranchProbability::print(raw_ostream &OS) const { if (isUnknown()) return OS << "?%"; @@ -111,3 +109,10 @@ uint64_t BranchProbability::scale(uint64_t Num) const { uint64_t BranchProbability::scaleByInverse(uint64_t Num) const { return ::scale<0>(Num, D, N); } + +BranchProbability BranchProbability::pow(unsigned N) const { + BranchProbability Res = BranchProbability::getOne(); + for (unsigned I = 0; I < N; ++I) + Res *= *this; + return Res; +} diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 9491ec0..dab8bee 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -382,7 +382,7 @@ public: RegisteredSubCommands.erase(sub); } - iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator> + iterator_range<SmallPtrSet<SubCommand *, 4>::iterator> getRegisteredSubcommands() { return make_range(RegisteredSubCommands.begin(), RegisteredSubCommands.end()); @@ -2343,10 +2343,10 @@ namespace { class HelpPrinter { protected: const bool ShowHidden; - typedef SmallVector<std::pair<const char *, Option *>, 128> - StrOptionPairVector; - typedef SmallVector<std::pair<const char *, SubCommand *>, 128> - StrSubCommandPairVector; + using StrOptionPairVector = + SmallVector<std::pair<const char *, Option *>, 128>; + using StrSubCommandPairVector = + SmallVector<std::pair<const char *, SubCommand *>, 128>; // Print the options. Opts is assumed to be alphabetically sorted. virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) { for (const auto &Opt : Opts) @@ -2830,7 +2830,7 @@ StringMap<Option *> &cl::getRegisteredOptions(SubCommand &Sub) { return Sub.OptionsMap; } -iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator> +iterator_range<SmallPtrSet<SubCommand *, 4>::iterator> cl::getRegisteredSubcommands() { return GlobalParser->getRegisteredSubcommands(); } diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp index 98153647..3bfae14 100644 --- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp +++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp @@ -47,16 +47,16 @@ class DAGDeltaAlgorithmImpl { friend class DeltaActiveSetHelper; public: - typedef DAGDeltaAlgorithm::change_ty change_ty; - typedef DAGDeltaAlgorithm::changeset_ty changeset_ty; - typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty; - typedef DAGDeltaAlgorithm::edge_ty edge_ty; + using change_ty = DAGDeltaAlgorithm::change_ty; + using changeset_ty = DAGDeltaAlgorithm::changeset_ty; + using changesetlist_ty = DAGDeltaAlgorithm::changesetlist_ty; + using edge_ty = DAGDeltaAlgorithm::edge_ty; private: - typedef std::vector<change_ty>::iterator pred_iterator_ty; - typedef std::vector<change_ty>::iterator succ_iterator_ty; - typedef std::set<change_ty>::iterator pred_closure_iterator_ty; - typedef std::set<change_ty>::iterator succ_closure_iterator_ty; + using pred_iterator_ty = std::vector<change_ty>::iterator; + using succ_iterator_ty = std::vector<change_ty>::iterator; + using pred_closure_iterator_ty = std::set<change_ty>::iterator; + using succ_closure_iterator_ty = std::set<change_ty>::iterator; DAGDeltaAlgorithm &DDA; diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp index f1c15c0..61566d3 100644 --- a/llvm/lib/Support/DynamicLibrary.cpp +++ b/llvm/lib/Support/DynamicLibrary.cpp @@ -23,7 +23,7 @@ using namespace llvm::sys; // All methods for HandleSet should be used holding SymbolsMutex. class DynamicLibrary::HandleSet { - typedef std::vector<void *> HandleList; + using HandleList = std::vector<void *>; HandleList Handles; void *Process = &Invalid; diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp index b6a2f8a..2e8fba8 100644 --- a/llvm/lib/Support/StringRef.cpp +++ b/llvm/lib/Support/StringRef.cpp @@ -17,11 +17,6 @@ using namespace llvm; -// MSVC emits references to this into the translation units which reference it. -#ifndef _MSC_VER -constexpr size_t StringRef::npos; -#endif - // strncasecmp() is not available on non-POSIX systems, so define an // alternative function here. static int ascii_strncasecmp(StringRef LHS, StringRef RHS) { diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp index 9d45096..b08f508 100644 --- a/llvm/lib/Support/Timer.cpp +++ b/llvm/lib/Support/Timer.cpp @@ -207,7 +207,7 @@ void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const { namespace { -typedef StringMap<Timer> Name2TimerMap; +using Name2TimerMap = StringMap<Timer>; class Name2PairMap { StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map; diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp index 6f8e091..8f0d24e 100644 --- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp +++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp @@ -251,10 +251,10 @@ constexpr const char *const HangulSyllables[][3] = { // Unicode 15.0 // 3.12 Conjoining Jamo Behavior Common constants -constexpr const char32_t SBase = 0xAC00; -constexpr const uint32_t LCount = 19; -constexpr const uint32_t VCount = 21; -constexpr const uint32_t TCount = 28; +constexpr char32_t SBase = 0xAC00; +constexpr uint32_t LCount = 19; +constexpr uint32_t VCount = 21; +constexpr uint32_t TCount = 28; static std::size_t findSyllable(StringRef Name, bool Strict, char &PreviousInName, int &Pos, int Column) { diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc index 648d6a5..da68994 100644 --- a/llvm/lib/Support/Windows/Signals.inc +++ b/llvm/lib/Support/Windows/Signals.inc @@ -421,8 +421,13 @@ bool sys::RemoveFileOnSignal(StringRef Filename, std::string *ErrMsg) { return true; } - if (FilesToRemove == NULL) + if (FilesToRemove == NULL) { FilesToRemove = new std::vector<std::string>; + std::atexit([]() { + delete FilesToRemove; + FilesToRemove = NULL; + }); + } FilesToRemove->push_back(std::string(Filename)); diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index 07b9989..d6f27fb 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -61,17 +61,6 @@ using namespace llvm; -constexpr raw_ostream::Colors raw_ostream::BLACK; -constexpr raw_ostream::Colors raw_ostream::RED; -constexpr raw_ostream::Colors raw_ostream::GREEN; -constexpr raw_ostream::Colors raw_ostream::YELLOW; -constexpr raw_ostream::Colors raw_ostream::BLUE; -constexpr raw_ostream::Colors raw_ostream::MAGENTA; -constexpr raw_ostream::Colors raw_ostream::CYAN; -constexpr raw_ostream::Colors raw_ostream::WHITE; -constexpr raw_ostream::Colors raw_ostream::SAVEDCOLOR; -constexpr raw_ostream::Colors raw_ostream::RESET; - raw_ostream::~raw_ostream() { // raw_ostream's subclasses should take care to flush the buffer // in their destructors. diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp index 3b510d3..f716317 100644 --- a/llvm/lib/Support/raw_socket_stream.cpp +++ b/llvm/lib/Support/raw_socket_stream.cpp @@ -332,7 +332,7 @@ ListeningSocket::~ListeningSocket() { raw_socket_stream::raw_socket_stream(int SocketFD) : raw_fd_stream(SocketFD, true) {} -raw_socket_stream::~raw_socket_stream() {} +raw_socket_stream::~raw_socket_stream() = default; Expected<std::unique_ptr<raw_socket_stream>> raw_socket_stream::createConnectedUnix(StringRef SocketPath) { diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index 30eae6e..e8e6469 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -682,8 +682,10 @@ tgtok::TokKind TGLexer::LexExclaim() { .Case("instances", tgtok::XInstances) .Case("substr", tgtok::XSubstr) .Case("find", tgtok::XFind) - .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated. - .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated. + .Cases({"setdagop", "setop"}, + tgtok::XSetDagOp) // !setop is deprecated. + .Cases({"getdagop", "getop"}, + tgtok::XGetDagOp) // !getop is deprecated. .Case("setdagopname", tgtok::XSetDagOpName) .Case("getdagopname", tgtok::XGetDagOpName) .Case("getdagarg", tgtok::XGetDagArg) diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 7e03b97..45b7120 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -370,6 +370,22 @@ SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const { {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}}; } +SVEStackAllocations AArch64PrologueEpilogueCommon::getSVEStackAllocations( + SVEFrameSizes const &SVE) { + StackOffset AfterZPRs = SVE.ZPR.LocalsSize; + StackOffset BeforePPRs = SVE.ZPR.CalleeSavesSize + SVE.PPR.CalleeSavesSize; + StackOffset AfterPPRs = {}; + if (SVELayout == SVEStackLayout::Split) { + BeforePPRs = SVE.PPR.CalleeSavesSize; + // If there are no ZPR CSRs, place all local allocations after the ZPRs. + if (SVE.ZPR.CalleeSavesSize) + AfterPPRs += SVE.PPR.LocalsSize + SVE.ZPR.CalleeSavesSize; + else + AfterZPRs += SVE.PPR.LocalsSize; // Group allocation of locals. + } + return {BeforePPRs, AfterPPRs, AfterZPRs}; +} + struct SVEPartitions { struct { MachineBasicBlock::iterator Begin, End; @@ -687,16 +703,19 @@ void AArch64PrologueEmitter::emitPrologue() { // All of the remaining stack allocations are for locals. determineLocalsStackSize(NumBytes, PrologueSaveSize); + auto [PPR, ZPR] = getSVEStackFrameSizes(); + SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR}); + MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI; if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { + assert(!SVEAllocs.AfterPPRs && + "unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord"); // If we're doing SVE saves first, we need to immediately allocate space // for fixed objects, then space for the SVE callee saves. // // Windows unwind requires that the scalable size is a multiple of 16; // that's handled when the callee-saved size is computed. - auto SaveSize = - StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) + - StackOffset::getFixed(FixedObject); + auto SaveSize = SVEAllocs.BeforePPRs + StackOffset::getFixed(FixedObject); allocateStackSpace(PrologueBeginI, 0, SaveSize, false, StackOffset{}, /*FollowupAllocs=*/true); NumBytes -= FixedObject; @@ -764,12 +783,11 @@ void AArch64PrologueEmitter::emitPrologue() { if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding); - auto [PPR, ZPR] = getSVEStackFrameSizes(); - StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes); + SVEAllocs.AfterZPRs += NonSVELocalsSize; + StackOffset CFAOffset = StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize; - MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI; // Allocate space for the callee saves and PPR locals (if any). if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) { @@ -780,31 +798,23 @@ void AArch64PrologueEmitter::emitPrologue() { if (EmitAsyncCFI) emitCalleeSavedSVELocations(AfterSVESavesI); - StackOffset AllocateBeforePPRs = SVECalleeSavesSize; - StackOffset AllocateAfterPPRs = PPR.LocalsSize; - if (SVELayout == SVEStackLayout::Split) { - AllocateBeforePPRs = PPR.CalleeSavesSize; - AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize; - } - allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs, + allocateStackSpace(PPRRange.Begin, 0, SVEAllocs.BeforePPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || AllocateAfterPPRs || - ZPR.LocalsSize || NonSVELocalsSize); - CFAOffset += AllocateBeforePPRs; + MFI.hasVarSizedObjects() || SVEAllocs.AfterPPRs || + SVEAllocs.AfterZPRs); + CFAOffset += SVEAllocs.BeforePPRs; assert(PPRRange.End == ZPRRange.Begin && "Expected ZPR callee saves after PPR locals"); - allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs, + allocateStackSpace(PPRRange.End, RealignmentPadding, SVEAllocs.AfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || ZPR.LocalsSize || - NonSVELocalsSize); - CFAOffset += AllocateAfterPPRs; + MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs); + CFAOffset += SVEAllocs.AfterPPRs; } else { assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord); - // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been - // allocated (and separate PPR locals are not supported, all SVE locals, - // both PPR and ZPR, are within the ZPR locals area). - assert(!PPR.LocalsSize && "Unexpected PPR locals!"); - CFAOffset += SVECalleeSavesSize; + // Note: With CalleeSavesAboveFrameRecord, the SVE CS (BeforePPRs) have + // already been allocated. PPR locals (included in AfterPPRs) are not + // supported (note: this is asserted above). + CFAOffset += SVEAllocs.BeforePPRs; } // Allocate space for the rest of the frame including ZPR locals. Align the @@ -815,9 +825,9 @@ void AArch64PrologueEmitter::emitPrologue() { // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the // correct value here, as NumBytes also includes padding bytes, which // shouldn't be counted here. - allocateStackSpace( - AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize, - EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects()); + allocateStackSpace(AfterSVESavesI, RealignmentPadding, SVEAllocs.AfterZPRs, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects()); } // If we need a base pointer, set it up here. It's whatever the value of the @@ -1472,27 +1482,26 @@ void AArch64EpilogueEmitter::emitEpilogue() { assert(NumBytes >= 0 && "Negative stack allocation size!?"); StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; - StackOffset SVEStackSize = - SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize; + SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR}); MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin; - MachineBasicBlock::iterator RestoreEnd = PPRRange.End; // Deallocate the SVE area. if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { - StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize; + assert(!SVEAllocs.AfterPPRs && + "unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord"); // If the callee-save area is before FP, restoring the FP implicitly - // deallocates non-callee-save SVE allocations. Otherwise, deallocate them + // deallocates non-callee-save SVE allocations. Otherwise, deallocate them // explicitly. if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, - SVELocalsSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI); } // Deallocate callee-save SVE registers. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, + SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI); } else if (AFI->hasSVEStackSize()) { // If we have stack realignment or variable-sized objects we must use the FP // to restore SVE callee saves (as there is an unknown amount of @@ -1524,46 +1533,33 @@ void AArch64EpilogueEmitter::emitEpilogue() { emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy); } else if (BaseForSVEDealloc == AArch64::SP) { - auto CFAOffset = - SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); - - if (SVECalleeSavesSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - auto NonSVELocals = StackOffset::getFixed(NumBytes); - emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, - NonSVELocals, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= NonSVELocals; - NumBytes = 0; - } - - if (ZPR.LocalsSize) { - emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, - ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= ZPR.LocalsSize; + auto NonSVELocals = StackOffset::getFixed(NumBytes); + auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) + + SVEAllocs.totalSize(); + + if (SVECalleeSavesSize || SVELayout == SVEStackLayout::Split) { + // Deallocate non-SVE locals now. This is needed to reach the SVE callee + // saves, but may also allow combining stack hazard bumps for split SVE. + SVEAllocs.AfterZPRs += NonSVELocals; + NumBytes -= NonSVELocals.getFixed(); } - - StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize; - if (SVELayout == SVEStackLayout::Split && - (PPR.LocalsSize || ZPR.CalleeSavesSize)) { - assert(PPRRange.Begin == ZPRRange.End && - "Expected PPR restores after ZPR"); - emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP, - PPR.LocalsSize + ZPR.CalleeSavesSize, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize; - SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize; - } - - // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs: - if (SVECalleeSavesToDealloc) - emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, - SVECalleeSavesToDealloc, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI, EmitCFI && !HasFP, CFAOffset); + // To deallocate the SVE stack adjust by the allocations in reverse. + emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, + SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + CFAOffset); + CFAOffset -= SVEAllocs.AfterZPRs; + assert(PPRRange.Begin == ZPRRange.End && + "Expected PPR restores after ZPR"); + emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP, + SVEAllocs.AfterPPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + CFAOffset); + CFAOffset -= SVEAllocs.AfterPPRs; + emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, + SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + CFAOffset); } if (EmitCFI) diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h index bccadda..6e0e283 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h @@ -33,6 +33,11 @@ struct SVEFrameSizes { } PPR, ZPR; }; +struct SVEStackAllocations { + StackOffset BeforePPRs, AfterPPRs, AfterZPRs; + StackOffset totalSize() const { return BeforePPRs + AfterPPRs + AfterZPRs; } +}; + class AArch64PrologueEpilogueCommon { public: AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB, @@ -66,6 +71,7 @@ protected: bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; SVEFrameSizes getSVEStackFrameSizes() const; + SVEStackAllocations getSVEStackAllocations(SVEFrameSizes const &); MachineFunction &MF; MachineBasicBlock &MBB; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index fede586..5b5565a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -308,9 +308,9 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits; } -bool AArch64TTIImpl::areTypesABICompatible( - const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { +bool AArch64TTIImpl::areTypesABICompatible(const Function *Caller, + const Function *Callee, + ArrayRef<Type *> Types) const { if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) return false; @@ -1032,6 +1032,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::experimental_vector_extract_last_active: + if (ST->isSVEorStreamingSVEAvailable()) { + auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]); + // This should turn into chained clastb instructions. + return LegalCost; + } + break; default: break; } @@ -2220,7 +2227,7 @@ static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, return std::nullopt; } -template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> +template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc> static std::optional<Instruction *> instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp) { @@ -6650,10 +6657,15 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( Ops.push_back(&Ext->getOperandUse(0)); Ops.push_back(&Op); - if (isa<SExtInst>(Ext)) + if (isa<SExtInst>(Ext)) { NumSExts++; - else + } else { NumZExts++; + // A zext(a) is also a sext(zext(a)), if we take more than 2 steps. + if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 < + I->getType()->getScalarSizeInBits()) + NumSExts++; + } continue; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index fe2e849..b39546a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -84,7 +84,7 @@ public: const Function *Callee) const override; bool areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const override; + ArrayRef<Type *> Types) const override; unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index cd8b249..67042b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass(); ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> { - AMDGPUSimplifyLibCallsPass() {} + AMDGPUSimplifyLibCallsPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; @@ -371,13 +371,13 @@ public: class AMDGPUAnnotateUniformValuesPass : public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> { public: - AMDGPUAnnotateUniformValuesPass() {} + AMDGPUAnnotateUniformValuesPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> { public: - SIModeRegisterPass() {} + SIModeRegisterPass() = default; PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 1064e57..dad94b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -96,7 +96,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { } struct KernArgPreloadDescriptor : public ArgDescriptor { - KernArgPreloadDescriptor() {} + KernArgPreloadDescriptor() = default; SmallVector<MCRegister> Regs; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 9907c88f..8669978 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1555,7 +1555,7 @@ private: AMDGPU::ClusterDimsAttr Attr; - static constexpr const char AttrName[] = "amdgpu-cluster-dims"; + static constexpr char AttrName[] = "amdgpu-cluster-dims"; }; AAAMDGPUClusterDims & diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9ce1224..0c97741 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -221,12 +221,21 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *BB = I.getParent(); + Register VCCReg = I.getOperand(1).getReg(); + MachineInstr *Cmp; + + // Set SCC as a side effect with S_CMP or S_OR. + if (STI.hasScalarCompareEq64()) { + unsigned CmpOpc = + STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; + Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); + } else { + Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) + .addReg(VCCReg) + .addReg(VCCReg); + } - unsigned CmpOpc = - STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; - MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) - .addReg(I.getOperand(1).getReg()) - .addImm(0); if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp index 1e6589e..d7d0292 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -58,6 +58,8 @@ class AMDGPULowerVGPREncoding { static constexpr unsigned BitsPerField = 2; static constexpr unsigned NumFields = 4; static constexpr unsigned FieldMask = (1 << BitsPerField) - 1; + static constexpr unsigned ModeWidth = NumFields * BitsPerField; + static constexpr unsigned ModeMask = (1 << ModeWidth) - 1; using ModeType = PackedVector<unsigned, BitsPerField, std::bitset<BitsPerField * NumFields>>; @@ -82,12 +84,12 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; + // Current basic block. + MachineBasicBlock *MBB; + /// Most recent s_set_* instruction. MachineInstr *MostRecentModeSet; - /// Whether the current mode is known. - bool CurrentModeKnown; - /// Current mode bits. ModeTy CurrentMode; @@ -108,10 +110,13 @@ private: MachineInstr *Clause; /// Insert mode change before \p I. \returns true if mode was changed. - bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I); + bool setMode(ModeTy NewMode, ModeTy Mask, + MachineBasicBlock::instr_iterator I); /// Reset mode to default. - void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); } + void resetMode(MachineBasicBlock::instr_iterator I) { + setMode(ModeTy(), ModeTy::fullMask(), I); + } /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt. std::optional<unsigned> getMSBs(const MachineOperand &MO) const; @@ -130,38 +135,43 @@ private: /// Check if an instruction \p I is within a clause and returns a suitable /// iterator to insert mode change. It may also modify the S_CLAUSE /// instruction to extend it or drop the clause if it cannot be adjusted. - MachineInstr *handleClause(MachineInstr *I); + MachineBasicBlock::instr_iterator + handleClause(MachineBasicBlock::instr_iterator I); }; bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask, - MachineInstr *I) { + MachineBasicBlock::instr_iterator I) { assert((NewMode.raw_bits() & ~Mask.raw_bits()).none()); - if (CurrentModeKnown) { - auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); + auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); - if ((Delta & Mask.raw_bits()).none()) { - CurrentMask |= Mask; - return false; - } + if ((Delta & Mask.raw_bits()).none()) { + CurrentMask |= Mask; + return false; + } - if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { - CurrentMode |= NewMode; - CurrentMask |= Mask; + if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { + CurrentMode |= NewMode; + CurrentMask |= Mask; - MostRecentModeSet->getOperand(0).setImm(CurrentMode); - return true; - } + MachineOperand &Op = MostRecentModeSet->getOperand(0); + + // Carry old mode bits from the existing instruction. + int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth); + + Op.setImm(CurrentMode | OldModeBits); + return true; } + // Record previous mode into high 8 bits of the immediate. + int64_t OldModeBits = CurrentMode << ModeWidth; + I = handleClause(I); - MostRecentModeSet = - BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) - .addImm(NewMode); + MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) + .addImm(NewMode | OldModeBits); CurrentMode = NewMode; CurrentMask = Mask; - CurrentModeKnown = true; return true; } @@ -233,21 +243,22 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { if (Ops.first) { ModeTy NewMode, Mask; computeMode(NewMode, Mask, MI, Ops.first, Ops.second); - return setMode(NewMode, Mask, &MI); + return setMode(NewMode, Mask, MI.getIterator()); } assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo()); return false; } -MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) { +MachineBasicBlock::instr_iterator +AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) { if (!ClauseRemaining) return I; // A clause cannot start with a special instruction, place it right before // the clause. if (ClauseRemaining == ClauseLen) { - I = Clause->getPrevNode(); + I = Clause->getPrevNode()->getIterator(); assert(I->isBundle()); return I; } @@ -284,9 +295,9 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { ClauseLen = ClauseRemaining = 0; CurrentMode.reset(); CurrentMask.reset(); - CurrentModeKnown = true; for (auto &MBB : MF) { MostRecentModeSet = nullptr; + this->MBB = &MBB; for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) { if (MI.isMetaInstruction()) @@ -294,17 +305,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { if (MI.isTerminator() || MI.isCall()) { if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) CurrentMode.reset(); - CurrentModeKnown = true; - } else - resetMode(&MI); + else + resetMode(MI.getIterator()); continue; } if (MI.isInlineAsm()) { if (TII->hasVGPRUses(MI)) - resetMode(&MI); + resetMode(MI.getIterator()); continue; } @@ -323,14 +333,8 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { --ClauseRemaining; } - // If we're falling through to a block that has at least one other - // predecessor, we no longer know the mode. - MachineBasicBlock *Next = MBB.getNextNode(); - if (Next && Next->pred_size() >= 2 && - llvm::is_contained(Next->predecessors(), &MBB)) { - if (CurrentMode.raw_bits().any()) - CurrentModeKnown = false; - } + // Reset the mode if we are falling through. + resetMode(MBB.instr_end()); } return Changed; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 680e7eb..844649ebb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -412,7 +412,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { *OutStreamer); if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { - unsigned V = MI->getOperand(0).getImm(); + unsigned V = MI->getOperand(0).getImm() & 0xff; OutStreamer->AddComment( " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) + " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h index cf2ab825..a3be0f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -48,7 +48,7 @@ private: FuncInfoMap FIM; public: - AMDGPUPerfHintAnalysis() {} + AMDGPUPerfHintAnalysis() = default; // OldPM bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index e187959..907f830 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" @@ -34,9 +35,17 @@ using namespace llvm; using namespace AMDGPU; +using namespace llvm::MIPatternMatch; namespace { +// AMDGPU-specific pattern matchers +template <typename SrcTy> +inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE> +m_GAMDGPUReadAnyLane(const SrcTy &Src) { + return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src); +} + class AMDGPURegBankLegalize : public MachineFunctionPass { public: static char ID; @@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { // Src = G_AMDGPU_READANYLANE RALSrc - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (RAL) + Register RALSrc; + if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))) return RALSrc; + // TruncSrc = G_AMDGPU_READANYLANE RALSrc + // AextSrc = G_TRUNC TruncSrc + // Src = G_ANYEXT AextSrc + if (mi_match(Src, MRI, + m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) { + return RALSrc; + } + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc // LoSgpr = G_AMDGPU_READANYLANE LoVgpr // HiSgpr = G_AMDGPU_READANYLANE HiVgpr diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 5407566..dc8fa7f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -500,6 +500,16 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) { + auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg()); + auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo}); + auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi}); + B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), + {ResLo.getReg(0), ResHi.getReg(0)}); + MI.eraseFromParent(); +} + static bool isSignedBFE(MachineInstr &MI) { if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI)) return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -616,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == V2S16); + auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg()); + unsigned Opc = MI.getOpcode(); + auto Flags = MI.getFlags(); + auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32); + auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32); + auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32); + auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); @@ -688,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, return lowerUnpackBitShift(MI); case UnpackMinMax: return lowerUnpackMinMax(MI); + case ScalarizeToS16: + return lowerSplitTo16(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; @@ -804,6 +833,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, } break; } + case UnpackAExt: + return lowerUnpackAExt(MI); case WidenMMOToS32: return widenMMOToS32(cast<GAnyLoad>(MI)); } @@ -837,6 +868,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { return LLT::scalar(32); case Sgpr64: case Vgpr64: + case UniInVgprS64: return LLT::scalar(64); case Sgpr128: case Vgpr128: @@ -960,6 +992,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case UniInVcc: case UniInVgprS16: case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: case UniInVgprB32: @@ -1092,6 +1125,7 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); @@ -1120,7 +1154,8 @@ void RegBankLegalizeHelper::applyMappingDst( assert(RB == SgprRB); Register NewDst = MRI.createVirtualRegister(SgprRB_S32); Op.setReg(NewDst); - B.buildTrunc(Reg, NewDst); + if (!MRI.use_empty(Reg)) + B.buildTrunc(Reg, NewDst); break; } case InvalidMapping: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index d937815..e7598f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -72,6 +72,7 @@ class RegBankLegalizeHelper { static constexpr LLT P6 = LLT::pointer(6, 32); MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32}; + MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16}; MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32}; MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1}; @@ -121,9 +122,11 @@ private: void lowerV_BFE(MachineInstr &MI); void lowerS_BFE(MachineInstr &MI); void lowerSplitTo32(MachineInstr &MI); + void lowerSplitTo16(MachineInstr &MI); void lowerSplitTo32Select(MachineInstr &MI); void lowerSplitTo32SExtInReg(MachineInstr &MI); void lowerUnpackMinMax(MachineInstr &MI); + void lowerUnpackAExt(MachineInstr &MI); }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a67b12a..1e5885a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -202,7 +202,7 @@ bool PredicateMapping::match(const MachineInstr &MI, return true; } -SetOfRulesForOpcode::SetOfRulesForOpcode() {} +SetOfRulesForOpcode::SetOfRulesForOpcode() = default; SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) : FastTypes(FastTypes) {} @@ -470,7 +470,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}) .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}); + + addRulesForGOpcs({G_UADDO, G_USUBO}, Standard) + .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_UADDE, G_USUBE}, Standard) + .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) + .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}}); addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); @@ -901,14 +913,28 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); - addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}}); + addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}}); + + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) + .Uni(S64, {{Sgpr64}, {}}); bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16}, + hasSALUFloat) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}}); addRulesForGOpcs({G_FPTOUI}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 93e0efd..e6df5d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID { V4S32, UniV2S16, + UniV2S32, DivV2S16, + DivV2S32, // B types B32, @@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID { UniInVcc, UniInVgprS16, UniInVgprS32, + UniInVgprS64, UniInVgprV2S16, + UniInVgprV2S32, UniInVgprV4S32, UniInVgprB32, UniInVgprB64, @@ -217,13 +221,15 @@ enum LoweringMethodID { V_BFE, VgprToVccCopy, SplitTo32, + ScalarizeToS16, SplitTo32Select, SplitTo32SExtInReg, Ext32To64, UniCstExt, SplitLoad, WidenLoad, - WidenMMOToS32 + WidenMMOToS32, + UnpackAExt }; enum FastRulesTypes { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 75a94ac..b87b54f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -816,7 +816,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) { Params.consume_front("strategy="); auto Result = StringSwitch<std::optional<ScanOptions>>(Params) .Case("dpp", ScanOptions::DPP) - .Cases("iterative", "", ScanOptions::Iterative) + .Cases({"iterative", ""}, ScanOptions::Iterative) .Case("none", ScanOptions::None) .Default(std::nullopt); if (Result) @@ -1315,6 +1315,9 @@ void AMDGPUPassConfig::addIRPasses() { isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); + if (EnableUniformIntrinsicCombine) + addPass(createAMDGPUUniformIntrinsicCombineLegacyPass()); + // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); @@ -2066,6 +2069,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { if (isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); + if (EnableUniformIntrinsicCombine) + addPass(AMDGPUUniformIntrinsicCombinePass()); // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 733c5d5..ddf9a24 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -225,8 +225,8 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); if (DummyReturnBB == nullptr) { - DummyReturnBB = BasicBlock::Create(F.getContext(), - "DummyReturnBlock", &F); + DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); Type *RetTy = F.getReturnType(); Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp index 61c5dcd..ded2f5a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp @@ -54,7 +54,7 @@ public: bool CullSGPRHazardsAtMemWait; unsigned CullSGPRHazardsMemWaitThreshold; - AMDGPUWaitSGPRHazards() {} + AMDGPUWaitSGPRHazards() = default; // Return the numeric ID 0-127 for a given SGPR. static std::optional<unsigned> sgprNumber(Register Reg, diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 975781f..f357981 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -183,7 +183,7 @@ class ScheduleMetrics { unsigned BubbleCycles; public: - ScheduleMetrics() {} + ScheduleMetrics() = default; ScheduleMetrics(unsigned L, unsigned BC) : ScheduleLength(L), BubbleCycles(BC) {} unsigned getLength() const { return ScheduleLength; } @@ -217,7 +217,7 @@ class RegionPressureMap { bool IsLiveOut; public: - RegionPressureMap() {} + RegionPressureMap() = default; RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut) : DAG(GCNDAG), IsLiveOut(LiveOut) {} // Build the Instr->LiveReg and RegionIdx->Instr maps diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 013cfeb..28b4da8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) { if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12) - VgprMSBs = Inst.getOperand(0).getImm(); + VgprMSBs = Inst.getOperand(0).getImm() & 0xff; else if (isTerminator(Inst)) VgprMSBs = 0; } diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index d950131..65dce74 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -2116,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); + // Set VADDR4 to NULL + let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + // set to 0 based on SPG. - let vaddr4 = 0; let rsrc = 0; let vdata = 0; let d16 = 0; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b34ab2a..8bb2808 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7035,9 +7035,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SDLoc SL(N); if (Src.getOpcode() == ISD::SETCC) { + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + // Need to expand bfloat to float for comparison (setcc). + if (Op0.getValueType() == MVT::bf16) { + Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0); + Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1); + } // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) - return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), - Src.getOperand(1), Src.getOperand(2)); + return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2)); } if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) { // (ballot 0) -> 0 diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6dcbced..b7fa899 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1288,18 +1288,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { } void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { + // On entry to a block with multiple predescessors, there may + // be pending SMEM and VMEM events active at the same time. + // In such cases, only clear one active event at a time. + auto applyPendingXcntGroup = [this](unsigned E) { + unsigned LowerBound = getScoreLB(X_CNT); + applyWaitcnt(X_CNT, 0); + PendingEvents |= (1 << E); + setScoreLB(X_CNT, LowerBound); + }; + // Wait on XCNT is redundant if we are already waiting for a load to complete. // SMEM can return out of order, so only omit XCNT wait if we are waiting till // zero. - if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) - return applyWaitcnt(X_CNT, 0); + if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) { + if (hasPendingEvent(VMEM_GROUP)) + applyPendingXcntGroup(VMEM_GROUP); + else + applyWaitcnt(X_CNT, 0); + return; + } // If we have pending store we cannot optimize XCnt because we do not wait for // stores. VMEM loads retun in order, so if we only have loads XCnt is // decremented to the same number as LOADCnt. if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && - !hasPendingEvent(STORE_CNT)) - return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + !hasPendingEvent(STORE_CNT)) { + if (hasPendingEvent(SMEM_GROUP)) + applyPendingXcntGroup(SMEM_GROUP); + else + applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + return; + } applyWaitcnt(X_CNT, Wait.XCnt); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d930a21..45f5919 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6153,7 +6153,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, // information. if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { - constexpr const AMDGPU::OpName OpNames[] = { + constexpr AMDGPU::OpName OpNames[] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; for (auto [I, OpName] : enumerate(OpNames)) { @@ -6215,8 +6215,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO) const { - constexpr const unsigned NumOps = 3; - constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + constexpr unsigned NumOps = 3; + constexpr AMDGPU::OpName OpNames[NumOps * 2] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; @@ -10618,6 +10618,42 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, return false; } +// SCC is already valid after SCCValid. +// SCCRedefine will redefine SCC to the same value already available after +// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and +// update kill/dead flags if necessary. +static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, + const SIRegisterInfo &RI) { + MachineInstr *KillsSCC = nullptr; + for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), + SCCRedefine->getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; + } + if (MachineOperand *SccDef = + SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) + SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); + SCCRedefine->eraseFromParent(); + return true; +} + +static bool foldableSelect(const MachineInstr &Def) { + if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 && + Def.getOpcode() != AMDGPU::S_CSELECT_B64) + return false; + bool Op1IsNonZeroImm = + Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0; + bool Op2IsZeroImm = + Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0; + if (!Op1IsNonZeroImm || !Op2IsZeroImm) + return false; + return true; +} + bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, @@ -10637,19 +10673,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!Def || Def->getParent() != CmpInstr.getParent()) return false; - const auto foldableSelect = [](MachineInstr *Def) -> bool { - if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 || - Def->getOpcode() == AMDGPU::S_CSELECT_B64) { - bool Op1IsNonZeroImm = - Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0; - bool Op2IsZeroImm = - Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0; - if (Op1IsNonZeroImm && Op2IsZeroImm) - return true; - } - return false; - }; - // For S_OP that set SCC = DST!=0, do the transformation // // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) @@ -10660,24 +10683,12 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero // imm), 0) - if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def)) + if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; - if (MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); return true; }; @@ -10755,21 +10766,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } - - MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; if (!MRI->use_nodbg_empty(DefReg)) { assert(!IsReversedCC); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fdba454..6b06534 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -601,10 +601,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); - if (!Subtarget->hasVFP2Base()) + if (!Subtarget->hasVFP2Base()) { setAllExpand(MVT::f32); - if (!Subtarget->hasFP64()) + } else { + for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, + ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) + setOperationAction(Op, MVT::f32, Legal); + } + if (!Subtarget->hasFP64()) { setAllExpand(MVT::f64); + } else { + for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, + ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) + setOperationAction(Op, MVT::f64, Legal); + } } if (Subtarget->hasFullFP16()) { @@ -1281,12 +1291,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall); } // fp16 is a special v7 extension that adds f16 <-> f32 conversions. if (!Subtarget->hasFP16()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall); } // Strict floating-point comparisons need custom lowering. @@ -1333,31 +1347,42 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, } // FP16 often need to be promoted to call lib functions + // clang-format off if (Subtarget->hasFullFP16()) { - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FTAN, MVT::f16, Promote); - setOperationAction(ISD::FSINCOS, MVT::f16, Promote); - setOperationAction(ISD::FPOWI, MVT::f16, Promote); - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FEXP, MVT::f16, Promote); - setOperationAction(ISD::FEXP2, MVT::f16, Promote); - setOperationAction(ISD::FEXP10, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::f16, Promote); - setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::LRINT, MVT::f16, Expand); setOperationAction(ISD::LROUND, MVT::f16, Expand); - - setOperationAction(ISD::FROUND, MVT::f16, Legal); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); - setOperationAction(ISD::FTRUNC, MVT::f16, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); - setOperationAction(ISD::FRINT, MVT::f16, Legal); - setOperationAction(ISD::FFLOOR, MVT::f16, Legal); - setOperationAction(ISD::FCEIL, MVT::f16, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + + for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, + ISD::FCOS, ISD::FSIN, ISD::FSINCOS, + ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS, + ISD::FASIN, ISD::FATAN, ISD::FATAN2, + ISD::FCOSH, ISD::FSINH, ISD::FTANH, + ISD::FTAN, ISD::FEXP, ISD::FEXP2, + ISD::FEXP10, ISD::FLOG, ISD::FLOG2, + ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW, + ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN, + ISD::STRICT_FACOS, ISD::STRICT_FASIN, ISD::STRICT_FATAN, + ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH, + ISD::STRICT_FTANH, ISD::STRICT_FEXP, ISD::STRICT_FEXP2, + ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, + ISD::STRICT_FTAN}) { + setOperationAction(Op, MVT::f16, Promote); + } + + // Round-to-integer need custom lowering for fp16, as Promote doesn't work + // because the result type is integer. + for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) + setOperationAction(Op, MVT::f16, Custom); + + for (auto Op : {ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC, + ISD::FNEARBYINT, ISD::FRINT, ISD::FFLOOR, + ISD::FCEIL, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN, + ISD::STRICT_FTRUNC, ISD::STRICT_FNEARBYINT, ISD::STRICT_FRINT, + ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL}) { + setOperationAction(Op, MVT::f16, Legal); + } + // clang-format on } if (Subtarget->hasNEON()) { @@ -10725,6 +10750,19 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerCMP(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: { + assert((Op.getOperand(1).getValueType() == MVT::f16 || + Op.getOperand(1).getValueType() == MVT::bf16) && + "Expected custom lowering of rounding operations only for f16"); + SDLoc DL(Op); + SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other}, + {Ext.getValue(1), Ext.getValue(0)}); + } } } @@ -22071,6 +22109,11 @@ bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( ScalarTy->isIntegerTy(32)); } +ArrayRef<MCPhysReg> ARMTargetLowering::getRoundingControlRegisters() const { + static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM}; + return RCRegs; +} + Value *ARMTargetLowering::createComplexDeinterleavingIR( IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 357d2c5..bf3438b 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -1009,6 +1009,8 @@ class VectorType; bool isUnsupportedFloatingType(EVT VT) const; + ArrayRef<MCPhysReg> getRoundingControlRegisters() const override; + SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue Flags, SelectionDAG &DAG) const; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 10d4cd5..f7176a6 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -473,15 +473,15 @@ def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs)>; // An 'fmul' node with a single use. let HasOneUse = 1 in -def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs)>; +def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (any_fmul node:$lhs, node:$rhs)>; // An 'fadd' node which checks for single non-hazardous use. -def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ +def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fadd node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; // An 'fsub' node which checks for single non-hazardous use. -def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ +def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fsub node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 6771106..e2cc97b 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -439,14 +439,14 @@ let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FP def VADDD : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fadd DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDS : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fadd SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPALU32]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -457,21 +457,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VADDH : AHbI<0b11100, 0b11, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBD : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fsub DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBS : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fsub SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPALU32]>{ // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -482,42 +482,42 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VSUBH : AHbI<0b11100, 0b11, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVD : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPDIV64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVS : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fdiv SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVH : AHbI<0b11101, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULD : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fmul DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULS : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fmul SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -528,21 +528,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VMULH : AHbI<0b11100, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULD : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>, + [(set DPR:$Dd, (fneg (any_fmul DPR:$Dn, (f64 DPR:$Dm))))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULS : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]>, + [(set SPR:$Sd, (fneg (any_fmul SPR:$Sn, SPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -553,7 +553,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VNMULH : AHbI<0b11100, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fneg (fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>, + [(set (f16 HPR:$Sd), (fneg (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; multiclass vsel_inst<string op, bits<2> opc, int CC> { @@ -587,7 +587,7 @@ defm VSELGE : vsel_inst<"ge", 0b10, 10>; defm VSELEQ : vsel_inst<"eq", 0b00, 0>; defm VSELVS : vsel_inst<"vs", 0b01, 6>; -multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { +multiclass vmaxmin_inst<string op, bit opc, PatFrags SD> { let DecoderNamespace = "VFPV8", PostEncoderMethod = "", isUnpredicable = 1, mayRaiseFPException = 1 in { def H : AHbInp<0b11101, 0b00, opc, @@ -610,8 +610,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { } } -defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>; -defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>; +defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, any_fmaxnum>; +defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, any_fminnum>; // Match reassociated forms only if not sign dependent rounding. def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), @@ -746,7 +746,7 @@ let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "", - [(set DPR:$Dd, (fpextend SPR:$Sm))]>, + [(set DPR:$Dd, (any_fpextend SPR:$Sm))]>, Sched<[WriteFPCVT]> { // Instruction operands. bits<5> Dd; @@ -766,7 +766,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "", - [(set SPR:$Sd, (fpround DPR:$Dm))]>, + [(set SPR:$Sd, (any_fpround DPR:$Dm))]>, Sched<[WriteFPCVT]> { // Instruction operands. bits<5> Sd; @@ -796,7 +796,7 @@ def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))), +def : FP16Pat<(f32 (any_fpextend (f16 HPR:$Sm))), (VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>; def : FP16Pat<(f16_to_fp GPR:$a), (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; @@ -808,16 +808,16 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f16 (fpround SPR:$Sm)), +def : FP16Pat<(f16 (any_fpround SPR:$Sm)), (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$Sm), HPR)>; def : FP16Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$a), GPR))>; -def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane), (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane), (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), @@ -830,9 +830,9 @@ def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), +def : FP16Pat<(f32 (any_fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), (VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>; -def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), +def : FP16Pat<(f32 (any_fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), (VCVTTHS (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), (SSubReg_f16_reg imm_odd:$lane)))>; @@ -844,12 +844,12 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane), (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane), (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), @@ -872,7 +872,7 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, let hasSideEffects = 0; } -def : FullFP16Pat<(f64 (fpextend (f16 HPR:$Sm))), +def : FullFP16Pat<(f64 (any_fpextend (f16 HPR:$Sm))), (VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>, Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(f64 (f16_to_fp GPR:$a)), @@ -898,7 +898,7 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, let hasSideEffects = 0; } -def : FullFP16Pat<(f16 (fpround DPR:$Dm)), +def : FullFP16Pat<(f16 (any_fpround DPR:$Dm)), (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$Dm), HPR)>, Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(fp_to_f16 (f64 DPR:$a)), @@ -1007,41 +1007,41 @@ multiclass vcvt_inst<string opc, bits<2> rm, let Predicates = [HasFPARMv8] in { let Predicates = [HasFullFP16] in { - def : Pat<(i32 (fp_to_sint (node (f16 HPR:$a)))), + def : Pat<(i32 (any_fp_to_sint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SH") (f16 HPR:$a)), GPR)>; - def : Pat<(i32 (fp_to_uint (node (f16 HPR:$a)))), + def : Pat<(i32 (any_fp_to_uint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"UH") (f16 HPR:$a)), GPR)>; } - def : Pat<(i32 (fp_to_sint (node SPR:$a))), + def : Pat<(i32 (any_fp_to_sint (node SPR:$a))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SS") SPR:$a), GPR)>; - def : Pat<(i32 (fp_to_uint (node SPR:$a))), + def : Pat<(i32 (any_fp_to_uint (node SPR:$a))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"US") SPR:$a), GPR)>; } let Predicates = [HasFPARMv8, HasDPVFP] in { - def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))), + def : Pat<(i32 (any_fp_to_sint (node (f64 DPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SD") DPR:$a), GPR)>; - def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))), + def : Pat<(i32 (any_fp_to_uint (node (f64 DPR:$a)))), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"UD") DPR:$a), GPR)>; } } -defm VCVTA : vcvt_inst<"a", 0b00, fround>; +defm VCVTA : vcvt_inst<"a", 0b00, any_fround>; defm VCVTN : vcvt_inst<"n", 0b01>; -defm VCVTP : vcvt_inst<"p", 0b10, fceil>; -defm VCVTM : vcvt_inst<"m", 0b11, ffloor>; +defm VCVTP : vcvt_inst<"p", 0b10, any_fceil>; +defm VCVTM : vcvt_inst<"m", 0b11, any_ffloor>; def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, (outs DPR:$Dd), (ins DPR:$Dm), @@ -1103,9 +1103,9 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node, Requires<[HasFPARMv8,HasDPVFP]>; } -defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc, [], 0>; -defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint, [FPSCR_RM], 0>; -defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint, [FPSCR_RM], 1>; +defm VRINTZ : vrint_inst_zrx<"z", 0, 1, any_ftrunc, [], 0>; +defm VRINTR : vrint_inst_zrx<"r", 0, 0, any_fnearbyint, [FPSCR_RM], 0>; +defm VRINTX : vrint_inst_zrx<"x", 1, 0, any_frint, [FPSCR_RM], 1>; multiclass vrint_inst_anpm<string opc, bits<2> rm, SDPatternOperator node = null_frag> { @@ -1145,30 +1145,31 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm, Requires<[HasFPARMv8,HasDPVFP]>; } -defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>; -defm VRINTN : vrint_inst_anpm<"n", 0b01, froundeven>; -defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>; -defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>; +defm VRINTA : vrint_inst_anpm<"a", 0b00, any_fround>; +defm VRINTN : vrint_inst_anpm<"n", 0b01, any_froundeven>; +defm VRINTP : vrint_inst_anpm<"p", 0b10, any_fceil>; +defm VRINTM : vrint_inst_anpm<"m", 0b11, any_ffloor>; + let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$Dd), (ins DPR:$Dm), IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "", - [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fsqrt (f64 DPR:$Dm)))]>, Sched<[WriteFPSQRT64]>; let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "", - [(set SPR:$Sd, (fsqrt SPR:$Sm))]>, + [(set SPR:$Sd, (any_fsqrt SPR:$Sm))]>, Sched<[WriteFPSQRT32]>; let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm", - [(set (f16 HPR:$Sd), (fsqrt (f16 HPR:$Sm)))]>; + [(set (f16 HPR:$Sd), (any_fsqrt (f16 HPR:$Sm)))]>; let hasSideEffects = 0 in { let isMoveReg = 1 in { @@ -1509,10 +1510,10 @@ def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(f64 (sint_to_fp GPR:$a)), + def : VFPPat<(f64 (any_sint_to_fp GPR:$a)), (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), + def : VFPPat<(f64 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOD (VLDRS addrmode5:$a))>; } @@ -1529,10 +1530,10 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f32 (any_sint_to_fp GPR:$a)), (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOS (VLDRS addrmode5:$a))>; let mayRaiseFPException = 1 in @@ -1545,7 +1546,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f16 (any_sint_to_fp GPR:$a)), (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; let mayRaiseFPException = 1 in @@ -1558,10 +1559,10 @@ def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(f64 (uint_to_fp GPR:$a)), + def : VFPPat<(f64 (any_uint_to_fp GPR:$a)), (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), + def : VFPPat<(f64 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOD (VLDRS addrmode5:$a))>; } @@ -1578,10 +1579,10 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f32 (any_uint_to_fp GPR:$a)), (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOS (VLDRS addrmode5:$a))>; let mayRaiseFPException = 1 in @@ -1594,7 +1595,7 @@ def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f16 (any_uint_to_fp GPR:$a)), (VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; // FP -> Int: @@ -1669,12 +1670,12 @@ def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))), + def : VFPPat<(i32 (any_fp_to_sint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat (f64 DPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; - def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (any_fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f64 DPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; @@ -1693,12 +1694,12 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)), +def : VFPNoNEONPat<(i32 (any_fp_to_sint SPR:$a)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat SPR:$a, i32)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))), +def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_sint (f32 SPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)), @@ -1715,7 +1716,7 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))), +def : VFPNoNEONPat<(i32 (any_fp_to_sint (f16 HPR:$a))), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; @@ -1730,12 +1731,12 @@ def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))), + def : VFPPat<(i32 (any_fp_to_uint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat (f64 DPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; - def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (any_fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f64 DPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; @@ -1754,12 +1755,12 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)), +def : VFPNoNEONPat<(i32 (any_fp_to_uint SPR:$a)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat SPR:$a, i32)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))), +def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_uint (f32 SPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)), @@ -1776,7 +1777,7 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))), +def : VFPNoNEONPat<(i32 (any_fp_to_uint (f16 HPR:$a))), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; @@ -2320,13 +2321,13 @@ def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), // Match @llvm.fma.* intrinsics // (fma x, y, z) -> (vfms z, x, y) -def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), +def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), +def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), +def : Pat<(f16 (any_fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2375,13 +2376,13 @@ def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), // Match @llvm.fma.* intrinsics // (fma (fneg x), y, z) -> (vfms z, x, y) -def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), +def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), +def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), +def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2427,23 +2428,23 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), // Match @llvm.fma.* intrinsics // (fneg (fma x, y, z)) -> (vfnma z, x, y) -def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), +def : Pat<(fneg (any_fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), +def : Pat<(fneg (any_fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), +def : Pat<(fneg (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y) -def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), +def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), +def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), +def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2488,23 +2489,23 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), // Match @llvm.fma.* intrinsics // (fma x, y, (fneg z)) -> (vfnms z, x, y)) -def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), +def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), +def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), +def : Pat<(f16 (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y) -def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), +def : Pat<(fneg (f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), +def : Pat<(fneg (f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), +def : Pat<(fneg (f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp index ebfa593..bf7c962f 100644 --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -47,9 +47,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( // Only use a specialized AEABI function if the default version of this // Libcall is an AEABI function. - if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0) - return SDValue(); - + // // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be // able to translate memset to memclr and use the value to index the function // name array. @@ -61,12 +59,21 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( } AEABILibcall; switch (LC) { case RTLIB::MEMCPY: + if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memcpy) + return SDValue(); + AEABILibcall = AEABI_MEMCPY; break; case RTLIB::MEMMOVE: + if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memmove) + return SDValue(); + AEABILibcall = AEABI_MEMMOVE; break; case RTLIB::MEMSET: + if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memset) + return SDValue(); + AEABILibcall = AEABI_MEMSET; if (isNullConstant(Src)) AEABILibcall = AEABI_MEMCLR; diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index f60660b..1bb670d 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -426,15 +426,15 @@ class ARMAsmParser : public MCTargetAsmParser { VPTState.CurPosition = ~0U; } - void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) { + void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) { return getParser().Note(L, Msg, Range); } - bool Warning(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) { + bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) { return getParser().Warning(L, Msg, Range); } - bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) { + bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}) { return getParser().Error(L, Msg, Range); } diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp index e5b4f6e..08f196b 100644 --- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp @@ -884,13 +884,13 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, .Case("{t4}", CSKY::R20) .Case("{t5}", CSKY::R21) .Case("{t6}", CSKY::R22) - .Cases("{t7}", "{fp}", CSKY::R23) - .Cases("{t8}", "{top}", CSKY::R24) - .Cases("{t9}", "{bsp}", CSKY::R25) + .Cases({"{t7}", "{fp}"}, CSKY::R23) + .Cases({"{t8}", "{top}"}, CSKY::R24) + .Cases({"{t9}", "{bsp}"}, CSKY::R25) .Case("{r26}", CSKY::R26) .Case("{r27}", CSKY::R27) - .Cases("{gb}", "{rgb}", "{rdb}", CSKY::R28) - .Cases("{tb}", "{rtb}", CSKY::R29) + .Cases({"{gb}", "{rgb}", "{rdb}"}, CSKY::R28) + .Cases({"{tb}", "{rtb}"}, CSKY::R29) .Case("{svbr}", CSKY::R30) .Case("{tls}", CSKY::R31) .Default(CSKY::NoRegister); @@ -907,38 +907,38 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // use the ABI names in register constraint lists. if (Subtarget.useHardFloat()) { unsigned FReg = StringSwitch<unsigned>(Constraint.lower()) - .Cases("{fr0}", "{vr0}", CSKY::F0_32) - .Cases("{fr1}", "{vr1}", CSKY::F1_32) - .Cases("{fr2}", "{vr2}", CSKY::F2_32) - .Cases("{fr3}", "{vr3}", CSKY::F3_32) - .Cases("{fr4}", "{vr4}", CSKY::F4_32) - .Cases("{fr5}", "{vr5}", CSKY::F5_32) - .Cases("{fr6}", "{vr6}", CSKY::F6_32) - .Cases("{fr7}", "{vr7}", CSKY::F7_32) - .Cases("{fr8}", "{vr8}", CSKY::F8_32) - .Cases("{fr9}", "{vr9}", CSKY::F9_32) - .Cases("{fr10}", "{vr10}", CSKY::F10_32) - .Cases("{fr11}", "{vr11}", CSKY::F11_32) - .Cases("{fr12}", "{vr12}", CSKY::F12_32) - .Cases("{fr13}", "{vr13}", CSKY::F13_32) - .Cases("{fr14}", "{vr14}", CSKY::F14_32) - .Cases("{fr15}", "{vr15}", CSKY::F15_32) - .Cases("{fr16}", "{vr16}", CSKY::F16_32) - .Cases("{fr17}", "{vr17}", CSKY::F17_32) - .Cases("{fr18}", "{vr18}", CSKY::F18_32) - .Cases("{fr19}", "{vr19}", CSKY::F19_32) - .Cases("{fr20}", "{vr20}", CSKY::F20_32) - .Cases("{fr21}", "{vr21}", CSKY::F21_32) - .Cases("{fr22}", "{vr22}", CSKY::F22_32) - .Cases("{fr23}", "{vr23}", CSKY::F23_32) - .Cases("{fr24}", "{vr24}", CSKY::F24_32) - .Cases("{fr25}", "{vr25}", CSKY::F25_32) - .Cases("{fr26}", "{vr26}", CSKY::F26_32) - .Cases("{fr27}", "{vr27}", CSKY::F27_32) - .Cases("{fr28}", "{vr28}", CSKY::F28_32) - .Cases("{fr29}", "{vr29}", CSKY::F29_32) - .Cases("{fr30}", "{vr30}", CSKY::F30_32) - .Cases("{fr31}", "{vr31}", CSKY::F31_32) + .Cases({"{fr0}", "{vr0}"}, CSKY::F0_32) + .Cases({"{fr1}", "{vr1}"}, CSKY::F1_32) + .Cases({"{fr2}", "{vr2}"}, CSKY::F2_32) + .Cases({"{fr3}", "{vr3}"}, CSKY::F3_32) + .Cases({"{fr4}", "{vr4}"}, CSKY::F4_32) + .Cases({"{fr5}", "{vr5}"}, CSKY::F5_32) + .Cases({"{fr6}", "{vr6}"}, CSKY::F6_32) + .Cases({"{fr7}", "{vr7}"}, CSKY::F7_32) + .Cases({"{fr8}", "{vr8}"}, CSKY::F8_32) + .Cases({"{fr9}", "{vr9}"}, CSKY::F9_32) + .Cases({"{fr10}", "{vr10}"}, CSKY::F10_32) + .Cases({"{fr11}", "{vr11}"}, CSKY::F11_32) + .Cases({"{fr12}", "{vr12}"}, CSKY::F12_32) + .Cases({"{fr13}", "{vr13}"}, CSKY::F13_32) + .Cases({"{fr14}", "{vr14}"}, CSKY::F14_32) + .Cases({"{fr15}", "{vr15}"}, CSKY::F15_32) + .Cases({"{fr16}", "{vr16}"}, CSKY::F16_32) + .Cases({"{fr17}", "{vr17}"}, CSKY::F17_32) + .Cases({"{fr18}", "{vr18}"}, CSKY::F18_32) + .Cases({"{fr19}", "{vr19}"}, CSKY::F19_32) + .Cases({"{fr20}", "{vr20}"}, CSKY::F20_32) + .Cases({"{fr21}", "{vr21}"}, CSKY::F21_32) + .Cases({"{fr22}", "{vr22}"}, CSKY::F22_32) + .Cases({"{fr23}", "{vr23}"}, CSKY::F23_32) + .Cases({"{fr24}", "{vr24}"}, CSKY::F24_32) + .Cases({"{fr25}", "{vr25}"}, CSKY::F25_32) + .Cases({"{fr26}", "{vr26}"}, CSKY::F26_32) + .Cases({"{fr27}", "{vr27}"}, CSKY::F27_32) + .Cases({"{fr28}", "{vr28}"}, CSKY::F28_32) + .Cases({"{fr29}", "{vr29}"}, CSKY::F29_32) + .Cases({"{fr30}", "{vr30}"}, CSKY::F30_32) + .Cases({"{fr31}", "{vr31}"}, CSKY::F31_32) .Default(CSKY::NoRegister); if (FReg != CSKY::NoRegister) { assert(CSKY::F0_32 <= FReg && FReg <= CSKY::F31_32 && "Unknown fp-reg"); diff --git a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp index 15def36..b6bbb20 100644 --- a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp +++ b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp @@ -52,6 +52,7 @@ void DXILAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { emitGlobalConstant(GV->getDataLayout(), GV->getInitializer()); } -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXAsmPrinter() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXAsmPrinter() { RegisterAsmPrinter<DXILAsmPrinter> X(getTheDirectXTarget()); } diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index bcf8440..84b1a31 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -53,7 +53,8 @@ using namespace llvm; -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXTarget() { RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget()); auto *PR = PassRegistry::getPassRegistry(); initializeDXILIntrinsicExpansionLegacyPass(*PR); diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp index 9a14c01..62ad014 100644 --- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp +++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp @@ -132,7 +132,8 @@ static MCRegisterInfo *createDirectXMCRegisterInfo(const Triple &Triple) { static MCInstrInfo *createDirectXMCInstrInfo() { return new MCInstrInfo(); } -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetMC() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXTargetMC() { Target &T = getTheDirectXTarget(); RegisterMCAsmInfo<DirectXMCAsmInfo> X(T); TargetRegistry::RegisterMCInstrInfo(T, createDirectXMCInstrInfo); diff --git a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp index ae01626..934bd1b 100644 --- a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp +++ b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp @@ -24,7 +24,8 @@ Target &getTheDirectXTarget() { using namespace llvm; -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetInfo() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXTargetInfo() { RegisterTarget<Triple::dxil, /*HasJIT=*/false> X( getTheDirectXTarget(), "dxil", "DirectX Intermediate Language", "DXIL"); } diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp index 3b810d0..79863e1 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp @@ -34,7 +34,7 @@ class HexagonCopyHoisting : public MachineFunctionPass { public: static char ID; - HexagonCopyHoisting() : MachineFunctionPass(ID), MFN(nullptr), MRI(nullptr) {} + HexagonCopyHoisting() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "Hexagon Copy Hoisting"; } @@ -56,8 +56,8 @@ public: void moveCopyInstr(MachineBasicBlock *DestBB, std::pair<Register, Register> Key, MachineInstr *MI); - MachineFunction *MFN; - MachineRegisterInfo *MRI; + MachineFunction *MFN = nullptr; + MachineRegisterInfo *MRI = nullptr; std::vector<DenseMap<std::pair<Register, Register>, MachineInstr *>> CopyMIList; }; diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td index f4e36fa7..e661c94 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td @@ -26,6 +26,7 @@ def tc_20a4bbec : InstrItinClass; def tc_227864f7 : InstrItinClass; def tc_257f6f7c : InstrItinClass; def tc_26a377fe : InstrItinClass; +def tc_2a698a03 : InstrItinClass; def tc_2b4c548e : InstrItinClass; def tc_2c745bb8 : InstrItinClass; def tc_2d4051cd : InstrItinClass; @@ -52,6 +53,7 @@ def tc_561aaa58 : InstrItinClass; def tc_56c4f9fe : InstrItinClass; def tc_56e64202 : InstrItinClass; def tc_58d21193 : InstrItinClass; +def tc_57a4709c : InstrItinClass; def tc_5bf8afbb : InstrItinClass; def tc_5cdf8c84 : InstrItinClass; def tc_61bf7c03 : InstrItinClass; @@ -220,6 +222,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -356,6 +363,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -812,6 +824,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -948,6 +965,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -1404,6 +1426,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -1540,6 +1567,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -1996,6 +2028,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -2132,6 +2169,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -2588,6 +2630,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -2724,6 +2771,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -3180,6 +3232,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -3316,6 +3373,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -3772,6 +3834,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -3908,6 +3975,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -4364,6 +4436,11 @@ class DepHVXItinV69 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -4500,6 +4577,11 @@ class DepHVXItinV69 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -4956,6 +5038,11 @@ class DepHVXItinV71 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -5092,6 +5179,11 @@ class DepHVXItinV71 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -5548,6 +5640,11 @@ class DepHVXItinV73 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -5684,6 +5781,11 @@ class DepHVXItinV73 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -6140,6 +6242,11 @@ class DepHVXItinV75 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -6276,6 +6383,11 @@ class DepHVXItinV75 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -6732,6 +6844,11 @@ class DepHVXItinV79 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -6868,6 +6985,11 @@ class DepHVXItinV79 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], @@ -7324,6 +7446,11 @@ class DepHVXItinV81 { InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], @@ -7460,6 +7587,11 @@ class DepHVXItinV81 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_57a4709c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_XLANE]>], [9, 2], diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td index f8f1c2a..b188134 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -29939,6 +29939,58 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vabs_qf16_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = vabs($Vu32.hf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vabs_qf16_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = vabs($Vu32.qf16)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vabs_qf32_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = vabs($Vu32.qf32)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vabs_qf32_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = vabs($Vu32.sf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vabs_sf : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), @@ -31302,6 +31354,21 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_valign4 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), +"$Vd32 = valign4($Vu32,$Vv32,$Rt8)", +tc_57a4709c, TypeCVI_VA>, Enc_a30110, Requires<[UseHVXV81]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b0; +let Inst{31-24} = 0b00011000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_valignb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8), @@ -32583,6 +32650,32 @@ let isCVI = 1; let hasHvxTmp = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vconv_bf_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxWR:$Vuu32), +"$Vd32.bf = $Vuu32.qf32", +tc_2a698a03, TypeCVI_VS>, Enc_a33d04, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_f8_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.f8 = $Vu32.qf16", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vconv_h_hf : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), @@ -32596,6 +32689,19 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vconv_h_hf_rnd : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.h = $Vu32.hf:rnd", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vconv_hf_h : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), @@ -32635,6 +32741,71 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vconv_qf16_f8 : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32), +"$Vdd32.qf16 = $Vu32.f8", +tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_qf16_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = $Vu32.hf", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_qf16_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = $Vu32.qf16", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_qf32_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = $Vu32.qf32", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_qf32_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = $Vu32.sf", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vconv_sf_qf32 : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), @@ -33720,6 +33891,122 @@ let isHVXALU2SRC = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Qx4 = $Qx4in"; } +def V6_veqhf : HInst< +(outs HvxQR:$Qd4), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Qd4 = vcmp.eq($Vu32.hf,$Vv32.hf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b000111; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_veqhf_and : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 &= vcmp.eq($Vu32.hf,$Vv32.hf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b000111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqhf_or : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 |= vcmp.eq($Vu32.hf,$Vv32.hf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b010111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isAccumulator = 1; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqhf_xor : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 ^= vcmp.eq($Vu32.hf,$Vv32.hf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b100111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqsf : HInst< +(outs HvxQR:$Qd4), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Qd4 = vcmp.eq($Vu32.sf,$Vv32.sf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b000011; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_veqsf_and : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 &= vcmp.eq($Vu32.sf,$Vv32.sf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b000011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqsf_or : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 |= vcmp.eq($Vu32.sf,$Vv32.sf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b010011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isAccumulator = 1; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_veqsf_xor : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 ^= vcmp.eq($Vu32.sf,$Vv32.sf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-2} = 0b100011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let isHVXALU = 1; +let isHVXALU2SRC = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} def V6_veqw : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -34538,6 +34825,58 @@ let Inst{31-24} = 0b00011110; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vilog2_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.w = vilog2($Vu32.hf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vilog2_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.w = vilog2($Vu32.qf16)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vilog2_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.w = vilog2($Vu32.qf32)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vilog2_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.w = vilog2($Vu32.sf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vinsertwr : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, IntRegs:$Rt32), @@ -37170,6 +37509,58 @@ let isCVI = 1; let isHVXALU = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vneg_qf16_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = vneg($Vu32.hf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vneg_qf16_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf16 = vneg($Vu32.qf16)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vneg_qf32_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = vneg($Vu32.qf32)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vneg_qf32_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.qf32 = vneg($Vu32.sf)", +tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000001110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vnormamth : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td index 23f4b3a..c11483b 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td +++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td @@ -3830,6 +3830,122 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2), // V81 HVX Instructions. +def: Pat<(int_hexagon_V6_vabs_qf16_hf HvxVR:$src1), + (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf16_hf_128B HvxVR:$src1), + (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf16_qf16 HvxVR:$src1), + (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf16_qf16_128B HvxVR:$src1), + (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf32_qf32 HvxVR:$src1), + (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf32_qf32_128B HvxVR:$src1), + (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf32_sf HvxVR:$src1), + (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vabs_qf32_sf_128B HvxVR:$src1), + (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX64B]>; +def: Pat<(int_hexagon_V6_valign4_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), + (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vconv_bf_qf32 HvxWR:$src1), + (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_bf_qf32_128B HvxWR:$src1), + (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_f8_qf16 HvxVR:$src1), + (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_f8_qf16_128B HvxVR:$src1), + (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_h_hf_rnd HvxVR:$src1), + (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vconv_h_hf_rnd_128B HvxVR:$src1), + (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vconv_qf16_f8 HvxVR:$src1), + (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_f8_128B HvxVR:$src1), + (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_hf HvxVR:$src1), + (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_hf_128B HvxVR:$src1), + (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_qf16 HvxVR:$src1), + (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf16_qf16_128B HvxVR:$src1), + (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf32_qf32 HvxVR:$src1), + (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf32_qf32_128B HvxVR:$src1), + (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf32_sf HvxVR:$src1), + (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_qf32_sf_128B HvxVR:$src1), + (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf HvxVR:$src1, HvxVR:$src2), + (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_128B HvxVR:$src1, HvxVR:$src2), + (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqhf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf HvxVR:$src1, HvxVR:$src2), + (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_128B HvxVR:$src1, HvxVR:$src2), + (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_veqsf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_hf HvxVR:$src1), + (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_hf_128B HvxVR:$src1), + (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_qf16 HvxVR:$src1), + (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_qf16_128B HvxVR:$src1), + (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_qf32 HvxVR:$src1), + (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_qf32_128B HvxVR:$src1), + (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_sf HvxVR:$src1), + (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vilog2_sf_128B HvxVR:$src1), + (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf16_hf HvxVR:$src1), + (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf16_hf_128B HvxVR:$src1), + (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf16_qf16 HvxVR:$src1), + (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf16_qf16_128B HvxVR:$src1), + (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf32_qf32 HvxVR:$src1), + (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf32_qf32_128B HvxVR:$src1), + (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf32_sf HvxVR:$src1), + (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vneg_qf32_sf_128B HvxVR:$src1), + (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2), (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2), diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp index 93418f7..a10c937 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp @@ -34,13 +34,13 @@ STATISTIC(HexagonNumStoreAbsConversions, namespace { class HexagonGenMemAbsolute : public MachineFunctionPass { - const HexagonInstrInfo *TII; - MachineRegisterInfo *MRI; - const TargetRegisterInfo *TRI; + const HexagonInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; public: static char ID; - HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) {} + HexagonGenMemAbsolute() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "Hexagon Generate Load/Store Set Absolute Address Instruction"; diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 7ee280d..eadf020 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1815,7 +1815,7 @@ struct WeightedLeaf { int Weight; int InsertionOrder; - WeightedLeaf() {} + WeightedLeaf() = default; WeightedLeaf(SDValue Value, int Weight, int InsertionOrder) : Value(Value), Weight(Weight), InsertionOrder(InsertionOrder) { diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index 85ce944..e40dbd2 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -3434,6 +3434,19 @@ let AddedComplexity = 100 in { (C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>; } +multiclass FloatClass<SDPatternOperator IntOp, InstHexagon MI, + PatFrag RegPred> { + let AddedComplexity = 100 in { + def: Pat<(i1 (seteq (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)), + (C2_not (MI RegPred:$Rs, u5_0ImmPred_timm:$u5))>; + def: Pat<(i1 (setne (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)), + (MI RegPred:$Rs, u5_0ImmPred_timm:$u5)>; + } +} + +defm : FloatClass<int_hexagon_F2_sfclass, F2_sfclass, F32>; +defm : FloatClass<int_hexagon_F2_dfclass, F2_dfclass, F64>; + def: Pat<(int_hexagon_instrprof_custom (HexagonAtPcrel tglobaladdr:$addr), u32_0ImmPred:$I), (PS_call_instrprof_custom tglobaladdr:$addr, imm:$I)>; diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 1637b91..d19920c 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -612,6 +612,9 @@ let Predicates = [UseHVX] in { (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; def: Pat<(VecQ32 (trunc HVI32:$Vs)), (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; + def: Pat<(VecQ16 (trunc HWI32:$Vss)), + (Combineq(VecQ32(V6_vandvrt (HiVec $Vss), (ToI32 0x01010101))), + (VecQ32 (V6_vandvrt (LoVec $Vss), (ToI32 0x01010101))))>; } let Predicates = [UseHVX] in { diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index b9cdd6a..ce2de75 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -544,7 +544,7 @@ int HexagonSubtarget::updateLatency(MachineInstr &SrcInst, if (!hasV60Ops()) return Latency; - auto &QII = static_cast<const HexagonInstrInfo &>(*getInstrInfo()); + const HexagonInstrInfo &QII = *getInstrInfo(); // BSB scheduling. if (QII.isHVXVec(SrcInst) || useBSBScheduling()) Latency = (Latency + 1) >> 1; diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp index 71bdfc66..5a85f34 100644 --- a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp @@ -43,7 +43,7 @@ namespace { class HexagonTfrCleanup : public MachineFunctionPass { public: static char ID; - HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) {} + HexagonTfrCleanup() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "Hexagon TFR Cleanup"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); @@ -52,8 +52,8 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; private: - const HexagonInstrInfo *HII; - const TargetRegisterInfo *TRI; + const HexagonInstrInfo *HII = nullptr; + const TargetRegisterInfo *TRI = nullptr; typedef DenseMap<unsigned, uint64_t> ImmediateMap; diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index 690dd73..e86b21c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -365,6 +365,7 @@ def : Pat<(f32 (uint_to_fp (i64 (sexti32 (i64 GPR:$src))))), // FP Rounding let Predicates = [HasBasicF, IsLA64] in { def : PatFpr<frint, FRINT_S, FPR32>; +def : PatFpr<flog2, FLOGB_S, FPR32>; } // Predicates = [HasBasicF, IsLA64] let Predicates = [HasBasicF, IsLA32] in { diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index daefbaa..2e88254 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -348,6 +348,7 @@ def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>; // FP Rounding let Predicates = [HasBasicD, IsLA64] in { def : PatFpr<frint, FRINT_D, FPR64>; +def : PatFpr<flog2, FLOGB_D, FPR64>; } // Predicates = [HasBasicD, IsLA64] /// Pseudo-instructions needed for the soft-float ABI with LA32D diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 80c96c6..fe700e1 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -244,8 +244,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_BF16, MVT::f32, Subtarget.isSoftFPABI() ? LibCall : Custom); - if (Subtarget.is64Bit()) + if (Subtarget.is64Bit()) { setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + } if (!Subtarget.hasBasicD()) { setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); @@ -291,8 +293,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_BF16, MVT::f64, Subtarget.isSoftFPABI() ? LibCall : Custom); - if (Subtarget.is64Bit()) + if (Subtarget.is64Bit()) { setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FLOG2, MVT::f64, Legal); + } } // Set operations for 'LSX' feature. @@ -362,10 +366,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::FSQRT, VT, Legal); setOperationAction(ISD::FNEG, VT, Legal); + setOperationAction(ISD::FLOG2, VT, Legal); setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FMINNUM, VT, Legal); + setOperationAction(ISD::FMAXNUM, VT, Legal); } setOperationAction(ISD::CTPOP, GRLenVT, Legal); setOperationAction(ISD::FCEIL, {MVT::f32, MVT::f64}, Legal); @@ -443,10 +454,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::FSQRT, VT, Legal); setOperationAction(ISD::FNEG, VT, Legal); + setOperationAction(ISD::FLOG2, VT, Legal); setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FMINNUM, VT, Legal); + setOperationAction(ISD::FMAXNUM, VT, Legal); } } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 613dea6..b502b056 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1558,6 +1558,10 @@ defm : PatXrXrF<fmul, "XVFMUL">; // XVFDIV_{S/D} defm : PatXrXrF<fdiv, "XVFDIV">; +// XVFMAX_{S/D}, XVFMIN_{S/D} +defm : PatXrXrF<fmaxnum, "XVFMAX">; +defm : PatXrXrF<fminnum, "XVFMIN">; + // XVFMADD_{S/D} def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa), (XVFMADD_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>; @@ -1593,6 +1597,9 @@ def : Pat<(fma_nsz (fneg v4f64:$xj), v4f64:$xk, v4f64:$xa), // XVFSQRT_{S/D} defm : PatXrF<fsqrt, "XVFSQRT">; +// XVFLOGB_{S/D} +defm : PatXrF<flog2, "XVFLOGB">; + // XVRECIP_{S/D} def : Pat<(fdiv vsplatf32_fpimm_eq_1, v8f32:$xj), (XVFRECIP_S v8f32:$xj)>; @@ -2024,6 +2031,24 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)), (XVFTINTRZ_LU_D v4f64:$vj)), sub_128)>; +// XVAVG_{B/H/W/D/BU/HU/WU/DU}, XVAVGR_{B/H/W/D/BU/HU/WU/DU} +defm : VAvgPat<sra, "XVAVG_B", v32i8>; +defm : VAvgPat<sra, "XVAVG_H", v16i16>; +defm : VAvgPat<sra, "XVAVG_W", v8i32>; +defm : VAvgPat<sra, "XVAVG_D", v4i64>; +defm : VAvgPat<srl, "XVAVG_BU", v32i8>; +defm : VAvgPat<srl, "XVAVG_HU", v16i16>; +defm : VAvgPat<srl, "XVAVG_WU", v8i32>; +defm : VAvgPat<srl, "XVAVG_DU", v4i64>; +defm : VAvgrPat<sra, "XVAVGR_B", v32i8>; +defm : VAvgrPat<sra, "XVAVGR_H", v16i16>; +defm : VAvgrPat<sra, "XVAVGR_W", v8i32>; +defm : VAvgrPat<sra, "XVAVGR_D", v4i64>; +defm : VAvgrPat<srl, "XVAVGR_BU", v32i8>; +defm : VAvgrPat<srl, "XVAVGR_HU", v16i16>; +defm : VAvgrPat<srl, "XVAVGR_WU", v8i32>; +defm : VAvgrPat<srl, "XVAVGR_DU", v4i64>; + // abs def : Pat<(abs v32i8:$xj), (XVSIGNCOV_B v32i8:$xj, v32i8:$xj)>; def : Pat<(abs v16i16:$xj), (XVSIGNCOV_H v16i16:$xj, v16i16:$xj)>; @@ -2403,6 +2428,12 @@ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm), def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm), (XVPICKVE_D v4f64:$xj, (to_valid_timm timm:$imm))>; +// Vector floating-point conversion +defm : PatXrF<fceil, "XVFRINTRP">; +defm : PatXrF<ffloor, "XVFRINTRM">; +defm : PatXrF<ftrunc, "XVFRINTRZ">; +defm : PatXrF<froundeven, "XVFRINTRNE">; + // load def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm), (XVLD GPR:$rj, (to_valid_timm timm:$imm))>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 4619c6b..6b74a4b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1518,6 +1518,18 @@ multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> { } } +multiclass VAvgPat<SDPatternOperator OpNode, string Inst, ValueType vt> { + def : Pat<(OpNode (vt (add vt:$vj, vt:$vk)), (vt (vsplat_imm_eq_1))), + (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>; +} + +multiclass VAvgrPat<SDPatternOperator OpNode, string Inst, ValueType vt> { + def : Pat<(OpNode (vt (add (vt (add vt:$vj, vt:$vk)), + (vt (vsplat_imm_eq_1)))), + (vt (vsplat_imm_eq_1))), + (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>; +} + let Predicates = [HasExtLSX] in { // VADD_{B/H/W/D} @@ -1748,6 +1760,10 @@ defm : PatVrVrF<fmul, "VFMUL">; // VFDIV_{S/D} defm : PatVrVrF<fdiv, "VFDIV">; +// VFMAX_{S/D}, VFMIN_{S/D} +defm : PatVrVrF<fmaxnum, "VFMAX">; +defm : PatVrVrF<fminnum, "VFMIN">; + // VFMADD_{S/D} def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va), (VFMADD_S v4f32:$vj, v4f32:$vk, v4f32:$va)>; @@ -1783,6 +1799,9 @@ def : Pat<(fma_nsz (fneg v2f64:$vj), v2f64:$vk, v2f64:$va), // VFSQRT_{S/D} defm : PatVrF<fsqrt, "VFSQRT">; +// VFLOGB_{S/D} +defm : PatVrF<flog2, "VFLOGB">; + // VFRECIP_{S/D} def : Pat<(fdiv vsplatf32_fpimm_eq_1, v4f32:$vj), (VFRECIP_S v4f32:$vj)>; @@ -2154,6 +2173,24 @@ def : Pat<(f32 f32imm_vldi:$in), def : Pat<(f64 f64imm_vldi:$in), (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>; +// VAVG_{B/H/W/D/BU/HU/WU/DU}, VAVGR_{B/H/W/D/BU/HU/WU/DU} +defm : VAvgPat<sra, "VAVG_B", v16i8>; +defm : VAvgPat<sra, "VAVG_H", v8i16>; +defm : VAvgPat<sra, "VAVG_W", v4i32>; +defm : VAvgPat<sra, "VAVG_D", v2i64>; +defm : VAvgPat<srl, "VAVG_BU", v16i8>; +defm : VAvgPat<srl, "VAVG_HU", v8i16>; +defm : VAvgPat<srl, "VAVG_WU", v4i32>; +defm : VAvgPat<srl, "VAVG_DU", v2i64>; +defm : VAvgrPat<sra, "VAVGR_B", v16i8>; +defm : VAvgrPat<sra, "VAVGR_H", v8i16>; +defm : VAvgrPat<sra, "VAVGR_W", v4i32>; +defm : VAvgrPat<sra, "VAVGR_D", v2i64>; +defm : VAvgrPat<srl, "VAVGR_BU", v16i8>; +defm : VAvgrPat<srl, "VAVGR_HU", v8i16>; +defm : VAvgrPat<srl, "VAVGR_WU", v4i32>; +defm : VAvgrPat<srl, "VAVGR_DU", v2i64>; + // abs def : Pat<(abs v16i8:$vj), (VSIGNCOV_B v16i8:$vj, v16i8:$vj)>; def : Pat<(abs v8i16:$vj), (VSIGNCOV_H v8i16:$vj, v8i16:$vj)>; @@ -2519,6 +2556,11 @@ def : Pat<(f64 (froundeven FPR64:$fj)), (f64 (EXTRACT_SUBREG (VFRINTRNE_D (VREPLVEI_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)), sub_64))>; +defm : PatVrF<fceil, "VFRINTRP">; +defm : PatVrF<ffloor, "VFRINTRM">; +defm : PatVrF<ftrunc, "VFRINTRZ">; +defm : PatVrF<froundeven, "VFRINTRNE">; + // load def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm), (VLD GPR:$rj, (to_valid_timm timm:$imm))>; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index 7d54565..6d69af5 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -39,7 +39,7 @@ LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH, /*HasRelocationAddend=*/true) {} -LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {} +LoongArchELFObjectWriter::~LoongArchELFObjectWriter() = default; unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup, const MCValue &Target, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp index f0e2bc4..08fa51d 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp @@ -38,7 +38,7 @@ public: LoongArchMCCodeEmitter(MCContext &ctx, MCInstrInfo const &MCII) : Ctx(ctx), MCII(MCII) {} - ~LoongArchMCCodeEmitter() override {} + ~LoongArchMCCodeEmitter() override = default; void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB, SmallVectorImpl<MCFixup> &Fixups, diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp index e37f3a66..fb5cd5c2 100644 --- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp +++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp @@ -690,9 +690,9 @@ bool M68kAsmParser::parseRegisterName(MCRegister &RegNo, SMLoc Loc, } else { // Floating point control register. RegNo = StringSwitch<unsigned>(RegisterNameLower) - .Cases("fpc", "fpcr", M68k::FPC) - .Cases("fps", "fpsr", M68k::FPS) - .Cases("fpi", "fpiar", M68k::FPIAR) + .Cases({"fpc", "fpcr"}, M68k::FPC) + .Cases({"fps", "fpsr"}, M68k::FPS) + .Cases({"fpi", "fpiar"}, M68k::FPIAR) .Default(M68k::NoRegister); assert(RegNo != M68k::NoRegister && "Unrecognized FP control register name"); diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp index fe83dc6..51bafe4 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp @@ -49,7 +49,7 @@ public: M68kAsmBackend(const Target &T, const MCSubtargetInfo &STI) : MCAsmBackend(llvm::endianness::big), Allows32BitBranch(llvm::StringSwitch<bool>(STI.getCPU()) - .CasesLower("m68020", "m68030", "m68040", true) + .CasesLower({"m68020", "m68030", "m68040"}, true) .Default(false)) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 97379d7..f588e56 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -6176,7 +6176,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) { CC = StringSwitch<unsigned>(Name) .Case("zero", 0) - .Cases("at", "AT", 1) + .Cases({"at", "AT"}, 1) .Case("a0", 4) .Case("a1", 5) .Case("a2", 6) diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h index caef8fe7..b832b82 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h +++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h @@ -20,7 +20,7 @@ class MemoryLocation; class NVPTXAAResult : public AAResultBase { public: - NVPTXAAResult() {} + NVPTXAAResult() = default; NVPTXAAResult(NVPTXAAResult &&Arg) : AAResultBase(std::move(Arg)) {} /// Handle invalidation events from the new pass manager. diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 7e7ee75..c667a09 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1871,17 +1871,6 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; } (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \ : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, ))) -#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \ - [&]() -> auto { \ - if (is_mc && is_ch) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \ - if (is_ch) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \ - if (is_mc) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \ - }() - static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, @@ -1925,112 +1914,6 @@ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, } } -static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, - bool IsMultiCast, - bool IsCacheHint, bool IsIm2Col) { - if (IsIm2Col) { - switch (Dim) { - case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast, - IsCacheHint, IsShared32); - case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast, - IsCacheHint, IsShared32); - case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast, - IsCacheHint, IsShared32); - default: - llvm_unreachable("Invalid Dimension in im2col mode for " - "GetCpAsyncBulkTensorG2SOpcode."); - } - } else { - switch (Dim) { - case 1: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 2: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - default: - llvm_unreachable( - "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode."); - } - } -} - -static size_t GetDimsFromIntrinsic(unsigned IID) { - switch (IID) { - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d: - return 3; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d: - return 4; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d: - return 5; - default: - llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic."); - } -} - -void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, - bool IsIm2Col) { - // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: - // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2} - // multicast, cache_hint, - // multicast_flag, cache_hint_flag, cta_group_flag} - // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {8 + dims + im2col_offsets} - size_t NumOps = N->getNumOperands(); - size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) - : (NumOps - 10); - // Offsets is always 'NumDims - 2' and only for im2col mode - size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; - bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1; - bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1; - size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src} - size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID - - unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1); - if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()) - report_fatal_error( - formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}", - Subtarget->getSmVersion())); - - SDLoc DL(N); - SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs)); - - // Push MultiCast operand, if available - if (IsMultiCast) - Ops.push_back(N->getOperand(MultiCastIdx)); - - // Push CacheHint operand, if available - if (IsCacheHint) - Ops.push_back(N->getOperand(MultiCastIdx + 1)); - - // Flag for CTA Group - Ops.push_back(getI32Imm(CTAGroupVal, DL)); - - // Finally, the chain operand - Ops.push_back(N->getOperand(0)); - - bool IsShared32 = - CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; - unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode( - NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col); - ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); -} - void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp, bool IsIm2Col) { @@ -2175,18 +2058,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { switch (IID) { default: return false; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: - SelectCpAsyncBulkTensorG2SCommon(N); - return true; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: - SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true); - return true; case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d: case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d: case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index c912e70..1cb579b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -86,7 +86,6 @@ private: bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); - void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp, bool IsIm2Col = false); void SelectTcgen05Ld(SDNode *N, bool hasOffset = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index dfde0cc..b260221 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -139,7 +139,6 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; def hasTcgen05MMAScaleInputDImm : Predicate<"Subtarget->hasTcgen05MMAScaleInputDImm()">; -def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">; def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">; class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index c923f0e..50827bd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -599,75 +599,15 @@ class TMA_IM2COL_UTIL<int dim, string mode> { string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", "); } -// From Global to Shared memory (G2S) -class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> { - string prefix = "cp.async.bulk.tensor"; - string dir = "shared::cluster.global"; - string completion = "mbarrier::complete_tx::bytes"; - string inst_name = prefix - # "." # dim # "d" - # "." # dir - # "." # mode - # "." # completion - # !if(mc, ".multicast::cluster", "") - # !if(ch, ".L2::cache_hint", ""); - string intr_name = "CP_ASYNC_BULK_TENSOR_G2S_" - # dim # "D" - # !if(is_shared32, "_SHARED32", "") - # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); -} - def CTAGroupFlags : Operand<i32> { let PrintMethod = "printCTAGroup"; } -multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> { - defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; - defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; - defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; - defvar rc = !if(is_shared32, B32, B64); - - defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0); - defvar im2col_dag = !if(!eq(mode, "im2col"), - !dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)), - (ins)); - defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", "); - defvar im2col_asm_str = ", {{" # im2col_str # "}}"; - - defvar asm_str = !if(!eq(mode, "im2col"), - !strconcat(asm_str_default, im2col_asm_str), asm_str_default); +def tma_cta_group_imm0 : TImmLeaf<i32, [{return Imm == 0;}]>; +def tma_cta_group_imm_any : TImmLeaf<i32, [{return Imm >= 0;}]>; - def "" : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _MC : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, - (ins B16:$mc, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, - (ins B64:$ch, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _MC_CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, - (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>, - Requires<[hasPTX<80>, hasSM<90>]>; -} - -foreach dim = [1, 2, 3, 4, 5] in { - foreach shared32 = [true, false] in { - foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - defm G2S_STRINGS<dim, mode, 0, 0, shared32>.intr_name : - CP_ASYNC_BULK_TENSOR_G2S_INTR<dim, shared32, mode>; - } - } -} - -multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> { +multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred, + TImmLeaf cta_group_type = tma_cta_group_imm_any> { defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; @@ -697,10 +637,10 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> !setdagop(dims_dag, intr), !setdagop(im2col_dag, intr), (intr B16:$mc, B64:$ch)); - defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg)); - defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg)); - defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg)); - defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg)); + defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, cta_group_type:$cg)); + defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, cta_group_type:$cg)); + defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, cta_group_type:$cg)); + defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, cta_group_type:$cg)); def "" : NVPTXInst<(outs), ins_dag, inst_name # asm_str # ";", @@ -719,14 +659,30 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> [intr_dag_with_mc_ch]>, Requires<pred>; } + +foreach dim = 1...5 in { + defm TMA_G2S_TILE_CG0_ # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, "tile", [hasPTX<80>, hasSM<90>], + tma_cta_group_imm0>; + defm TMA_G2S_TILE_ # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, "tile", + [callSubtarget<"hasTMABlackwellSupport">]>; +} foreach dim = 3...5 in { + defm TMA_G2S_IM2COL_CG0_ # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, "im2col", [hasPTX<80>, hasSM<90>], + tma_cta_group_imm0>; + defm TMA_G2S_IM2COL_ # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, "im2col", + [callSubtarget<"hasTMABlackwellSupport">]>; foreach mode = ["im2col_w", "im2col_w_128"] in { defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D" - : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>; + : TMA_TENSOR_G2S_INTR<dim, mode, + [callSubtarget<"hasTMABlackwellSupport">]>; } } defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4", - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> { defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; @@ -784,7 +740,8 @@ foreach dim = 3...5 in { : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>; defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D" - : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>; + : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", + [callSubtarget<"hasTMABlackwellSupport">]>; } defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4", [hasPTX<86>, hasSM<100>]>; @@ -835,7 +792,7 @@ foreach dim = 1...5 in { } } defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4", - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; def TMAReductionFlags : Operand<i32> { let PrintMethod = "printTmaReductionMode"; @@ -930,11 +887,11 @@ foreach dim = 3...5 in { foreach mode = ["im2col_w", "im2col_w_128"] in { defvar suffix = !toupper(mode) # "_" # dim # "D"; defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode, - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; } } defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; //Prefetchu and Prefetch @@ -1605,12 +1562,17 @@ def : Pat<(int_nvvm_saturate_d f64:$a), (CVT_f64_f64 $a, CvtSAT)>; // Exp2 Log2 // -def : Pat<(int_nvvm_ex2_approx_ftz_f f32:$a), (EX2_APPROX_f32 $a, FTZ)>; -def : Pat<(int_nvvm_ex2_approx_f f32:$a), (EX2_APPROX_f32 $a, NoFTZ)>; +def : Pat<(f32 (int_nvvm_ex2_approx_ftz f32:$a)), (EX2_APPROX_f32 $a, FTZ)>; +def : Pat<(f32 (int_nvvm_ex2_approx f32:$a)), (EX2_APPROX_f32 $a, NoFTZ)>; let Predicates = [hasPTX<70>, hasSM<75>] in { - def : Pat<(int_nvvm_ex2_approx_f16 f16:$a), (EX2_APPROX_f16 $a)>; - def : Pat<(int_nvvm_ex2_approx_f16x2 v2f16:$a), (EX2_APPROX_f16x2 $a)>; + def : Pat<(f16 (int_nvvm_ex2_approx f16:$a)), (EX2_APPROX_f16 $a)>; + def : Pat<(v2f16 (int_nvvm_ex2_approx v2f16:$a)), (EX2_APPROX_f16x2 $a)>; +} + +let Predicates = [hasPTX<78>, hasSM<90>] in { + def : Pat<(bf16 (int_nvvm_ex2_approx_ftz bf16:$a)), (EX2_APPROX_bf16 $a)>; + def : Pat<(v2bf16 (int_nvvm_ex2_approx_ftz v2bf16:$a)), (EX2_APPROX_bf16x2 $a)>; } def LG2_APPROX_f32 : diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 194dbdc..021b1f6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -166,18 +166,15 @@ public: // f32x2 instructions in Blackwell family bool hasF32x2Instructions() const; - // TMA G2S copy with cta_group::1/2 support - bool hasCpAsyncBulkTensorCTAGroupSupport() const { - // TODO: Update/tidy-up after the family-conditional support arrives - switch (FullSmVersion) { - case 1003: - case 1013: - return PTXVersion >= 86; - case 1033: - return PTXVersion >= 88; - default: - return false; - } + // Checks support for following in TMA: + // - cta_group::1/2 support + // - im2col_w/w_128 mode support + // - tile_gather4 mode support + // - tile_scatter4 mode support + bool hasTMABlackwellSupport() const { + return hasPTXWithFamilySMs(90, {100, 110}) || + hasPTXWithFamilySMs(88, {100, 101}) || + hasPTXWithAccelSMs(86, {100, 101}); } // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 729c077..64593e6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -318,7 +318,7 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, // answer. These include: // // - nvvm_cos_approx_{f,ftz_f} - // - nvvm_ex2_approx_{d,f,ftz_f} + // - nvvm_ex2_approx(_ftz) // - nvvm_lg2_approx_{d,f,ftz_f} // - nvvm_sin_approx_{f,ftz_f} // - nvvm_sqrt_approx_{f,ftz_f} diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index bcb3f50..780e124 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2702,7 +2702,7 @@ static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) { static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) { return StringSwitch<bool>(GV->getName()) - .Cases("llvm.global_ctors", "llvm.global_dtors", true) + .Cases({"llvm.global_ctors", "llvm.global_dtors"}, true) .Default(false); } diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index da3efdc..0c2e44e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -360,6 +360,10 @@ let Predicates = [HasVSX, IsISAFuture] in { def LXVPRLL : XForm_XTp5_RAB5<31, 621, (outs vsrprc:$XTp), (ins (memr $RA):$addr, g8rc:$RB), "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def LXVPB32X + : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp), + (ins (memr $RA):$addr, g8rc:$RB), + "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } let mayStore = 1 in { @@ -376,6 +380,10 @@ let Predicates = [HasVSX, IsISAFuture] in { : XForm_XTp5_RAB5<31, 749, (outs), (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def STXVPB32X + : XForm_XTp5_RAB5<31, 1005, (outs), + (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), + "stxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB), diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index 3640d25..70df59d 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -1316,7 +1316,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) { // useless and possible to break some original well-form addressing mode // to make this pre-inc prep for it. if (PointerElementType->isIntegerTy(64)) { - const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L); + const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV); if (!LARSCEV || LARSCEV->getLoop() != L) return false; diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 000d296..4ff489d 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -296,8 +296,9 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) - : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, - computeFSAdditions(FS, OL, TT), Options, + : CodeGenTargetMachineImpl(T, + TT.computeDataLayout(Options.MCOptions.ABIName), + TT, CPU, computeFSAdditions(FS, OL, TT), Options, getEffectiveRelocModel(TT, RM), getEffectivePPCCodeModel(TT, CM, JIT), OL), TLOF(createTLOF(getTargetTriple())), diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 2fba090..b04e887 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -912,7 +912,7 @@ bool PPCTTIImpl::areInlineCompatible(const Function *Caller, bool PPCTTIImpl::areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { + ArrayRef<Type *> Types) const { // We need to ensure that argument promotion does not // attempt to promote pointers to MMA types (__vector_pair diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 475472a..8d7f255 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -147,7 +147,7 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const override; bool areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const override; + ArrayRef<Type *> Types) const override; bool supportsTailCallFor(const CallBase *CB) const override; private: diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 8198173..282cf5d 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -92,6 +92,10 @@ private: void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID, MachineIRBuilder &MIB) const; bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; + void addVectorLoadStoreOperands(MachineInstr &I, + SmallVectorImpl<SrcOp> &SrcOps, + unsigned &CurOp, bool IsMasked, + bool IsStrided) const; bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineIRBuilder &MIB) const; @@ -716,6 +720,26 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { return GenericOpc; } +void RISCVInstructionSelector::addVectorLoadStoreOperands( + MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, + bool IsMasked, bool IsStrided) const { + // Base Pointer + auto PtrReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PtrReg); + + // Stride + if (IsStrided) { + auto StrideReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(StrideReg); + } + + // Mask + if (IsMasked) { + auto MaskReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(MaskReg); + } +} + bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( MachineInstr &I, MachineIRBuilder &MIB) const { // Find the intrinsic ID. @@ -752,21 +776,7 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( SrcOps.push_back(Register(RISCV::NoRegister)); } - // Base Pointer - auto PtrReg = I.getOperand(CurOp++).getReg(); - SrcOps.push_back(PtrReg); - - // Stride - if (IsStrided) { - auto StrideReg = I.getOperand(CurOp++).getReg(); - SrcOps.push_back(StrideReg); - } - - // Mask - if (IsMasked) { - auto MaskReg = I.getOperand(CurOp++).getReg(); - SrcOps.push_back(MaskReg); - } + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided); RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); const RISCV::VLEPseudo *P = @@ -795,6 +805,48 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vsm: + case Intrinsic::riscv_vse: + case Intrinsic::riscv_vse_mask: + case Intrinsic::riscv_vsse: + case Intrinsic::riscv_vsse_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vse_mask || + IntrinID == Intrinsic::riscv_vsse_mask; + bool IsStrided = IntrinID == Intrinsic::riscv_vsse || + IntrinID == Intrinsic::riscv_vsse_mask; + LLT VT = MRI->getType(I.getOperand(1).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Sources + unsigned CurOp = 1; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Store value + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + const RISCV::VSEPseudo *P = RISCV::getVSEPseudo( + IsMasked, IsStrided, Log2SEW, static_cast<unsigned>(LMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } } } diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 4105618..526675a 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -127,6 +127,10 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCAND: case RISCV::PseudoCCOR: case RISCV::PseudoCCXOR: + case RISCV::PseudoCCMAX: + case RISCV::PseudoCCMAXU: + case RISCV::PseudoCCMIN: + case RISCV::PseudoCCMINU: case RISCV::PseudoCCADDW: case RISCV::PseudoCCSUBW: case RISCV::PseudoCCSLL: @@ -217,6 +221,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, .addImm(0); } else { unsigned NewOpc; + // clang-format off switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); @@ -228,6 +233,10 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCAND: NewOpc = RISCV::AND; break; case RISCV::PseudoCCOR: NewOpc = RISCV::OR; break; case RISCV::PseudoCCXOR: NewOpc = RISCV::XOR; break; + case RISCV::PseudoCCMAX: NewOpc = RISCV::MAX; break; + case RISCV::PseudoCCMIN: NewOpc = RISCV::MIN; break; + case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break; + case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break; case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break; case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break; case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break; @@ -250,6 +259,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCNDS_BFOS: NewOpc = RISCV::NDS_BFOS; break; case RISCV::PseudoCCNDS_BFOZ: NewOpc = RISCV::NDS_BFOZ; break; } + // clang-format on if (NewOpc == RISCV::NDS_BFOZ || NewOpc == RISCV::NDS_BFOS) { BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index b4556f6..cfee6ab 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1851,6 +1851,11 @@ def TuneShortForwardBranchOpt def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">; def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">; +def TuneShortForwardBranchIMinMax + : SubtargetFeature<"short-forward-branch-i-minmax", "HasShortForwardBranchIMinMax", + "true", "Enable short forward branch optimization for min,max instructions in Zbb", + [TuneShortForwardBranchOpt]>; + // Some subtargets require a S2V transfer buffer to move scalars into vectors. // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure. def TuneNoSinkSplatOperands diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 9a6afa1..b25a054 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3995,6 +3995,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits, case RISCV::CTZW: case RISCV::CPOPW: case RISCV::SLLI_UW: + case RISCV::ABSW: case RISCV::FMV_W_X: case RISCV::FCVT_H_W: case RISCV::FCVT_H_W_INX: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1c930ac..e0cf739 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -433,6 +433,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.hasStdExtP() || (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) { setOperationAction(ISD::ABS, XLenVT, Legal); + if (Subtarget.is64Bit()) + setOperationAction(ISD::ABS, MVT::i32, Custom); } else if (Subtarget.hasShortForwardBranchOpt()) { // We can use PseudoCCSUB to implement ABS. setOperationAction(ISD::ABS, XLenVT, Legal); @@ -14816,8 +14818,16 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); + if (Subtarget.hasStdExtP()) { + SDValue Src = + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); + SDValue Abs = DAG.getNode(RISCVISD::ABSW, DL, MVT::i64, Src); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Abs)); + return; + } + if (Subtarget.hasStdExtZbb()) { - // Emit a special ABSW node that will be expanded to NEGW+MAX at isel. + // Emit a special node that will be expanded to NEGW+MAX at isel. // This allows us to remember that the result is sign extended. Expanding // to NEGW+MAX here requires a Freeze which breaks ComputeNumSignBits. SDValue Src = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, @@ -19784,7 +19794,9 @@ legalizeScatterGatherIndexType(SDLoc DL, SDValue &Index, // LLVM's legalization take care of the splitting. // FIXME: LLVM can't split VP_GATHER or VP_SCATTER yet. Index = DAG.getNode(ISD::SIGN_EXTEND, DL, - IndexVT.changeVectorElementType(XLenVT), Index); + EVT::getVectorVT(*DAG.getContext(), XLenVT, + IndexVT.getVectorElementCount()), + Index); } IndexType = ISD::UNSIGNED_SCALED; return true; @@ -20290,6 +20302,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, break; } + case RISCVISD::ABSW: case RISCVISD::CLZW: case RISCVISD::CTZW: { // Only the lower 32 bits of the first operand are read @@ -21862,6 +21875,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( case RISCVISD::REMUW: case RISCVISD::ROLW: case RISCVISD::RORW: + case RISCVISD::ABSW: case RISCVISD::FCVT_W_RV64: case RISCVISD::FCVT_WU_RV64: case RISCVISD::STRICT_FCVT_W_RV64: @@ -23932,7 +23946,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, .Case("{t0}", RISCV::X5) .Case("{t1}", RISCV::X6) .Case("{t2}", RISCV::X7) - .Cases("{s0}", "{fp}", RISCV::X8) + .Cases({"{s0}", "{fp}"}, RISCV::X8) .Case("{s1}", RISCV::X9) .Case("{a0}", RISCV::X10) .Case("{a1}", RISCV::X11) @@ -23969,38 +23983,38 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // use the ABI names in register constraint lists. if (Subtarget.hasStdExtF()) { unsigned FReg = StringSwitch<unsigned>(Constraint.lower()) - .Cases("{f0}", "{ft0}", RISCV::F0_F) - .Cases("{f1}", "{ft1}", RISCV::F1_F) - .Cases("{f2}", "{ft2}", RISCV::F2_F) - .Cases("{f3}", "{ft3}", RISCV::F3_F) - .Cases("{f4}", "{ft4}", RISCV::F4_F) - .Cases("{f5}", "{ft5}", RISCV::F5_F) - .Cases("{f6}", "{ft6}", RISCV::F6_F) - .Cases("{f7}", "{ft7}", RISCV::F7_F) - .Cases("{f8}", "{fs0}", RISCV::F8_F) - .Cases("{f9}", "{fs1}", RISCV::F9_F) - .Cases("{f10}", "{fa0}", RISCV::F10_F) - .Cases("{f11}", "{fa1}", RISCV::F11_F) - .Cases("{f12}", "{fa2}", RISCV::F12_F) - .Cases("{f13}", "{fa3}", RISCV::F13_F) - .Cases("{f14}", "{fa4}", RISCV::F14_F) - .Cases("{f15}", "{fa5}", RISCV::F15_F) - .Cases("{f16}", "{fa6}", RISCV::F16_F) - .Cases("{f17}", "{fa7}", RISCV::F17_F) - .Cases("{f18}", "{fs2}", RISCV::F18_F) - .Cases("{f19}", "{fs3}", RISCV::F19_F) - .Cases("{f20}", "{fs4}", RISCV::F20_F) - .Cases("{f21}", "{fs5}", RISCV::F21_F) - .Cases("{f22}", "{fs6}", RISCV::F22_F) - .Cases("{f23}", "{fs7}", RISCV::F23_F) - .Cases("{f24}", "{fs8}", RISCV::F24_F) - .Cases("{f25}", "{fs9}", RISCV::F25_F) - .Cases("{f26}", "{fs10}", RISCV::F26_F) - .Cases("{f27}", "{fs11}", RISCV::F27_F) - .Cases("{f28}", "{ft8}", RISCV::F28_F) - .Cases("{f29}", "{ft9}", RISCV::F29_F) - .Cases("{f30}", "{ft10}", RISCV::F30_F) - .Cases("{f31}", "{ft11}", RISCV::F31_F) + .Cases({"{f0}", "{ft0}"}, RISCV::F0_F) + .Cases({"{f1}", "{ft1}"}, RISCV::F1_F) + .Cases({"{f2}", "{ft2}"}, RISCV::F2_F) + .Cases({"{f3}", "{ft3}"}, RISCV::F3_F) + .Cases({"{f4}", "{ft4}"}, RISCV::F4_F) + .Cases({"{f5}", "{ft5}"}, RISCV::F5_F) + .Cases({"{f6}", "{ft6}"}, RISCV::F6_F) + .Cases({"{f7}", "{ft7}"}, RISCV::F7_F) + .Cases({"{f8}", "{fs0}"}, RISCV::F8_F) + .Cases({"{f9}", "{fs1}"}, RISCV::F9_F) + .Cases({"{f10}", "{fa0}"}, RISCV::F10_F) + .Cases({"{f11}", "{fa1}"}, RISCV::F11_F) + .Cases({"{f12}", "{fa2}"}, RISCV::F12_F) + .Cases({"{f13}", "{fa3}"}, RISCV::F13_F) + .Cases({"{f14}", "{fa4}"}, RISCV::F14_F) + .Cases({"{f15}", "{fa5}"}, RISCV::F15_F) + .Cases({"{f16}", "{fa6}"}, RISCV::F16_F) + .Cases({"{f17}", "{fa7}"}, RISCV::F17_F) + .Cases({"{f18}", "{fs2}"}, RISCV::F18_F) + .Cases({"{f19}", "{fs3}"}, RISCV::F19_F) + .Cases({"{f20}", "{fs4}"}, RISCV::F20_F) + .Cases({"{f21}", "{fs5}"}, RISCV::F21_F) + .Cases({"{f22}", "{fs6}"}, RISCV::F22_F) + .Cases({"{f23}", "{fs7}"}, RISCV::F23_F) + .Cases({"{f24}", "{fs8}"}, RISCV::F24_F) + .Cases({"{f25}", "{fs9}"}, RISCV::F25_F) + .Cases({"{f26}", "{fs10}"}, RISCV::F26_F) + .Cases({"{f27}", "{fs11}"}, RISCV::F27_F) + .Cases({"{f28}", "{ft8}"}, RISCV::F28_F) + .Cases({"{f29}", "{ft9}"}, RISCV::F29_F) + .Cases({"{f30}", "{ft10}"}, RISCV::F30_F) + .Cases({"{f31}", "{ft11}"}, RISCV::F31_F) .Default(RISCV::NoRegister); if (FReg != RISCV::NoRegister) { assert(RISCV::F0_F <= FReg && FReg <= RISCV::F31_F && "Unknown fp-reg"); diff --git a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp index a1c8e23..c58a5c0 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp @@ -48,7 +48,7 @@ class VXRMInfo { } State = Uninitialized; public: - VXRMInfo() {} + VXRMInfo() = default; static VXRMInfo getUnknown() { VXRMInfo Info; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 912b82d..c9df787 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -869,7 +869,7 @@ std::optional<unsigned> getFoldedOpcode(MachineFunction &MF, MachineInstr &MI, } } -// This is the version used during inline spilling +// This is the version used during InlineSpiller::spillAroundUses MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, @@ -1699,6 +1699,10 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::AND: return RISCV::PseudoCCAND; case RISCV::OR: return RISCV::PseudoCCOR; case RISCV::XOR: return RISCV::PseudoCCXOR; + case RISCV::MAX: return RISCV::PseudoCCMAX; + case RISCV::MAXU: return RISCV::PseudoCCMAXU; + case RISCV::MIN: return RISCV::PseudoCCMIN; + case RISCV::MINU: return RISCV::PseudoCCMINU; case RISCV::ADDI: return RISCV::PseudoCCADDI; case RISCV::SLLI: return RISCV::PseudoCCSLLI; @@ -1735,7 +1739,8 @@ unsigned getPredicatedOpcode(unsigned Opcode) { /// return the defining instruction. static MachineInstr *canFoldAsPredicatedOp(Register Reg, const MachineRegisterInfo &MRI, - const TargetInstrInfo *TII) { + const TargetInstrInfo *TII, + const RISCVSubtarget &STI) { if (!Reg.isVirtual()) return nullptr; if (!MRI.hasOneNonDBGUse(Reg)) @@ -1743,6 +1748,12 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg, MachineInstr *MI = MRI.getVRegDef(Reg); if (!MI) return nullptr; + + if (!STI.hasShortForwardBranchIMinMax() && + (MI->getOpcode() == RISCV::MAX || MI->getOpcode() == RISCV::MIN || + MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU)) + return nullptr; + // Check if MI can be predicated and folded into the CCMOV. if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END) return nullptr; @@ -1806,10 +1817,10 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI, MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); MachineInstr *DefMI = - canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this); + canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI); bool Invert = !DefMI; if (!DefMI) - DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this); + DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI); if (!DefMI) return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 7c89686..9cb53fb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -768,7 +768,7 @@ def BGE : BranchCC_rri<0b101, "bge">; def BLTU : BranchCC_rri<0b110, "bltu">; def BGEU : BranchCC_rri<0b111, "bgeu">; -let IsSignExtendingOpW = 1 in { +let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in { def LB : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>; def LH : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>; def LW : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>; @@ -889,8 +889,10 @@ def CSRRCI : CSR_ii<0b111, "csrrci">; /// RV64I instructions let Predicates = [IsRV64] in { +let canFoldAsLoad = 1 in { def LWU : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>; def LD : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>; +} def SD : Store_rri<0b011, "sd">, Sched<[WriteSTD, ReadStoreData, ReadMemBase]>; let IsSignExtendingOpW = 1 in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index afac37d..4ffe3e6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -71,6 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt]; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtD] in { +let canFoldAsLoad = 1 in def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 6571d99..b30f8ec 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -330,6 +330,7 @@ class PseudoFROUND<DAGOperand Ty, ValueType vt, ValueType intvt = XLenVT> //===----------------------------------------------------------------------===// let Predicates = [HasStdExtF] in { +let canFoldAsLoad = 1 in def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index cc085bb..4cbbba3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1461,5 +1461,10 @@ let Predicates = [HasStdExtP, IsRV32] in { // Codegen patterns //===----------------------------------------------------------------------===// +def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>; + let Predicates = [HasStdExtP] in def : PatGpr<abs, ABS>; + +let Predicates = [HasStdExtP, IsRV64] in +def : PatGpr<riscv_absw, ABSW>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 0114fbd..5a67a5a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -106,6 +106,10 @@ def PseudoCCSRA : SFBALU_rr; def PseudoCCAND : SFBALU_rr; def PseudoCCOR : SFBALU_rr; def PseudoCCXOR : SFBALU_rr; +def PseudoCCMAX : SFBALU_rr; +def PseudoCCMIN : SFBALU_rr; +def PseudoCCMAXU : SFBALU_rr; +def PseudoCCMINU : SFBALU_rr; def PseudoCCADDI : SFBALU_ri; def PseudoCCANDI : SFBALU_ri; diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index d08115b..ea98cdb 100644 --- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -172,6 +172,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, case RISCV::CTZW: case RISCV::CPOPW: case RISCV::SLLI_UW: + case RISCV::ABSW: case RISCV::FMV_W_X: case RISCV::FCVT_H_W: case RISCV::FCVT_H_W_INX: diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index e9f43b9..84bb294 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -438,18 +438,19 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II, TypeSize VRegSize = OldLoc.getValue().divideCoefficientBy(NumRegs); Register VLENB = 0; - unsigned PreHandledNum = 0; + unsigned VLENBShift = 0; + unsigned PrevHandledNum = 0; unsigned I = 0; while (I != NumRegs) { auto [LMulHandled, RegClass, Opcode] = getSpillReloadInfo(NumRegs - I, RegEncoding, IsSpill); auto [RegNumHandled, _] = RISCVVType::decodeVLMUL(LMulHandled); bool IsLast = I + RegNumHandled == NumRegs; - if (PreHandledNum) { + if (PrevHandledNum) { Register Step; // Optimize for constant VLEN. if (auto VLEN = STI.getRealVLen()) { - int64_t Offset = *VLEN / 8 * PreHandledNum; + int64_t Offset = *VLEN / 8 * PrevHandledNum; Step = MRI.createVirtualRegister(&RISCV::GPRRegClass); STI.getInstrInfo()->movImm(MBB, II, DL, Step, Offset); } else { @@ -457,15 +458,21 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II, VLENB = MRI.createVirtualRegister(&RISCV::GPRRegClass); BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VLENB); } - uint32_t ShiftAmount = Log2_32(PreHandledNum); - if (ShiftAmount == 0) - Step = VLENB; - else { - Step = MRI.createVirtualRegister(&RISCV::GPRRegClass); - BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), Step) - .addReg(VLENB, getKillRegState(IsLast)) - .addImm(ShiftAmount); + uint32_t ShiftAmount = Log2_32(PrevHandledNum); + // To avoid using an extra register, we shift the VLENB register and + // remember how much it has been shifted. We can then use relative + // shifts to adjust to the desired shift amount. + if (VLENBShift > ShiftAmount) { + BuildMI(MBB, II, DL, TII->get(RISCV::SRLI), VLENB) + .addReg(VLENB, RegState::Kill) + .addImm(VLENBShift - ShiftAmount); + } else if (VLENBShift < ShiftAmount) { + BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VLENB) + .addReg(VLENB, RegState::Kill) + .addImm(ShiftAmount - VLENBShift); } + VLENBShift = ShiftAmount; + Step = VLENB; } BuildMI(MBB, II, DL, TII->get(RISCV::ADD), NewBase) @@ -489,7 +496,7 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II, if (IsSpill) MIB.addReg(Reg, RegState::Implicit); - PreHandledNum = RegNumHandled; + PrevHandledNum = RegNumHandled; RegEncoding += RegNumHandled; I += RegNumHandled; } diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp index 0a318e0..ed6d355 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp @@ -15,4 +15,4 @@ using namespace llvm; SPIRVTargetStreamer::SPIRVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} -SPIRVTargetStreamer::~SPIRVTargetStreamer() {} +SPIRVTargetStreamer::~SPIRVTargetStreamer() = default; diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 9e11c3a..dd57b74 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -149,23 +149,23 @@ static FunctionType *getOriginalFunctionType(const Function &F) { return isa<MDString>(N->getOperand(0)) && cast<MDString>(N->getOperand(0))->getString() == F.getName(); }); - // TODO: probably one function can have numerous type mutations, - // so we should support this. if (ThisFuncMDIt != NamedMD->op_end()) { auto *ThisFuncMD = *ThisFuncMDIt; - MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1)); - assert(MD && "MDNode operand is expected"); - ConstantInt *Const = getConstInt(MD, 0); - if (Const) { - auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); - assert(CMeta && "ConstantAsMetadata operand is expected"); - assert(Const->getSExtValue() >= -1); - // Currently -1 indicates return value, greater values mean - // argument numbers. - if (Const->getSExtValue() == -1) - RetTy = CMeta->getType(); - else - ArgTypes[Const->getSExtValue()] = CMeta->getType(); + for (unsigned I = 1; I != ThisFuncMD->getNumOperands(); ++I) { + MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(I)); + assert(MD && "MDNode operand is expected"); + ConstantInt *Const = getConstInt(MD, 0); + if (Const) { + auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); + assert(CMeta && "ConstantAsMetadata operand is expected"); + assert(Const->getSExtValue() >= -1); + // Currently -1 indicates return value, greater values mean + // argument numbers. + if (Const->getSExtValue() == -1) + RetTy = CMeta->getType(); + else + ArgTypes[Const->getSExtValue()] = CMeta->getType(); + } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 3fea21e..3f0424f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3151,6 +3151,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectInsertElt(ResVReg, ResType, I); case Intrinsic::spv_gep: return selectGEP(ResVReg, ResType, I); + case Intrinsic::spv_bitcast: { + Register OpReg = I.getOperand(2).getReg(); + SPIRVType *OpType = + OpReg.isValid() ? GR.getSPIRVTypeForVReg(OpReg) : nullptr; + if (!GR.isBitcastCompatible(ResType, OpType)) + report_fatal_error("incompatible result and operand types in a bitcast"); + return selectOpWithSrcs(ResVReg, ResType, I, {OpReg}, SPIRV::OpBitcast); + } case Intrinsic::spv_unref_global: case Intrinsic::spv_init_global: { MachineInstr *MI = MRI->getVRegDef(I.getOperand(1).getReg()); diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 6e444c9..65dffc7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -73,16 +73,23 @@ class SPIRVLegalizePointerCast : public FunctionPass { // Returns the loaded value. Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType, FixedVectorType *TargetType, Value *Source) { - assert(TargetType->getNumElements() <= SourceType->getNumElements()); LoadInst *NewLoad = B.CreateLoad(SourceType, Source); buildAssignType(B, SourceType, NewLoad); Value *AssignValue = NewLoad; if (TargetType->getElementType() != SourceType->getElementType()) { + const DataLayout &DL = B.GetInsertBlock()->getModule()->getDataLayout(); + [[maybe_unused]] TypeSize TargetTypeSize = + DL.getTypeSizeInBits(TargetType); + [[maybe_unused]] TypeSize SourceTypeSize = + DL.getTypeSizeInBits(SourceType); + assert(TargetTypeSize == SourceTypeSize); AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast, {TargetType, SourceType}, {NewLoad}); buildAssignType(B, TargetType, AssignValue); + return AssignValue; } + assert(TargetType->getNumElements() < SourceType->getNumElements()); SmallVector<int> Mask(/* Size= */ TargetType->getNumElements()); for (unsigned I = 0; I < TargetType->getNumElements(); ++I) Mask[I] = I; diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index f7cdfcb..db036a5 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -613,8 +613,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, << FinalFlags << "\n"; MachineInstr *OrigMINonConst = const_cast<MachineInstr *>(OrigMI); MachineOperand &OrigFlagsOp = OrigMINonConst->getOperand(2); - OrigFlagsOp = - MachineOperand::CreateImm(static_cast<unsigned>(FinalFlags)); + OrigFlagsOp = MachineOperand::CreateImm(FinalFlags); return; // Merge done, so we found a duplicate; don't add it to MAI.MS } } diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index 2d19f6de..44b6c66 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -81,7 +81,7 @@ private: void initAvailableCapabilitiesForVulkan(const SPIRVSubtarget &ST); public: - RequirementHandler() {} + RequirementHandler() = default; void clear() { MinimalCaps.clear(); AllCaps.clear(); diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index db6f2d6..d538009 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -192,31 +192,43 @@ static void buildOpBitcast(SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB, .addUse(OpReg); } -// We do instruction selections early instead of calling MIB.buildBitcast() -// generating the general op code G_BITCAST. When MachineVerifier validates -// G_BITCAST we see a check of a kind: if Source Type is equal to Destination -// Type then report error "bitcast must change the type". This doesn't take into -// account the notion of a typed pointer that is important for SPIR-V where a -// user may and should use bitcast between pointers with different pointee types -// (https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast). -// It's important for correct lowering in SPIR-V, because interpretation of the -// data type is not left to instructions that utilize the pointer, but encoded -// by the pointer declaration, and the SPIRV target can and must handle the -// declaration and use of pointers that specify the type of data they point to. -// It's not feasible to improve validation of G_BITCAST using just information -// provided by low level types of source and destination. Therefore we don't -// produce G_BITCAST as the general op code with semantics different from -// OpBitcast, but rather lower to OpBitcast immediately. As for now, the only -// difference would be that CombinerHelper couldn't transform known patterns -// around G_BUILD_VECTOR. See discussion -// in https://github.com/llvm/llvm-project/pull/110270 for even more context. -static void selectOpBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, - MachineIRBuilder MIB) { +// We lower G_BITCAST to OpBitcast here to avoid a MachineVerifier error. +// The verifier checks if the source and destination LLTs of a G_BITCAST are +// different, but this check is too strict for SPIR-V's typed pointers, which +// may have the same LLT but different SPIRVType (e.g. pointers to different +// pointee types). By lowering to OpBitcast here, we bypass the verifier's +// check. See discussion in https://github.com/llvm/llvm-project/pull/110270 +// for more context. +// +// We also handle the llvm.spv.bitcast intrinsic here. If the source and +// destination SPIR-V types are the same, we lower it to a COPY to enable +// further optimizations like copy propagation. +static void lowerBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, + MachineIRBuilder MIB) { SmallVector<MachineInstr *, 16> ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { + if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); + SPIRVType *DstType = GR->getSPIRVTypeForVReg(DstReg); + assert( + DstType && + "Expected destination SPIR-V type to have been assigned already."); + SPIRVType *SrcType = GR->getSPIRVTypeForVReg(SrcReg); + assert(SrcType && + "Expected source SPIR-V type to have been assigned already."); + if (DstType == SrcType) { + MIB.setInsertPt(*MI.getParent(), MI); + MIB.buildCopy(DstReg, SrcReg); + ToErase.push_back(&MI); + continue; + } + } + if (MI.getOpcode() != TargetOpcode::G_BITCAST) continue; + MIB.setInsertPt(*MI.getParent(), MI); buildOpBitcast(GR, MIB, MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); @@ -237,16 +249,11 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, SmallVector<MachineInstr *, 10> ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) && - !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast)) + if (!isSpvIntrinsic(MI, Intrinsic::spv_ptrcast)) continue; assert(MI.getOperand(2).isReg()); MIB.setInsertPt(*MI.getParent(), MI); ToErase.push_back(&MI); - if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) { - MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg()); - continue; - } Register Def = MI.getOperand(0).getReg(); Register Source = MI.getOperand(2).getReg(); Type *ElemTy = getMDOperandAsType(MI.getOperand(3).getMetadata(), 0); @@ -1089,7 +1096,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) { removeImplicitFallthroughs(MF, MIB); insertSpirvDecorations(MF, GR, MIB); insertInlineAsm(MF, GR, ST, MIB); - selectOpBitcasts(MF, GR, MIB); + lowerBitcasts(MF, GR, MIB); return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 7dd0b95..5ba0356 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -69,7 +69,7 @@ static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { } // Pin SPIRVTargetObjectFile's vtables to this file. -SPIRVTargetObjectFile::~SPIRVTargetObjectFile() {} +SPIRVTargetObjectFile::~SPIRVTargetObjectFile() = default; SPIRVTargetMachine::SPIRVTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h index 9d0adbb..87ec256 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h @@ -16,7 +16,7 @@ namespace llvm { /// This implementation is used for SystemZ ELF targets. class SystemZELFTargetObjectFile : public TargetLoweringObjectFileELF { public: - SystemZELFTargetObjectFile() {} + SystemZELFTargetObjectFile() = default; /// Describe a TLS variable address within debug info. const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h index 7845cdf..1bfc61f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h @@ -76,7 +76,7 @@ public: BlockSet.insert(MBB); } ArrayRef<MachineBasicBlock *> getBlocks() const { return Blocks; } - using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator; + using block_iterator = ArrayRef<MachineBasicBlock *>::const_iterator; block_iterator block_begin() const { return getBlocks().begin(); } block_iterator block_end() const { return getBlocks().end(); } inline iterator_range<block_iterator> blocks() const { @@ -96,7 +96,7 @@ public: void addSubException(std::unique_ptr<WebAssemblyException> E) { SubExceptions.push_back(std::move(E)); } - using iterator = typename decltype(SubExceptions)::const_iterator; + using iterator = decltype(SubExceptions)::const_iterator; iterator begin() const { return SubExceptions.begin(); } iterator end() const { return SubExceptions.end(); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index 27f7e1a..5a1779c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -81,7 +81,7 @@ WebAssemblyFrameLowering::getLocalForStackObject(MachineFunction &MF, // Abuse object size to record number of WebAssembly locals allocated to // this object. MFI.setObjectSize(FrameIndex, ValueVTs.size()); - return static_cast<unsigned>(Local); + return Local; } /// We need a base pointer in the case of having items on the stack that diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index ff4d6469..ee575e3 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -207,8 +207,7 @@ template <> struct MappingTraits<WebAssemblyFunctionInfo> { template <> struct CustomMappingTraits<BBNumberMap> { static void inputOne(IO &YamlIO, StringRef Key, BBNumberMap &SrcToUnwindDest) { - YamlIO.mapRequired(Key.str().c_str(), - SrcToUnwindDest[std::atoi(Key.str().c_str())]); + YamlIO.mapRequired(Key, SrcToUnwindDest[std::atoi(Key.str().c_str())]); } static void output(IO &YamlIO, BBNumberMap &SrcToUnwindDest) { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h index e92bf17..96b8a4e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h @@ -35,7 +35,7 @@ public: virtual MachineBasicBlock *getHeader() const = 0; virtual bool contains(const MachineBasicBlock *MBB) const = 0; virtual unsigned getNumBlocks() const = 0; - using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator; + using block_iterator = ArrayRef<MachineBasicBlock *>::const_iterator; virtual iterator_range<block_iterator> blocks() const = 0; virtual bool isLoop() const = 0; }; diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 127ee67..bac3692 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1121,7 +1121,7 @@ private: void setTypeInfo(AsmTypeInfo Type) { CurType = Type; } }; - bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt, + bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}, bool MatchingInlineAsm = false) { MCAsmParser &Parser = getParser(); if (MatchingInlineAsm) { @@ -2470,10 +2470,10 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, // Report back its kind, or IOK_INVALID if does not evaluated as a known one unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) { return StringSwitch<unsigned>(Name) - .Cases("TYPE","type",IOK_TYPE) - .Cases("SIZE","size",IOK_SIZE) - .Cases("LENGTH","length",IOK_LENGTH) - .Default(IOK_INVALID); + .Cases({"TYPE", "type"}, IOK_TYPE) + .Cases({"SIZE", "size"}, IOK_SIZE) + .Cases({"LENGTH", "length"}, IOK_LENGTH) + .Default(IOK_INVALID); } /// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator @@ -2516,8 +2516,8 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) { unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) { return StringSwitch<unsigned>(Name.lower()) .Case("type", MOK_TYPE) - .Cases("size", "sizeof", MOK_SIZEOF) - .Cases("length", "lengthof", MOK_LENGTHOF) + .Cases({"size", "sizeof"}, MOK_SIZEOF) + .Cases({"length", "lengthof"}, MOK_LENGTHOF) .Default(MOK_INVALID); } @@ -2581,21 +2581,21 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) { bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size, StringRef *SizeStr) { Size = StringSwitch<unsigned>(getTok().getString()) - .Cases("BYTE", "byte", 8) - .Cases("WORD", "word", 16) - .Cases("DWORD", "dword", 32) - .Cases("FLOAT", "float", 32) - .Cases("LONG", "long", 32) - .Cases("FWORD", "fword", 48) - .Cases("DOUBLE", "double", 64) - .Cases("QWORD", "qword", 64) - .Cases("MMWORD","mmword", 64) - .Cases("XWORD", "xword", 80) - .Cases("TBYTE", "tbyte", 80) - .Cases("XMMWORD", "xmmword", 128) - .Cases("YMMWORD", "ymmword", 256) - .Cases("ZMMWORD", "zmmword", 512) - .Default(0); + .Cases({"BYTE", "byte"}, 8) + .Cases({"WORD", "word"}, 16) + .Cases({"DWORD", "dword"}, 32) + .Cases({"FLOAT", "float"}, 32) + .Cases({"LONG", "long"}, 32) + .Cases({"FWORD", "fword"}, 48) + .Cases({"DOUBLE", "double"}, 64) + .Cases({"QWORD", "qword"}, 64) + .Cases({"MMWORD", "mmword"}, 64) + .Cases({"XWORD", "xword"}, 80) + .Cases({"TBYTE", "tbyte"}, 80) + .Cases({"XMMWORD", "xmmword"}, 128) + .Cases({"YMMWORD", "ymmword"}, 256) + .Cases({"ZMMWORD", "zmmword"}, 512) + .Default(0); if (Size) { if (SizeStr) *SizeStr = getTok().getString(); @@ -2886,22 +2886,22 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) { // otherwise the EFLAGS Condition Code enumerator. X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) { return StringSwitch<X86::CondCode>(CC) - .Case("o", X86::COND_O) // Overflow - .Case("no", X86::COND_NO) // No Overflow - .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal - .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below - .Cases("e", "z", X86::COND_E) // Equal/Zero - .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero - .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above - .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal - .Case("s", X86::COND_S) // Sign - .Case("ns", X86::COND_NS) // No Sign - .Cases("p", "pe", X86::COND_P) // Parity/Parity Even - .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd - .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal - .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less - .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater - .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal + .Case("o", X86::COND_O) // Overflow + .Case("no", X86::COND_NO) // No Overflow + .Cases({"b", "nae"}, X86::COND_B) // Below/Neither Above nor Equal + .Cases({"ae", "nb"}, X86::COND_AE) // Above or Equal/Not Below + .Cases({"e", "z"}, X86::COND_E) // Equal/Zero + .Cases({"ne", "nz"}, X86::COND_NE) // Not Equal/Not Zero + .Cases({"be", "na"}, X86::COND_BE) // Below or Equal/Not Above + .Cases({"a", "nbe"}, X86::COND_A) // Above/Neither Below nor Equal + .Case("s", X86::COND_S) // Sign + .Case("ns", X86::COND_NS) // No Sign + .Cases({"p", "pe"}, X86::COND_P) // Parity/Parity Even + .Cases({"np", "po"}, X86::COND_NP) // No Parity/Parity Odd + .Cases({"l", "nge"}, X86::COND_L) // Less/Neither Greater nor Equal + .Cases({"ge", "nl"}, X86::COND_GE) // Greater or Equal/Not Less + .Cases({"le", "ng"}, X86::COND_LE) // Less or Equal/Not Greater + .Cases({"g", "nle"}, X86::COND_G) // Greater/Neither Less nor Equal .Default(X86::COND_INVALID); } @@ -4322,7 +4322,7 @@ bool X86AsmParser::matchAndEmitATTInstruction( SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - SMRange EmptyRange = std::nullopt; + SMRange EmptyRange; // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode // when matching the instruction. if (ForcedDataPrefix == X86::Is32Bit) @@ -4548,7 +4548,7 @@ bool X86AsmParser::matchAndEmitIntelInstruction( SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - SMRange EmptyRange = std::nullopt; + SMRange EmptyRange; // Find one unsized memory operand, if present. X86Operand *UnsizedMemOp = nullptr; for (const auto &Op : Operands) { diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 89ac53e..a922725 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -620,37 +620,6 @@ struct X86Operand final : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(Reg)); } - bool isTILEPair() const { - return Kind == Register && - X86MCRegisterClasses[X86::TILERegClassID].contains(getReg()); - } - - void addTILEPairOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - MCRegister Reg = getReg(); - switch (Reg.id()) { - default: - llvm_unreachable("Invalid tile register!"); - case X86::TMM0: - case X86::TMM1: - Reg = X86::TMM0_TMM1; - break; - case X86::TMM2: - case X86::TMM3: - Reg = X86::TMM2_TMM3; - break; - case X86::TMM4: - case X86::TMM5: - Reg = X86::TMM4_TMM5; - break; - case X86::TMM6: - case X86::TMM7: - Reg = X86::TMM6_TMM7; - break; - } - Inst.addOperand(MCOperand::createReg(Reg)); - } - void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); if (getMemBaseReg()) diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 4927b45..7d2b5eb 100644 --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -810,10 +810,6 @@ static int readModRM(struct InternalInstruction *insn) { if (index > 7) \ *valid = 0; \ return prefix##_TMM0 + index; \ - case TYPE_TMM_PAIR: \ - if (index > 7) \ - *valid = 0; \ - return prefix##_TMM0_TMM1 + (index / 2); \ case TYPE_VK: \ index &= 0xf; \ if (index > 7) \ @@ -2323,7 +2319,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_YMM: case TYPE_ZMM: case TYPE_TMM: - case TYPE_TMM_PAIR: case TYPE_VK_PAIR: case TYPE_VK: case TYPE_DEBUGREG: diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index dc9af2c..b0aa70b 100644 --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -535,12 +535,6 @@ namespace X86Disassembler { ENTRY(TMM6) \ ENTRY(TMM7) -#define REGS_TMM_PAIRS \ - ENTRY(TMM0_TMM1) \ - ENTRY(TMM2_TMM3) \ - ENTRY(TMM4_TMM5) \ - ENTRY(TMM6_TMM7) - #define ALL_EA_BASES \ EA_BASES_16BIT \ EA_BASES_32BIT \ @@ -565,7 +559,6 @@ namespace X86Disassembler { REGS_DEBUG \ REGS_CONTROL \ REGS_TMM \ - REGS_TMM_PAIRS \ ENTRY(RIP) /// All possible values of the base field for effective-address diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index 1c5f166..759d95e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -467,22 +467,3 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, } llvm_unreachable("Unknown mask pair register name"); } - -void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - switch (MI->getOperand(OpNo).getReg()) { - case X86::TMM0_TMM1: - printRegName(OS, X86::TMM0); - return; - case X86::TMM2_TMM3: - printRegName(OS, X86::TMM2); - return; - case X86::TMM4_TMM5: - printRegName(OS, X86::TMM4); - return; - case X86::TMM6_TMM7: - printRegName(OS, X86::TMM6); - return; - } - llvm_unreachable("Unknown mask pair register name"); -} diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index 2c9467c..cb55f2f 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -40,7 +40,6 @@ protected: const MCSubtargetInfo &STI); void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); - void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index a1fd366..9e291a6 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -274,9 +274,6 @@ def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true", "Support AMX-MOVRS instructions", [FeatureAMXTILE]>; -def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", - "Support AMX amx-transpose instructions", - [FeatureAMXTILE]>; def FeatureAMXAVX512 : SubtargetFeature<"amx-avx512", "HasAMXAVX512", "true", "Support AMX-AVX512 instructions", @@ -1177,8 +1174,7 @@ def ProcessorFeatures { FeatureAMXMOVRS, FeatureAMXAVX512, FeatureAMXFP8, - FeatureAMXTF32, - FeatureAMXTRANSPOSE]; + FeatureAMXTF32]; list<SubtargetFeature> DMRFeatures = !listconcat(GNRDFeatures, DMRAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 4a9b824..e3c44c0 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -649,149 +649,6 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(Opc)); return true; } - // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding - // AMX instruction to support it. So, split it to 2 load instructions: - // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" --> - // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" + - // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment" - case X86::PTILEPAIRLOAD: { - int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm(); - Register TReg = MBBI->getOperand(0).getReg(); - bool DstIsDead = MBBI->getOperand(0).isDead(); - Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); - Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); - unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; - - MachineInstrBuilder MIBLo = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) - .addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead)); - MachineInstrBuilder MIBHi = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) - .addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead)); - - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MIBLo.add(MBBI->getOperand(1 + i)); - if (i == X86::AddrDisp) - MIBHi.addImm(Disp + TmmSize); - else - MIBHi.add(MBBI->getOperand(1 + i)); - } - - // Make sure the first stride reg used in first tileload is alive. - MachineOperand &Stride = - MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg); - Stride.setIsKill(false); - - // Split the memory operand, adjusting the offset and size for the halves. - MachineMemOperand *OldMMO = MBBI->memoperands().front(); - MachineFunction *MF = MBB.getParent(); - MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); - MachineMemOperand *MMOHi = - MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); - - MIBLo.setMemRefs(MMOLo); - MIBHi.setMemRefs(MMOHi); - - // Delete the pseudo. - MBB.erase(MBBI); - return true; - } - // Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no - // corresponding AMX instruction to support it. So, split it too: - // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" --> - // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" + - // "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1" - case X86::PTILEPAIRSTORE: { - int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm(); - Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg(); - bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill(); - Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); - Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); - unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; - - MachineInstrBuilder MIBLo = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); - MachineInstrBuilder MIBHi = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); - - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MIBLo.add(MBBI->getOperand(i)); - if (i == X86::AddrDisp) - MIBHi.addImm(Disp + TmmSize); - else - MIBHi.add(MBBI->getOperand(i)); - } - MIBLo.addReg(TReg0, getKillRegState(SrcIsKill)); - MIBHi.addReg(TReg1, getKillRegState(SrcIsKill)); - - // Make sure the first stride reg used in first tilestore is alive. - MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg); - Stride.setIsKill(false); - - // Split the memory operand, adjusting the offset and size for the halves. - MachineMemOperand *OldMMO = MBBI->memoperands().front(); - MachineFunction *MF = MBB.getParent(); - MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); - MachineMemOperand *MMOHi = - MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); - - MIBLo.setMemRefs(MMOLo); - MIBHi.setMemRefs(MMOHi); - - // Delete the pseudo. - MBB.erase(MBBI); - return true; - } - case X86::PT2RPNTLVWZ0V: - case X86::PT2RPNTLVWZ0T1V: - case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: { - for (unsigned i = 3; i > 0; --i) - MI.removeOperand(i); - unsigned Opc; - switch (Opcode) { - case X86::PT2RPNTLVWZ0V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); - break; - case X86::PT2RPNTLVWZ0T1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); - break; - case X86::PT2RPNTLVWZ1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); - break; - case X86::PT2RPNTLVWZ1T1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); - break; - case X86::PT2RPNTLVWZ0RSV: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); - break; - case X86::PT2RPNTLVWZ0RST1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); - break; - case X86::PT2RPNTLVWZ1RSV: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); - break; - case X86::PT2RPNTLVWZ1RST1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); - break; - default: - llvm_unreachable("Impossible Opcode!"); - } - MI.setDesc(TII->get(Opc)); - return true; - } - case X86::PTTRANSPOSEDV: - case X86::PTCONJTFP16V: { - for (int i = 2; i > 0; --i) - MI.removeOperand(i); - MI.setDesc(TII->get(Opcode == X86::PTTRANSPOSEDV ? X86::TTRANSPOSED - : X86::TCONJTFP16)); - return true; - } case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: case X86::PTDPBSSDV: @@ -800,13 +657,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PTDPBUUDV: case X86::PTDPBF16PSV: case X86::PTDPFP16PSV: - case X86::PTTDPBF16PSV: - case X86::PTTDPFP16PSV: - case X86::PTTCMMIMFP16PSV: - case X86::PTTCMMRLFP16PSV: - case X86::PTCONJTCMMIMFP16PSV: case X86::PTMMULTF32PSV: - case X86::PTTMMULTF32PSV: case X86::PTDPBF8PSV: case X86::PTDPBHF8PSV: case X86::PTDPHBF8PSV: @@ -816,6 +667,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.removeOperand(i); unsigned Opc; switch (Opcode) { + // clang-format off case X86::PTCMMIMFP16PSV: Opc = X86::TCMMIMFP16PS; break; case X86::PTCMMRLFP16PSV: Opc = X86::TCMMRLFP16PS; break; case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break; @@ -824,40 +676,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PSV: Opc = X86::TDPFP16PS; break; - case X86::PTTDPBF16PSV: - Opc = X86::TTDPBF16PS; - break; - case X86::PTTDPFP16PSV: - Opc = X86::TTDPFP16PS; - break; - case X86::PTTCMMIMFP16PSV: - Opc = X86::TTCMMIMFP16PS; - break; - case X86::PTTCMMRLFP16PSV: - Opc = X86::TTCMMRLFP16PS; - break; - case X86::PTCONJTCMMIMFP16PSV: - Opc = X86::TCONJTCMMIMFP16PS; - break; - case X86::PTMMULTF32PSV: - Opc = X86::TMMULTF32PS; - break; - case X86::PTTMMULTF32PSV: - Opc = X86::TTMMULTF32PS; - break; - case X86::PTDPBF8PSV: - Opc = X86::TDPBF8PS; - break; - case X86::PTDPBHF8PSV: - Opc = X86::TDPBHF8PS; - break; - case X86::PTDPHBF8PSV: - Opc = X86::TDPHBF8PS; - break; - case X86::PTDPHF8PSV: - Opc = X86::TDPHF8PS; - break; - + case X86::PTMMULTF32PSV: Opc = X86::TMMULTF32PS; break; + case X86::PTDPBF8PSV: Opc = X86::TDPBF8PS; break; + case X86::PTDPBHF8PSV: Opc = X86::TDPBHF8PS; break; + case X86::PTDPHBF8PSV: Opc = X86::TDPHBF8PS; break; + case X86::PTDPHF8PSV: Opc = X86::TDPHF8PS; break; + // clang-format on default: llvm_unreachable("Unexpected Opcode"); } diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index 787b71d..06f729a 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -267,24 +267,16 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI, << printReg(TileReg, TRI) << '\n'); } -static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) { - if (Reg.isVirtual()) { - unsigned RegClassID = MRI->getRegClass(Reg)->getID(); - if (RegClassID == X86::TILERegClassID) - return 1; - if (RegClassID == X86::TILEPAIRRegClassID) - return 2; - } else { - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return 1; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return 2; +static bool isTileRegister(MachineRegisterInfo *MRI, Register Reg) { + if (Reg.isVirtual() && + (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)) { + return true; } - return 0; -} -static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) { - return getTileDefNum(MRI, VirtReg) > 0; + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + + return false; } static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { @@ -296,7 +288,7 @@ static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { if (!MO.isReg()) return false; - return getTileDefNum(MRI, MO.getReg()) > 0; + return isTileRegister(MRI, MO.getReg()); } static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) { @@ -636,19 +628,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { else if (dominates(MBB, LastShapeMI, ColMI)) LastShapeMI = ColMI; } - unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg()); - if (TileDefNum > 1) { - for (unsigned I = 1; I < TileDefNum; I++) { - MachineOperand *ColxMO = &MI.getOperand(2 + I); - MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg()); - if (ColxMI->getParent() == &MBB) { - if (!LastShapeMI) - LastShapeMI = ColxMI; - else if (dominates(MBB, LastShapeMI, ColxMI)) - LastShapeMI = ColxMI; - } - } - } + // If there is user live out of the tilecfg, spill it and reload in // before the user. Register TileReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 11d331b..d86ae36 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -77,14 +77,14 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE, INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, "Fast Tile Register Configure", false, false) -static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) { +static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { // There is no phi instruction after register allocation. assert(MI.isPHI() == false); // The instruction must have 3 operands: tile def, row, col. // It should be AMX pseudo instruction that have shape operand. if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 || !MI.isPseudo()) - return 0; + return false; MachineOperand &MO = MI.getOperand(0); if (MO.isReg()) { @@ -93,24 +93,18 @@ static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) { // register is not rewritten yet. if (Reg.isVirtual()) { if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) - return 1; - if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID) - return 2; + return true; } if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return 1; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return 2; + return true; } - return 0; + return false; } static unsigned getTMMIndex(Register Reg) { if (Reg >= X86::TMM0 && Reg <= X86::TMM7) return Reg - X86::TMM0; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return (Reg - X86::TMM0_TMM1) * 2; llvm_unreachable("Invalid Tmm Reg!"); } @@ -120,17 +114,14 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) { bool Change = false; SmallVector<std::pair<unsigned, ShapeT>, 6> ShapeInfos; for (MachineInstr &MI : reverse(MBB)) { - unsigned DefNum = getNumDefTiles(MRI, MI); - if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV) + if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV) continue; // AMX instructions that define tile register. if (MI.getOpcode() != X86::PLDTILECFGV) { MachineOperand &Row = MI.getOperand(1); unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg()); - for (unsigned I = 0; I < DefNum; I++) { - MachineOperand &Col = MI.getOperand(2 + I); - ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)}); - } + MachineOperand &Col = MI.getOperand(2); + ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)}); } else { // PLDTILECFGV // Rewrite the shape information to memory. Stack slot should have // been initialized to zero in pre config. diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 4393f6e..d4418c8 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -337,23 +337,8 @@ namespace { // lowering but before ISEL. bool isAMXSDNode(SDNode *N) const { // Check if N is AMX SDNode: - // 1. check specific opcode since these carry MVT::Untyped instead of - // x86amx_type; - // 2. check result type; - // 3. check operand type; - switch (N->getOpcode()) { - default: - break; - case X86::PT2RPNTLVWZ0V: - case X86::PT2RPNTLVWZ0T1V: - case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: - return true; - } + // 1. check result type; + // 2. check operand type; for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { if (N->getValueType(Idx) == MVT::x86amx) return true; @@ -5398,65 +5383,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } - case Intrinsic::x86_t2rpntlvwz0rs: - case Intrinsic::x86_t2rpntlvwz0rst1: - case Intrinsic::x86_t2rpntlvwz1rs: - case Intrinsic::x86_t2rpntlvwz1rst1: - if (!Subtarget->hasAMXMOVRS()) - break; - [[fallthrough]]; - case Intrinsic::x86_t2rpntlvwz0: - case Intrinsic::x86_t2rpntlvwz0t1: - case Intrinsic::x86_t2rpntlvwz1: - case Intrinsic::x86_t2rpntlvwz1t1: { - if (!Subtarget->hasAMXTRANSPOSE()) - break; - auto *MFI = - CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); - MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); - unsigned Opc; - switch (IntNo) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_t2rpntlvwz0: - Opc = X86::PT2RPNTLVWZ0; - break; - case Intrinsic::x86_t2rpntlvwz0t1: - Opc = X86::PT2RPNTLVWZ0T1; - break; - case Intrinsic::x86_t2rpntlvwz1: - Opc = X86::PT2RPNTLVWZ1; - break; - case Intrinsic::x86_t2rpntlvwz1t1: - Opc = X86::PT2RPNTLVWZ1T1; - break; - case Intrinsic::x86_t2rpntlvwz0rs: - Opc = X86::PT2RPNTLVWZ0RS; - break; - case Intrinsic::x86_t2rpntlvwz0rst1: - Opc = X86::PT2RPNTLVWZ0RST1; - break; - case Intrinsic::x86_t2rpntlvwz1rs: - Opc = X86::PT2RPNTLVWZ1RS; - break; - case Intrinsic::x86_t2rpntlvwz1rst1: - Opc = X86::PT2RPNTLVWZ1RST1; - break; - } - // FIXME: Match displacement and scale. - unsigned TIndex = Node->getConstantOperandVal(2); - SDValue TReg = getI8Imm(TIndex, dl); - SDValue Base = Node->getOperand(3); - SDValue Scale = getI8Imm(1, dl); - SDValue Index = Node->getOperand(4); - SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue Chain = Node->getOperand(0); - SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; - MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); - ReplaceNode(Node, CNode); - return; - } } break; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 624cff2..2970cf4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -22861,6 +22861,13 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, if (!OpVT.isScalarInteger() || OpSize < 128) return SDValue(); + // Don't do this if we're not supposed to use the FPU. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (Subtarget.useSoftFloat() || NoImplicitFloatOps) + return SDValue(); + // Ignore a comparison with zero because that gets special treatment in // EmitTest(). But make an exception for the special case of a pair of // logically-combined vector-sized operands compared to zero. This pattern may @@ -22883,13 +22890,9 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. // Otherwise use PCMPEQ (plus AND) and mask testing. - bool NoImplicitFloatOps = - DAG.getMachineFunction().getFunction().hasFnAttribute( - Attribute::NoImplicitFloat); - if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && - ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX()) || - (OpSize == 512 && Subtarget.useAVX512Regs()))) { + if ((OpSize == 128 && Subtarget.hasSSE2()) || + (OpSize == 256 && Subtarget.hasAVX()) || + (OpSize == 512 && Subtarget.useAVX512Regs())) { bool HasPT = Subtarget.hasSSE41(); // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened @@ -27946,67 +27949,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } - case Intrinsic::x86_t2rpntlvwz0rs_internal: - case Intrinsic::x86_t2rpntlvwz0rst1_internal: - case Intrinsic::x86_t2rpntlvwz1rs_internal: - case Intrinsic::x86_t2rpntlvwz1rst1_internal: - case Intrinsic::x86_t2rpntlvwz0_internal: - case Intrinsic::x86_t2rpntlvwz0t1_internal: - case Intrinsic::x86_t2rpntlvwz1_internal: - case Intrinsic::x86_t2rpntlvwz1t1_internal: { - auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>(); - X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); - unsigned IntNo = Op.getConstantOperandVal(1); - unsigned Opc = 0; - switch (IntNo) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_t2rpntlvwz0_internal: - Opc = X86::PT2RPNTLVWZ0V; - break; - case Intrinsic::x86_t2rpntlvwz0t1_internal: - Opc = X86::PT2RPNTLVWZ0T1V; - break; - case Intrinsic::x86_t2rpntlvwz1_internal: - Opc = X86::PT2RPNTLVWZ1V; - break; - case Intrinsic::x86_t2rpntlvwz1t1_internal: - Opc = X86::PT2RPNTLVWZ1T1V; - break; - case Intrinsic::x86_t2rpntlvwz0rs_internal: - Opc = X86::PT2RPNTLVWZ0RSV; - break; - case Intrinsic::x86_t2rpntlvwz0rst1_internal: - Opc = X86::PT2RPNTLVWZ0RST1V; - break; - case Intrinsic::x86_t2rpntlvwz1rs_internal: - Opc = X86::PT2RPNTLVWZ1RSV; - break; - case Intrinsic::x86_t2rpntlvwz1rst1_internal: - Opc = X86::PT2RPNTLVWZ1RST1V; - break; - } - - SDLoc DL(Op); - SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); - - SDValue Ops[] = {Op.getOperand(2), // Row - Op.getOperand(3), // Col0 - Op.getOperand(4), // Col1 - Op.getOperand(5), // Base - DAG.getTargetConstant(1, DL, MVT::i8), // Scale - Op.getOperand(6), // Index - DAG.getTargetConstant(0, DL, MVT::i32), // Disp - DAG.getRegister(0, MVT::i16), // Segment - Op.getOperand(0)}; // Chain - - MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops); - SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx, - SDValue(Res, 0)); - SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx, - SDValue(Res, 0)); - return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL); - } case Intrinsic::x86_atomic_bts_rm: case Intrinsic::x86_atomic_btc_rm: case Intrinsic::x86_atomic_btr_rm: { @@ -37745,10 +37687,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, assert (Imm < 8 && "Illegal tmm index"); return X86::TMM0 + Imm; }; - auto TMMImmToTMMPair = [](unsigned Imm) { - assert(Imm < 8 && "Illegal tmm pair index."); - return X86::TMM0_TMM1 + Imm / 2; - }; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); @@ -38129,53 +38067,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBHF8PS: case X86::PTDPHBF8PS: case X86::PTDPHF8PS: - case X86::PTTDPBF16PS: - case X86::PTTDPFP16PS: - case X86::PTTCMMIMFP16PS: - case X86::PTTCMMRLFP16PS: - case X86::PTCONJTCMMIMFP16PS: - case X86::PTMMULTF32PS: - case X86::PTTMMULTF32PS: { + case X86::PTMMULTF32PS: { unsigned Opc; switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); + // clang-format off case X86::PTDPBSSD: Opc = X86::TDPBSSD; break; case X86::PTDPBSUD: Opc = X86::TDPBSUD; break; case X86::PTDPBUSD: Opc = X86::TDPBUSD; break; case X86::PTDPBUUD: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break; - case X86::PTCMMIMFP16PS: - Opc = X86::TCMMIMFP16PS; - break; - case X86::PTCMMRLFP16PS: - Opc = X86::TCMMRLFP16PS; - break; + case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break; + case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break; case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break; case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break; case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break; case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break; - case X86::PTTDPBF16PS: - Opc = X86::TTDPBF16PS; - break; - case X86::PTTDPFP16PS: - Opc = X86::TTDPFP16PS; - break; - case X86::PTTCMMIMFP16PS: - Opc = X86::TTCMMIMFP16PS; - break; - case X86::PTTCMMRLFP16PS: - Opc = X86::TTCMMRLFP16PS; - break; - case X86::PTCONJTCMMIMFP16PS: - Opc = X86::TCONJTCMMIMFP16PS; - break; - case X86::PTMMULTF32PS: - Opc = X86::TMMULTF32PS; - break; - case X86::PTTMMULTF32PS: - Opc = X86::TTMMULTF32PS; - break; + case X86::PTMMULTF32PS: Opc = X86::TMMULTF32PS; break; + // clang-format on } MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); @@ -38246,70 +38156,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } - case X86::PT2RPNTLVWZ0: - case X86::PT2RPNTLVWZ0T1: - case X86::PT2RPNTLVWZ1: - case X86::PT2RPNTLVWZ1T1: - case X86::PT2RPNTLVWZ0RS: - case X86::PT2RPNTLVWZ0RST1: - case X86::PT2RPNTLVWZ1RS: - case X86::PT2RPNTLVWZ1RST1: { - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Opc; -#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC) - switch (MI.getOpcode()) { - default: - llvm_unreachable("Unexpected instruction!"); - case X86::PT2RPNTLVWZ0: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); - break; - case X86::PT2RPNTLVWZ0T1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); - break; - case X86::PT2RPNTLVWZ1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); - break; - case X86::PT2RPNTLVWZ1T1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); - break; - case X86::PT2RPNTLVWZ0RS: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); - break; - case X86::PT2RPNTLVWZ0RST1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); - break; - case X86::PT2RPNTLVWZ1RS: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); - break; - case X86::PT2RPNTLVWZ1RST1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); - break; - } -#undef GET_EGPR_IF_ENABLED - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); - MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); - - MIB.add(MI.getOperand(1)); // base - MIB.add(MI.getOperand(2)); // scale - MIB.add(MI.getOperand(3)); // index - MIB.add(MI.getOperand(4)); // displacement - MIB.add(MI.getOperand(5)); // segment - MI.eraseFromParent(); // The pseudo is gone now. - return BB; - } - case X86::PTTRANSPOSED: - case X86::PTCONJTFP16: { - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED - : X86::TCONJTFP16; - - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; - } case X86::PTCVTROWPS2BF16Hrri: case X86::PTCVTROWPS2BF16Lrri: case X86::PTCVTROWPS2PHHrri: @@ -48778,10 +48624,9 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SDValue BC0 = peekThroughBitcasts(Op0); if (BC0.getOpcode() == X86ISD::PCMPEQ && ISD::isBuildVectorAllZeros(BC0.getOperand(1).getNode())) { - SDLoc DL(EFLAGS); CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE); - SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0)); - return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X); + SDValue X = DAG.getBitcast(OpVT, DAG.getFreeze(BC0.getOperand(0))); + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, X, X); } } } @@ -48837,7 +48682,7 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, MVT FloatSVT = MVT::getFloatingPointVT(EltBits); MVT FloatVT = MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits); - Res = DAG.getBitcast(FloatVT, Res); + Res = DAG.getBitcast(FloatVT, DAG.getFreeze(Res)); return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res); } else if (EltBits == 16) { MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8; @@ -48856,8 +48701,30 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, } // TESTZ(X,-1) == TESTZ(X,X) - if (ISD::isBuildVectorAllOnes(Op1.getNode())) + if (ISD::isBuildVectorAllOnes(Op1.getNode())) { + Op0 = DAG.getFreeze(Op0); return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); + } + + // Attempt to convert PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets. + if (EFLAGS.getOpcode() == X86ISD::PTEST && Subtarget.hasAVX()) { + KnownBits KnownOp1 = DAG.computeKnownBits(Op1); + assert(KnownOp1.getBitWidth() == 64 && + "Illegal PTEST vector element width"); + if (KnownOp1.isConstant()) { + const APInt &Mask = KnownOp1.getConstant(); + if (Mask.isSignMask()) { + MVT FpVT = MVT::getVectorVT(MVT::f64, OpVT.getSizeInBits() / 64); + Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); + return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); + } + if (Mask.isSplat(32) && Mask.trunc(32).isSignMask()) { + MVT FpVT = MVT::getVectorVT(MVT::f32, OpVT.getSizeInBits() / 32); + Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); + return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); + } + } + } // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y) // TODO: Add COND_NE handling? @@ -53480,6 +53347,80 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Look for a RMW operation that only touches one bit of a larger than legal +// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; + + // Only handle normal stores and its chain was a matching normal load. + auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); + if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || + !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset()) + return SDValue(); + + SDValue LoadVal(Ld, 0); + SDValue StoredVal = St->getValue(); + EVT VT = StoredVal.getValueType(); + + // Only narrow larger than legal scalar integers. + if (!VT.isScalarInteger() || + VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) + return SDValue(); + + // BTR: X & ~(1 << ShAmt) + // BTS: X | (1 << ShAmt) + // BTC: X ^ (1 << ShAmt) + SDValue ShAmt; + if (!StoredVal.hasOneUse() || + !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || + sd_match(StoredVal, + m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match(StoredVal, + m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + return SDValue(); + + // Ensure the shift amount is in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) + return SDValue(); + + // Split the shift into an alignment shift that moves the active i32 block to + // the bottom bits for truncation and a modulo shift that can act on the i32. + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + + // Compute the byte offset for the i32 block that is changed by the RMW. + // combineTruncate will adjust the load for us in a similar way. + EVT PtrVT = St->getBasePtr().getValueType(); + SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT); + SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, + DAG.getShiftAmountConstant(3, PtrVT, DL)); + SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL, + SDNodeFlags::NoUnsignedWrap); + + // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. + SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + + SDValue Mask = + DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), + DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + + SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), + Align(), St->getMemOperand()->getFlags()); +} + static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -53706,6 +53647,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } + if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) + return R; + // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC) if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -54493,6 +54437,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { + using namespace SDPatternMatch; if (!VT.isVector() || !Subtarget.hasSSSE3()) return SDValue(); @@ -54502,42 +54447,19 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, return SDValue(); SDValue SSatVal = detectSSatPattern(In, VT); - if (!SSatVal || SSatVal.getOpcode() != ISD::ADD) + if (!SSatVal) return SDValue(); - // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs - // of multiplies from even/odd elements. - SDValue N0 = SSatVal.getOperand(0); - SDValue N1 = SSatVal.getOperand(1); - - if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) - return SDValue(); - - SDValue N00 = N0.getOperand(0); - SDValue N01 = N0.getOperand(1); - SDValue N10 = N1.getOperand(0); - SDValue N11 = N1.getOperand(1); - + // See if this is a signed saturation of an ADD, adding pairs of multiplies + // from even/odd elements, from zero_extend/sign_extend operands. + // // TODO: Handle constant vectors and use knownbits/computenumsignbits? - // Canonicalize zero_extend to LHS. - if (N01.getOpcode() == ISD::ZERO_EXTEND) - std::swap(N00, N01); - if (N11.getOpcode() == ISD::ZERO_EXTEND) - std::swap(N10, N11); - - // Ensure we have a zero_extend and a sign_extend. - if (N00.getOpcode() != ISD::ZERO_EXTEND || - N01.getOpcode() != ISD::SIGN_EXTEND || - N10.getOpcode() != ISD::ZERO_EXTEND || - N11.getOpcode() != ISD::SIGN_EXTEND) + SDValue N00, N01, N10, N11; + if (!sd_match(SSatVal, + m_Add(m_Mul(m_ZExt(m_Value(N00)), m_SExt(m_Value(N01))), + m_Mul(m_ZExt(m_Value(N10)), m_SExt(m_Value(N11)))))) return SDValue(); - // Peek through the extends. - N00 = N00.getOperand(0); - N01 = N01.getOperand(0); - N10 = N10.getOperand(0); - N11 = N11.getOperand(0); - // Ensure the extend is from vXi8. if (N00.getValueType().getVectorElementType() != MVT::i8 || N01.getValueType().getVectorElementType() != MVT::i8 || @@ -54660,8 +54582,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, // truncation, see if we can convert the shift into a pointer offset instead. // Limit this to normal (non-ext) scalar integer loads. if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && - Src.hasOneUse() && Src.getOperand(0).hasOneUse() && - ISD::isNormalLoad(Src.getOperand(0).getNode())) { + Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) && + (Src.getOperand(0).hasOneUse() || + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) { auto *Ld = cast<LoadSDNode>(Src.getOperand(0)); if (Ld->isSimple() && VT.isByteSized() && isPowerOf2_64(VT.getSizeInBits())) { @@ -54669,9 +54592,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); // Check the shift amount is byte aligned. // Check the truncation doesn't use any shifted in (zero) top bits. + // Check the shift amount doesn't depend on the original load. if (KnownAmt.countMinTrailingZeros() >= 3 && KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() - - VT.getSizeInBits())) { + VT.getSizeInBits()) && + !Ld->isPredecessorOf(ShAmt.getNode())) { EVT PtrVT = Ld->getBasePtr().getValueType(); SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT); SDValue PtrByteOfs = @@ -54682,8 +54607,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewLoad = DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), Align(), Ld->getMemOperand()->getFlags()); - DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), - NewLoad.getValue(1)); + DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; } } @@ -56459,6 +56383,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); const SDValue LHS = N->getOperand(0); const SDValue RHS = N->getOperand(1); @@ -56517,6 +56442,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (SDValue AndN = MatchAndCmpEq(RHS, LHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); + // If we're performing a bit test on a larger than legal type, attempt + // to (aligned) shift down the value to the bottom 32-bits and then + // perform the bittest on the i32 value. + // ICMP_ZERO(AND(X,SHL(1,IDX))) + // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31)))) + if (isNullConstant(RHS) && + OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) { + SDValue X, ShAmt; + if (sd_match(LHS, m_OneUse(m_And(m_Value(X), + m_Shl(m_One(), m_Value(ShAmt)))))) { + // Only attempt this if the shift amount is known to be in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) { + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getConstant(31, DL, AmtVT)); + SDValue Mask = DAG.getNode( + ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), + DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); + X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask); + return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32), + CC); + } + } + } + // cmpeq(trunc(x),C) --> cmpeq(x,C) // cmpne(trunc(x),C) --> cmpne(x,C) // iff x upper bits are zero. diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 69a5115..522782a 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -338,188 +338,6 @@ let Predicates = [HasAMXFP8, In64BitMode] in { } } -let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in { - let mayStore = 1 in - def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>; - let mayLoad = 1 in - def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>; -} - -multiclass T2RPNTLVW_Base<bits<8> op1, bits<8> op2, string rs, string suffix> { - def Z0#rs#suffix : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src), - "t2rpntlvwz0" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PS; - def Z0#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src), - "t2rpntlvwz0" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PS; - def Z1#rs#suffix : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src), - "t2rpntlvwz1" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PD; - def Z1#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src), - "t2rpntlvwz1" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PD; -} - -let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX; - -let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8; - -let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX; - -let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8; - -let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { - let SchedRW = [WriteSystem] in { - def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), - "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS; - let isPseudo = true in { - def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - } - - def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src), - [(set TILE: $dst, - (int_x86_ttransposed_internal GR16:$src1, GR16:$src2, - TILE:$src))]>; - - let usesCustomInserter = 1 in { - def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src), - [(int_x86_ttransposed timm:$dst, timm:$src)]>; - } - } -} // HasAMXTILE, HasAMXTRANSPOSE - -let Predicates = [HasAMXBF16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in - def TTDPBF16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>, VEX, VVVV, T8,XS; - let Constraints = "$src4 = $dst" in - def PTTDPBF16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttdpbf16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - let usesCustomInserter = 1 in - def PTTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttdpbf16ps timm:$src1, timm:$src2, timm:$src3)]>; -} - -let Predicates = [HasAMXFP16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in - def TTDPFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttdpfp16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>, VEX, VVVV, T8,XD; - let Constraints = "$src4 = $dst" in - def PTTDPFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttdpfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - let usesCustomInserter = 1 in - def PTTDPFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttdpfp16ps timm:$src1, timm:$src2, timm:$src3)]>; -} - -let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in { - def TTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", - []>, VEX, VVVV, T8,XD; - def TTCMMRLFP16PS: I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", - []>, VEX, VVVV, T8,XS; - def TCONJTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "tconjtcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", - []>, VEX, VVVV, WIG, T8,PS; - } - def TCONJTFP16 : I<0x6b, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), - "tconjtfp16\t{$src, $dst|$dst, $src}", []>, VEX, T8,PD; - - let Constraints = "$src4 = $dst" in { - def PTTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttcmmimfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - def PTTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttcmmrlfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - def PTCONJTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_tconjtcmmimfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - } - def PTCONJTFP16V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3), - [(set TILE: $dst, (int_x86_tconjtfp16_internal GR16:$src1, GR16:$src2, TILE:$src3))]>; - - let usesCustomInserter = 1 in { - def PTTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>; - def PTTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttcmmrlfp16ps timm:$src1, timm:$src2, timm:$src3)]>; - def PTCONJTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_tconjtcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>; - def PTCONJTFP16 : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src), - [(int_x86_tconjtfp16 timm:$dst, timm:$src)]>; - } -} - -let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let isPseudo = true in { - def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1RSV : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - } - let usesCustomInserter = 1 in { - def PT2RPNTLVWZ0RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - def PT2RPNTLVWZ1RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - } -} // HasAMXMOVRS, HasAMXTRANSPOSE - multiclass TILELOADDRS_Base<string suffix> { def suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1), "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD; @@ -721,29 +539,3 @@ let Predicates = [HasAMXTF32, In64BitMode] in { } } // SchedRW = [WriteSystem] } // HasAMXTF32 - -let Predicates = [HasAMXTF32, HasAMXTRANSPOSE, In64BitMode] in { - let SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in { - def TTMMULTF32PS: I<0x48, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttmmultf32ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>, VEX, VVVV, T8, PS; - } - let Constraints = "$src4 = $dst" in { - def PTTMMULTF32PSV : PseudoI<(outs TILE:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, - TILE:$src4, TILE:$src5, TILE:$src6), - [(set TILE:$dst, - (int_x86_ttmmultf32ps_internal GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6))]>; - } - let usesCustomInserter = 1 in { - def PTTMMULTF32PS : PseudoI<(outs), - (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttmmultf32ps timm:$src1, timm:$src2, - timm:$src3)]>; - } - } // SchedRW = [WriteSystem] -} // HasAMXTF32, HasAMXTRANSPOSE diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 5c23f91..6b2a7a4 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4544,11 +4544,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg, return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD) : GET_EGPR_IF_ENABLED(X86::TILESTORED); #undef GET_EGPR_IF_ENABLED - case 2048: - assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) && - "Unknown 2048-byte regclass"); - assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE"); - return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE; } } @@ -4743,8 +4738,6 @@ static bool isAMXOpcode(unsigned Opc) { case X86::TILESTORED: case X86::TILELOADD_EVEX: case X86::TILESTORED_EVEX: - case X86::PTILEPAIRLOAD: - case X86::PTILEPAIRSTORE: return true; } } @@ -4757,8 +4750,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, default: llvm_unreachable("Unexpected special opcode!"); case X86::TILESTORED: - case X86::TILESTORED_EVEX: - case X86::PTILEPAIRSTORE: { + case X86::TILESTORED_EVEX: { // tilestored %tmm, (%sp, %idx) MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); @@ -4772,8 +4764,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, break; } case X86::TILELOADD: - case X86::TILELOADD_EVEX: - case X86::PTILEPAIRLOAD: { + case X86::TILELOADD_EVEX: { // tileloadd (%sp, %idx), %tmm MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td index 5207eca..6ba07f7 100644 --- a/llvm/lib/Target/X86/X86InstrOperands.td +++ b/llvm/lib/Target/X86/X86InstrOperands.td @@ -536,10 +536,3 @@ def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> { def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> { let ParserMatchClass = VK16PairAsmOperand; } - -let RenderMethod = "addTILEPairOperands" in - def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; } - -def TILEPair : RegisterOperand<TILEPAIR, "printTILEPair"> { - let ParserMatchClass = TILEPairAsmOperand; -} diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index c20bb05..98104a6f 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -183,7 +183,6 @@ def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; def HasAMXMOVRS : Predicate<"Subtarget->hasAMXMOVRS()">; -def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">; def HasAMXTF32 : Predicate<"Subtarget->hasAMXTF32()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 090060e..3b96e70 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -115,9 +115,9 @@ struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> { static constexpr MachineInstr *const ArgNodeSentinel = nullptr; using GraphT = ImmutableGraph<MachineInstr *, int>; - using Node = typename GraphT::Node; - using Edge = typename GraphT::Edge; - using size_type = typename GraphT::size_type; + using Node = GraphT::Node; + using Edge = GraphT::Edge; + using size_type = GraphT::size_type; MachineGadgetGraph(std::unique_ptr<Node[]> Nodes, std::unique_ptr<Edge[]> Edges, size_type NodesSize, size_type EdgesSize, int NumFences = 0, int NumGadgets = 0) @@ -191,10 +191,10 @@ template <> struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits { using GraphType = MachineGadgetGraph; using Traits = llvm::GraphTraits<GraphType *>; - using NodeRef = typename Traits::NodeRef; - using EdgeRef = typename Traits::EdgeRef; - using ChildIteratorType = typename Traits::ChildIteratorType; - using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType; + using NodeRef = Traits::NodeRef; + using EdgeRef = Traits::EdgeRef; + using ChildIteratorType = Traits::ChildIteratorType; + using ChildEdgeIteratorType = Traits::ChildEdgeIteratorType; DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} @@ -227,9 +227,6 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits { } // end namespace llvm -constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel; -constexpr int MachineGadgetGraph::GadgetEdgeSentinel; - char X86LoadValueInjectionLoadHardeningPass::ID = 0; void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage( @@ -335,7 +332,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( L.computePhiInfo(); GraphBuilder Builder; - using GraphIter = typename GraphBuilder::BuilderNodeRef; + using GraphIter = GraphBuilder::BuilderNodeRef; DenseMap<MachineInstr *, GraphIter> NodeMap; int FenceCount = 0, GadgetCount = 0; auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) { diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 8ffd454..2fc5d38 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -74,22 +74,6 @@ static bool isAMXCast(Instruction *II) { match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value())); } -// Some instructions may return more than one tiles. -// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal -static unsigned getNumDefTiles(IntrinsicInst *II) { - Type *Ty = II->getType(); - if (Ty->isX86_AMXTy()) - return 1; - - unsigned Num = 0; - for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) { - Type *STy = Ty->getContainedType(i); - if (STy->isX86_AMXTy()) - Num++; - } - return Num; -} - static bool isAMXIntrinsic(Value *I) { auto *II = dyn_cast<IntrinsicInst>(I); if (!II) @@ -98,7 +82,7 @@ static bool isAMXIntrinsic(Value *I) { return false; // Check if return type or parameter is x86_amx. If it is x86_amx // the intrinsic must be x86 amx intrinsics. - if (getNumDefTiles(II) > 0) + if (II->getType()->isX86_AMXTy()) return true; for (Value *V : II->args()) { if (V->getType()->isX86_AMXTy()) @@ -137,27 +121,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) { llvm_unreachable("No terminator in the entry block!"); } -class ShapeCalculator { -private: - const TargetMachine *TM = nullptr; - - // In AMX intrinsics we let Shape = {Row, Col}, but the - // RealCol = Col / ElementSize. We may use the RealCol - // as a new Row for other new created AMX intrinsics. - std::map<Value *, Value *> Col2Row, Row2Col; - -public: - ShapeCalculator(const TargetMachine *TargetM) : TM(TargetM) {} - std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo); - std::pair<Value *, Value *> getShape(PHINode *Phi); - Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity); - Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity); -}; - -Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V, - unsigned Granularity) { - if (auto It = Col2Row.find(V); It != Col2Row.end()) - return It->second; +static Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity) { IRBuilder<> Builder(II); Value *RealRow = nullptr; if (isa<ConstantInt>(V)) @@ -186,47 +150,16 @@ Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V, getFirstNonAllocaInTheEntryBlock(*II->getFunction())); RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity)); } - Col2Row[V] = RealRow; return RealRow; } -Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V, - unsigned Granularity) { - if (auto It = Row2Col.find(V); It != Row2Col.end()) - return It->second; - IRBuilder<> Builder(II); - Value *RealCol = nullptr; - if (isa<ConstantInt>(V)) - RealCol = - Builder.getInt16((cast<ConstantInt>(V)->getSExtValue()) * Granularity); - else if (isa<Instruction>(V)) { - Builder.SetInsertPoint(cast<Instruction>(V)); - RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity)); - cast<Instruction>(RealCol)->moveAfter(cast<Instruction>(V)); - } else { - // When it is not a const value and it is a function argument, we create - // Row at the entry bb. - IRBuilder<> NewBuilder( - getFirstNonAllocaInTheEntryBlock(*II->getFunction())); - RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity)); - } - Row2Col[V] = RealCol; - return RealCol; -} - // TODO: Refine the row and col-in-bytes of tile to row and col of matrix. -std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II, - unsigned OpNo) { - (void)TM; +std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) { IRBuilder<> Builder(II); Value *Row = nullptr, *Col = nullptr; switch (II->getIntrinsicID()) { default: llvm_unreachable("Expect amx intrinsics"); - case Intrinsic::x86_t2rpntlvwz0_internal: - case Intrinsic::x86_t2rpntlvwz0t1_internal: - case Intrinsic::x86_t2rpntlvwz1_internal: - case Intrinsic::x86_t2rpntlvwz1t1_internal: case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: case Intrinsic::x86_tilestored64_internal: @@ -271,13 +204,6 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II, } break; } - case Intrinsic::x86_ttransposed_internal: - case Intrinsic::x86_tconjtfp16_internal: { - assert((OpNo == 2) && "Illegal Operand Number."); - Row = getRowFromCol(II, II->getArgOperand(1), 4); - Col = getColFromRow(II, II->getArgOperand(0), 4); - break; - } case Intrinsic::x86_tcvtrowd2ps_internal: case Intrinsic::x86_tcvtrowps2bf16h_internal: case Intrinsic::x86_tcvtrowps2bf16l_internal: @@ -289,34 +215,12 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II, Col = II->getArgOperand(1); break; } - case Intrinsic::x86_ttdpbf16ps_internal: - case Intrinsic::x86_ttdpfp16ps_internal: - case Intrinsic::x86_ttcmmimfp16ps_internal: - case Intrinsic::x86_ttcmmrlfp16ps_internal: - case Intrinsic::x86_tconjtcmmimfp16ps_internal: - case Intrinsic::x86_ttmmultf32ps_internal: { - switch (OpNo) { - case 3: - Row = II->getArgOperand(0); - Col = II->getArgOperand(1); - break; - case 4: - Row = getRowFromCol(II, II->getArgOperand(2), 4); - Col = getColFromRow(II, II->getArgOperand(0), 4); - break; - case 5: - Row = getRowFromCol(II, II->getArgOperand(2), 4); - Col = II->getArgOperand(1); - break; - } - break; - } } return std::make_pair(Row, Col); } -std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) { +static std::pair<Value *, Value *> getShape(PHINode *Phi) { Use &U = *(Phi->use_begin()); unsigned OpNo = U.getOperandNo(); User *V = U.getUser(); @@ -349,15 +253,14 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) { namespace { class X86LowerAMXType { Function &Func; - ShapeCalculator *SC; // In AMX intrinsics we let Shape = {Row, Col}, but the // RealCol = Col / ElementSize. We may use the RealCol // as a new Row for other new created AMX intrinsics. - std::map<Value *, Value *> Col2Row, Row2Col; + std::map<Value *, Value *> Col2Row; public: - X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {} + X86LowerAMXType(Function &F) : Func(F) {} bool visit(); void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast); void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST); @@ -374,7 +277,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { Use &U = *(Bitcast->use_begin()); unsigned OpNo = U.getOperandNo(); auto *II = cast<IntrinsicInst>(U.getUser()); - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); IRBuilder<> Builder(Bitcast); // Use the maximun column as stride. Value *Stride = Builder.getInt64(64); @@ -454,7 +357,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride}; Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args); @@ -594,18 +497,11 @@ static Value *getAllocaPos(BasicBlock *BB) { static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!"); - auto *II = dyn_cast<IntrinsicInst>(TileDef); - unsigned Idx = 0; - // Extract tile from multiple tiles' def. - if (auto *Extr = dyn_cast<ExtractValueInst>(TileDef)) { - assert(Extr->hasIndices() && "Tile extract miss index!"); - Idx = Extr->getIndices()[0]; - II = cast<IntrinsicInst>(Extr->getOperand(0)); - } + auto *II = cast<IntrinsicInst>(TileDef); assert(II && "Not tile intrinsic!"); - Value *Row = II->getOperand(Idx); - Value *Col = II->getOperand(Idx + 1); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); BasicBlock *BB = TileDef->getParent(); BasicBlock::iterator Iter = TileDef->getIterator(); @@ -624,20 +520,14 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { // Get tile shape. IntrinsicInst *II = nullptr; - unsigned Idx = 0; if (IsPHI) { Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0); II = cast<IntrinsicInst>(PhiOp); - } else if (auto *Extr = dyn_cast<ExtractValueInst>(V)) { - // Extract tile from multiple tiles' def. - assert(Extr->hasIndices() && "Tile extract miss index!"); - Idx = Extr->getIndices()[0]; - II = cast<IntrinsicInst>(Extr->getOperand(0)); } else { II = cast<IntrinsicInst>(V); } - Value *Row = II->getOperand(Idx); - Value *Col = II->getOperand(Idx + 1); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); Instruction *UserI = cast<Instruction>(U.getUser()); IRBuilder<> Builder(UserI); @@ -848,12 +738,10 @@ namespace { class X86LowerAMXCast { Function &Func; - ShapeCalculator *SC; std::unique_ptr<DominatorTree> DT; public: - X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC) - : Func(F), SC(ShapeC), DT(nullptr) {} + X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {} bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST); bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD); bool combineTilezero(IntrinsicInst *Cast); @@ -932,7 +820,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue()) return false; Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = SC->getShape(OldPN); + std::tie(Row, Col) = getShape(OldPN); // TODO: If it is not constant the Row and Col must domoniate tilezero // that we are going to create. if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col)) @@ -1063,19 +951,6 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( return true; } -static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx, - bool IsRow) { - if (!isAMXIntrinsic(Inst)) - return nullptr; - - auto *II = cast<IntrinsicInst>(Inst); - if (IsRow) - return II->getOperand(0); - - assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!"); - return II->getOperand(ShapeIdx + 1); -} - // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42) // store <256 x i32> %43, <256 x i32>* %p, align 64 // --> @@ -1090,38 +965,13 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) { if (!Tile->hasOneUse()) return false; - // We don't fetch shape from tilestore, we only get shape from tiledef, - // so we can set the max tile shape to tilestore for special cases. + auto *II = cast<IntrinsicInst>(Tile); + // Tile is output from AMX intrinsic. The first operand of the + // intrinsic is row, the second operand of the intrinsic is column. + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); + IRBuilder<> Builder(ST); - Value *Row = nullptr; - Value *Col = nullptr; - - if (isAMXIntrinsic(Tile)) { - auto *II = cast<IntrinsicInst>(Tile); - // Tile is output from AMX intrinsic. The first operand of the - // intrinsic is row, the second operand of the intrinsic is column. - Row = II->getOperand(0); - Col = II->getOperand(1); - } else { - // Now we supported multi-tiles value in structure, so we may get tile - // from extracting multi-tiles structure. - // For example: - // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1, - // i16 %2, i16 %3, i8* %4, i64 %5) - // %7 = extractvalue { x86_amx, x86_amx } %6, 0 - // %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7) - // store <256 x i32> %8, <256 x i32>* %0, align 1024 - // - // TODO: Currently we only handle extractvalue case, enhance me for other - // cases if possible. - auto *II = cast<ExtractValueInst>(Tile); - assert(II && "We meet unhandle source in fetching tile value!"); - unsigned ShapeIdx = II->getIndices()[0]; - Value *Tiles = II->getOperand(0); - Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true); - Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false); - } - assert(Row && Col && "Shape got failed!"); // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); @@ -1146,7 +996,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) { // shape information through def-use chain. if (!isAMXIntrinsic(II)) return false; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); IRBuilder<> Builder(LD); // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); @@ -1189,7 +1039,7 @@ bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) { if (!isAMXIntrinsic(II)) return false; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); IRBuilder<> Builder(Cast); Value *NewInst = @@ -1384,7 +1234,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); std::array<Value *, 4> Args = { Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())}; Value *NewInst = @@ -1445,14 +1295,13 @@ bool lowerAmxType(Function &F, const TargetMachine *TM, return false; bool C = false; - ShapeCalculator SC(TM); - X86LowerAMXCast LAC(F, &SC); + X86LowerAMXCast LAC(F); C |= LAC.combineAMXcast(TLI); // There might be remaining AMXcast after combineAMXcast and they should be // handled elegantly. C |= LAC.transformAllAMXCast(); - X86LowerAMXType LAT(F, &SC); + X86LowerAMXType LAT(F); C |= LAT.visit(); // Prepare for fast register allocation at O0. diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 2a1c499..8a1d00d 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -141,15 +141,10 @@ class X86PreTileConfig : public MachineFunctionPass { if (!MO.isReg() || !MO.getReg().isVirtual()) return false; - unsigned Shapes = 0; - if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) - Shapes = 1; - if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID) - Shapes = 2; - if (!Shapes) + if (MRI->getRegClass(MO.getReg())->getID() != X86::TILERegClassID) return false; - collectShapeInfo(MI, Shapes); + collectShapeInfo(MI); return true; } @@ -165,7 +160,7 @@ class X86PreTileConfig : public MachineFunctionPass { } /// Collect the shape def information for later use. - void collectShapeInfo(MachineInstr &MI, unsigned Shapes); + void collectShapeInfo(MachineInstr &MI); /// Try to hoist shapes definded below AMX instructions. bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl<MIRef> &Shapes) { @@ -231,7 +226,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Pre-configure", false, false) -void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { +void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) { MIRef MIR(MI, MBB); auto &Refs = ShapeBBs[MBB]; @@ -240,10 +235,8 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { Refs.insert(I, MIR); }; - // All shapes have same row in multi-tile operand. - SmallVector<Register, 8> WorkList; - for (unsigned I = 1; I < Shapes + 2; ++I) - WorkList.push_back(MI.getOperand(I).getReg()); + SmallVector<Register, 8> WorkList( + {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()}); while (!WorkList.empty()) { Register R = WorkList.pop_back_val(); MachineInstr *DefMI = MRI->getVRegDef(R); @@ -252,13 +245,6 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second) continue; - // This happens when column = 0 in multi-tile operand. - if (DefMI->getOpcode() == X86::COPY) { - MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg()); - if (MI && MI->isMoveImmediate()) - continue; - } - if (DefMI->isPHI()) { for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2) if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB())) diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 76979e3..72f3813 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -597,10 +597,6 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } - // Reserve low half pair registers in case they are used by RA aggressively. - Reserved.set(X86::TMM0_TMM1); - Reserved.set(X86::TMM2_TMM3); - assert(checkAllSuperRegsMarked(Reserved, {X86::SIL, X86::DIL, X86::BPL, X86::SPL, X86::SIH, X86::DIH, X86::BPH, X86::SPH})); @@ -621,7 +617,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const { // and try to return the minimum number of registers supported by the target. static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) && (X86::K6_K7 + 1 == X86::TMMCFG) && - (X86::TMM6_TMM7 + 1 == X86::R16) && + (X86::TMM7 + 1 == X86::R16) && (X86::R31WH + 1 == X86::NUM_TARGET_REGS), "Register number may be incorrect"); @@ -694,8 +690,7 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF, } bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { - return RC->getID() == X86::TILERegClassID || - RC->getID() == X86::TILEPAIRRegClassID; + return RC->getID() == X86::TILERegClassID; } void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { @@ -1062,17 +1057,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTDPFP16PSV: case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: - case X86::PTTRANSPOSEDV: - case X86::PTTDPBF16PSV: - case X86::PTTDPFP16PSV: - case X86::PTTCMMIMFP16PSV: - case X86::PTTCMMRLFP16PSV: - case X86::PTCONJTCMMIMFP16PSV: - case X86::PTCONJTFP16V: case X86::PTILELOADDRSV: case X86::PTILELOADDRST1V: case X86::PTMMULTF32PSV: - case X86::PTTMMULTF32PSV: case X86::PTDPBF8PSV: case X86::PTDPBHF8PSV: case X86::PTDPHBF8PSV: @@ -1083,56 +1070,7 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, VRM->assignVirt2Shape(VirtReg, Shape); return Shape; } - case X86::PT2RPNTLVWZ0V: - case X86::PT2RPNTLVWZ0T1V: - case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: { - MachineOperand &MO1 = MI->getOperand(1); - MachineOperand &MO2 = MI->getOperand(2); - MachineOperand &MO3 = MI->getOperand(3); - ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI); - VRM->assignVirt2Shape(VirtReg, Shape); - return Shape; - } - } -} - -static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) { - unsigned PhysShapeNum = PhysShape.getShapeNum(); - unsigned VirtShapeNum = VirtShape.getShapeNum(); - - if (PhysShapeNum < VirtShapeNum) - return false; - - if (PhysShapeNum == VirtShapeNum) { - if (PhysShapeNum == 1) - return PhysShape == VirtShape; - - for (unsigned I = 0; I < PhysShapeNum; I++) { - ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); - ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I)); - if (VShape != PShape) - return false; - } - return true; - } - - // Hint subreg of mult-tile reg to single tile reg. - if (VirtShapeNum == 1) { - for (unsigned I = 0; I < PhysShapeNum; I++) { - ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); - if (VirtShape == PShape) - return true; - } } - - // Note: Currently we have no requirement for case of - // (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum) - return false; } bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, @@ -1153,7 +1091,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, if (!VRM) return BaseImplRetVal; - if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) { + if (ID != X86::TILERegClassID) { if (DisableRegAllocNDDHints || !ST.hasNDD() || !TRI.isGeneralPurposeRegisterClass(&RC)) return BaseImplRetVal; @@ -1204,7 +1142,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, return; } ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI); - if (canHintShape(PhysShape, VirtShape)) + if (PhysShape == VirtShape) Hints.push_back(PhysReg); }; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index 99b7910..692e42a 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -30,8 +30,6 @@ let Namespace = "X86" in { def sub_ymm : SubRegIndex<256>; def sub_mask_0 : SubRegIndex<-1>; def sub_mask_1 : SubRegIndex<-1, -1>; - def sub_t0 : SubRegIndex<8192>; - def sub_t1 : SubRegIndex<8192, 8192>; } //===----------------------------------------------------------------------===// @@ -432,10 +430,6 @@ def TMM4: X86Reg<"tmm4", 4>; def TMM5: X86Reg<"tmm5", 5>; def TMM6: X86Reg<"tmm6", 6>; def TMM7: X86Reg<"tmm7", 7>; -// TMM register pairs -def TPAIRS : RegisterTuples<[sub_t0, sub_t1], - [(add TMM0, TMM2, TMM4, TMM6), - (add TMM1, TMM3, TMM5, TMM7)]>; } // Floating point stack registers. These don't map one-to-one to the FP @@ -862,9 +856,6 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} -// Need check alignment 3rd operand size=1024*2*8 -let isAllocatable = 1 in -def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;} //===----------------------------------------------------------------------===// // Register categories. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 3d8d0a23..0b1430e 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -6562,7 +6562,7 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, bool X86TTIImpl::areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Types) const { + ArrayRef<Type *> Types) const { if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) return false; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 133b366..de5e1c2 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -296,7 +296,7 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const override; bool areTypesABICompatible(const Function *Caller, const Function *Callee, - const ArrayRef<Type *> &Type) const override; + ArrayRef<Type *> Type) const override; uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override { return ST->getMaxInlineSizeThreshold(); diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index 17a44dd..09ef8fb 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -74,63 +74,6 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, false) -unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) { - if (Reg.isVirtual()) { - unsigned RegClassID = MRI->getRegClass(Reg)->getID(); - if (RegClassID == X86::TILERegClassID) - return 1; - if (RegClassID == X86::TILEPAIRRegClassID) - return 2; - } else { - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return 1; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return 2; - } - return 0; -} - -static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM, - Register VirtReg, - SmallVector<ShapeT, 8> &Phys2Shapes) { - unsigned Num = getAMXRegNum(MRI, VirtReg); - MCRegister PhysReg = VRM.getPhys(VirtReg); - if (!PhysReg) - return; - - if (Num == 1) { - unsigned Index = PhysReg - X86::TMM0; - if (!Phys2Shapes[Index].isValid()) { - ShapeT Shape = VRM.getShape(VirtReg); - Phys2Shapes[Index] = std::move(Shape); - return; - } - } - // Split tile pair shape info to 2 single tile shape info. e.g: - // Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes. - if (Num == 2) { - unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2; - unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1; - - ShapeT Shape = VRM.getShape(VirtReg); - assert(Shape.getShapeNum() == 2 && "Unexpected shape number!"); - - if (!Phys2Shapes[Index0].isValid()) { - ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI); - Phys2Shapes[Index0] = std::move(Shape0); - } - - if (!Phys2Shapes[Index1].isValid()) { - ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI); - Phys2Shapes[Index1] = std::move(Shape1); - } - } -} - -static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) { - return getAMXRegNum(MRI, Reg) > 0; -} - bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); // Early exit in the common case of non-AMX code. @@ -138,7 +81,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { return false; const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const X86RegisterInfo *TRI = ST.getRegisterInfo(); const TargetInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS(); @@ -176,24 +119,29 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { assert(ConstMI && "Cannot find an insertion point"); unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs(); - SmallVector<ShapeT, 8> Phys2Shapes(AMXRegNum, ShapeT()); + SmallVector<Register, 8> Phys2Virt(AMXRegNum, 0); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register VirtReg = Register::index2VirtReg(I); if (MRI.reg_nodbg_empty(VirtReg)) continue; - if (!isAMXRegClass(&MRI, VirtReg)) + if (!TRI->isTileRegisterClass(MRI.getRegClass(VirtReg))) + continue; + MCRegister PhysReg = VRM.getPhys(VirtReg); + if (!PhysReg) continue; - collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes); + unsigned Index = PhysReg - X86::TMM0; + if (!Phys2Virt[Index]) + Phys2Virt[Index] = VirtReg; } // Fill in the shape of each tile physical register. for (unsigned I = 0; I < AMXRegNum; ++I) { - ShapeT Shape = Phys2Shapes[I]; - if (!Shape.isValid()) + if (!Phys2Virt[I]) continue; DebugLoc DL; bool IsRow = true; MachineInstr *NewMI = nullptr; + ShapeT Shape = VRM.getShape(Phys2Virt[I]); for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) { // Here is the data format for the tile config. // 0 palette @@ -222,14 +170,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { "Cannot initialize with different shapes"); continue; } - if (DefMI.getOperand(1).isImm()) { - Imm = DefMI.getOperand(1).getImm(); - } else { - assert(DefMI.getOpcode() == X86::MOV32r0 && - "The opcode is assumed to be MOV32r0 if the operand is not " - "immediate."); - Imm = 0; - } + Imm = DefMI.getOperand(1).getImm(); NewMI = addFrameReference( BuildMI(MF.front(), ++ConstMI->getIterator(), DL, diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 0849fc7..c164762 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -2192,7 +2192,6 @@ StringMap<bool> sys::getHostCPUFeatures() { bool HasLeaf1E = MaxLevel >= 0x1e && !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; - Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave; Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave; Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave; diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp index d510445..f74d670 100644 --- a/llvm/lib/TargetParser/PPCTargetParser.cpp +++ b/llvm/lib/TargetParser/PPCTargetParser.cpp @@ -48,9 +48,9 @@ StringRef normalizeCPUName(StringRef CPUName) { // accepting it. Clang has always ignored it and passed the // generic CPU ID to the back end. return StringSwitch<StringRef>(CPUName) - .Cases("common", "405", "generic") - .Cases("ppc440", "440fp", "440") - .Cases("630", "power3", "pwr3") + .Cases({"common", "405"}, "generic") + .Cases({"ppc440", "440fp"}, "440") + .Cases({"630", "power3"}, "pwr3") .Case("G3", "g3") .Case("G4", "g4") .Case("G4+", "g4+") @@ -69,7 +69,7 @@ StringRef normalizeCPUName(StringRef CPUName) { .Case("power9", "pwr9") .Case("power10", "pwr10") .Case("power11", "pwr11") - .Cases("powerpc", "powerpc32", "ppc") + .Cases({"powerpc", "powerpc32"}, "ppc") .Case("powerpc64", "ppc64") .Case("powerpc64le", "ppc64le") .Default(CPUName); diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp index d765d9c..d735923 100644 --- a/llvm/lib/TargetParser/TargetDataLayout.cpp +++ b/llvm/lib/TargetParser/TargetDataLayout.cpp @@ -208,7 +208,7 @@ static std::string computeMipsDataLayout(const Triple &TT, StringRef ABIName) { return Ret; } -static std::string computePowerDataLayout(const Triple &T) { +static std::string computePowerDataLayout(const Triple &T, StringRef ABIName) { bool is64Bit = T.isPPC64(); std::string Ret; @@ -228,7 +228,8 @@ static std::string computePowerDataLayout(const Triple &T) { // If the target ABI uses function descriptors, then the alignment of function // pointers depends on the alignment used to emit the descriptor. Otherwise, // function pointers are aligned to 32 bits because the instructions must be. - if ((T.getArch() == Triple::ppc64 && !T.isPPC64ELFv2ABI())) { + if ((T.getArch() == Triple::ppc64 && + (!T.isPPC64ELFv2ABI() && ABIName != "elfv2"))) { Ret += "-Fi64"; } else if (T.isOSAIX()) { Ret += is64Bit ? "-Fi64" : "-Fi32"; @@ -573,7 +574,7 @@ std::string Triple::computeDataLayout(StringRef ABIName) const { case Triple::ppcle: case Triple::ppc64: case Triple::ppc64le: - return computePowerDataLayout(*this); + return computePowerDataLayout(*this, ABIName); case Triple::r600: case Triple::amdgcn: return computeAMDDataLayout(*this); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index b13c795..37e8ad9 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -143,7 +143,7 @@ constexpr FeatureBitset FeaturesDiamondRapids = FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 | FeaturePPX | FeatureNDD | FeatureNF | FeatureMOVRS | FeatureAMX_MOVRS | - FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 | FeatureAMX_TRANSPOSE; + FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32; // Intel Atom processors. // Bonnell has feature parity with Core2 and adds MOVBE. @@ -615,7 +615,6 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; -constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 = FeatureAMX_TILE | FeatureAVX10_2; diff --git a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp index cda07e8..f55bc9c 100644 --- a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp +++ b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp @@ -32,7 +32,7 @@ using namespace llvm::MachO; using namespace llvm::MachO::DylibReader; using TripleVec = std::vector<Triple>; -static typename TripleVec::iterator emplace(TripleVec &Container, Triple &&T) { +static TripleVec::iterator emplace(TripleVec &Container, Triple &&T) { auto I = partition_point(Container, [=](const Triple &CT) { return std::forward_as_tuple(CT.getArch(), CT.getOS(), CT.getEnvironment()) < diff --git a/llvm/lib/TextAPI/RecordVisitor.cpp b/llvm/lib/TextAPI/RecordVisitor.cpp index d333b33..24971a7 100644 --- a/llvm/lib/TextAPI/RecordVisitor.cpp +++ b/llvm/lib/TextAPI/RecordVisitor.cpp @@ -15,7 +15,7 @@ using namespace llvm; using namespace llvm::MachO; -RecordVisitor::~RecordVisitor() {} +RecordVisitor::~RecordVisitor() = default; void RecordVisitor::visitObjCInterface(const ObjCInterfaceRecord &) {} void RecordVisitor::visitObjCCategory(const ObjCCategoryRecord &) {} diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h index e05fe28..1e549f1 100644 --- a/llvm/lib/Transforms/Coroutines/CoroCloner.h +++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h @@ -77,7 +77,7 @@ public: : OrigF(OrigF), Suffix(Suffix), Shape(Shape), FKind(FKind), Builder(OrigF.getContext()), TTI(TTI) {} - virtual ~BaseCloner() {} + virtual ~BaseCloner() = default; /// Create a clone for a continuation lowering. static Function *createClone(Function &OrigF, const Twine &Suffix, diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5048561..5ed47ae 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final return true; RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); @@ -10701,7 +10701,7 @@ struct AAInterFnReachabilityFunction auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this); RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 894d83f..d35ae47 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1034,11 +1034,11 @@ private: } // namespace template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; template <> diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index d7eb745..2a87a0f 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -208,7 +208,7 @@ namespace KernelInfo { // }; #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_IDX(Configuration, 0) KERNEL_ENVIRONMENT_IDX(Ident, 1) @@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1) #undef KERNEL_ENVIRONMENT_IDX #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0) KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1) @@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams) GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) { - constexpr const int InitKernelEnvironmentArgNo = 0; + constexpr int InitKernelEnvironmentArgNo = 0; return cast<GlobalVariable>( KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo) ->stripPointerCasts()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 3ddf182..cbaff29 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3997,6 +3997,27 @@ static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I, return nullptr; } +/// Fold select(X >s 0, 0, -X) | smax(X, 0) --> abs(X) +/// select(X <s 0, -X, 0) | smax(X, 0) --> abs(X) +static Value *FoldOrOfSelectSmaxToAbs(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *X; + Value *Sel; + if (match(&I, + m_c_Or(m_Value(Sel), m_OneUse(m_SMax(m_Value(X), m_ZeroInt()))))) { + auto NegX = m_Neg(m_Specific(X)); + if (match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(X), + m_ZeroInt()), + m_ZeroInt(), NegX)) || + match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(X), + m_ZeroInt()), + NegX, m_ZeroInt()))) + return Builder.CreateBinaryIntrinsic(Intrinsic::abs, X, + Builder.getFalse()); + } + return nullptr; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -4545,6 +4566,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V); + if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder)) + return replaceInstUsesWith(I, Res); + return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f5130da..9572f9d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelCond->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) && + SelCond) { + return SelectInst::Create(C, A, B, "", nullptr, SelCond); + } else if (match(FalseVal, + m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) && + SelFVal) { + SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal); + NewSI->swapProfMetadata(); + return NewSI; + } else { + return createSelectInstWithUnknownProfile(C, A, B); + } + } return SelectInst::Create(C, A, B); } @@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelFVal->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) && + SelCond) { + SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond); + NewSI->swapProfMetadata(); + return NewSI; + } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) && + SelFVal) { + return SelectInst::Create(C, B, A, "", nullptr, SelFVal); + } else { + return createSelectInstWithUnknownProfile(C, B, A); + } + } return SelectInst::Create(C, B, A); } } diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 7795cce..b5548d4 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -69,14 +69,6 @@ namespace llvm { // Command line option to enable vtable value profiling. Defined in // ProfileData/InstrProf.cpp: -enable-vtable-value-profiling= extern cl::opt<bool> EnableVTableValueProfiling; -// TODO: Remove -debug-info-correlate in next LLVM release, in favor of -// -profile-correlate=debug-info. -cl::opt<bool> DebugInfoCorrelate( - "debug-info-correlate", - cl::desc("Use debug info to correlate profiles. (Deprecated, use " - "-profile-correlate=debug-info)"), - cl::init(false)); - LLVM_ABI cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate( "profile-correlate", cl::desc("Use debug info or binary file to correlate profiles."), @@ -1047,7 +1039,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { // in lightweight mode. We need to move the value profile pointer to the // Counter struct to get this working. assert( - !DebugInfoCorrelate && ProfileCorrelate == InstrProfCorrelator::NONE && + ProfileCorrelate == InstrProfCorrelator::NONE && "Value profiling is not yet supported with lightweight instrumentation"); GlobalVariable *Name = Ind->getName(); auto It = ProfileDataMap.find(Name); @@ -1504,7 +1496,7 @@ static inline Constant *getVTableAddrForProfData(GlobalVariable *GV) { } void InstrLowerer::getOrCreateVTableProfData(GlobalVariable *GV) { - assert(!DebugInfoCorrelate && + assert(ProfileCorrelate != InstrProfCorrelator::DEBUG_INFO && "Value profiling is not supported with lightweight instrumentation"); if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage()) return; @@ -1584,8 +1576,7 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc, // Use internal rather than private linkage so the counter variable shows up // in the symbol table when using debug info for correlation. - if ((DebugInfoCorrelate || - ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) && + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO && TT.isOSBinFormatMachO() && Linkage == GlobalValue::PrivateLinkage) Linkage = GlobalValue::InternalLinkage; @@ -1691,8 +1682,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) { auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts); PD.RegionCounters = CounterPtr; - if (DebugInfoCorrelate || - ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) { + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) { LLVMContext &Ctx = M.getContext(); Function *Fn = Inc->getParent()->getParent(); if (auto *SP = Fn->getSubprogram()) { @@ -1737,7 +1727,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) { void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // When debug information is correlated to profile data, a data variable // is not needed. - if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) return; GlobalVariable *NamePtr = Inc->getName(); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index 2f256df..b72d41a 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) { return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column); } +static AllocationType getAllocType(const AllocationInfo *AllocInfo) { + return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), + AllocInfo->Info.getAllocCount(), + AllocInfo->Info.getTotalLifetime()); +} + static AllocationType addCallStack(CallStackTrie &AllocTrie, const AllocationInfo *AllocInfo, uint64_t FullStackId) { SmallVector<uint64_t> StackIds; for (const auto &StackFrame : AllocInfo->CallStack) StackIds.push_back(computeStackId(StackFrame)); - auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), - AllocInfo->Info.getAllocCount(), - AllocInfo->Info.getTotalLifetime()); + auto AllocType = getAllocType(AllocInfo); std::vector<ContextTotalSize> ContextSizeInfo; if (recordContextSizeInfoForAnalysis()) { auto TotalSize = AllocInfo->Info.getTotalSize(); @@ -405,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI, const std::set<const AllocationInfo *> &AllocInfoSet, std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo> &FullStackIdToAllocMatchInfo) { + // TODO: Remove this once the profile creation logic deduplicates contexts + // that are the same other than the IsInlineFrame bool. Until then, keep the + // largest. + DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo; + for (auto *AllocInfo : AllocInfoSet) { + auto FullStackId = computeFullStackId(AllocInfo->CallStack); + auto [It, Inserted] = + UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo}); + // If inserted entry, done. + if (Inserted) + continue; + // Keep the larger one, or the noncold one if they are the same size. + auto CurSize = It->second->Info.getTotalSize(); + auto NewSize = AllocInfo->Info.getTotalSize(); + if ((CurSize > NewSize) || + (CurSize == NewSize && + getAllocType(AllocInfo) != AllocationType::NotCold)) + continue; + It->second = AllocInfo; + } // We may match this instruction's location list to multiple MIB // contexts. Add them to a Trie specialized for trimming the contexts to // the minimal needed to disambiguate contexts with unique behavior. CallStackTrie AllocTrie(&ORE, MaxColdSize); uint64_t TotalSize = 0; uint64_t TotalColdSize = 0; - for (auto *AllocInfo : AllocInfoSet) { + for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) { // Check the full inlined call stack against this one. // If we found and thus matched all frames on the call, include // this MIB. if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; - uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) - FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); if (AllocType == AllocationType::Cold) diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index 80e77e09..a2fad02 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -161,7 +161,7 @@ template <char NsanTypeId> class ShadowTypeConfigImpl : public ShadowTypeConfig { public: char getNsanTypeId() const override { return NsanTypeId; } - static constexpr const char kNsanTypeId = NsanTypeId; + static constexpr char kNsanTypeId = NsanTypeId; }; // `double` (`d`) shadow type. diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 71736cf..af53fa0 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -456,7 +456,7 @@ createIRLevelProfileFlagVar(Module &M, ProfileVersion |= VARIANT_MASK_INSTR_ENTRY; if (PGOInstrumentLoopEntries) ProfileVersion |= VARIANT_MASK_INSTR_LOOP_ENTRIES; - if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) ProfileVersion |= VARIANT_MASK_DBG_CORRELATE; if (PGOFunctionEntryCoverage) ProfileVersion |= diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 78d4a57e..87eba5f 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -58,6 +58,18 @@ static cl::opt<bool> cl::desc("Writes always set the type"), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClOutlineInstrumentation( + "tysan-outline-instrumentation", + cl::desc("Uses function calls for all TySan instrumentation, reducing " + "ELF size"), + cl::Hidden, cl::init(false)); + +static cl::opt<bool> ClVerifyOutlinedInstrumentation( + "tysan-verify-outlined-instrumentation", + cl::desc("Check types twice with both inlined instrumentation and " + "function calls. This verifies that they behave the same."), + cl::Hidden, cl::init(false)); + STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses"); namespace { @@ -105,12 +117,16 @@ private: Regex AnonNameRegex; Type *IntptrTy; uint64_t PtrShift; - IntegerType *OrdTy; + IntegerType *OrdTy, *U64Ty; /// Callbacks to run-time library are computed in initializeCallbacks. FunctionCallee TysanCheck; FunctionCallee TysanCtorFunction; + FunctionCallee TysanIntrumentMemInst; + FunctionCallee TysanInstrumentWithShadowUpdate; + FunctionCallee TysanSetShadowType; + /// Callback to set types for gloabls. Function *TysanGlobalsSetTypeFunction; }; @@ -130,6 +146,8 @@ TypeSanitizer::TypeSanitizer(Module &M) void TypeSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(M.getContext()); OrdTy = IRB.getInt32Ty(); + U64Ty = IRB.getInt64Ty(); + Type *BoolType = IRB.getInt1Ty(); AttributeList Attr; Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind); @@ -144,6 +162,30 @@ void TypeSanitizer::initializeCallbacks(Module &M) { TysanCtorFunction = M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy()); + + TysanIntrumentMemInst = M.getOrInsertFunction( + "__tysan_instrument_mem_inst", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer of data to be written to + IRB.getPtrTy(), // Pointer of data to write + U64Ty, // Size of the data in bytes + BoolType // Do we need to call memmove + ); + + TysanInstrumentWithShadowUpdate = M.getOrInsertFunction( + "__tysan_instrument_with_shadow_update", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer to data to be read + IRB.getPtrTy(), // Pointer to type descriptor + BoolType, // Do we need to type check this + U64Ty, // Size of data we access in bytes + OrdTy // Flags + ); + + TysanSetShadowType = M.getOrInsertFunction( + "__tysan_set_shadow_type", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer of data to be written to + IRB.getPtrTy(), // Pointer to the new type descriptor + U64Ty // Size of data we access in bytes + ); } void TypeSanitizer::instrumentGlobals(Module &M) { @@ -587,6 +629,29 @@ bool TypeSanitizer::instrumentWithShadowUpdate( Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy()); + if (ClOutlineInstrumentation) { + if (!ForceSetType && (!ClWritesAlwaysSetType || IsRead)) { + // We need to check the type here. If the type is unknown, then the read + // sets the type. If the type is known, then it is checked. If the type + // doesn't match, then we call the runtime type check (which may yet + // determine that the mismatch is okay). + + Constant *Flags = + ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1)); + + IRB.CreateCall(TysanInstrumentWithShadowUpdate, + {Ptr, TD, + SanitizeFunction ? IRB.getTrue() : IRB.getFalse(), + IRB.getInt64(AccessSize), Flags}); + } else if (ForceSetType || IsWrite) { + // In the mode where writes always set the type, for a write (which does + // not also read), we just set the type. + IRB.CreateCall(TysanSetShadowType, {Ptr, TD, IRB.getInt64(AccessSize)}); + } + + return true; + } + Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift, ShadowBase, AppMemMask); Type *Int8PtrPtrTy = PointerType::get(IRB.getContext(), 0); @@ -838,37 +903,47 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, } } - if (!ShadowBase) - ShadowBase = getShadowBase(*F); - if (!AppMemMask) - AppMemMask = getAppMemMask(*F); + if (ClOutlineInstrumentation) { + if (!Src) + Src = ConstantPointerNull::get(IRB.getPtrTy()); - Value *ShadowDataInt = IRB.CreateAdd( - IRB.CreateShl( - IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask), - PtrShift), - ShadowBase); - Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy()); - - if (!Src) { - IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift), - Align(1ull << PtrShift)); + IRB.CreateCall( + TysanIntrumentMemInst, + {Dest, Src, Size, NeedsMemMove ? IRB.getTrue() : IRB.getFalse()}); return true; - } - - Value *SrcShadowDataInt = IRB.CreateAdd( - IRB.CreateShl( - IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask), - PtrShift), - ShadowBase); - Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy()); - - if (NeedsMemMove) { - IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData, - Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); } else { - IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData, - Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + if (!ShadowBase) + ShadowBase = getShadowBase(*F); + if (!AppMemMask) + AppMemMask = getAppMemMask(*F); + + Value *ShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy()); + + if (!Src) { + IRB.CreateMemSet(ShadowData, IRB.getInt8(0), + IRB.CreateShl(Size, PtrShift), Align(1ull << PtrShift)); + return true; + } + + Value *SrcShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy()); + + if (NeedsMemMove) { + IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } else { + IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } } return true; @@ -890,6 +965,16 @@ PreservedAnalyses TypeSanitizerPass::run(Module &M, for (Function &F : M) { const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F); TySan.sanitizeFunction(F, TLI); + if (ClVerifyOutlinedInstrumentation && ClOutlineInstrumentation) { + // Outlined instrumentation is a new option, and so this exists to + // verify there is no difference in behaviour between the options. + // If the outlined instrumentation triggers a verification failure + // when the original inlined instrumentation does not, or vice versa, + // then there is a discrepency which should be investigated. + ClOutlineInstrumentation = false; + TySan.sanitizeFunction(F, TLI); + ClOutlineInstrumentation = true; + } } return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index 89980d5..a577f51 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -122,7 +122,8 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) { Value *Cond = Assume->getArgOperand(0); // Don't drop type tests, which have special semantics. - if (match(Cond, m_Intrinsic<Intrinsic::type_test>())) + if (match(Cond, m_Intrinsic<Intrinsic::type_test>()) || + match(Cond, m_Intrinsic<Intrinsic::public_type_test>())) continue; SmallVector<Value *> Affected; diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index a06f832..d564e32 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -514,7 +514,7 @@ public: class GVNSink { public: - GVNSink() {} + GVNSink() = default; bool run(Function &F) { LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 7ebcc21..4ba4ba3 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -162,8 +162,6 @@ class IndVarSimplify { const SCEV *ExitCount, PHINode *IndVar, SCEVExpander &Rewriter); - bool sinkUnusedInvariants(Loop *L); - public: IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, const DataLayout &DL, TargetLibraryInfo *TLI, @@ -1079,85 +1077,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, return true; } -//===----------------------------------------------------------------------===// -// sinkUnusedInvariants. A late subpass to cleanup loop preheaders. -//===----------------------------------------------------------------------===// - -/// If there's a single exit block, sink any loop-invariant values that -/// were defined in the preheader but not used inside the loop into the -/// exit block to reduce register pressure in the loop. -bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { - BasicBlock *ExitBlock = L->getExitBlock(); - if (!ExitBlock) return false; - - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) return false; - - bool MadeAnyChanges = false; - for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { - - // Skip BB Terminator. - if (Preheader->getTerminator() == &I) - continue; - - // New instructions were inserted at the end of the preheader. - if (isa<PHINode>(I)) - break; - - // Don't move instructions which might have side effects, since the side - // effects need to complete before instructions inside the loop. Also don't - // move instructions which might read memory, since the loop may modify - // memory. Note that it's okay if the instruction might have undefined - // behavior: LoopSimplify guarantees that the preheader dominates the exit - // block. - if (I.mayHaveSideEffects() || I.mayReadFromMemory()) - continue; - - // Skip debug or pseudo instructions. - if (I.isDebugOrPseudoInst()) - continue; - - // Skip eh pad instructions. - if (I.isEHPad()) - continue; - - // Don't sink alloca: we never want to sink static alloca's out of the - // entry block, and correctly sinking dynamic alloca's requires - // checks for stacksave/stackrestore intrinsics. - // FIXME: Refactor this check somehow? - if (isa<AllocaInst>(&I)) - continue; - - // Determine if there is a use in or before the loop (direct or - // otherwise). - bool UsedInLoop = false; - for (Use &U : I.uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - BasicBlock *UseBB = User->getParent(); - if (PHINode *P = dyn_cast<PHINode>(User)) { - unsigned i = - PHINode::getIncomingValueNumForOperand(U.getOperandNo()); - UseBB = P->getIncomingBlock(i); - } - if (UseBB == Preheader || L->contains(UseBB)) { - UsedInLoop = true; - break; - } - } - - // If there is, the def must remain in the preheader. - if (UsedInLoop) - continue; - - // Otherwise, sink it to the exit block. - I.moveBefore(ExitBlock->getFirstInsertionPt()); - SE->forgetValue(&I); - MadeAnyChanges = true; - } - - return MadeAnyChanges; -} - static void replaceExitCond(BranchInst *BI, Value *NewCond, SmallVectorImpl<WeakTrackingVH> &DeadInsts) { auto *OldCond = BI->getCondition(); @@ -2065,10 +1984,6 @@ bool IndVarSimplify::run(Loop *L) { // The Rewriter may not be used from this point on. - // Loop-invariant instructions in the preheader that aren't used in the - // loop may be sunk below the loop to reduce register pressure. - Changed |= sinkUnusedInvariants(L); - // rewriteFirstIterationLoopExitValues does not rely on the computation of // trip count and therefore can further simplify exit values in addition to // rewriteLoopExitValues. diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index b2c526b..d13b990 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -211,9 +211,15 @@ static Instruction *cloneInstructionInExitBlock( static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU); -static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest, - ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater &MSSAU, ScalarEvolution *SE); +static void moveInstructionBefore( + Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, + MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator); + +static bool sinkUnusedInvariantsFromPreheaderToExit( + Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, + SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE); static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L, function_ref<void(Instruction *)> Fn); @@ -471,6 +477,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE) : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE); + + // sink pre-header defs that are unused in-loop into the unique exit to reduce + // pressure. + Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU, + SE, DT, Flags, ORE); + Flags.setIsSink(false); if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, @@ -1456,19 +1468,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater &MSSAU, - ScalarEvolution *SE) { + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, + MemorySSA::InsertionPlace Point) { SafetyInfo.removeInstruction(&I); SafetyInfo.insertInstructionTo(&I, Dest->getParent()); I.moveBefore(*Dest->getParent(), Dest); if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>( MSSAU.getMemorySSA()->getMemoryAccess(&I))) - MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), - MemorySSA::BeforeTerminator); + MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point); if (SE) SE->forgetBlockAndLoopDispositions(&I); } +// If there's a single exit block, sink any loop-invariant values that were +// defined in the preheader but not used inside the loop into the exit block +// to reduce register pressure in the loop. +static bool sinkUnusedInvariantsFromPreheaderToExit( + Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, + SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) { + BasicBlock *ExitBlock = L->getExitBlock(); + if (!ExitBlock) + return false; + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + bool MadeAnyChanges = false; + + for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { + + // Skip terminator. + if (Preheader->getTerminator() == &I) + continue; + + // New instructions were inserted at the end of the preheader. + if (isa<PHINode>(I)) + break; + + // Don't move instructions which might have side effects, since the side + // effects need to complete before instructions inside the loop. Note that + // it's okay if the instruction might have undefined behavior: LoopSimplify + // guarantees that the preheader dominates the exit block. + if (I.mayHaveSideEffects()) + continue; + + if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr)) + continue; + + // Determine if there is a use in or before the loop (direct or + // otherwise). + bool UsedInLoopOrPreheader = false; + for (Use &U : I.uses()) { + auto *UserI = cast<Instruction>(U.getUser()); + BasicBlock *UseBB = UserI->getParent(); + if (auto *PN = dyn_cast<PHINode>(UserI)) { + UseBB = PN->getIncomingBlock(U); + } + if (UseBB == Preheader || L->contains(UseBB)) { + UsedInLoopOrPreheader = true; + break; + } + } + if (UsedInLoopOrPreheader) + continue; + + moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo, + MSSAU, SE, MemorySSA::Beginning); + MadeAnyChanges = true; + } + + return MadeAnyChanges; +} + static Instruction *sinkThroughTriviallyReplaceablePHI( PHINode *TPN, Instruction *I, LoopInfo *LI, SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies, diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 1a279b6..001215a 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1318,6 +1318,11 @@ public: /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop = true; + /// This records whether all of the fixups using this LSRUse are unconditional + /// within the loop, meaning they will be executed on every path to the loop + /// latch. This includes fixups before early exits. + bool AllFixupsUnconditional = true; + /// RigidFormula is set to true to guarantee that this use will be associated /// with a single formula--the one that initially matched. Some SCEV /// expressions cannot be expanded. This allows LSR to consider the registers @@ -1421,16 +1426,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) || TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) { const SCEV *Start; - const SCEVConstant *Step; - if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) + const APInt *Step; + if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) { // If the step size matches the base offset, we could use pre-indexed // addressing. - if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() && - Step->getAPInt() == F.BaseOffset.getFixedValue()) || - ((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) && - SE->isLoopInvariant(Start, L))) + bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) && + F.BaseOffset.isFixed() && + *Step == F.BaseOffset.getFixedValue(); + bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) && + !isa<SCEVConstant>(Start) && + SE->isLoopInvariant(Start, L); + // We can only pre or post index when the load/store is unconditional. + if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional) LoopCost = 0; + } } + // If the loop counts down to zero and we'll be using a hardware loop then // the addrec will be combined into the hardware loop instruction. if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() && @@ -1783,6 +1794,9 @@ void LSRUse::print(raw_ostream &OS) const { if (AllFixupsOutsideLoop) OS << ", all-fixups-outside-loop"; + if (AllFixupsUnconditional) + OS << ", all-fixups-unconditional"; + if (WidestFixupType) OS << ", widest fixup type: " << *WidestFixupType; } @@ -2213,6 +2227,7 @@ class LSRInstance { void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void CountRegisters(const Formula &F, size_t LUIdx); bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); + bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const; void CollectLoopInvariantFixupsAndFormulae(); @@ -3607,6 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = TmpPostIncLoops; LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF); // Create SCEV as Formula for calculating baseline cost if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) { @@ -3680,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { return true; } +/// Test whether this fixup will be executed each time the corresponding IV +/// increment instruction is executed. +bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const { + // If the fixup block dominates the IV increment block then there is no path + // through the loop to the increment that doesn't pass through the fixup. + return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent()); +} + /// Check for other uses of loop-invariant values which we're tracking. These /// other uses will pin these values in registers, making them less profitable /// for elimination. @@ -3803,6 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LF.OperandValToReplace = U; LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF); if (!LU.WidestFixupType || SE.getTypeSizeInBits(LU.WidestFixupType) < SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) @@ -4940,6 +4965,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; + LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional; // Transfer the fixups of LU to LUThatHas. for (LSRFixup &Fixup : LU.Fixups) { diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 3487e81..7e70ba2 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -245,11 +245,14 @@ raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) { } // namespace -static bool isUniformShape(Value *V) { +static bool isShapePreserving(Value *V) { Instruction *I = dyn_cast<Instruction>(V); if (!I) return true; + if (isa<SelectInst>(I)) + return true; + if (I->isBinaryOp()) return true; @@ -300,6 +303,16 @@ static bool isUniformShape(Value *V) { } } +/// Return an iterator over the operands of \p I that should share shape +/// information with \p I. +static iterator_range<Use *> getShapedOperandsForInst(Instruction *I) { + assert(isShapePreserving(I) && + "Can't retrieve shaped operands for an instruction that does not " + "preserve shape information"); + auto Ops = I->operands(); + return isa<SelectInst>(I) ? drop_begin(Ops) : Ops; +} + /// Return the ShapeInfo for the result of \p I, it it can be determined. static std::optional<ShapeInfo> computeShapeInfoForInst(Instruction *I, @@ -329,9 +342,8 @@ computeShapeInfoForInst(Instruction *I, return OpShape->second; } - if (isUniformShape(I) || isa<SelectInst>(I)) { - auto Ops = I->operands(); - auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops; + if (isShapePreserving(I)) { + auto ShapedOps = getShapedOperandsForInst(I); // Find the first operand that has a known shape and use that. for (auto &Op : ShapedOps) { auto OpShape = ShapeMap.find(Op.get()); @@ -710,10 +722,9 @@ public: case Intrinsic::matrix_column_major_store: return true; default: - return isUniformShape(II); + break; } - return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) || - isa<SelectInst>(V); + return isShapePreserving(V) || isa<StoreInst>(V) || isa<LoadInst>(V); } /// Propagate the shape information of instructions to their users. @@ -800,9 +811,8 @@ public: } else if (isa<StoreInst>(V)) { // Nothing to do. We forward-propagated to this so we would just // backward propagate to an instruction with an already known shape. - } else if (isUniformShape(V) || isa<SelectInst>(V)) { - auto Ops = cast<Instruction>(V)->operands(); - auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops; + } else if (isShapePreserving(V)) { + auto ShapedOps = getShapedOperandsForInst(cast<Instruction>(V)); // Propagate to all operands. ShapeInfo Shape = ShapeMap[V]; for (Use &U : ShapedOps) { diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index e043d07..08be5df 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1534,8 +1534,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, bool SrcNotDom = false; auto CaptureTrackingWithModRef = - [&](Instruction *AI, - function_ref<bool(Instruction *)> ModRefCallback) -> bool { + [&](Instruction *AI, function_ref<bool(Instruction *)> ModRefCallback, + bool &AddressCaptured) -> bool { SmallVector<Instruction *, 8> Worklist; Worklist.push_back(AI); unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); @@ -1559,8 +1559,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, if (!Visited.insert(&U).second) continue; UseCaptureInfo CI = DetermineUseCaptureKind(U, AI); - if (capturesAnything(CI.UseCC)) + if (capturesAnyProvenance(CI.UseCC)) return false; + AddressCaptured |= capturesAddress(CI.UseCC); if (UI->mayReadOrWriteMemory()) { if (UI->isLifetimeStartOrEnd()) { @@ -1627,7 +1628,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, return true; }; - if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) + bool DestAddressCaptured = false; + if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback, + DestAddressCaptured)) return false; // Bailout if Dest may have any ModRef before Store. if (!ReachabilityWorklist.empty() && @@ -1653,7 +1656,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, return true; }; - if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback)) + bool SrcAddressCaptured = false; + if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback, + SrcAddressCaptured)) + return false; + + // If both the source and destination address are captured, the fact that they + // are no longer two separate allocations may be observed. + if (DestAddressCaptured && SrcAddressCaptured) return false; // We can do the transformation. First, move the SrcAlloca to the start of the diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 5af6c96..bb6c879 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -81,6 +81,7 @@ STATISTIC( STATISTIC(NumInvariantConditionsInjected, "Number of invariant conditions injected and unswitched"); +namespace llvm { static cl::opt<bool> EnableNonTrivialUnswitch( "enable-nontrivial-unswitch", cl::init(false), cl::Hidden, cl::desc("Forcibly enables non-trivial loop unswitching rather than " @@ -131,11 +132,17 @@ static cl::opt<bool> InjectInvariantConditions( static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold( "simple-loop-unswitch-inject-invariant-condition-hotness-threshold", - cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " - "unswitch on them to eliminate branches that are " - "not-taken 1/<this option> times or less."), + cl::Hidden, + cl::desc("Only try to inject loop invariant conditions and " + "unswitch on them to eliminate branches that are " + "not-taken 1/<this option> times or less."), cl::init(16)); +static cl::opt<bool> EstimateProfile("simple-loop-unswitch-estimate-profile", + cl::Hidden, cl::init(true)); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // namespace llvm + AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key; namespace { struct CompareDesc { @@ -268,13 +275,42 @@ static bool areLoopExitPHIsLoopInvariant(const Loop &L, llvm_unreachable("Basic blocks should never be empty!"); } -/// Copy a set of loop invariant values \p ToDuplicate and insert them at the +/// Copy a set of loop invariant values \p Invariants and insert them at the /// end of \p BB and conditionally branch on the copied condition. We only /// branch on a single value. +/// We attempt to estimate the profile of the resulting conditional branch from +/// \p ComputeProfFrom, which is the original conditional branch we're +/// unswitching. +/// When \p Direction is true, the \p Invariants form a disjunction, and the +/// branch conditioned on it exits the loop on the "true" case. When \p +/// Direction is false, the \p Invariants form a conjunction and the branch +/// exits on the "false" case. static void buildPartialUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, - const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { + const Instruction *I, AssumptionCache *AC, const DominatorTree &DT, + const BranchInst &ComputeProfFrom) { + + SmallVector<uint32_t> BranchWeights; + bool HasBranchWeights = EstimateProfile && !ProfcheckDisableMetadataFixes && + extractBranchWeights(ComputeProfFrom, BranchWeights); + // If Direction is true, that means we had a disjunction and that the "true" + // case exits. The probability of the disjunction of the subset of terms is at + // most as high as the original one. So, if the probability is higher than the + // one we'd assign in absence of a profile (i.e. 0.5), we will use 0.5, + // but if it's lower, we will use the original probability. + // Conversely, if Direction is false, that means we had a conjunction, and the + // probability of exiting is captured in the second branch weight. That + // probability is a disjunction (of the negation of the original terms). The + // same reasoning applies as above. + // Issue #165649: should we expect BFI to conserve, and use that to calculate + // the branch weights? + if (HasBranchWeights && + static_cast<double>(BranchWeights[Direction ? 0 : 1]) / + static_cast<double>(sum_of(BranchWeights)) > + 0.5) + HasBranchWeights = false; + IRBuilder<> IRB(&BB); IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); @@ -287,8 +323,14 @@ static void buildPartialUnswitchConditionalBranch( Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants) : IRB.CreateAnd(FrozenInvariants); - IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, - Direction ? &NormalSucc : &UnswitchedSucc); + auto *BR = IRB.CreateCondBr( + Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc, + HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof) + : nullptr); + if (!HasBranchWeights) + setExplicitlyUnknownBranchWeightsIfProfiled( + *BR, *BR->getParent()->getParent(), DEBUG_TYPE); } /// Copy a set of loop invariant values, and conditionally branch on them. @@ -658,7 +700,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, " condition!"); buildPartialUnswitchConditionalBranch( *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH, - FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT); + FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT, BI); } // Update the dominator tree with the added edge. @@ -2477,7 +2519,7 @@ static void unswitchNontrivialInvariants( else { buildPartialUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, - FreezeLoopUnswitchCond, BI, &AC, DT); + FreezeLoopUnswitchCond, BI, &AC, DT, *BI); } DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 0f3978f..5f6f66a 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -143,8 +143,8 @@ struct SubGraphTraits { class WrappedSuccIterator : public iterator_adaptor_base< WrappedSuccIterator, BaseSuccIterator, - typename std::iterator_traits<BaseSuccIterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { + std::iterator_traits<BaseSuccIterator>::iterator_category, NodeRef, + std::ptrdiff_t, NodeRef *, NodeRef> { SmallDenseSet<RegionNode *> *Nodes; public: diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 9829d4d..11db0ec 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -674,6 +674,79 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName); } +/// Helper function to update the cycle or loop information after inserting a +/// new block between a callbr instruction and one of its target blocks. Adds +/// the new block to the innermost cycle or loop that the callbr instruction and +/// the original target block share. +/// \p LCI cycle or loop information to update +/// \p CallBrBlock block containing the callbr instruction +/// \p CallBrTarget new target block of the callbr instruction +/// \p Succ original target block of the callbr instruction +template <typename TI, typename T> +static bool updateCycleLoopInfo(TI *LCI, BasicBlock *CallBrBlock, + BasicBlock *CallBrTarget, BasicBlock *Succ) { + static_assert(std::is_same_v<TI, CycleInfo> || std::is_same_v<TI, LoopInfo>, + "type must be CycleInfo or LoopInfo"); + if (!LCI) + return false; + + T *LC; + if constexpr (std::is_same_v<TI, CycleInfo>) + LC = LCI->getSmallestCommonCycle(CallBrBlock, Succ); + else + LC = LCI->getSmallestCommonLoop(CallBrBlock, Succ); + if (!LC) + return false; + + if constexpr (std::is_same_v<TI, CycleInfo>) + LCI->addBlockToCycle(CallBrTarget, LC); + else + LC->addBasicBlockToLoop(CallBrTarget, *LCI); + + return true; +} + +BasicBlock *llvm::SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ, + unsigned SuccIdx, DomTreeUpdater *DTU, + CycleInfo *CI, LoopInfo *LI, + bool *UpdatedLI) { + CallBrInst *CallBr = dyn_cast<CallBrInst>(CallBrBlock->getTerminator()); + assert(CallBr && "expected callbr terminator"); + assert(SuccIdx < CallBr->getNumSuccessors() && + Succ == CallBr->getSuccessor(SuccIdx) && "invalid successor index"); + + // Create a new block between callbr and the specified successor. + // splitBlockBefore cannot be re-used here since it cannot split if the split + // point is a PHI node (because BasicBlock::splitBasicBlockBefore cannot + // handle that). But we don't need to rewire every part of a potential PHI + // node. We only care about the edge between CallBrBlock and the original + // successor. + BasicBlock *CallBrTarget = + BasicBlock::Create(CallBrBlock->getContext(), + CallBrBlock->getName() + ".target." + Succ->getName(), + CallBrBlock->getParent()); + // Rewire control flow from the new target block to the original successor. + Succ->replacePhiUsesWith(CallBrBlock, CallBrTarget); + // Rewire control flow from callbr to the new target block. + CallBr->setSuccessor(SuccIdx, CallBrTarget); + // Jump from the new target block to the original successor. + BranchInst::Create(Succ, CallBrTarget); + + bool Updated = + updateCycleLoopInfo<LoopInfo, Loop>(LI, CallBrBlock, CallBrTarget, Succ); + if (UpdatedLI) + *UpdatedLI = Updated; + updateCycleLoopInfo<CycleInfo, Cycle>(CI, CallBrBlock, CallBrTarget, Succ); + if (DTU) { + DTU->applyUpdates({{DominatorTree::Insert, CallBrBlock, CallBrTarget}}); + if (DTU->getDomTree().dominates(CallBrBlock, Succ)) + DTU->applyUpdates({{DominatorTree::Delete, CallBrBlock, Succ}, + {DominatorTree::Insert, CallBrTarget, Succ}}); + } + + return CallBrTarget; +} + void llvm::setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) { if (auto *II = dyn_cast<InvokeInst>(TI)) II->setUnwindDest(Succ); diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp index 0046a00..287a177 100644 --- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp +++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Utils/ControlFlowUtils.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/ValueHandle.h" @@ -281,7 +282,9 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize( for (auto [BB, Succ0, Succ1] : Branches) { #ifndef NDEBUG - assert(Incoming.insert(BB).second && "Duplicate entry for incoming block."); + assert( + (Incoming.insert(BB).second || isa<CallBrInst>(BB->getTerminator())) && + "Duplicate entry for incoming block."); #endif if (Succ0) Outgoing.insert(Succ0); diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp index 45e1d12..804af22 100644 --- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp +++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp @@ -79,6 +79,53 @@ // Limitation: The pass cannot handle switch statements and indirect // branches. Both must be lowered to plain branches first. // +// CallBr support: CallBr is handled as a more general branch instruction which +// can have multiple successors. The pass redirects the edges to intermediate +// target blocks that unconditionally branch to the original callbr target +// blocks. This allows the control flow hub to know to which of the original +// target blocks to jump to. +// Example input CFG: +// Entry (callbr) +// / \ +// v v +// H ----> B +// ^ /| +// `----' | +// v +// Exit +// +// becomes: +// Entry (callbr) +// / \ +// v v +// target.H target.B +// | | +// v v +// H ----> B +// ^ /| +// `----' | +// v +// Exit +// +// Note +// OUTPUT CFG: Converted to a natural loop with a new header N. +// +// Entry (callbr) +// / \ +// v v +// target.H target.B +// \ / +// \ / +// v v +// N <---. +// / \ \ +// / \ | +// v v / +// H --> B --' +// | +// v +// Exit +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/FixIrreducible.h" @@ -231,6 +278,7 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, return false; LLVM_DEBUG(dbgs() << "Processing cycle:\n" << CI.print(&C) << "\n";); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); ControlFlowHub CHub; SetVector<BasicBlock *> Predecessors; @@ -242,18 +290,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, } for (BasicBlock *P : Predecessors) { - auto *Branch = cast<BranchInst>(P->getTerminator()); - // Exactly one of the two successors is the header. - BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; - BasicBlock *Succ1 = Succ0 ? nullptr : Header; - if (!Succ0) - assert(Branch->getSuccessor(1) == Header); - assert(Succ0 || Succ1); - CHub.addBranch(P, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added internal branch: " << P->getName() << " -> " - << (Succ0 ? Succ0->getName() : "") << " " - << (Succ1 ? Succ1->getName() : "") << "\n"); + if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator())) { + // Exactly one of the two successors is the header. + BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; + BasicBlock *Succ1 = Succ0 ? nullptr : Header; + assert(Succ0 || Branch->getSuccessor(1) == Header); + assert(Succ0 || Succ1); + CHub.addBranch(P, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added internal branch: " << printBasicBlock(P) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { + for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { + BasicBlock *Succ = CallBr->getSuccessor(I); + if (Succ != Header) + continue; + BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added internal branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } // Redirect external incoming edges. This includes the edges on the header. @@ -266,17 +328,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, } for (BasicBlock *P : Predecessors) { - auto *Branch = cast<BranchInst>(P->getTerminator()); - BasicBlock *Succ0 = Branch->getSuccessor(0); - Succ0 = C.contains(Succ0) ? Succ0 : nullptr; - BasicBlock *Succ1 = - Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); - Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; - CHub.addBranch(P, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added external branch: " << P->getName() << " -> " - << (Succ0 ? Succ0->getName() : "") << " " - << (Succ1 ? Succ1->getName() : "") << "\n"); + if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator()); Branch) { + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = C.contains(Succ0) ? Succ0 : nullptr; + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; + CHub.addBranch(P, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { + for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { + BasicBlock *Succ = CallBr->getSuccessor(I); + if (!C.contains(Succ)) + continue; + BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added external branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } // Redirect all the backedges through a "hub" consisting of a series @@ -292,7 +369,6 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, SetVector<BasicBlock *> Entries; Entries.insert(C.entry_rbegin(), C.entry_rend()); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); CHub.finalize(&DTU, GuardBlocks, "irr"); #if defined(EXPENSIVE_CHECKS) assert(DT.verify(DominatorTree::VerificationLevel::Full)); @@ -325,8 +401,6 @@ static bool FixIrreducibleImpl(Function &F, CycleInfo &CI, DominatorTree &DT, LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: " << F.getName() << "\n"); - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - bool Changed = false; for (Cycle *TopCycle : CI.toplevel_cycles()) { for (Cycle *C : depth_first(TopCycle)) { diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 4fe736a..94dfd3a 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -499,9 +499,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L); const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); - unsigned EstimatedLoopInvocationWeight = 0; std::optional<unsigned> OriginalTripCount = - llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight); + llvm::getLoopEstimatedTripCount(L); + BranchProbability OriginalLoopProb = llvm::getLoopProbability(L); // Effectively "DCE" unrolled iterations that are beyond the max tripcount // and will never be executed. @@ -592,11 +592,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, : isEpilogProfitable(L); if (ULO.Runtime && - !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount, - EpilogProfitability, ULO.UnrollRemainder, - ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, - PreserveLCSSA, ULO.SCEVExpansionBudget, - ULO.RuntimeUnrollMultiExit, RemainderLoop)) { + !UnrollRuntimeLoopRemainder( + L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability, + ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, + PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit, + RemainderLoop, OriginalTripCount, OriginalLoopProb)) { if (ULO.Force) ULO.Runtime = false; else { @@ -1130,11 +1130,46 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, LI->erase(L); // We shouldn't try to use `L` anymore. L = nullptr; - } else if (OriginalTripCount) { - // Update the trip count. Note that the remainder has already logic - // computing it in `UnrollRuntimeLoopRemainder`. - setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count, - EstimatedLoopInvocationWeight); + } else { + // Update metadata for the loop's branch weights and estimated trip count: + // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch + // weights, latch branch weights, and estimated trip count of the + // remainder loop it creates. It also sets the branch weights for the + // unrolled loop guard it creates. The branch weights for the unrolled + // loop latch are adjusted below. FIXME: Handle prologue loops. + // - Otherwise, if unrolled loop iteration latches become unconditional, + // branch weights are adjusted above. FIXME: Actually handle such + // unconditional latches. + // - Otherwise, the original loop's branch weights are correct for the + // unrolled loop, so do not adjust them. + // - In all cases, the unrolled loop's estimated trip count is set below. + // + // As an example of the last case, consider what happens if the unroll count + // is 4 for a loop with an estimated trip count of 10 when we do not create + // a remainder loop and all iterations' latches remain conditional. Each + // unrolled iteration's latch still has the same probability of exiting the + // loop as it did when in the original loop, and thus it should still have + // the same branch weights. Each unrolled iteration's non-zero probability + // of exiting already appropriately reduces the probability of reaching the + // remaining iterations just as it did in the original loop. Trying to also + // adjust the branch weights of the final unrolled iteration's latch (i.e., + // the backedge for the unrolled loop as a whole) to reflect its new trip + // count of 3 will erroneously further reduce its block frequencies. + // However, in case an analysis later needs to estimate the trip count of + // the unrolled loop as a whole without considering the branch weights for + // each unrolled iteration's latch within it, we store the new trip count as + // separate metadata. + if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) { + // Where p is always the probability of executing at least 1 more + // iteration, the probability for at least n more iterations is p^n. + setLoopProbability(L, OriginalLoopProb.pow(ULO.Count)); + } + if (OriginalTripCount) { + unsigned NewTripCount = *OriginalTripCount / ULO.Count; + if (!ULO.Runtime && *OriginalTripCount % ULO.Count) + ++NewTripCount; + setLoopEstimatedTripCount(L, NewTripCount); + } } // LoopInfo should not be valid, confirm that. diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 6312831..1e8f6cc 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -40,6 +40,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include <cmath> using namespace llvm; @@ -195,6 +196,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, } } +/// Assume, due to our position in the remainder loop or its guard, anywhere +/// from 0 to \p N more iterations can possibly execute. Among such cases in +/// the original loop (with loop probability \p OriginalLoopProb), what is the +/// probability of executing at least one more iteration? +static BranchProbability +probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // Each of these variables holds the original loop's probability that the + // number of iterations it will execute is some m in the specified range. + BranchProbability ProbOne = OriginalLoopProb; // 1 <= m + BranchProbability ProbTooMany = ProbOne.pow(N + 1); // N + 1 <= m + BranchProbability ProbNotTooMany = ProbTooMany.getCompl(); // 0 <= m <= N + BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N + return ProbOneNotTooMany / ProbNotTooMany; +} + /// Connect the unrolling epilog code to the original loop. /// The unrolling epilog code contains code to execute the /// 'extra' iterations if the run-time trip count modulo the @@ -221,7 +237,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE, - unsigned Count, AssumptionCache &AC) { + unsigned Count, AssumptionCache &AC, + BranchProbability OriginalLoopProb) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]); @@ -332,12 +349,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, PreserveLCSSA); // Add the branch to the exit block (around the epilog loop) MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if (OriginalLoopProb.isUnknown() && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume equal distribution in interval [0, Count). MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(1, Count - 1); } - B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + BranchInst *RemainderLoopGuard = + B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + if (!OriginalLoopProb.isUnknown()) { + setBranchProbability(RemainderLoopGuard, + probOfNextInRemainder(OriginalLoopProb, Count - 1), + /*ForFirstTarget=*/true); + } InsertPt->eraseFromParent(); if (DT) { auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit); @@ -357,14 +381,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, /// The cloned blocks should be inserted between InsertTop and InsertBot. /// InsertTop should be new preheader, InsertBot new loop exit. /// Returns the new cloned loop that is created. -static Loop * -CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, - const bool UnrollRemainder, - BasicBlock *InsertTop, - BasicBlock *InsertBot, BasicBlock *Preheader, +static Loop *CloneLoopBlocks(Loop *L, Value *NewIter, + const bool UseEpilogRemainder, + const bool UnrollRemainder, BasicBlock *InsertTop, + BasicBlock *InsertBot, BasicBlock *Preheader, std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, - DominatorTree *DT, LoopInfo *LI, unsigned Count) { + DominatorTree *DT, LoopInfo *LI, unsigned Count, + std::optional<unsigned> OriginalTripCount, + BranchProbability OriginalLoopProb) { StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); @@ -419,7 +444,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next"); Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp"); MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*LatchBR)) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*LatchBR)) { uint32_t ExitWeight; uint32_t BackEdgeWeight; if (Count >= 3) { @@ -437,7 +463,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, MDBuilder MDB(Builder.getContext()); BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight); } - Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + BranchInst *RemainderLoopLatch = + Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // Compute the total frequency of the original loop body from the + // remainder iterations. Once we've reached them, the first of them + // always executes, so its frequency and probability are 1. + double FreqRemIters = 1; + if (Count > 2) { + BranchProbability ProbReaching = BranchProbability::getOne(); + for (unsigned N = Count - 2; N >= 1; --N) { + ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N); + FreqRemIters += double(ProbReaching.getNumerator()) / + ProbReaching.getDenominator(); + } + } + // Solve for the loop probability that would produce that frequency. + // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters. + double ProbDouble = 1 - 1 / FreqRemIters; + BranchProbability Prob = BranchProbability::getBranchProbability( + std::round(ProbDouble * BranchProbability::getDenominator()), + BranchProbability::getDenominator()); + setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true); + } NewIdx->addIncoming(Zero, InsertTop); NewIdx->addIncoming(IdxNext, NewBB); LatchBR->eraseFromParent(); @@ -460,25 +508,13 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Loop *NewLoop = NewLoops[L]; assert(NewLoop && "L should have been cloned"); - MDNode *LoopID = NewLoop->getLoopID(); - // Only add loop metadata if the loop is not going to be completely - // unrolled. - if (UnrollRemainder) - return NewLoop; - - std::optional<MDNode *> NewLoopID = makeFollowupLoopID( - LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); - if (NewLoopID) { - NewLoop->setLoopID(*NewLoopID); - - // Do not setLoopAlreadyUnrolled if loop attributes have been defined - // explicitly. - return NewLoop; - } + if (OriginalTripCount && UseEpilogRemainder) + setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count); // Add unroll disable metadata to disable future unrolling for this loop. - NewLoop->setLoopAlreadyUnrolled(); + if (!UnrollRemainder) + NewLoop->setLoopAlreadyUnrolled(); return NewLoop; } @@ -603,7 +639,8 @@ bool llvm::UnrollRuntimeLoopRemainder( LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, bool PreserveLCSSA, unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit, - Loop **ResultLoop) { + Loop **ResultLoop, std::optional<unsigned> OriginalTripCount, + BranchProbability OriginalLoopProb) { LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); LLVM_DEBUG(L->dump()); LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" @@ -823,12 +860,23 @@ bool llvm::UnrollRuntimeLoopRemainder( BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; // Branch to either remainder (extra iterations) loop or unrolling loop. MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume loop is nearly always entered. MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights); } - B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + BranchInst *UnrollingLoopGuard = + B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // The original loop's first iteration always happens. Compute the + // probability of the original loop executing Count-1 iterations after that + // to complete the first iteration of the unrolled loop. + BranchProbability ProbOne = OriginalLoopProb; + BranchProbability ProbRest = ProbOne.pow(Count - 1); + setBranchProbability(UnrollingLoopGuard, ProbRest, + /*ForFirstTarget=*/false); + } PreHeaderBR->eraseFromParent(); if (DT) { if (UseEpilogRemainder) @@ -855,9 +903,10 @@ bool llvm::UnrollRuntimeLoopRemainder( // iterations. This function adds the appropriate CFG connections. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; - Loop *remainderLoop = CloneLoopBlocks( - L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot, - NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count); + Loop *remainderLoop = + CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, + InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, + LI, Count, OriginalTripCount, OriginalLoopProb); // Insert the cloned blocks into the function. F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end()); @@ -956,7 +1005,8 @@ bool llvm::UnrollRuntimeLoopRemainder( // Connect the epilog code to the original loop and update the // PHI functions. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, - NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC); + NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC, + OriginalLoopProb); // Update counter in loop for unrolling. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index b6ba822..6e60b94 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -962,13 +962,54 @@ bool llvm::setLoopEstimatedTripCount( if (LatchBranch->getSuccessor(0) != L->getHeader()) std::swap(BackedgeTakenWeight, LatchExitWeight); - MDBuilder MDB(LatchBranch->getContext()); - // Set/Update profile metadata. - LatchBranch->setMetadata( - LLVMContext::MD_prof, - MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight)); + setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight}, + /*IsExpected=*/false); + + return true; +} + +BranchProbability llvm::getLoopProbability(Loop *L) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return BranchProbability::getUnknown(); + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return getBranchProbability(LatchBranch, FirstTargetIsLoop); +} +bool llvm::setLoopProbability(Loop *L, BranchProbability P) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return false; + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return setBranchProbability(LatchBranch, P, FirstTargetIsLoop); +} + +BranchProbability llvm::getBranchProbability(BranchInst *B, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return BranchProbability::getUnknown(); + uint64_t Weight0, Weight1; + if (!extractBranchWeights(*B, Weight0, Weight1)) + return BranchProbability::getUnknown(); + uint64_t Denominator = Weight0 + Weight1; + if (Denominator == 0) + return BranchProbability::getUnknown(); + if (!ForFirstTarget) + std::swap(Weight0, Weight1); + return BranchProbability::getBranchProbability(Weight0, Denominator); +} + +bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return false; + BranchProbability Prob0 = P; + BranchProbability Prob1 = P.getCompl(); + if (!ForFirstTarget) + std::swap(Prob0, Prob1); + setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()}, + /*IsExpected=*/false); return true; } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index b03fb62..cbc604e 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -80,6 +80,7 @@ #include <algorithm> #include <cassert> #include <climits> +#include <cmath> #include <cstddef> #include <cstdint> #include <iterator> @@ -5955,7 +5956,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Update weight for the newly-created conditional branch. - if (hasBranchWeightMD(*SI)) { + if (hasBranchWeightMD(*SI) && NewBI->isConditional()) { SmallVector<uint64_t, 8> Weights; getBranchWeights(SI, Weights); if (Weights.size() == 1 + SI->getNumCases()) { @@ -5977,14 +5978,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Prune obsolete incoming values off the successors' PHI nodes. - for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(Dest->phis())) { unsigned PreviousEdges = Cases->size(); if (Dest == SI->getDefaultDest()) ++PreviousEdges; for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) - cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } - for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(OtherDest->phis())) { unsigned PreviousEdges = OtherCases->size(); if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; @@ -5993,7 +5994,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, if (NewBI->isUnconditional()) ++E; for (unsigned I = 0; I != E; ++I) - cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } // Clean up the default block - it may have phis or other instructions before @@ -7632,7 +7633,33 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, auto *DefaultCaseBB = SI->getDefaultDest(); BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); auto It = OrigBB->getTerminator()->getIterator(); + SmallVector<uint32_t> Weights; + auto HasWeights = + !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights); auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + if (HasWeights && any_of(Weights, [](const auto &V) { return V != 0; })) { + // IsPow2 covers a subset of the cases in which we'd go to the default + // label. The other is those powers of 2 that don't appear in the case + // statement. We don't know the distribution of the values coming in, so + // the safest is to split 50-50 the original probability to `default`. + uint64_t OrigDenominator = sum_of(map_range( + Weights, [](const auto &V) { return static_cast<uint64_t>(V); })); + SmallVector<uint64_t> NewWeights(2); + NewWeights[1] = Weights[0] / 2; + NewWeights[0] = OrigDenominator - NewWeights[1]; + setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false); + + // For the original switch, we reduce the weight of the default by the + // amount by which the previous branch contributes to getting to default, + // and then make sure the remaining weights have the same relative ratio + // wrt eachother. + uint64_t CasesDenominator = OrigDenominator - Weights[0]; + Weights[0] /= 2; + for (auto &W : drop_begin(Weights)) + W = NewWeights[0] * static_cast<double>(W) / CasesDenominator; + + setBranchWeights(*SI, Weights, /*IsExpected=*/false); + } // BI is handling the default case for SI, and so should share its DebugLoc. BI->setDebugLoc(SI->getDebugLoc()); It->eraseFromParent(); diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 9f338db..94c5c170 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -12,7 +12,11 @@ // // Limitation: This assumes that all terminators in the CFG are direct branches // (the "br" instruction). The presence of any other control flow -// such as indirectbr, switch or callbr will cause an assert. +// such as indirectbr or switch will cause an assert. +// The callbr terminator is supported by creating intermediate +// target blocks that unconditionally branch to the original target +// blocks. These intermediate target blocks can then be redirected +// through the ControlFlowHub as usual. // //===----------------------------------------------------------------------===// @@ -150,25 +154,55 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { SmallVector<BasicBlock *, 8> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix; // Redirect exiting edges through a control flow hub. ControlFlowHub CHub; - for (auto *BB : ExitingBlocks) { - auto *Branch = cast<BranchInst>(BB->getTerminator()); - BasicBlock *Succ0 = Branch->getSuccessor(0); - Succ0 = L->contains(Succ0) ? nullptr : Succ0; - - BasicBlock *Succ1 = - Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); - Succ1 = L->contains(Succ1) ? nullptr : Succ1; - CHub.addBranch(BB, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added exiting branch: " << BB->getName() << " -> {" - << (Succ0 ? Succ0->getName() : "<none>") << ", " - << (Succ1 ? Succ1->getName() : "<none>") << "}\n"); + + for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { + BasicBlock *BB = ExitingBlocks[I]; + if (BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator())) { + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = L->contains(Succ0) ? nullptr : Succ0; + + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = L->contains(Succ1) ? nullptr : Succ1; + CHub.addBranch(BB, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added extiting branch: " << printBasicBlock(BB) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(BB->getTerminator())) { + for (unsigned J = 0; J < CallBr->getNumSuccessors(); ++J) { + BasicBlock *Succ = CallBr->getSuccessor(J); + if (L->contains(Succ)) + continue; + bool UpdatedLI = false; + BasicBlock *NewSucc = + SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); + // Even if CallBr and Succ do not have a common parent loop, we need to + // add the new target block to the parent loop of the current loop. + if (!UpdatedLI) + CallBrTargetBlocksToFix.push_back(NewSucc); + // ExitingBlocks is later used to restore SSA, so we need to make sure + // that the blocks used for phi nodes in the guard blocks match the + // predecessors of the guard blocks, which, in the case of callbr, are + // the new intermediate target blocks instead of the callbr blocks + // themselves. + ExitingBlocks[I] = NewSucc; + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added exiting branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } SmallVector<BasicBlock *, 8> GuardBlocks; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); BasicBlock *LoopExitBlock; bool ChangedCFG; std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( @@ -187,10 +221,19 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { // The guard blocks were created outside the loop, so they need to become // members of the parent loop. - if (auto ParentLoop = L->getParentLoop()) { + // Same goes for the callbr target blocks. Although we try to add them to the + // smallest common parent loop of the callbr block and the corresponding + // original target block, there might not have been such a loop, in which case + // the newly created callbr target blocks are not part of any loop. For nested + // loops, this might result in them leading to a loop with multiple entry + // points. + if (auto *ParentLoop = L->getParentLoop()) { for (auto *G : GuardBlocks) { ParentLoop->addBasicBlockToLoop(G, LI); } + for (auto *C : CallBrTargetBlocksToFix) { + ParentLoop->addBasicBlockToLoop(C, LI); + } ParentLoop->verifyLoop(); } @@ -218,8 +261,6 @@ bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) { auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - return runImpl(LI, DT); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 3fed003..04b0562 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -167,7 +167,7 @@ public: DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( - new VPInstruction(Opcode, Operands, Flags, DL, Name)); + new VPInstruction(Opcode, Operands, Flags, {}, DL, Name)); } VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, @@ -184,7 +184,7 @@ public: DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( - new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); + new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name)); } VPInstruction *createNot(VPValue *Operand, @@ -205,7 +205,7 @@ public: return tryInsertInstruction(new VPInstruction( Instruction::BinaryOps::Or, {LHS, RHS}, - VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name)); + VPRecipeWithIRFlags::DisjointFlagsTy(false), {}, DL, Name)); } VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS, @@ -221,7 +221,7 @@ public: std::optional<FastMathFlags> FMFs = std::nullopt) { auto *Select = FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, - *FMFs, DL, Name) + *FMFs, {}, DL, Name) : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, DL, Name); return tryInsertInstruction(Select); @@ -235,7 +235,7 @@ public: assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( - new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name)); + new VPInstruction(Instruction::ICmp, {A, B}, Pred, {}, DL, Name)); } /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A @@ -246,7 +246,7 @@ public: assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE && Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( - new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name)); + new VPInstruction(Instruction::FCmp, {A, B}, Pred, {}, DL, Name)); } VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, @@ -254,7 +254,7 @@ public: const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::none(), DL, Name)); + GEPNoWrapFlags::none(), {}, DL, Name)); } VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset, @@ -262,7 +262,7 @@ public: DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction(new VPInstruction( - VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name)); + VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, {}, DL, Name)); } VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset, @@ -270,7 +270,7 @@ public: const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::none(), DL, Name)); + GEPNoWrapFlags::none(), {}, DL, Name)); } VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL, @@ -280,8 +280,7 @@ public: VPValue *createElementCount(Type *Ty, ElementCount EC) { VPlan &Plan = *getInsertBlock()->getPlan(); - VPValue *RuntimeEC = - Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue())); + VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue()); if (EC.isScalable()) { VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty); RuntimeEC = EC.getKnownMinValue() == 1 @@ -304,9 +303,11 @@ public: } VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op, - Type *ResultTy, DebugLoc DL) { + Type *ResultTy, DebugLoc DL, + const VPIRFlags &Flags = {}, + const VPIRMetadata &Metadata = {}) { return tryInsertInstruction( - new VPInstructionWithType(Opcode, Op, ResultTy, {}, DL)); + new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata)); } VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f7968ab..e5c3f17 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( continue; VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -5750,13 +5750,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { getMemoryInstructionCost(I, ElementCount::getFixed(1)))); UpdateMemOpUserCost(cast<LoadInst>(I)); } else if (const auto *Group = getInterleavedAccessGroup(I)) { - // Scalarize an interleave group of address loads. - for (unsigned I = 0; I < Group->getFactor(); ++I) { - if (Instruction *Member = Group->getMember(I)) { - setWideningDecision( - Member, VF, CM_Scalarize, - (VF.getKnownMinValue() * - getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); + // Scalarize all members of this interleaved group when any member + // is used as an address. The address-used load skips scalarization + // overhead, other members include it. + for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) { + if (Instruction *Member = Group->getMember(Idx)) { + InstructionCost Cost = + AddrDefs.contains(Member) + ? (VF.getKnownMinValue() * + getMemoryInstructionCost(Member, + ElementCount::getFixed(1))) + : getMemInstScalarizationCost(Member, VF); + setWideningDecision(Member, VF, CM_Scalarize, Cost); UpdateMemOpUserCost(cast<LoadInst>(Member)); } } @@ -6871,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(), + OrigLoop); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7105,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The // legacy cost model doesn't properly model costs for such loops. assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || + !Legal->getLAI()->getSymbolicStrides().empty() || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) || @@ -7745,8 +7752,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, if (CM.isPredicatedInst(I)) { SmallVector<VPValue *> Ops(Operands); VPValue *Mask = getBlockInMask(Builder.getInsertBlock()); - VPValue *One = - Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); + VPValue *One = Plan.getConstantInt(I->getType(), 1u); auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); Ops[1] = SafeRHS; return new VPWidenRecipe(*I, Ops); @@ -7799,11 +7805,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, } case Instruction::ExtractValue: { SmallVector<VPValue *> NewOps(Operands); - Type *I32Ty = IntegerType::getInt32Ty(I->getContext()); auto *EVI = cast<ExtractValueInst>(I); assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index"); unsigned Idx = EVI->getIndices()[0]; - NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false))); + NewOps.push_back(Plan.getConstantInt(32, Idx)); return new VPWidenRecipe(*I, NewOps); } }; @@ -8172,8 +8177,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "Expected an ADD or SUB operation for predicated partial " "reductions (because the neutral element in the mask is zero)!"); Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = - Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0)); + VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0); BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); } return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, @@ -8335,11 +8339,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( &R) || (isa<VPInstruction>(&R) && !UnderlyingValue)) continue; - - // FIXME: VPlan0, which models a copy of the original scalar loop, should - // not use VPWidenPHIRecipe to model the phis. - assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) && - UnderlyingValue && "unsupported recipe"); + assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe"); // TODO: Gradually replace uses of underlying instruction by analyses on // VPlan. @@ -8440,7 +8440,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -8640,7 +8640,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs && CurrentLinkI->getOpcode() == Instruction::Sub) { Type *PhiTy = PhiR->getUnderlyingValue()->getType(); - auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0)); + auto *Zero = Plan->getConstantInt(PhiTy, 0); VPWidenRecipe *Sub = new VPWidenRecipe( Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {}, VPIRMetadata(), CurrentLinkI->getDebugLoc()); @@ -8854,8 +8854,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( ToDelete.push_back(Select); // Convert the reduction phi to operate on bools. - PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( - OrigLoop->getHeader()->getContext()))); + PhiR->setOperand(0, Plan->getFalse()); continue; } @@ -8877,9 +8876,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( unsigned ScaleFactor = RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr()) .value_or(1); - Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext()); - auto *ScaleFactorVPV = - Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor)); + auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor); VPValue *StartV = PHBuilder.createNaryOp( VPInstruction::ReductionStartVector, {PhiR->getStartValue(), Iden, ScaleFactorVPV}, @@ -9910,7 +9907,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind, *CM.PSE.getSE()); + CM.CostKind, *CM.PSE.getSE(), L); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1b55a3b..bf3f52c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20975,6 +20975,27 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (isa<PHINode>(S.getMainOp()) || isVectorLikeInstWithConstOps(S.getMainOp())) return nullptr; + // If the parent node is non-schedulable and the current node is copyable, and + // any of parent instructions are used outside several basic blocks or in + // bin-op node - cancel scheduling, it may cause wrong def-use deps in + // analysis, leading to a crash. + // Non-scheduled nodes may not have related ScheduleData model, which may lead + // to a skipped dep analysis. + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->doesNotNeedToSchedule() && + EI.UserTE->getOpcode() != Instruction::PHI && + any_of(EI.UserTE->Scalars, [](Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I || I->hasOneUser()) + return false; + for (User *U : I->users()) { + auto *UI = cast<Instruction>(U); + if (isa<BinaryOperator>(UI)) + return true; + } + return false; + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { @@ -22134,6 +22155,27 @@ bool BoUpSLP::collectValuesToDemote( {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(), VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()}); + if (E.isAltShuffle()) { + // Combining these opcodes may lead to incorrect analysis, skip for now. + auto IsDangerousOpcode = [](unsigned Opcode) { + switch (Opcode) { + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return true; + default: + break; + } + return false; + }; + if (IsDangerousOpcode(E.getAltOpcode())) + return FinalAnalysis(); + } + switch (E.getOpcode()) { // We can always demote truncations and extensions. Since truncations can diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 9c869dd..d354933 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -92,7 +92,7 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const { DGNode::print(OS, false); if (PrintDeps) { // Print memory preds. - static constexpr const unsigned Indent = 4; + static constexpr unsigned Indent = 4; for (auto *Pred : MemPreds) OS.indent(Indent) << "<-" << *Pred->getInstruction() << "\n"; } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 86dbd21..5534da9 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -25,14 +25,14 @@ static cl::opt<bool> "emit new instructions (*very* expensive).")); #endif // NDEBUG -static constexpr const unsigned long StopAtDisabled = +static constexpr unsigned long StopAtDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopAt("sbvec-stop-at", cl::init(StopAtDisabled), cl::Hidden, cl::desc("Vectorize if the invocation count is < than this. 0 " "disables vectorization.")); -static constexpr const unsigned long StopBundleDisabled = +static constexpr unsigned long StopBundleDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopBundle("sbvec-stop-bndl", cl::init(StopBundleDisabled), cl::Hidden, diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index ed2f80b..2de6921 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -43,7 +43,7 @@ cl::opt<std::string> AllowFiles( "sbvec-allow-files", cl::init(".*"), cl::Hidden, cl::desc("Run the vectorizer only on file paths that match any in the " "list of comma-separated regex's.")); -static constexpr const char AllowFilesDelim = ','; +static constexpr char AllowFilesDelim = ','; SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") { if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1f10058..cfe1f1e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -939,7 +939,7 @@ class VPIRMetadata { SmallVector<std::pair<unsigned, MDNode *>> Metadata; public: - VPIRMetadata() {} + VPIRMetadata() = default; /// Adds metatadata that can be preserved from the original instruction /// \p I. @@ -950,12 +950,9 @@ public: VPIRMetadata(Instruction &I, LoopVersioning *LVer); /// Copy constructor for cloning. - VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {} + VPIRMetadata(const VPIRMetadata &Other) = default; - VPIRMetadata &operator=(const VPIRMetadata &Other) { - Metadata = Other.Metadata; - return *this; - } + VPIRMetadata &operator=(const VPIRMetadata &Other) = default; /// Add all metadata to \p I. void applyMetadata(Instruction &I) const; @@ -1107,14 +1104,14 @@ public: VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(), - const Twine &Name = ""); + const VPIRFlags &Flags, const VPIRMetadata &MD = {}, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = ""); VP_CLASSOF_IMPL(VPDef::VPInstructionSC) VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); - auto *New = new VPInstruction(Opcode, Operands, *this, getDebugLoc(), Name); + auto *New = new VPInstruction(Opcode, operands(), *this, *this, + getDebugLoc(), Name); if (getUnderlyingValue()) New->setUnderlyingValue(getUnderlyingInstr()); return New; @@ -1196,7 +1193,14 @@ public: VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands, Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL, const Twine &Name = "") - : VPInstruction(Opcode, Operands, Flags, DL, Name), ResultTy(ResultTy) {} + : VPInstruction(Opcode, Operands, Flags, {}, DL, Name), + ResultTy(ResultTy) {} + + VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands, + Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags, + const VPIRMetadata &Metadata, const Twine &Name = "") + : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name), + ResultTy(ResultTy) {} static inline bool classof(const VPRecipeBase *R) { // VPInstructionWithType are VPInstructions with specific opcodes requiring @@ -1221,10 +1225,9 @@ public: } VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); auto *New = - new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this, - getDebugLoc(), getName()); + new VPInstructionWithType(getOpcode(), operands(), getResultType(), + *this, getDebugLoc(), getName()); New->setUnderlyingValue(getUnderlyingValue()); return New; } @@ -3206,6 +3209,9 @@ protected: : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + assert(isa<VPVectorEndPointerRecipe>(getAddr()) || + !Reverse && + "Reversed acccess without VPVectorEndPointerRecipe address?"); } public: @@ -3977,7 +3983,7 @@ class VPIRBasicBlock : public VPBasicBlock { IRBB(IRBB) {} public: - ~VPIRBasicBlock() override {} + ~VPIRBasicBlock() override = default; static inline bool classof(const VPBlockBase *V) { return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; @@ -4029,7 +4035,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { IsReplicator(IsReplicator) {} public: - ~VPRegionBlock() override {} + ~VPRegionBlock() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPBlockBase *V) { @@ -4109,6 +4115,12 @@ public: const VPCanonicalIVPHIRecipe *getCanonicalIV() const { return const_cast<VPRegionBlock *>(this)->getCanonicalIV(); } + + /// Return the type of the canonical IV for loop regions. + Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); } + const Type *getCanonicalIVType() const { + return getCanonicalIV()->getScalarType(); + } }; inline VPRegionBlock *VPRecipeBase::getRegion() { @@ -4387,15 +4399,25 @@ public: } /// Return a VPValue wrapping i1 true. - VPValue *getTrue() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getTrue(Ctx)); - } + VPValue *getTrue() { return getConstantInt(1, 1); } /// Return a VPValue wrapping i1 false. - VPValue *getFalse() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getFalse(Ctx)); + VPValue *getFalse() { return getConstantInt(1, 0); } + + /// Return a VPValue wrapping a ConstantInt with the given type and value. + VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) { + return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value. + VPValue *getConstantInt(unsigned BitWidth, uint64_t Val, + bool IsSigned = false) { + return getConstantInt(APInt(BitWidth, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given APInt value. + VPValue *getConstantInt(const APInt &Val) { + return getOrAddLiveIn(ConstantInt::get(getContext(), Val)); } /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 65688a3..1a66d20 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -612,8 +612,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan, if (!RequiresScalarEpilogueCheck) Cmp = Plan.getFalse(); else if (TailFolded) - Cmp = Plan.getOrAddLiveIn( - ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext()))); + Cmp = Plan.getTrue(); else Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(), &Plan.getVectorTripCount(), LatchDL, "cmp.n"); @@ -712,8 +711,8 @@ void VPlanTransforms::addMinimumIterationCheck( // additional overflow check is required before entering the vector loop. // Get the maximum unsigned value for the type. - VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get( - TripCountTy, cast<IntegerType>(TripCountTy)->getMask())); + VPValue *MaxUIntTripCount = + Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask()); VPValue *DistanceToMax = Builder.createNaryOp( Instruction::Sub, {MaxUIntTripCount, TripCountVPV}, DebugLoc::getUnknown()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 2aaabd9..965426f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -350,13 +350,14 @@ struct VPCostContext { SmallPtrSet<Instruction *, 8> SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; ScalarEvolution &SE; + const Loop *L; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, TargetTransformInfo::TargetCostKind CostKind, - ScalarEvolution &SE) + ScalarEvolution &SE, const Loop *L) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind), SE(SE) {} + CostKind(CostKind), SE(SE), L(L) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b5b98c6..b57c448 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -313,7 +313,8 @@ private: // Check for recipes that do not have opcodes. if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> || std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> || - std::is_same_v<RecipeTy, VPDerivedIVRecipe>) + std::is_same_v<RecipeTy, VPDerivedIVRecipe> || + std::is_same_v<RecipeTy, VPVectorEndPointerRecipe>) return DefR; else return DefR && DefR->getOpcode() == Opcode; @@ -686,6 +687,64 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2}); } +template <typename Addr_t, typename Mask_t> struct Load_match { + Addr_t Addr; + Mask_t Mask; + + Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Load = dyn_cast<VPWidenLoadRecipe>(V); + if (!Load || !Addr.match(Load->getAddr()) || !Load->isMasked() || + !Mask.match(Load->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked load. +template <typename Addr_t, typename Mask_t> +inline Load_match<Addr_t, Mask_t> m_MaskedLoad(const Addr_t &Addr, + const Mask_t &Mask) { + return Load_match<Addr_t, Mask_t>(Addr, Mask); +} + +template <typename Addr_t, typename Val_t, typename Mask_t> struct Store_match { + Addr_t Addr; + Val_t Val; + Mask_t Mask; + + Store_match(Addr_t Addr, Val_t Val, Mask_t Mask) + : Addr(Addr), Val(Val), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Store = dyn_cast<VPWidenStoreRecipe>(V); + if (!Store || !Addr.match(Store->getAddr()) || + !Val.match(Store->getStoredValue()) || !Store->isMasked() || + !Mask.match(Store->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked store. +template <typename Addr_t, typename Val_t, typename Mask_t> +inline Store_match<Addr_t, Val_t, Mask_t> +m_MaskedStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) { + return Store_match<Addr_t, Val_t, Mask_t>(Addr, Val, Mask); +} + +template <typename Op0_t, typename Op1_t> +using VectorEndPointerRecipe_match = + Recipe_match<std::tuple<Op0_t, Op1_t>, 0, + /*Commutative*/ false, VPVectorEndPointerRecipe>; + +template <typename Op0_t, typename Op1_t> +VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0, + const Op1_t &Op1) { + return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1); +} + /// Match a call argument at a given argument index. template <typename Opnd_t> struct Argument_match { /// Call argument index to match. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a63c80..1ee405a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -162,8 +162,12 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPPredInstPHISC: case VPVectorEndPointerSC: return false; - case VPInstructionSC: - return mayWriteToMemory(); + case VPInstructionSC: { + auto *VPI = cast<VPInstruction>(this); + return mayWriteToMemory() || + VPI->getOpcode() == VPInstruction::BranchOnCount || + VPI->getOpcode() == VPInstruction::BranchOnCond; + } case VPWidenCallSC: { Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); @@ -490,10 +494,10 @@ template class VPUnrollPartAccessor<3>; } VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL, - const Twine &Name) + const VPIRFlags &Flags, const VPIRMetadata &MD, + DebugLoc DL, const Twine &Name) : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), - VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { + VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) { assert(flagsValidForOpcode(getOpcode()) && "Set flags not supported for the provided opcode"); assert((getNumOperandsForOpcode(Opcode) == -1u || @@ -1241,6 +1245,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::Select: case Instruction::PHI: case VPInstruction::AnyOf: + case VPInstruction::BranchOnCond: + case VPInstruction::BranchOnCount: case VPInstruction::Broadcast: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: @@ -2372,9 +2378,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && - getScalarType() == CanIV->getScalarType(); + getScalarType() == getRegion()->getCanonicalIVType(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3167,26 +3172,30 @@ bool VPReplicateRecipe::shouldPack() const { }); } -/// Returns true if \p Ptr is a pointer computation for which the legacy cost -/// model computes a SCEV expression when computing the address cost. -static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { +/// Returns a SCEV expression for \p Ptr if it is a pointer computation for +/// which the legacy cost model computes a SCEV expression when computing the +/// address cost. Computing SCEVs for VPValues is incomplete and returns +/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In +/// those cases we fall back to the legacy cost model. Otherwise return nullptr. +static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, + const Loop *L) { auto *PtrR = Ptr->getDefiningRecipe(); if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && cast<VPReplicateRecipe>(PtrR)->getOpcode() == Instruction::GetElementPtr) || isa<VPWidenGEPRecipe>(PtrR) || match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) - return false; + return nullptr; // We are looking for a GEP where all indices are either loop invariant or // inductions. for (VPValue *Opd : drop_begin(PtrR->operands())) { if (!Opd->isDefinedOutsideLoopRegions() && !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) - return false; + return nullptr; } - return true; + return vputils::getSCEVExprForVPValue(Ptr, SE, L); } /// Returns true if \p V is used as part of the address of another load or @@ -3354,9 +3363,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, bool IsLoad = UI->getOpcode() == Instruction::Load; const VPValue *PtrOp = getOperand(!IsLoad); - // TODO: Handle cases where we need to pass a SCEV to - // getAddressComputationCost. - if (shouldUseAddressAccessSCEV(PtrOp)) + const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L); + if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV)) break; Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); @@ -3374,7 +3382,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, InstructionCost ScalarCost = ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, - nullptr, Ctx.CostKind); + PtrSCEV, Ctx.CostKind); if (isSingleScalar()) return ScalarCost; diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.h b/llvm/lib/Transforms/Vectorize/VPlanSLP.h index 77ff36c..44972c68 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.h +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.h @@ -89,8 +89,7 @@ class VPlanSlp { /// Width of the widest combined bundle in bits. unsigned WidestBundleBits = 0; - using MultiNodeOpTy = - typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; + using MultiNodeOpTy = std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; // Input operand bundles for the current multi node. Each multi node operand // bundle contains values not matching the multi node's opcode. They will diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 4d98014..9d9bb14 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -151,7 +151,27 @@ static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) { static bool sinkScalarOperands(VPlan &Plan) { auto Iter = vp_depth_first_deep(Plan.getEntry()); + bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; + + auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo, + VPSingleDefRecipe *Candidate) { + // We only know how to duplicate VPReplicateRecipes and + // VPScalarIVStepsRecipes for now. + if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) + return false; + + if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || + Candidate->mayReadOrWriteMemory()) + return false; + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) + if (!ScalarVFOnly && RepR->isSingleScalar()) + return false; + + return true; + }; + // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; @@ -159,51 +179,37 @@ static bool sinkScalarOperands(VPlan &Plan) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) continue; - VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]); - if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) + VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); + if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; for (auto &Recipe : *VPBB) { - for (VPValue *Op : Recipe.operands()) + for (VPValue *Op : Recipe.operands()) { if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({VPBB, Def}); + if (IsValidSinkCandidate(VPBB, Def)) + WorkList.insert({VPBB, Def}); + } } } - bool ScalarVFOnly = Plan.hasScalarVFOnly(); // Try to sink each replicate or scalar IV steps recipe in the worklist. for (unsigned I = 0; I != WorkList.size(); ++I) { VPBasicBlock *SinkTo; VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - if (SinkCandidate->getParent() == SinkTo || - SinkCandidate->mayHaveSideEffects() || - SinkCandidate->mayReadOrWriteMemory()) - continue; - if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) { - if (!ScalarVFOnly && RepR->isSingleScalar()) - continue; - } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate)) - continue; - bool NeedsDuplicating = false; // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must be uniform-after-vectorization ( - // i.e., only first lane is used) . In the latter case, we need to duplicate - // SinkCandidate. - auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, - SinkCandidate](VPUser *U) { - auto *UI = cast<VPRecipeBase>(U); - if (UI->getParent() == SinkTo) - return true; - NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate); - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. - return NeedsDuplicating && - isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate); - }; - if (!all_of(SinkCandidate->users(), CanSinkWithUser)) + // or all users outside of SinkTo must have only their first lane used. In + // the latter case, we need to duplicate SinkCandidate. + auto UsersOutsideSinkTo = + make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { + return cast<VPRecipeBase>(U)->getParent() != SinkTo; + }); + if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) { + return !U->onlyFirstLaneUsed(SinkCandidate); + })) continue; + bool NeedsDuplicating = !UsersOutsideSinkTo.empty(); if (NeedsDuplicating) { if (ScalarVFOnly) @@ -230,7 +236,8 @@ static bool sinkScalarOperands(VPlan &Plan) { for (VPValue *Op : SinkCandidate->operands()) if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({SinkTo, Def}); + if (IsValidSinkCandidate(SinkTo, Def)) + WorkList.insert({SinkTo, Def}); Changed = true; } return Changed; @@ -699,8 +706,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { continue; const InductionDescriptor &ID = PtrIV->getInductionDescriptor(); - VPValue *StartV = - Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0)); + VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0); VPValue *StepV = PtrIV->getOperand(1); VPScalarIVStepsRecipe *Steps = createScalarIVSteps( Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr, @@ -820,7 +826,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // Calculate the final index. VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); auto *CanonicalIV = LoopRegion->getCanonicalIV(); - Type *CanonicalIVType = CanonicalIV->getScalarType(); + Type *CanonicalIVType = LoopRegion->getCanonicalIVType(); VPBuilder B(cast<VPBasicBlock>(PredVPBB)); DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc(); @@ -836,7 +842,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // changed it means the exit is using the incremented value, so we need to // add the step. if (Incoming != WideIV) { - VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1)); + VPValue *One = Plan.getConstantInt(CanonicalIVType, 1); EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL); } @@ -882,7 +888,7 @@ static VPValue *optimizeLatchExitInductionUser( return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape"); if (ScalarTy->isPointerTy()) { Type *StepTy = TypeInfo.inferScalarType(Step); - auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0)); + auto *Zero = Plan.getConstantInt(StepTy, 0); return B.createPtrAdd(EndValue, B.createNaryOp(Instruction::Sub, {Zero, Step}), DebugLoc::getUnknown(), "ind.escape"); @@ -1057,13 +1063,9 @@ static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R, return nullptr; } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { - VPlan *Plan = R.getParent()->getPlan(); - - auto *Def = dyn_cast<VPSingleDefRecipe>(&R); - if (!Def) - return; +/// Try to simplify VPSingleDefRecipe \p Def. +static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { + VPlan *Plan = Def->getParent()->getPlan(); // Simplification of live-in IR values for SingleDef recipes using // InstSimplifyFolder. @@ -1073,7 +1075,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(V); // Fold PredPHI LiveIn -> LiveIn. - if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) { + if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) { VPValue *Op = PredPHI->getOperand(0); if (Op->isLiveIn()) PredPHI->replaceAllUsesWith(Op); @@ -1092,12 +1094,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) + unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue())) ? Instruction::SExt : Instruction::ZExt; auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A, TruncTy); - if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { + if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) { // UnderlyingExt has distinct return type, used to retain legacy cost. Ext->setUnderlyingValue(UnderlyingExt); } @@ -1160,7 +1162,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { Builder.createLogicalAnd(X, Builder.createOr(Y, Z))); // x && !x -> 0 - if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) + if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) return Def->replaceAllUsesWith(Plan->getFalse()); if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) @@ -1188,8 +1190,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(A); if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt()))) - return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1) - : R.getOperand(0)); + return Def->replaceAllUsesWith( + Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0)); if (match(Def, m_Not(m_VPValue(A)))) { if (match(A, m_Not(m_VPValue(A)))) @@ -1218,8 +1220,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // If Cmp doesn't have a debug location, use the one from the negation, // to preserve the location. - if (!Cmp->getDebugLoc() && R.getDebugLoc()) - Cmp->setDebugLoc(R.getDebugLoc()); + if (!Cmp->getDebugLoc() && Def->getDebugLoc()) + Cmp->setDebugLoc(Def->getDebugLoc()); } } } @@ -1245,7 +1247,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A), m_VPValue(X), m_VPValue())) && match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) && - TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) { + TypeInfo.inferScalarType(Def)->isIntegerTy(1)) { Def->setOperand(1, Def->getOperand(0)); Def->setOperand(0, Y); return; @@ -1253,35 +1255,41 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) { if (Phi->getOperand(0) == Phi->getOperand(1)) - Def->replaceAllUsesWith(Phi->getOperand(0)); + Phi->replaceAllUsesWith(Phi->getOperand(0)); return; } // Look through ExtractLastElement (BuildVector ....). - if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()), - m_ExtractLastLanePerPart(m_BuildVector())))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_CombineOr(m_ExtractLastElement(m_BuildVector()), + m_ExtractLastLanePerPart(m_BuildVector())))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 1)); return; } // Look through ExtractPenultimateElement (BuildVector ....). - if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( - m_BuildVector()))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( + m_BuildVector()))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 2)); return; } uint64_t Idx; - if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith(BuildVector->getOperand(Idx)); return; } + if (match(Def, m_BuildVector()) && all_equal(Def->operands())) { + Def->replaceAllUsesWith( + Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0))); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -1298,7 +1306,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { isa<VPPhi>(X)) { auto *Phi = cast<VPPhi>(X); if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) && - Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) { + Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) { Phi->setOperand(0, Y); Def->replaceAllUsesWith(Phi); return; @@ -1306,7 +1314,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // VPVectorPointer for part 0 can be replaced by their start pointer. - if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) { + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) { if (VecPtr->isFirstPart()) { VecPtr->replaceAllUsesWith(VecPtr->getOperand(0)); return; @@ -1361,9 +1369,9 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) { Plan.getEntry()); VPTypeAnalysis TypeInfo(Plan); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - simplifyRecipe(R, TypeInfo); - } + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) + if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R)) + simplifyRecipe(Def, TypeInfo); } } @@ -1419,6 +1427,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { true /*IsSingleScalar*/); Clone->insertBefore(RepOrWidenR); RepOrWidenR->replaceAllUsesWith(Clone); + if (isDeadRecipe(*RepOrWidenR)) + RepOrWidenR->eraseFromParent(); } } } @@ -1572,9 +1582,9 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, continue; // Update IV operands and comparison bound to use new narrower type. - auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0)); + auto *NewStart = Plan.getConstantInt(NewIVTy, 0); WideIV->setStartValue(NewStart); - auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1)); + auto *NewStep = Plan.getConstantInt(NewIVTy, 1); WideIV->setStepValue(NewStep); auto *NewBTC = new VPWidenCastRecipe( @@ -1693,8 +1703,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, // When using wide lane masks, the return type of the get.active.lane.mask // intrinsic is VF x UF (last operand). - VPValue *ALMMultiplier = - Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + VPValue *ALMMultiplier = Plan.getConstantInt(64, UF); EntryALM->setOperand(2, ALMMultiplier); LoopALM->setOperand(2, ALMMultiplier); @@ -2400,8 +2409,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1)); + VPValue *ALMMultiplier = + Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1); auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC, ALMMultiplier}, DL, "active.lane.mask.entry"); @@ -2501,7 +2510,7 @@ void VPlanTransforms::addActiveLaneMask( } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1)); + ConstantInt::get(LoopRegion->getCanonicalIVType(), 1)); LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, @@ -2515,90 +2524,102 @@ void VPlanTransforms::addActiveLaneMask( HeaderMask->eraseFromParent(); } +template <typename Op0_t, typename Op1_t> struct RemoveMask_match { + Op0_t In; + Op1_t &Out; + + RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {} + + template <typename OpTy> bool match(OpTy *V) const { + if (m_Specific(In).match(V)) { + Out = nullptr; + return true; + } + if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V)) + return true; + return false; + } +}; + +/// Match a specific mask \p In, or a combination of it (logical-and In, Out). +/// Returns the remaining part \p Out if so, or nullptr otherwise. +template <typename Op0_t, typename Op1_t> +static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In, + Op1_t &Out) { + return RemoveMask_match<Op0_t, Op1_t>(In, Out); +} + /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based /// recipe could be created. /// \p HeaderMask Header Mask. /// \p CurRecipe Recipe to be transform. /// \p TypeInfo VPlan-based type analysis. -/// \p AllOneMask The vector mask parameter of vector-predication intrinsics. /// \p EVL The explicit vector length parameter of vector-predication /// intrinsics. static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, - VPTypeAnalysis &TypeInfo, - VPValue &AllOneMask, VPValue &EVL) { - // FIXME: Don't transform recipes to EVL recipes if they're not masked by the - // header mask. - auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { - assert(OrigMask && "Unmasked recipe when folding tail"); - // HeaderMask will be handled using EVL. - VPValue *Mask; - if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) - return Mask; - return HeaderMask == OrigMask ? nullptr : OrigMask; - }; + VPTypeAnalysis &TypeInfo, VPValue &EVL) { + VPlan *Plan = CurRecipe.getParent()->getPlan(); + VPValue *Addr, *Mask, *EndPtr; /// Adjust any end pointers so that they point to the end of EVL lanes not VF. - auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * { - auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr); - if (!EndPtr) - return Addr; - assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() && - "VPVectorEndPointerRecipe with non-VF VF operand?"); - assert( - all_of(EndPtr->users(), - [](VPUser *U) { - return cast<VPWidenMemoryRecipe>(U)->isReverse(); - }) && - "VPVectorEndPointRecipe not used by reversed widened memory recipe?"); - VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone(); - EVLAddr->insertBefore(&CurRecipe); - EVLAddr->setOperand(1, &EVL); - return EVLAddr; + auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) { + auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone(); + EVLEndPtr->insertBefore(&CurRecipe); + EVLEndPtr->setOperand(1, &EVL); + return EVLEndPtr; }; - return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) - .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { - VPValue *NewMask = GetNewMask(L->getMask()); - VPValue *NewAddr = GetNewAddr(L->getAddr()); - return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask); - }) - .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { - VPValue *NewMask = GetNewMask(S->getMask()); - VPValue *NewAddr = GetNewAddr(S->getAddr()); - return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); - }) - .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { - VPValue *NewMask = GetNewMask(IR->getMask()); - return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); - }) - .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { - VPValue *NewMask = GetNewMask(Red->getCondOp()); - return new VPReductionEVLRecipe(*Red, EVL, NewMask); - }) - .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * { - VPValue *LHS, *RHS; - // Transform select with a header mask condition - // select(header_mask, LHS, RHS) - // into vector predication merge. - // vp.merge(all-true, LHS, RHS, EVL) - if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), - m_VPValue(RHS)))) - return nullptr; - // Use all true as the condition because this transformation is - // limited to selects whose condition is a header mask. - return new VPWidenIntrinsicRecipe( - Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, - TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); - }) - .Default([&](VPRecipeBase *R) { return nullptr; }); + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe)) + if (Rdx->isConditional() && + match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask))) + return new VPReductionEVLRecipe(*Rdx, EVL, Mask); + + if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe)) + if (Interleave->getMask() && + match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask))) + return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask); + + VPValue *LHS, *RHS; + if (match(&CurRecipe, + m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) + return new VPWidenIntrinsicRecipe( + Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL}, + TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc()); + + return nullptr; } /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPTypeAnalysis TypeInfo(Plan); - VPValue *AllOneMask = Plan.getTrue(); VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); @@ -2658,7 +2679,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1)); VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe( Intrinsic::experimental_vp_splice, - {V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, + {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL}, TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc()); VPSplice->insertBefore(&R); R.getVPSingleValue()->replaceAllUsesWith(VPSplice); @@ -2692,7 +2713,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { for (VPUser *U : collectUsersRecursively(EVLMask)) { auto *CurRecipe = cast<VPRecipeBase>(U); VPRecipeBase *EVLRecipe = - optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); + optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL); if (!EVLRecipe) continue; @@ -2773,7 +2794,7 @@ void VPlanTransforms::addExplicitVectorLength( VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); auto *CanonicalIVPHI = LoopRegion->getCanonicalIV(); - auto *CanIVTy = CanonicalIVPHI->getScalarType(); + auto *CanIVTy = LoopRegion->getCanonicalIVType(); VPValue *StartV = CanonicalIVPHI->getStartValue(); // Create the ExplicitVectorLengthPhi recipe in the main loop. @@ -2788,8 +2809,7 @@ void VPlanTransforms::addExplicitVectorLength( if (MaxSafeElements) { // Support for MaxSafeDist for correct loop emission. - VPValue *AVLSafe = - Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements)); + VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements); VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(), "safe_avl"); @@ -2902,9 +2922,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext); VPBuilder Builder(LatchExitingBr); - VPValue *Cmp = - Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, - Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy))); + VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, + Plan.getConstantInt(AVLTy, 0)); Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp); LatchExitingBr->eraseFromParent(); } @@ -2928,8 +2947,7 @@ void VPlanTransforms::replaceSymbolicStrides( // Only handle constant strides for now. continue; - auto *CI = - Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst)); + auto *CI = Plan.getConstantInt(*StrideConst); if (VPValue *StrideVPV = Plan.getLiveIn(StrideV)) StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); @@ -2944,7 +2962,7 @@ void VPlanTransforms::replaceSymbolicStrides( unsigned BW = U->getType()->getScalarSizeInBits(); APInt C = isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW); - VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); + VPValue *CI = Plan.getConstantInt(C); StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); } RewriteMap[StrideV] = PSE.getSCEV(StrideV); @@ -3123,8 +3141,7 @@ void VPlanTransforms::createInterleaveGroups( DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) * IG->getIndex(IRInsertPos), /*IsSigned=*/true); - VPValue *OffsetVPV = - Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset)); + VPValue *OffsetVPV = Plan.getConstantInt(-Offset); VPBuilder B(InsertPos); Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW); } @@ -3865,8 +3882,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, VPBuilder Builder(VectorPH, VectorPH->begin()); auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); auto *TCMO = Builder.createNaryOp( - Instruction::Sub, - {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))}, + Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)}, DebugLoc::getCompilerGenerated(), "trip.count.minus.1"); BTC->replaceAllUsesWith(TCMO); } @@ -3991,9 +4007,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (TailByMasking) { TC = Builder.createNaryOp( Instruction::Add, - {TC, Builder.createNaryOp( - Instruction::Sub, - {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})}, + {TC, Builder.createNaryOp(Instruction::Sub, + {Step, Plan.getConstantInt(TCTy, 1)})}, DebugLoc::getCompilerGenerated(), "n.rnd.up"); } @@ -4015,8 +4030,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (RequiresScalarEpilogue) { assert(!TailByMasking && "requiring scalar epilogue is not supported with fail folding"); - VPValue *IsZero = Builder.createICmp( - CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0))); + VPValue *IsZero = + Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0)); R = Builder.createSelect(IsZero, Step, R); } @@ -4054,7 +4069,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, } VF.replaceAllUsesWith(RuntimeVF); - VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); + VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF()); VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF}); VFxUF.replaceAllUsesWith(MulByUF); } @@ -4174,7 +4189,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { - if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount())) + if (isa<VPCanonicalIVPHIRecipe>(&R)) continue; if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) && @@ -4334,17 +4349,17 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, VPBuilder PHBuilder(Plan.getVectorPreheader()); VPValue *UF = Plan.getOrAddLiveIn( - ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); + ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF())); if (VF.isScalable()) { VPValue *VScale = PHBuilder.createElementCount( - CanIV->getScalarType(), ElementCount::getScalable(1)); + VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1)); VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); Inc->setOperand(1, VScaleUF); Plan.getVF().replaceAllUsesWith(VScale); } else { Inc->setOperand(1, UF); Plan.getVF().replaceAllUsesWith( - Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + Plan.getConstantInt(CanIV->getScalarType(), 1)); } removeDeadRecipes(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index cfd1a74..d6a0028 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -68,10 +68,9 @@ class UnrollState { void unrollWidenInductionByUF(VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi); - VPValue *getConstantVPV(unsigned Part) { - Type *CanIVIntTy = - Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType(); - return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); + VPValue *getConstantInt(unsigned Part) { + Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); + return Plan.getConstantInt(CanIVIntTy, Part); } public: @@ -138,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { remapOperands(&PartIR, Part); if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) { - ScalarIVSteps->addOperand(getConstantVPV(Part)); + ScalarIVSteps->addOperand(getConstantInt(Part)); } addRecipeForPart(&Part0R, &PartIR, Part); @@ -250,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, for (unsigned Part = 1; Part != UF; ++Part) VPV2Parts[VPI][Part - 1] = StartV; } - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); } else { assert(isa<VPActiveLaneMaskPHIRecipe>(R) && "unexpected header phi recipe not needing unrolled part"); @@ -319,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) || match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R)) Copy->setOperand(0, R.getOperand(0)); @@ -475,8 +474,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, if (LaneDefs != Def2LaneDefs.end()) return LaneDefs->second[Lane.getKnownLane()]; - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); } @@ -510,8 +508,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane())); continue; } - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); NewOps.push_back(Ext); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 4db92e7..c6380d3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -32,22 +32,17 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) { } VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { - VPValue *Expanded = nullptr; if (auto *E = dyn_cast<SCEVConstant>(Expr)) - Expanded = Plan.getOrAddLiveIn(E->getValue()); - else { - auto *U = dyn_cast<SCEVUnknown>(Expr); - // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction - // value. Otherwise the value may be defined in a loop and using it directly - // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA - // form. - if (U && !isa<Instruction>(U->getValue())) { - Expanded = Plan.getOrAddLiveIn(U->getValue()); - } else { - Expanded = new VPExpandSCEVRecipe(Expr); - Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe()); - } - } + return Plan.getOrAddLiveIn(E->getValue()); + // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction + // value. Otherwise the value may be defined in a loop and using it directly + // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA + // form. + auto *U = dyn_cast<SCEVUnknown>(Expr); + if (U && !isa<Instruction>(U->getValue())) + return Plan.getOrAddLiveIn(U->getValue()); + auto *Expanded = new VPExpandSCEVRecipe(Expr); + Plan.getEntry()->appendRecipe(Expanded); return Expanded; } @@ -75,7 +70,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { B == Plan.getBackedgeTakenCount(); } -const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { +const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, + ScalarEvolution &SE, const Loop *L) { if (V->isLiveIn()) { if (Value *LiveIn = V->getLiveInIRValue()) return SE.getSCEV(LiveIn); @@ -86,6 +82,53 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe()) .Case<VPExpandSCEVRecipe>( [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); }) + .Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) { + if (!L) + return SE.getCouldNotCompute(); + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L, + SCEV::FlagAnyWrap); + }) + .Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) { + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L); + const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L); + if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>)) + return SE.getCouldNotCompute(); + + return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()), + SE.getMulExpr(IV, SE.getTruncateOrSignExtend( + Scale, IV->getType()))); + }) + .Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) { + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L); + if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step) || + !Step->isOne()) + return SE.getCouldNotCompute(); + return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()), + Step); + }) + .Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) { + if (R->getOpcode() != Instruction::GetElementPtr) + return SE.getCouldNotCompute(); + + const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L); + if (isa<SCEVCouldNotCompute>(Base)) + return SE.getCouldNotCompute(); + + SmallVector<const SCEV *> IndexExprs; + for (VPValue *Index : drop_begin(R->operands())) { + const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L); + if (isa<SCEVCouldNotCompute>(IndexExpr)) + return SE.getCouldNotCompute(); + IndexExprs.push_back(IndexExpr); + } + + Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr()) + ->getSourceElementType(); + return SE.getGEPExpr(Base, IndexExprs, SrcElementTy); + }) .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 37cd413..c21a0e7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr); /// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no /// SCEV expression could be constructed. -const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); +const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, + const Loop *L = nullptr); /// Returns true if \p VPV is a single scalar, either because it produces the /// same value for all lanes or only has its first lane used. |
