diff options
Diffstat (limited to 'llvm/lib')
148 files changed, 3177 insertions, 1606 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index dd98b62..c14cb9e 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1485,6 +1485,9 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, switch (Opcode) { default: llvm_unreachable("Missing case"); + case Instruction::PtrToAddr: + // TODO: Add some of the ptrtoint folds here as well. + break; case Instruction::PtrToInt: if (auto *CE = dyn_cast<ConstantExpr>(C)) { Constant *FoldedValue = nullptr; diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp index 329bd35..761c566 100644 --- a/llvm/lib/Analysis/Delinearization.cpp +++ b/llvm/lib/Analysis/Delinearization.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -32,6 +33,11 @@ using namespace llvm; #define DL_NAME "delinearize" #define DEBUG_TYPE DL_NAME +static cl::opt<bool> UseFixedSizeArrayHeuristic( + "delinearize-use-fixed-size-array-heuristic", cl::init(false), cl::Hidden, + cl::desc("When printing analysis, use the heuristic for fixed-size arrays " + "if the default delinearizetion fails.")); + // Return true when S contains at least an undef value. static inline bool containsUndefs(const SCEV *S) { return SCEVExprContains(S, [](const SCEV *S) { @@ -480,6 +486,184 @@ void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr, }); } +static std::optional<APInt> tryIntoAPInt(const SCEV *S) { + if (const auto *Const = dyn_cast<SCEVConstant>(S)) + return Const->getAPInt(); + return std::nullopt; +} + +/// Collects the absolute values of constant steps for all induction variables. +/// Returns true if we can prove that all step recurrences are constants and \p +/// Expr is divisible by \p ElementSize. Each step recurrence is stored in \p +/// Steps after divided by \p ElementSize. +static bool collectConstantAbsSteps(ScalarEvolution &SE, const SCEV *Expr, + SmallVectorImpl<uint64_t> &Steps, + uint64_t ElementSize) { + // End of recursion. The constant value also must be a multiple of + // ElementSize. + if (const auto *Const = dyn_cast<SCEVConstant>(Expr)) { + const uint64_t Mod = Const->getAPInt().urem(ElementSize); + return Mod == 0; + } + + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Expr); + if (!AR || !AR->isAffine()) + return false; + + const SCEV *Step = AR->getStepRecurrence(SE); + std::optional<APInt> StepAPInt = tryIntoAPInt(Step); + if (!StepAPInt) + return false; + + APInt Q; + uint64_t R; + APInt::udivrem(StepAPInt->abs(), ElementSize, Q, R); + if (R != 0) + return false; + + // Bail out when the step is too large. + std::optional<uint64_t> StepVal = Q.tryZExtValue(); + if (!StepVal) + return false; + + Steps.push_back(*StepVal); + return collectConstantAbsSteps(SE, AR->getStart(), Steps, ElementSize); +} + +bool llvm::findFixedSizeArrayDimensions(ScalarEvolution &SE, const SCEV *Expr, + SmallVectorImpl<uint64_t> &Sizes, + const SCEV *ElementSize) { + if (!ElementSize) + return false; + + std::optional<APInt> ElementSizeAPInt = tryIntoAPInt(ElementSize); + if (!ElementSizeAPInt || *ElementSizeAPInt == 0) + return false; + + std::optional<uint64_t> ElementSizeConst = ElementSizeAPInt->tryZExtValue(); + + // Early exit when ElementSize is not a positive constant. + if (!ElementSizeConst) + return false; + + if (!collectConstantAbsSteps(SE, Expr, Sizes, *ElementSizeConst) || + Sizes.empty()) { + Sizes.clear(); + return false; + } + + // At this point, Sizes contains the absolute step recurrences for all + // induction variables. Each step recurrence must be a multiple of the size of + // the array element. Assuming that the each value represents the size of an + // array for each dimension, attempts to restore the length of each dimension + // by dividing the step recurrence by the next smaller value. For example, if + // we have the following AddRec SCEV: + // + // AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8) + // + // Then Sizes will become [256, 32, 1] after sorted. We don't know the size of + // the outermost dimension, the next dimension will be computed as 256 / 32 = + // 8, and the last dimension will be computed as 32 / 1 = 32. Thus it results + // in like Arr[UnknownSize][8][32] with elements of size 8 bytes, where Arr is + // a base pointer. + // + // TODO: Catch more cases, e.g., when a step recurrence is not divisible by + // the next smaller one, like A[i][3*j]. + llvm::sort(Sizes.rbegin(), Sizes.rend()); + Sizes.erase(llvm::unique(Sizes), Sizes.end()); + + // The last element in Sizes should be ElementSize. At this point, all values + // in Sizes are assumed to be divided by ElementSize, so replace it with 1. + assert(Sizes.back() != 0 && "Unexpected zero size in Sizes."); + Sizes.back() = 1; + + for (unsigned I = 0; I + 1 < Sizes.size(); I++) { + uint64_t PrevSize = Sizes[I + 1]; + if (Sizes[I] % PrevSize) { + Sizes.clear(); + return false; + } + Sizes[I] /= PrevSize; + } + + // Finally, the last element in Sizes should be ElementSize. + Sizes.back() = *ElementSizeConst; + return true; +} + +/// Splits the SCEV into two vectors of SCEVs representing the subscripts and +/// sizes of an array access, assuming that the array is a fixed size array. +/// +/// E.g., if we have the code like as follows: +/// +/// double A[42][8][32]; +/// for i +/// for j +/// for k +/// use A[i][j][k] +/// +/// The access function will be represented as an AddRec SCEV like: +/// +/// AddRec: {{{0,+,2048}<%for.i>,+,256}<%for.j>,+,8}<%for.k> (ElementSize=8) +/// +/// Then findFixedSizeArrayDimensions infers the size of each dimension of the +/// array based on the fact that the value of the step recurrence is a multiple +/// of the size of the corresponding array element. In the above example, it +/// results in the following: +/// +/// CHECK: ArrayDecl[UnknownSize][8][32] with elements of 8 bytes. +/// +/// Finally each subscript will be computed as follows: +/// +/// CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>] +/// +/// Note that this function doesn't check the range of possible values for each +/// subscript, so the caller should perform additional boundary checks if +/// necessary. +/// +/// Also note that this function doesn't guarantee that the original array size +/// is restored "correctly". For example, in the following case: +/// +/// double A[42][4][64]; +/// double B[42][8][32]; +/// for i +/// for j +/// for k +/// use A[i][j][k] +/// use B[i][2*j][k] +/// +/// The access function for both accesses will be the same: +/// +/// AddRec: {{{0,+,2048}<%for.i>,+,512}<%for.j>,+,8}<%for.k> (ElementSize=8) +/// +/// The array sizes for both A and B will be computed as +/// ArrayDecl[UnknownSize][4][64], which matches for A, but not for B. +/// +/// TODO: At the moment, this function can handle only simple cases. For +/// example, we cannot handle a case where a step recurrence is not divisible +/// by the next smaller step recurrence, e.g., A[i][3*j]. +bool llvm::delinearizeFixedSizeArray(ScalarEvolution &SE, const SCEV *Expr, + SmallVectorImpl<const SCEV *> &Subscripts, + SmallVectorImpl<const SCEV *> &Sizes, + const SCEV *ElementSize) { + + // First step: find the fixed array size. + SmallVector<uint64_t, 4> ConstSizes; + if (!findFixedSizeArrayDimensions(SE, Expr, ConstSizes, ElementSize)) { + Sizes.clear(); + return false; + } + + // Convert the constant size to SCEV. + for (uint64_t Size : ConstSizes) + Sizes.push_back(SE.getConstant(Expr->getType(), Size)); + + // Second step: compute the access functions for each subscript. + computeAccessFunctions(SE, Expr, Subscripts, Sizes); + + return !Subscripts.empty(); +} + bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE, const GetElementPtrInst *GEP, SmallVectorImpl<const SCEV *> &Subscripts, @@ -586,9 +770,21 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI, O << "AccessFunction: " << *AccessFn << "\n"; SmallVector<const SCEV *, 3> Subscripts, Sizes; + + auto IsDelinearizationFailed = [&]() { + return Subscripts.size() == 0 || Sizes.size() == 0 || + Subscripts.size() != Sizes.size(); + }; + delinearize(*SE, AccessFn, Subscripts, Sizes, SE->getElementSize(&Inst)); - if (Subscripts.size() == 0 || Sizes.size() == 0 || - Subscripts.size() != Sizes.size()) { + if (UseFixedSizeArrayHeuristic && IsDelinearizationFailed()) { + Subscripts.clear(); + Sizes.clear(); + delinearizeFixedSizeArray(*SE, AccessFn, Subscripts, Sizes, + SE->getElementSize(&Inst)); + } + + if (IsDelinearizationFailed()) { O << "failed to delinearize\n"; continue; } diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index f1473b2..835e270 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -180,8 +180,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE; ++SrcI) { if (SrcI->mayReadOrWriteMemory()) { - for (inst_iterator DstI = SrcI, DstE = inst_end(F); - DstI != DstE; ++DstI) { + for (inst_iterator DstI = SrcI, DstE = inst_end(F); DstI != DstE; + ++DstI) { if (DstI->mayReadOrWriteMemory()) { OS << "Src:" << *SrcI << " --> Dst:" << *DstI << "\n"; OS << " da analyze - "; @@ -203,7 +203,7 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, // Normalize negative direction vectors if required by clients. if (NormalizeResults && D->normalize(&SE)) - OS << "normalized - "; + OS << "normalized - "; D->dump(OS); for (unsigned Level = 1; Level <= D->getLevels(); Level++) { if (D->isSplitable(Level)) { @@ -227,8 +227,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, void DependenceAnalysisWrapperPass::print(raw_ostream &OS, const Module *) const { - dumpExampleDependence(OS, info.get(), - getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false); + dumpExampleDependence( + OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false); } PreservedAnalyses @@ -249,33 +249,26 @@ bool Dependence::isInput() const { return Src->mayReadFromMemory() && Dst->mayReadFromMemory(); } - // Returns true if this is an output dependence. bool Dependence::isOutput() const { return Src->mayWriteToMemory() && Dst->mayWriteToMemory(); } - // Returns true if this is an flow (aka true) dependence. bool Dependence::isFlow() const { return Src->mayWriteToMemory() && Dst->mayReadFromMemory(); } - // Returns true if this is an anti dependence. bool Dependence::isAnti() const { return Src->mayReadFromMemory() && Dst->mayWriteToMemory(); } - // Returns true if a particular level is scalar; that is, // if no subscript in the source or destination mention the induction // variable associated with the loop at this level. // Leave this out of line, so it will serve as a virtual method anchor -bool Dependence::isScalar(unsigned level) const { - return false; -} - +bool Dependence::isScalar(unsigned level) const { return false; } //===----------------------------------------------------------------------===// // FullDependence methods @@ -338,8 +331,7 @@ bool FullDependence::normalize(ScalarEvolution *SE) { DV[Level - 1].Direction = RevDirection; // Reverse the dependence distance as well. if (DV[Level - 1].Distance != nullptr) - DV[Level - 1].Distance = - SE->getNegativeSCEV(DV[Level - 1].Distance); + DV[Level - 1].Distance = SE->getNegativeSCEV(DV[Level - 1].Distance); } LLVM_DEBUG(dbgs() << "After normalizing negative direction vectors:\n"; @@ -355,14 +347,12 @@ unsigned FullDependence::getDirection(unsigned Level) const { return DV[Level - 1].Direction; } - // Returns the distance (or NULL) associated with a particular level. const SCEV *FullDependence::getDistance(unsigned Level) const { assert(0 < Level && Level <= Levels && "Level out of range"); return DV[Level - 1].Distance; } - // Returns true if a particular level is scalar; that is, // if no subscript in the source or destination mention the induction // variable associated with the loop at this level. @@ -371,7 +361,6 @@ bool FullDependence::isScalar(unsigned Level) const { return DV[Level - 1].Scalar; } - // Returns true if peeling the first iteration from this loop // will break this dependence. bool FullDependence::isPeelFirst(unsigned Level) const { @@ -379,7 +368,6 @@ bool FullDependence::isPeelFirst(unsigned Level) const { return DV[Level - 1].PeelFirst; } - // Returns true if peeling the last iteration from this loop // will break this dependence. bool FullDependence::isPeelLast(unsigned Level) const { @@ -387,14 +375,12 @@ bool FullDependence::isPeelLast(unsigned Level) const { return DV[Level - 1].PeelLast; } - // Returns true if splitting this loop will break the dependence. bool FullDependence::isSplitable(unsigned Level) const { assert(0 < Level && Level <= Levels && "Level out of range"); return DV[Level - 1].Splitable; } - //===----------------------------------------------------------------------===// // DependenceInfo::Constraint methods @@ -405,7 +391,6 @@ const SCEV *DependenceInfo::Constraint::getX() const { return A; } - // If constraint is a point <X, Y>, returns Y. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getY() const { @@ -413,7 +398,6 @@ const SCEV *DependenceInfo::Constraint::getY() const { return B; } - // If constraint is a line AX + BY = C, returns A. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getA() const { @@ -422,7 +406,6 @@ const SCEV *DependenceInfo::Constraint::getA() const { return A; } - // If constraint is a line AX + BY = C, returns B. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getB() const { @@ -431,7 +414,6 @@ const SCEV *DependenceInfo::Constraint::getB() const { return B; } - // If constraint is a line AX + BY = C, returns C. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getC() const { @@ -440,7 +422,6 @@ const SCEV *DependenceInfo::Constraint::getC() const { return C; } - // If constraint is a distance, returns D. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getD() const { @@ -448,7 +429,6 @@ const SCEV *DependenceInfo::Constraint::getD() const { return SE->getNegativeSCEV(C); } - // Returns the loop associated with this constraint. const Loop *DependenceInfo::Constraint::getAssociatedLoop() const { assert((Kind == Distance || Kind == Line || Kind == Point) && @@ -499,17 +479,16 @@ LLVM_DUMP_METHOD void DependenceInfo::Constraint::dump(raw_ostream &OS) const { else if (isPoint()) OS << " Point is <" << *getX() << ", " << *getY() << ">\n"; else if (isDistance()) - OS << " Distance is " << *getD() << - " (" << *getA() << "*X + " << *getB() << "*Y = " << *getC() << ")\n"; + OS << " Distance is " << *getD() << " (" << *getA() << "*X + " << *getB() + << "*Y = " << *getC() << ")\n"; else if (isLine()) - OS << " Line is " << *getA() << "*X + " << - *getB() << "*Y = " << *getC() << "\n"; + OS << " Line is " << *getA() << "*X + " << *getB() << "*Y = " << *getC() + << "\n"; else llvm_unreachable("unknown constraint type in Constraint::dump"); } #endif - // Updates X with the intersection // of the Constraints X and Y. Returns true if X has changed. // Corresponds to Figure 4 from the paper @@ -591,15 +570,14 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { const SCEV *A1B2 = SE->getMulExpr(X->getA(), Y->getB()); const SCEV *A2B1 = SE->getMulExpr(Y->getA(), X->getB()); const SCEVConstant *C1A2_C2A1 = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1)); + dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1)); const SCEVConstant *C1B2_C2B1 = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1)); + dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1)); const SCEVConstant *A1B2_A2B1 = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1)); + dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1)); const SCEVConstant *A2B1_A1B2 = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2)); - if (!C1B2_C2B1 || !C1A2_C2A1 || - !A1B2_A2B1 || !A2B1_A1B2) + dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2)); + if (!C1B2_C2B1 || !C1A2_C2A1 || !A1B2_A2B1 || !A2B1_A1B2) return false; APInt Xtop = C1B2_C2B1->getAPInt(); APInt Xbot = A1B2_A2B1->getAPInt(); @@ -626,8 +604,8 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { ++DeltaSuccesses; return true; } - if (const SCEVConstant *CUB = - collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) { + if (const SCEVConstant *CUB = collectConstantUpperBound( + X->getAssociatedLoop(), Prod1->getType())) { const APInt &UpperBound = CUB->getAPInt(); LLVM_DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n"); if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) { @@ -636,8 +614,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { return true; } } - X->setPoint(SE->getConstant(Xq), - SE->getConstant(Yq), + X->setPoint(SE->getConstant(Xq), SE->getConstant(Yq), X->getAssociatedLoop()); ++DeltaSuccesses; return true; @@ -667,7 +644,6 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { return false; } - //===----------------------------------------------------------------------===// // DependenceInfo methods @@ -737,8 +713,7 @@ void Dependence::dump(raw_ostream &OS) const { // tbaa, non-overlapping regions etc), then it is known there is no dependecy. // Otherwise the underlying objects are checked to see if they point to // different identifiable objects. -static AliasResult underlyingObjectsAlias(AAResults *AA, - const DataLayout &DL, +static AliasResult underlyingObjectsAlias(AAResults *AA, const DataLayout &DL, const MemoryLocation &LocA, const MemoryLocation &LocB) { // Check the original locations (minus size) for noalias, which can happen for @@ -773,8 +748,7 @@ static AliasResult underlyingObjectsAlias(AAResults *AA, // Returns true if the load or store can be analyzed. Atomic and volatile // operations have properties which this analysis does not understand. -static -bool isLoadOrStore(const Instruction *I) { +static bool isLoadOrStore(const Instruction *I) { if (const LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->isUnordered(); else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) @@ -782,7 +756,6 @@ bool isLoadOrStore(const Instruction *I) { return false; } - // Examines the loop nesting of the Src and Dst // instructions and establishes their shared loops. Sets the variables // CommonLevels, SrcLevels, and MaxLevels. @@ -860,14 +833,12 @@ void DependenceInfo::establishNestingLevels(const Instruction *Src, MaxLevels -= CommonLevels; } - // Given one of the loops containing the source, return // its level index in our numbering scheme. unsigned DependenceInfo::mapSrcLoop(const Loop *SrcLoop) const { return SrcLoop->getLoopDepth(); } - // Given one of the loops containing the destination, // return its level index in our numbering scheme. unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const { @@ -880,7 +851,6 @@ unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const { return D; } - // Returns true if Expression is loop invariant in LoopNest. bool DependenceInfo::isLoopInvariant(const SCEV *Expression, const Loop *LoopNest) const { @@ -896,8 +866,6 @@ bool DependenceInfo::isLoopInvariant(const SCEV *Expression, return SE->isLoopInvariant(Expression, LoopNest->getOutermostLoop()); } - - // Finds the set of loops from the LoopNest that // have a level <= CommonLevels and are referred to by the SCEV Expression. void DependenceInfo::collectCommonLoops(const SCEV *Expression, @@ -924,9 +892,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) { IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType()); IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType()); if (SrcTy == nullptr || DstTy == nullptr) { - assert(SrcTy == DstTy && "This function only unify integer types and " - "expect Src and Dst share the same type " - "otherwise."); + assert(SrcTy == DstTy && + "This function only unify integer types and " + "expect Src and Dst share the same type otherwise."); continue; } if (SrcTy->getBitWidth() > widestWidthSeen) { @@ -939,7 +907,6 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) { } } - assert(widestWidthSeen > 0); // Now extend each pair to the widest seen. @@ -949,9 +916,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) { IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType()); IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType()); if (SrcTy == nullptr || DstTy == nullptr) { - assert(SrcTy == DstTy && "This function only unify integer types and " - "expect Src and Dst share the same type " - "otherwise."); + assert(SrcTy == DstTy && + "This function only unify integer types and " + "expect Src and Dst share the same type otherwise."); continue; } if (SrcTy->getBitWidth() < widestWidthSeen) @@ -1028,7 +995,6 @@ bool DependenceInfo::checkDstSubscript(const SCEV *Dst, const Loop *LoopNest, return checkSubscript(Dst, LoopNest, Loops, false); } - // Examines the subscript pair (the Src and Dst SCEVs) // and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear. // Collects the associated loops in a set. @@ -1049,14 +1015,12 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest, return Subscript::ZIV; if (N == 1) return Subscript::SIV; - if (N == 2 && (SrcLoops.count() == 0 || - DstLoops.count() == 0 || + if (N == 2 && (SrcLoops.count() == 0 || DstLoops.count() == 0 || (SrcLoops.count() == 1 && DstLoops.count() == 1))) return Subscript::RDIV; return Subscript::MIV; } - // A wrapper around SCEV::isKnownPredicate. // Looks for cases where we're interested in comparing for equality. // If both X and Y have been identically sign or zero extended, @@ -1069,12 +1033,9 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest, // involving symbolics. bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X, const SCEV *Y) const { - if (Pred == CmpInst::ICMP_EQ || - Pred == CmpInst::ICMP_NE) { - if ((isa<SCEVSignExtendExpr>(X) && - isa<SCEVSignExtendExpr>(Y)) || - (isa<SCEVZeroExtendExpr>(X) && - isa<SCEVZeroExtendExpr>(Y))) { + if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) { + if ((isa<SCEVSignExtendExpr>(X) && isa<SCEVSignExtendExpr>(Y)) || + (isa<SCEVZeroExtendExpr>(X) && isa<SCEVZeroExtendExpr>(Y))) { const SCEVIntegralCastExpr *CX = cast<SCEVIntegralCastExpr>(X); const SCEVIntegralCastExpr *CY = cast<SCEVIntegralCastExpr>(Y); const SCEV *Xop = CX->getOperand(); @@ -1111,7 +1072,10 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X, } } -/// Compare to see if S is less than Size, using isKnownNegative(S - max(Size, 1)) +/// Compare to see if S is less than Size, using +/// +/// isKnownNegative(S - Size) +/// /// with some extra checking if S is an AddRec and we can prove less-than using /// the loop bounds. bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const { @@ -1126,21 +1090,34 @@ bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const { Size = SE->getTruncateOrZeroExtend(Size, MaxType); // Special check for addrecs using BE taken count - const SCEV *Bound = SE->getMinusSCEV(S, Size); - if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Bound)) { - if (AddRec->isAffine()) { + if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S)) + if (AddRec->isAffine() && AddRec->hasNoSignedWrap()) { const SCEV *BECount = SE->getBackedgeTakenCount(AddRec->getLoop()); - if (!isa<SCEVCouldNotCompute>(BECount)) { - const SCEV *Limit = AddRec->evaluateAtIteration(BECount, *SE); - if (SE->isKnownNegative(Limit)) - return true; - } + const SCEV *Start = AddRec->getStart(); + const SCEV *Step = AddRec->getStepRecurrence(*SE); + const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE); + const SCEV *Diff0 = SE->getMinusSCEV(Start, Size); + const SCEV *Diff1 = SE->getMinusSCEV(End, Size); + + // If the value of Step is non-negative and the AddRec is non-wrap, it + // reaches its maximum at the last iteration. So it's enouth to check + // whether End - Size is negative. + if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1)) + return true; + + // If the value of Step is non-positive and the AddRec is non-wrap, the + // initial value is its maximum. + if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0)) + return true; + + // Even if we don't know the sign of Step, either Start or End must be + // the maximum value of the AddRec since it is non-wrap. + if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1)) + return true; } - } // Check using normal isKnownNegative - const SCEV *LimitedBound = - SE->getMinusSCEV(S, SE->getSMaxExpr(Size, SE->getOne(Size->getType()))); + const SCEV *LimitedBound = SE->getMinusSCEV(S, Size); return SE->isKnownNegative(LimitedBound); } @@ -1178,7 +1155,6 @@ const SCEV *DependenceInfo::collectUpperBound(const Loop *L, Type *T) const { return nullptr; } - // Calls collectUpperBound(), then attempts to cast it to SCEVConstant. // If the cast fails, returns NULL. const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L, @@ -1188,7 +1164,6 @@ const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L, return nullptr; } - // testZIV - // When we have a pair of subscripts of the form [c1] and [c2], // where c1 and c2 are both loop invariant, we attack it using @@ -1218,7 +1193,6 @@ bool DependenceInfo::testZIV(const SCEV *Src, const SCEV *Dst, return false; // possibly dependent } - // strongSIVtest - // From the paper, Practical Dependence Testing, Section 4.2.1 // @@ -1270,9 +1244,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, LLVM_DEBUG(dbgs() << "\t UpperBound = " << *UpperBound); LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n"); const SCEV *AbsDelta = - SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta); + SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta); const SCEV *AbsCoeff = - SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff); + SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff); const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff); if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) { // Distance greater than trip count - no dependence @@ -1286,7 +1260,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) { APInt ConstDelta = cast<SCEVConstant>(Delta)->getAPInt(); APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getAPInt(); - APInt Distance = ConstDelta; // these need to be initialized + APInt Distance = ConstDelta; // these need to be initialized APInt Remainder = ConstDelta; APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder); LLVM_DEBUG(dbgs() << "\t Distance = " << Distance << "\n"); @@ -1307,29 +1281,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, else Result.DV[Level].Direction &= Dependence::DVEntry::EQ; ++StrongSIVsuccesses; - } - else if (Delta->isZero()) { + } else if (Delta->isZero()) { // since 0/X == 0 Result.DV[Level].Distance = Delta; NewConstraint.setDistance(Delta, CurLoop); Result.DV[Level].Direction &= Dependence::DVEntry::EQ; ++StrongSIVsuccesses; - } - else { + } else { if (Coeff->isOne()) { LLVM_DEBUG(dbgs() << "\t Distance = " << *Delta << "\n"); Result.DV[Level].Distance = Delta; // since X/1 == X NewConstraint.setDistance(Delta, CurLoop); - } - else { + } else { Result.Consistent = false; - NewConstraint.setLine(Coeff, - SE->getNegativeSCEV(Coeff), + NewConstraint.setLine(Coeff, SE->getNegativeSCEV(Coeff), SE->getNegativeSCEV(Delta), CurLoop); } // maybe we can get a useful direction - bool DeltaMaybeZero = !SE->isKnownNonZero(Delta); + bool DeltaMaybeZero = !SE->isKnownNonZero(Delta); bool DeltaMaybePositive = !SE->isKnownNonPositive(Delta); bool DeltaMaybeNegative = !SE->isKnownNonNegative(Delta); bool CoeffMaybePositive = !SE->isKnownNonPositive(Coeff); @@ -1353,7 +1323,6 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, return false; } - // weakCrossingSIVtest - // From the paper, Practical Dependence Testing, Section 4.2.2 // @@ -1447,8 +1416,8 @@ bool DependenceInfo::weakCrossingSIVtest( if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) { LLVM_DEBUG(dbgs() << "\t UpperBound = " << *UpperBound << "\n"); const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2); - const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound), - ConstantTwo); + const SCEV *ML = + SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound), ConstantTwo); LLVM_DEBUG(dbgs() << "\t ML = " << *ML << "\n"); if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, ML)) { // Delta too big, no dependence @@ -1498,7 +1467,6 @@ bool DependenceInfo::weakCrossingSIVtest( return false; } - // Kirch's algorithm, from // // Optimizing Supercompilers for Supercomputers @@ -1519,9 +1487,11 @@ static bool findGCD(unsigned Bits, const APInt &AM, const APInt &BM, APInt R = G0; APInt::sdivrem(G0, G1, Q, R); while (R != 0) { + // clang-format off APInt A2 = A0 - Q*A1; A0 = A1; A1 = A2; APInt B2 = B0 - Q*B1; B0 = B1; B1 = B2; G0 = G1; G1 = R; + // clang-format on APInt::sdivrem(G0, G1, Q, R); } G = G1; @@ -1543,8 +1513,7 @@ static APInt floorOfQuotient(const APInt &A, const APInt &B) { APInt::sdivrem(A, B, Q, R); if (R == 0) return Q; - if ((A.sgt(0) && B.sgt(0)) || - (A.slt(0) && B.slt(0))) + if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0))) return Q; else return Q - 1; @@ -1556,8 +1525,7 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) { APInt::sdivrem(A, B, Q, R); if (R == 0) return Q; - if ((A.sgt(0) && B.sgt(0)) || - (A.slt(0) && B.slt(0))) + if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0))) return Q + 1; else return Q; @@ -1733,17 +1701,14 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, return Result.DV[Level].Direction == Dependence::DVEntry::NONE; } - // Return true if the divisor evenly divides the dividend. -static -bool isRemainderZero(const SCEVConstant *Dividend, - const SCEVConstant *Divisor) { +static bool isRemainderZero(const SCEVConstant *Dividend, + const SCEVConstant *Divisor) { const APInt &ConstDividend = Dividend->getAPInt(); const APInt &ConstDivisor = Divisor->getAPInt(); return ConstDividend.srem(ConstDivisor) == 0; } - // weakZeroSrcSIVtest - // From the paper, Practical Dependence Testing, Section 4.2.2 // @@ -1807,11 +1772,11 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff, const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff); if (!ConstCoeff) return false; - const SCEV *AbsCoeff = - SE->isKnownNegative(ConstCoeff) ? - SE->getNegativeSCEV(ConstCoeff) : ConstCoeff; + const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff) + ? SE->getNegativeSCEV(ConstCoeff) + : ConstCoeff; const SCEV *NewDelta = - SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta; + SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta; // check that Delta/SrcCoeff < iteration count // really check NewDelta < count*AbsCoeff @@ -1853,7 +1818,6 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff, return false; } - // weakZeroDstSIVtest - // From the paper, Practical Dependence Testing, Section 4.2.2 // @@ -1916,11 +1880,11 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff, const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff); if (!ConstCoeff) return false; - const SCEV *AbsCoeff = - SE->isKnownNegative(ConstCoeff) ? - SE->getNegativeSCEV(ConstCoeff) : ConstCoeff; + const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff) + ? SE->getNegativeSCEV(ConstCoeff) + : ConstCoeff; const SCEV *NewDelta = - SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta; + SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta; // check that Delta/SrcCoeff < iteration count // really check NewDelta < count*AbsCoeff @@ -1962,7 +1926,6 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff, return false; } - // exactRDIVtest - Tests the RDIV subscript pair for dependence. // Things of the form [c1 + a*i] and [c2 + b*j], // where i and j are induction variable, c1 and c2 are loop invariant, @@ -2084,7 +2047,6 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, return TL.sgt(TU); } - // symbolicRDIVtest - // In Section 4.5 of the Practical Dependence Testing paper,the authors // introduce a special case of Banerjee's Inequalities (also called the @@ -2167,8 +2129,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, return true; } } - } - else if (SE->isKnownNonPositive(A2)) { + } else if (SE->isKnownNonPositive(A2)) { // a1 >= 0 && a2 <= 0 if (N1 && N2) { // make sure that c2 - c1 <= a1*N1 - a2*N2 @@ -2187,8 +2148,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, return true; } } - } - else if (SE->isKnownNonPositive(A1)) { + } else if (SE->isKnownNonPositive(A1)) { if (SE->isKnownNonNegative(A2)) { // a1 <= 0 && a2 >= 0 if (N1 && N2) { @@ -2207,8 +2167,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, ++SymbolicRDIVindependence; return true; } - } - else if (SE->isKnownNonPositive(A2)) { + } else if (SE->isKnownNonPositive(A2)) { // a1 <= 0 && a2 <= 0 if (N1) { // make sure that a1*N1 <= c2 - c1 @@ -2233,7 +2192,6 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, return false; } - // testSIV - // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 - a2*i] // where i is an induction variable, c1 and c2 are loop invariant, and a1 and @@ -2260,17 +2218,17 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level, Level = mapSrcLoop(CurLoop); bool disproven; if (SrcCoeff == DstCoeff) - disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, - Level, Result, NewConstraint); + disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level, + Result, NewConstraint); else if (SrcCoeff == SE->getNegativeSCEV(DstCoeff)) disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level, Result, NewConstraint, SplitIter); else disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, Level, Result, NewConstraint); - return disproven || - gcdMIVtest(Src, Dst, Result) || - symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, CurLoop); + return disproven || gcdMIVtest(Src, Dst, Result) || + symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, + CurLoop); } if (SrcAddRec) { const SCEV *SrcConst = SrcAddRec->getStart(); @@ -2278,9 +2236,9 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level, const SCEV *DstConst = Dst; const Loop *CurLoop = SrcAddRec->getLoop(); Level = mapSrcLoop(CurLoop); - return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, - Level, Result, NewConstraint) || - gcdMIVtest(Src, Dst, Result); + return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level, + Result, NewConstraint) || + gcdMIVtest(Src, Dst, Result); } if (DstAddRec) { const SCEV *DstConst = DstAddRec->getStart(); @@ -2288,15 +2246,14 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level, const SCEV *SrcConst = Src; const Loop *CurLoop = DstAddRec->getLoop(); Level = mapDstLoop(CurLoop); - return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst, - CurLoop, Level, Result, NewConstraint) || - gcdMIVtest(Src, Dst, Result); + return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst, CurLoop, Level, + Result, NewConstraint) || + gcdMIVtest(Src, Dst, Result); } llvm_unreachable("SIV test expected at least one AddRec"); return false; } - // testRDIV - // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j] // where i and j are induction variables, c1 and c2 are loop invariant, @@ -2333,46 +2290,37 @@ bool DependenceInfo::testRDIV(const SCEV *Src, const SCEV *Dst, DstConst = DstAddRec->getStart(); DstCoeff = DstAddRec->getStepRecurrence(*SE); DstLoop = DstAddRec->getLoop(); - } - else if (SrcAddRec) { + } else if (SrcAddRec) { if (const SCEVAddRecExpr *tmpAddRec = - dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) { + dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) { SrcConst = tmpAddRec->getStart(); SrcCoeff = tmpAddRec->getStepRecurrence(*SE); SrcLoop = tmpAddRec->getLoop(); DstConst = Dst; DstCoeff = SE->getNegativeSCEV(SrcAddRec->getStepRecurrence(*SE)); DstLoop = SrcAddRec->getLoop(); - } - else + } else llvm_unreachable("RDIV reached by surprising SCEVs"); - } - else if (DstAddRec) { + } else if (DstAddRec) { if (const SCEVAddRecExpr *tmpAddRec = - dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) { + dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) { DstConst = tmpAddRec->getStart(); DstCoeff = tmpAddRec->getStepRecurrence(*SE); DstLoop = tmpAddRec->getLoop(); SrcConst = Src; SrcCoeff = SE->getNegativeSCEV(DstAddRec->getStepRecurrence(*SE)); SrcLoop = DstAddRec->getLoop(); - } - else + } else llvm_unreachable("RDIV reached by surprising SCEVs"); - } - else + } else llvm_unreachable("RDIV expected at least one AddRec"); - return exactRDIVtest(SrcCoeff, DstCoeff, - SrcConst, DstConst, - SrcLoop, DstLoop, + return exactRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop, DstLoop, Result) || - gcdMIVtest(Src, Dst, Result) || - symbolicRDIVtest(SrcCoeff, DstCoeff, - SrcConst, DstConst, - SrcLoop, DstLoop); + gcdMIVtest(Src, Dst, Result) || + symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop, + DstLoop); } - // Tests the single-subscript MIV pair (Src and Dst) for dependence. // Return true if dependence disproved. // Can sometimes refine direction vectors. @@ -2383,7 +2331,7 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst, LLVM_DEBUG(dbgs() << " dst = " << *Dst << "\n"); Result.Consistent = false; return gcdMIVtest(Src, Dst, Result) || - banerjeeMIVtest(Src, Dst, Loops, Result); + banerjeeMIVtest(Src, Dst, Loops, Result); } // Given a product, e.g., 10*X*Y, returns the first constant operand, @@ -2428,7 +2376,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, // we can't quit the loop just because the GCD == 1. const SCEV *Coefficients = Src; while (const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(Coefficients)) { + dyn_cast<SCEVAddRecExpr>(Coefficients)) { const SCEV *Coeff = AddRec->getStepRecurrence(*SE); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. @@ -2446,7 +2394,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, // we can't quit the loop just because the GCD == 1. Coefficients = Dst; while (const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(Coefficients)) { + dyn_cast<SCEVAddRecExpr>(Coefficients)) { const SCEV *Coeff = AddRec->getStepRecurrence(*SE); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. @@ -2468,16 +2416,14 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, if (isa<SCEVConstant>(Operand)) { assert(!Constant && "Surprised to find multiple constants"); Constant = cast<SCEVConstant>(Operand); - } - else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) { + } else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) { // Search for constant operand to participate in GCD; // If none found; return false. std::optional<APInt> ConstOp = getConstantPart(Product); if (!ConstOp) return false; ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs()); - } - else + } else return false; } } @@ -2512,7 +2458,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, bool Improved = false; Coefficients = Src; while (const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(Coefficients)) { + dyn_cast<SCEVAddRecExpr>(Coefficients)) { Coefficients = AddRec->getStart(); const Loop *CurLoop = AddRec->getLoop(); RunningGCD = ExtraGCD; @@ -2578,7 +2524,6 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, return false; } - //===----------------------------------------------------------------------===// // banerjeeMIVtest - // Use Banerjee's Inequalities to test an MIV subscript pair. @@ -2652,8 +2597,8 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst, if (testBounds(Dependence::DVEntry::ALL, 0, Bound, Delta)) { // Explore the direction vector hierarchy. unsigned DepthExpanded = 0; - unsigned NewDeps = exploreDirections(1, A, B, Bound, - Loops, DepthExpanded, Delta); + unsigned NewDeps = + exploreDirections(1, A, B, Bound, Loops, DepthExpanded, Delta); if (NewDeps > 0) { bool Improved = false; for (unsigned K = 1; K <= CommonLevels; ++K) { @@ -2670,23 +2615,20 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst, } if (Improved) ++BanerjeeSuccesses; - } - else { + } else { ++BanerjeeIndependence; Disproved = true; } - } - else { + } else { ++BanerjeeIndependence; Disproved = true; } - delete [] Bound; - delete [] A; - delete [] B; + delete[] Bound; + delete[] A; + delete[] B; return Disproved; } - // Hierarchically expands the direction vector // search space, combining the directions of discovered dependences // in the DirSet field of Bound. Returns the number of distinct @@ -2788,27 +2730,26 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A, // test bounds for <, *, *, ... if (testBounds(Dependence::DVEntry::LT, Level, Bound, Delta)) - NewDeps += exploreDirections(Level + 1, A, B, Bound, - Loops, DepthExpanded, Delta); + NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, + Delta); // Test bounds for =, *, *, ... if (testBounds(Dependence::DVEntry::EQ, Level, Bound, Delta)) - NewDeps += exploreDirections(Level + 1, A, B, Bound, - Loops, DepthExpanded, Delta); + NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, + Delta); // test bounds for >, *, *, ... if (testBounds(Dependence::DVEntry::GT, Level, Bound, Delta)) - NewDeps += exploreDirections(Level + 1, A, B, Bound, - Loops, DepthExpanded, Delta); + NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, + Delta); Bound[Level].Direction = Dependence::DVEntry::ALL; return NewDeps; - } - else - return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, Delta); + } else + return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, + Delta); } - // Returns true iff the current bounds are plausible. bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level, BoundInfo *Bound, const SCEV *Delta) const { @@ -2822,7 +2763,6 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level, return true; } - // Computes the upper and lower bounds for level K // using the * direction. Records them in Bound. // Wolfe gives the equations @@ -2840,17 +2780,16 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level, // and the upper bound is always >= 0. void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound, unsigned K) const { - Bound[K].Lower[Dependence::DVEntry::ALL] = nullptr; // Default value = -infinity. - Bound[K].Upper[Dependence::DVEntry::ALL] = nullptr; // Default value = +infinity. + Bound[K].Lower[Dependence::DVEntry::ALL] = + nullptr; // Default value = -infinity. + Bound[K].Upper[Dependence::DVEntry::ALL] = + nullptr; // Default value = +infinity. if (Bound[K].Iterations) { - Bound[K].Lower[Dependence::DVEntry::ALL] = - SE->getMulExpr(SE->getMinusSCEV(A[K].NegPart, B[K].PosPart), - Bound[K].Iterations); - Bound[K].Upper[Dependence::DVEntry::ALL] = - SE->getMulExpr(SE->getMinusSCEV(A[K].PosPart, B[K].NegPart), - Bound[K].Iterations); - } - else { + Bound[K].Lower[Dependence::DVEntry::ALL] = SE->getMulExpr( + SE->getMinusSCEV(A[K].NegPart, B[K].PosPart), Bound[K].Iterations); + Bound[K].Upper[Dependence::DVEntry::ALL] = SE->getMulExpr( + SE->getMinusSCEV(A[K].PosPart, B[K].NegPart), Bound[K].Iterations); + } else { // If the difference is 0, we won't need to know the number of iterations. if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart)) Bound[K].Lower[Dependence::DVEntry::ALL] = @@ -2861,7 +2800,6 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B, } } - // Computes the upper and lower bounds for level K // using the = direction. Records them in Bound. // Wolfe gives the equations @@ -2879,18 +2817,19 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B, // and the upper bound is always >= 0. void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound, unsigned K) const { - Bound[K].Lower[Dependence::DVEntry::EQ] = nullptr; // Default value = -infinity. - Bound[K].Upper[Dependence::DVEntry::EQ] = nullptr; // Default value = +infinity. + Bound[K].Lower[Dependence::DVEntry::EQ] = + nullptr; // Default value = -infinity. + Bound[K].Upper[Dependence::DVEntry::EQ] = + nullptr; // Default value = +infinity. if (Bound[K].Iterations) { const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff); const SCEV *NegativePart = getNegativePart(Delta); Bound[K].Lower[Dependence::DVEntry::EQ] = - SE->getMulExpr(NegativePart, Bound[K].Iterations); + SE->getMulExpr(NegativePart, Bound[K].Iterations); const SCEV *PositivePart = getPositivePart(Delta); Bound[K].Upper[Dependence::DVEntry::EQ] = - SE->getMulExpr(PositivePart, Bound[K].Iterations); - } - else { + SE->getMulExpr(PositivePart, Bound[K].Iterations); + } else { // If the positive/negative part of the difference is 0, // we won't need to know the number of iterations. const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff); @@ -2903,7 +2842,6 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B, } } - // Computes the upper and lower bounds for level K // using the < direction. Records them in Bound. // Wolfe gives the equations @@ -2919,35 +2857,35 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B, // We must be careful to handle the case where the upper bound is unknown. void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound, unsigned K) const { - Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity. - Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity. + Bound[K].Lower[Dependence::DVEntry::LT] = + nullptr; // Default value = -infinity. + Bound[K].Upper[Dependence::DVEntry::LT] = + nullptr; // Default value = +infinity. if (Bound[K].Iterations) { const SCEV *Iter_1 = SE->getMinusSCEV( Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType())); const SCEV *NegPart = - getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff)); + getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff)); Bound[K].Lower[Dependence::DVEntry::LT] = - SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff); + SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff); const SCEV *PosPart = - getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff)); + getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff)); Bound[K].Upper[Dependence::DVEntry::LT] = - SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff); - } - else { + SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff); + } else { // If the positive/negative part of the difference is 0, // we won't need to know the number of iterations. const SCEV *NegPart = - getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff)); + getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff)); if (NegPart->isZero()) Bound[K].Lower[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff); const SCEV *PosPart = - getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff)); + getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff)); if (PosPart->isZero()) Bound[K].Upper[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff); } } - // Computes the upper and lower bounds for level K // using the > direction. Records them in Bound. // Wolfe gives the equations @@ -2963,45 +2901,45 @@ void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B, // We must be careful to handle the case where the upper bound is unknown. void DependenceInfo::findBoundsGT(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound, unsigned K) const { - Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity. - Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity. + Bound[K].Lower[Dependence::DVEntry::GT] = + nullptr; // Default value = -infinity. + Bound[K].Upper[Dependence::DVEntry::GT] = + nullptr; // Default value = +infinity. if (Bound[K].Iterations) { const SCEV *Iter_1 = SE->getMinusSCEV( Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType())); const SCEV *NegPart = - getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart)); + getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart)); Bound[K].Lower[Dependence::DVEntry::GT] = - SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff); + SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff); const SCEV *PosPart = - getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart)); + getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart)); Bound[K].Upper[Dependence::DVEntry::GT] = - SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff); - } - else { + SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff); + } else { // If the positive/negative part of the difference is 0, // we won't need to know the number of iterations. - const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart)); + const SCEV *NegPart = + getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart)); if (NegPart->isZero()) Bound[K].Lower[Dependence::DVEntry::GT] = A[K].Coeff; - const SCEV *PosPart = getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart)); + const SCEV *PosPart = + getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart)); if (PosPart->isZero()) Bound[K].Upper[Dependence::DVEntry::GT] = A[K].Coeff; } } - // X^+ = max(X, 0) const SCEV *DependenceInfo::getPositivePart(const SCEV *X) const { return SE->getSMaxExpr(X, SE->getZero(X->getType())); } - // X^- = min(X, 0) const SCEV *DependenceInfo::getNegativePart(const SCEV *X) const { return SE->getSMinExpr(X, SE->getZero(X->getType())); } - // Walks through the subscript, // collecting each coefficient, the associated loop bounds, // and recording its positive and negative parts for later use. @@ -3046,7 +2984,6 @@ DependenceInfo::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag, return CI; } - // Looks through all the bounds info and // computes the lower bound given the current direction settings // at each level. If the lower bound for any level is -inf, @@ -3062,7 +2999,6 @@ const SCEV *DependenceInfo::getLowerBound(BoundInfo *Bound) const { return Sum; } - // Looks through all the bounds info and // computes the upper bound given the current direction settings // at each level. If the upper bound at any level is +inf, @@ -3078,7 +3014,6 @@ const SCEV *DependenceInfo::getUpperBound(BoundInfo *Bound) const { return Sum; } - //===----------------------------------------------------------------------===// // Constraint manipulation for Delta test. @@ -3098,7 +3033,6 @@ const SCEV *DependenceInfo::findCoefficient(const SCEV *Expr, return findCoefficient(AddRec->getStart(), TargetLoop); } - // Given a linear SCEV, // return the SCEV given by zeroing out the coefficient // corresponding to the specified loop. @@ -3112,12 +3046,10 @@ const SCEV *DependenceInfo::zeroCoefficient(const SCEV *Expr, if (AddRec->getLoop() == TargetLoop) return AddRec->getStart(); return SE->getAddRecExpr(zeroCoefficient(AddRec->getStart(), TargetLoop), - AddRec->getStepRecurrence(*SE), - AddRec->getLoop(), + AddRec->getStepRecurrence(*SE), AddRec->getLoop(), AddRec->getNoWrapFlags()); } - // Given a linear SCEV Expr, // return the SCEV given by adding some Value to the // coefficient corresponding to the specified TargetLoop. @@ -3128,17 +3060,13 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr, const SCEV *Value) const { const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr); if (!AddRec) // create a new addRec - return SE->getAddRecExpr(Expr, - Value, - TargetLoop, + return SE->getAddRecExpr(Expr, Value, TargetLoop, SCEV::FlagAnyWrap); // Worst case, with no info. if (AddRec->getLoop() == TargetLoop) { const SCEV *Sum = SE->getAddExpr(AddRec->getStepRecurrence(*SE), Value); if (Sum->isZero()) return AddRec->getStart(); - return SE->getAddRecExpr(AddRec->getStart(), - Sum, - AddRec->getLoop(), + return SE->getAddRecExpr(AddRec->getStart(), Sum, AddRec->getLoop(), AddRec->getNoWrapFlags()); } if (SE->isLoopInvariant(AddRec, TargetLoop)) @@ -3149,7 +3077,6 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr, AddRec->getNoWrapFlags()); } - // Review the constraints, looking for opportunities // to simplify a subscript pair (Src and Dst). // Return true if some simplification occurs. @@ -3178,7 +3105,6 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst, return Result; } - // Attempt to propagate a distance // constraint into a subscript pair (Src and Dst). // Return true if some simplification occurs. @@ -3204,7 +3130,6 @@ bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst, return true; } - // Attempt to propagate a line // constraint into a subscript pair (Src and Dst). // Return true if some simplification occurs. @@ -3224,22 +3149,22 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst, if (A->isZero()) { const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B); const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C); - if (!Bconst || !Cconst) return false; + if (!Bconst || !Cconst) + return false; APInt Beta = Bconst->getAPInt(); APInt Charlie = Cconst->getAPInt(); APInt CdivB = Charlie.sdiv(Beta); assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B"); const SCEV *AP_K = findCoefficient(Dst, CurLoop); - // Src = SE->getAddExpr(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB))); Src = SE->getMinusSCEV(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB))); Dst = zeroCoefficient(Dst, CurLoop); if (!findCoefficient(Src, CurLoop)->isZero()) Consistent = false; - } - else if (B->isZero()) { + } else if (B->isZero()) { const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A); const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C); - if (!Aconst || !Cconst) return false; + if (!Aconst || !Cconst) + return false; APInt Alpha = Aconst->getAPInt(); APInt Charlie = Cconst->getAPInt(); APInt CdivA = Charlie.sdiv(Alpha); @@ -3249,11 +3174,11 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst, Src = zeroCoefficient(Src, CurLoop); if (!findCoefficient(Dst, CurLoop)->isZero()) Consistent = false; - } - else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) { + } else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) { const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A); const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C); - if (!Aconst || !Cconst) return false; + if (!Aconst || !Cconst) + return false; APInt Alpha = Aconst->getAPInt(); APInt Charlie = Cconst->getAPInt(); APInt CdivA = Charlie.sdiv(Alpha); @@ -3264,8 +3189,7 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst, Dst = addToCoefficient(Dst, CurLoop, A_K); if (!findCoefficient(Dst, CurLoop)->isZero()) Consistent = false; - } - else { + } else { // paper is incorrect here, or perhaps just misleading const SCEV *A_K = findCoefficient(Src, CurLoop); Src = SE->getMulExpr(Src, A); @@ -3281,7 +3205,6 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst, return true; } - // Attempt to propagate a point // constraint into a subscript pair (Src and Dst). // Return true if some simplification occurs. @@ -3302,7 +3225,6 @@ bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst, return true; } - // Update direction vector entry based on the current constraint. void DependenceInfo::updateDirection(Dependence::DVEntry &Level, const Constraint &CurConstraint) const { @@ -3322,34 +3244,28 @@ void DependenceInfo::updateDirection(Dependence::DVEntry &Level, if (!SE->isKnownNonNegative(Level.Distance)) // if may be negative NewDirection |= Dependence::DVEntry::GT; Level.Direction &= NewDirection; - } - else if (CurConstraint.isLine()) { + } else if (CurConstraint.isLine()) { Level.Scalar = false; Level.Distance = nullptr; // direction should be accurate - } - else if (CurConstraint.isPoint()) { + } else if (CurConstraint.isPoint()) { Level.Scalar = false; Level.Distance = nullptr; unsigned NewDirection = Dependence::DVEntry::NONE; - if (!isKnownPredicate(CmpInst::ICMP_NE, - CurConstraint.getY(), + if (!isKnownPredicate(CmpInst::ICMP_NE, CurConstraint.getY(), CurConstraint.getX())) // if X may be = Y NewDirection |= Dependence::DVEntry::EQ; - if (!isKnownPredicate(CmpInst::ICMP_SLE, - CurConstraint.getY(), + if (!isKnownPredicate(CmpInst::ICMP_SLE, CurConstraint.getY(), CurConstraint.getX())) // if Y may be > X NewDirection |= Dependence::DVEntry::LT; - if (!isKnownPredicate(CmpInst::ICMP_SGE, - CurConstraint.getY(), + if (!isKnownPredicate(CmpInst::ICMP_SGE, CurConstraint.getY(), CurConstraint.getX())) // if Y may be < X NewDirection |= Dependence::DVEntry::GT; Level.Direction &= NewDirection; - } - else + } else llvm_unreachable("constraint has unexpected kind"); } @@ -3425,7 +3341,7 @@ bool DependenceInfo::tryDelinearizeFixedSize( dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn)); assert(SrcBase && DstBase && SrcBase == DstBase && "expected src and dst scev unknowns to be equal"); - }); + }); SmallVector<int, 4> SrcSizes; SmallVector<int, 4> DstSizes; @@ -3737,9 +3653,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, Pair[P].Group.resize(Pairs); removeMatchingExtensions(&Pair[P]); Pair[P].Classification = - classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), - Pair[P].Dst, LI->getLoopFor(Dst->getParent()), - Pair[P].Loops); + classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst, + LI->getLoopFor(Dst->getParent()), Pair[P].Loops); Pair[P].GroupLoops = Pair[P].Loops; Pair[P].Group.set(P); LLVM_DEBUG(dbgs() << " subscript " << P << "\n"); @@ -3814,18 +3729,15 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, if (Pair[SI].Classification == Subscript::NonLinear) { // ignore these, but collect loops for later ++NonlinearSubscriptPairs; - collectCommonLoops(Pair[SI].Src, - LI->getLoopFor(Src->getParent()), + collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()), Pair[SI].Loops); - collectCommonLoops(Pair[SI].Dst, - LI->getLoopFor(Dst->getParent()), + collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()), Pair[SI].Loops); Result.Consistent = false; } else if (Pair[SI].Classification == Subscript::ZIV) { // always separable Separable.set(SI); - } - else { + } else { // SIV, RDIV, or MIV, so check for coupled group bool Done = true; for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) { @@ -3843,8 +3755,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, if (Pair[SI].Group.count() == 1) { Separable.set(SI); ++SeparableSubscriptPairs; - } - else { + } else { Coupled.set(SI); ++CoupledSubscriptPairs; } @@ -3950,10 +3861,9 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, Constraints, Result.Consistent)) { LLVM_DEBUG(dbgs() << "\t Changed\n"); ++DeltaPropagations; - Pair[SJ].Classification = - classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()), - Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()), - Pair[SJ].Loops); + Pair[SJ].Classification = classifyPair( + Pair[SJ].Src, LI->getLoopFor(Src->getParent()), Pair[SJ].Dst, + LI->getLoopFor(Dst->getParent()), Pair[SJ].Loops); switch (Pair[SJ].Classification) { case Subscript::ZIV: LLVM_DEBUG(dbgs() << "ZIV\n"); @@ -3995,8 +3905,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, LLVM_DEBUG(dbgs() << "MIV test\n"); if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result)) return nullptr; - } - else + } else llvm_unreachable("expected only MIV subscripts at this point"); } @@ -4052,8 +3961,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, break; } } - } - else { + } else { // On the other hand, if all directions are equal and there's no // loop-independent dependence possible, then no dependence exists. bool AllEqual = true; @@ -4158,9 +4066,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, Pair[P].Group.resize(Pairs); removeMatchingExtensions(&Pair[P]); Pair[P].Classification = - classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), - Pair[P].Dst, LI->getLoopFor(Dst->getParent()), - Pair[P].Loops); + classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst, + LI->getLoopFor(Dst->getParent()), Pair[P].Loops); Pair[P].GroupLoops = Pair[P].Loops; Pair[P].Group.set(P); } @@ -4172,15 +4079,12 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, for (unsigned SI = 0; SI < Pairs; ++SI) { if (Pair[SI].Classification == Subscript::NonLinear) { // ignore these, but collect loops for later - collectCommonLoops(Pair[SI].Src, - LI->getLoopFor(Src->getParent()), + collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()), Pair[SI].Loops); - collectCommonLoops(Pair[SI].Dst, - LI->getLoopFor(Dst->getParent()), + collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()), Pair[SI].Loops); Result.Consistent = false; - } - else if (Pair[SI].Classification == Subscript::ZIV) + } else if (Pair[SI].Classification == Subscript::ZIV) Separable.set(SI); else { // SIV, RDIV, or MIV, so check for coupled group @@ -4214,8 +4118,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, case Subscript::SIV: { unsigned Level; const SCEV *SplitIter = nullptr; - (void) testSIV(Pair[SI].Src, Pair[SI].Dst, Level, - Result, NewConstraint, SplitIter); + (void)testSIV(Pair[SI].Src, Pair[SI].Dst, Level, Result, NewConstraint, + SplitIter); if (Level == SplitLevel) { assert(SplitIter != nullptr); return SplitIter; diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 2b0f212..67c2cfa 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -150,6 +150,10 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc, switch (II->getIntrinsicID()) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + Loc = MemoryLocation::getForArgument(II, 0, TLI); + // These intrinsics don't really modify the memory, but returning Mod + // will allow them to be handled conservatively. + return ModRefInfo::Mod; case Intrinsic::invariant_start: Loc = MemoryLocation::getForArgument(II, 1, TLI); // These intrinsics don't really modify the memory, but returning Mod @@ -441,11 +445,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( Intrinsic::ID ID = II->getIntrinsicID(); switch (ID) { case Intrinsic::lifetime_start: { - // FIXME: This only considers queries directly on the invariant-tagged - // pointer, not on query pointers that are indexed off of them. It'd - // be nice to handle that at some point (the right approach is to use - // GetPointerBaseWithConstantOffset). - MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(1)); + MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(0)); if (BatchAA.isMustAlias(ArgLoc, MemLoc)) return MemDepResult::getDef(II); continue; diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index 28a2640..72b643c 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -191,7 +191,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { - assert(ArgIdx == 1 && "Invalid argument index"); + assert(ArgIdx == 0 && "Invalid argument index"); auto *AI = dyn_cast<AllocaInst>(Arg); if (!AI) // lifetime of poison value. diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp index abe4985..1e20fca 100644 --- a/llvm/lib/Analysis/StackLifetime.cpp +++ b/llvm/lib/Analysis/StackLifetime.cpp @@ -70,7 +70,7 @@ void StackLifetime::collectMarkers() { const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); if (!II || !II->isLifetimeStartOrEnd()) continue; - const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1)); + const AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0)); if (!AI) continue; auto It = AllocaNumbering.find(AI); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 1e70228..b0e4b00 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -9147,7 +9147,8 @@ static bool matchTwoInputRecurrence(const PHINode *PN, InstTy *&Inst, return false; for (unsigned I = 0; I != 2; ++I) { - if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I))) { + if (auto *Operation = dyn_cast<InstTy>(PN->getIncomingValue(I)); + Operation && Operation->getNumOperands() >= 2) { Value *LHS = Operation->getOperand(0); Value *RHS = Operation->getOperand(1); if (LHS != PN && RHS != PN) diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index b3b4c37..425ea31 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::exp: case Intrinsic::exp10: case Intrinsic::exp2: + case Intrinsic::ldexp: case Intrinsic::log: case Intrinsic::log10: case Intrinsic::log2: @@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::canonicalize: case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::ucmp: @@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( switch (ID) { case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::vp_lrint: @@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::vp_is_fpclass: return OpdIdx == 0; case Intrinsic::powi: + case Intrinsic::ldexp: return OpdIdx == -1 || OpdIdx == 1; default: return OpdIdx == -1; diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 520c6a0..3d5bd61 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -928,6 +928,7 @@ lltok::Kind LLLexer::LexIdentifier() { INSTKEYWORD(fptoui, FPToUI); INSTKEYWORD(fptosi, FPToSI); INSTKEYWORD(inttoptr, IntToPtr); + INSTKEYWORD(ptrtoaddr, PtrToAddr); INSTKEYWORD(ptrtoint, PtrToInt); INSTKEYWORD(bitcast, BitCast); INSTKEYWORD(addrspacecast, AddrSpaceCast); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 13bef1f..1bc2906 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4273,6 +4273,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { case lltok::kw_bitcast: case lltok::kw_addrspacecast: case lltok::kw_inttoptr: + case lltok::kw_ptrtoaddr: case lltok::kw_ptrtoint: { unsigned Opc = Lex.getUIntVal(); Type *DestTy = nullptr; @@ -7310,6 +7311,7 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_fptoui: case lltok::kw_fptosi: case lltok::kw_inttoptr: + case lltok::kw_ptrtoaddr: case lltok::kw_ptrtoint: return parseCast(Inst, PFS, KeywordVal); case lltok::kw_fptrunc: diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index 36d10d0..eb83945 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -60,6 +60,17 @@ ArrayRef<EnumEntry<SigComponentType>> dxbc::getSigComponentTypes() { return ArrayRef(SigComponentTypes); } +static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { + {"SRV", llvm::dxil::ResourceClass::SRV}, + {"UAV", llvm::dxil::ResourceClass::UAV}, + {"CBV", llvm::dxil::ResourceClass::CBuffer}, + {"Sampler", llvm::dxil::ResourceClass::Sampler}, +}; + +ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> dxbc::getResourceClasses() { + return ArrayRef(ResourceClassNames); +} + static const EnumEntry<RootFlags> RootFlagNames[] = { #define ROOT_SIGNATURE_FLAG(Val, Enum) {#Enum, RootFlags::Enum}, #include "llvm/BinaryFormat/DXContainerConstants.def" diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 290d873..22a0d0f 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1283,6 +1283,7 @@ static int getDecodedCastOpcode(unsigned Val) { case bitc::CAST_SITOFP : return Instruction::SIToFP; case bitc::CAST_FPTRUNC : return Instruction::FPTrunc; case bitc::CAST_FPEXT : return Instruction::FPExt; + case bitc::CAST_PTRTOADDR: return Instruction::PtrToAddr; case bitc::CAST_PTRTOINT: return Instruction::PtrToInt; case bitc::CAST_INTTOPTR: return Instruction::IntToPtr; case bitc::CAST_BITCAST : return Instruction::BitCast; diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 05680fa..a3f8254 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -647,6 +647,7 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) { case Instruction::SIToFP : return bitc::CAST_SITOFP; case Instruction::FPTrunc : return bitc::CAST_FPTRUNC; case Instruction::FPExt : return bitc::CAST_FPEXT; + case Instruction::PtrToAddr: return bitc::CAST_PTRTOADDR; case Instruction::PtrToInt: return bitc::CAST_PTRTOINT; case Instruction::IntToPtr: return bitc::CAST_INTTOPTR; case Instruction::BitCast : return bitc::CAST_BITCAST; diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index c72b6e8..23a3543 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -3657,6 +3657,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV, break; // Error } + case Instruction::PtrToAddr: case Instruction::PtrToInt: { const DataLayout &DL = getDataLayout(); diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 3f3d5dc9..278dd65 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -1915,7 +1915,6 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( // TODO: the "order" argument type is "int", not int32. So // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. - ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO"); Constant *OrderingVal = ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering)); @@ -2012,7 +2011,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( if (CASExpected) { AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); AllocaCASExpected->setAlignment(AllocaAlignment); - Builder.CreateLifetimeStart(AllocaCASExpected, SizeVal64); + Builder.CreateLifetimeStart(AllocaCASExpected); Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); Args.push_back(AllocaCASExpected); } @@ -2026,7 +2025,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( } else { AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); AllocaValue->setAlignment(AllocaAlignment); - Builder.CreateLifetimeStart(AllocaValue, SizeVal64); + Builder.CreateLifetimeStart(AllocaValue); Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); Args.push_back(AllocaValue); } @@ -2036,7 +2035,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( if (!CASExpected && HasResult && !UseSizedLibcall) { AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); AllocaResult->setAlignment(AllocaAlignment); - Builder.CreateLifetimeStart(AllocaResult, SizeVal64); + Builder.CreateLifetimeStart(AllocaResult); Args.push_back(AllocaResult); } @@ -2069,7 +2068,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( // And then, extract the results... if (ValueOperand && !UseSizedLibcall) - Builder.CreateLifetimeEnd(AllocaValue, SizeVal64); + Builder.CreateLifetimeEnd(AllocaValue); if (CASExpected) { // The final result from the CAS is {load of 'expected' alloca, bool result @@ -2078,7 +2077,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( Value *V = PoisonValue::get(FinalResultTy); Value *ExpectedOut = Builder.CreateAlignedLoad( CASExpected->getType(), AllocaCASExpected, AllocaAlignment); - Builder.CreateLifetimeEnd(AllocaCASExpected, SizeVal64); + Builder.CreateLifetimeEnd(AllocaCASExpected); V = Builder.CreateInsertValue(V, ExpectedOut, 0); V = Builder.CreateInsertValue(V, Result, 1); I->replaceAllUsesWith(V); @@ -2089,7 +2088,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( else { V = Builder.CreateAlignedLoad(I->getType(), AllocaResult, AllocaAlignment); - Builder.CreateLifetimeEnd(AllocaResult, SizeVal64); + Builder.CreateLifetimeEnd(AllocaResult); } I->replaceAllUsesWith(V); } diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 9ba1782..0f3ec8b 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -132,9 +132,10 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, unsigned i = 0; unsigned NumFixedArgs = CB.getFunctionType()->getNumParams(); for (const auto &Arg : CB.args()) { - ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i), - i < NumFixedArgs}; + ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i)}; setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB); + if (i >= NumFixedArgs) + OrigArg.Flags[0].setVarArg(); // If we have an explicit sret argument that is an Instruction, (i.e., it // might point to function-local memory), we can't meaningfully tail-call. @@ -301,7 +302,7 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg, // double] -> double). SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), OrigArg.OrigArgIndex, OrigArg.Flags[0], - OrigArg.IsFixed, OrigArg.OrigValue); + OrigArg.OrigValue); return; } @@ -313,7 +314,7 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg, for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.OrigArgIndex, - OrigArg.Flags[0], OrigArg.IsFixed); + OrigArg.Flags[0]); if (NeedsRegBlock) SplitArgs.back().Flags[0].setInConsecutiveRegs(); } diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index bbfae57..d30dfa7 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2209,7 +2209,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START : TargetOpcode::LIFETIME_END; - const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(1)); + const AllocaInst *AI = dyn_cast<AllocaInst>(CI.getArgOperand(0)); if (!AI || !AI->isStaticAlloca()) return true; diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp index ca51b67..5f37890 100644 --- a/llvm/lib/CodeGen/RegisterPressure.cpp +++ b/llvm/lib/CodeGen/RegisterPressure.cpp @@ -1001,7 +1001,7 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec, ++CritIdx; if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) { - int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc(); + int PDiff = (int)PNew - CriticalPSets[CritIdx].getUnitInc(); if (PDiff > 0) { Delta.CriticalMax = PressureChange(i); Delta.CriticalMax.setUnitInc(PDiff); @@ -1191,7 +1191,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff, ++CritIdx; if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) { - int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc(); + int CritInc = (int)MNew - CriticalPSets[CritIdx].getUnitInc(); if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) { Delta.CriticalMax = PressureChange(PSetID); Delta.CriticalMax.setUnitInc(CritInc); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7341914..17703f5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12843,22 +12843,21 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) { SDLoc DL(HG); EVT MemVT = HG->getMemoryVT(); + EVT DataVT = Index.getValueType(); MachineMemOperand *MMO = HG->getMemOperand(); ISD::MemIndexType IndexType = HG->getIndexType(); if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) return Chain; - SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index, - HG->getScale(), HG->getIntID()}; - if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL)) + if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL) || + refineIndexType(Index, IndexType, DataVT, DAG)) { + SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index, + HG->getScale(), HG->getIntID()}; return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops, MMO, IndexType); + } - EVT DataVT = Index.getValueType(); - if (refineIndexType(Index, IndexType, DataVT, DAG)) - return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops, - MMO, IndexType); return SDValue(); } @@ -16343,6 +16342,38 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { DAG, DL); } break; + case ISD::ABDU: + case ISD::ABDS: + // (trunc (abdu/abds a, b)) → (abdu/abds (trunc a), (trunc b)) + if (!LegalOperations || N0.hasOneUse()) { + EVT SrcVT = N0.getValueType(); + EVT TruncVT = VT; + unsigned SrcBits = SrcVT.getScalarSizeInBits(); + unsigned TruncBits = TruncVT.getScalarSizeInBits(); + unsigned NeededBits = SrcBits - TruncBits; + + SDValue A = N0.getOperand(0); + SDValue B = N0.getOperand(1); + bool CanFold = false; + + if (N0.getOpcode() == ISD::ABDU) { + KnownBits KnownA = DAG.computeKnownBits(A); + KnownBits KnownB = DAG.computeKnownBits(B); + CanFold = KnownA.countMinLeadingZeros() >= NeededBits && + KnownB.countMinLeadingZeros() >= NeededBits; + } else { + unsigned SignBitsA = DAG.ComputeNumSignBits(A); + unsigned SignBitsB = DAG.ComputeNumSignBits(B); + CanFold = SignBitsA > NeededBits && SignBitsB > NeededBits; + } + + if (CanFold && TLI.isOperationLegal(N0.getOpcode(), VT)) { + SDValue NewA = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, A); + SDValue NewB = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, B); + return DAG.getNode(N0.getOpcode(), DL, TruncVT, NewA, NewB); + } + } + break; } return SDValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 649a310..5ef1746 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1371,7 +1371,7 @@ void SelectionDAG::init(MachineFunction &NewMF, const TargetLibraryInfo *LibraryInfo, UniformityInfo *NewUA, ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin, MachineModuleInfo &MMIin, - FunctionVarLocs const *VarLocs, bool HasDivergency) { + FunctionVarLocs const *VarLocs) { MF = &NewMF; SDAGISelPass = PassPtr; ORE = &NewORE; @@ -1384,7 +1384,6 @@ void SelectionDAG::init(MachineFunction &NewMF, BFI = BFIin; MMI = &MMIin; FnVarLocs = VarLocs; - DivergentTarget = HasDivergency; } SelectionDAG::~SelectionDAG() { @@ -2331,8 +2330,7 @@ SDValue SelectionDAG::getRegister(Register Reg, EVT VT) { return SDValue(E, 0); auto *N = newSDNode<RegisterSDNode>(Reg, VTs); - N->SDNodeBits.IsDivergent = - DivergentTarget && TLI->isSDNodeSourceOfDivergence(N, FLI, UA); + N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); @@ -5630,6 +5628,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::FDIV: case ISD::FREM: case ISD::FCOPYSIGN: + case ISD::FP_EXTEND: // No poison except from flags (which is handled above) return false; @@ -8888,6 +8887,44 @@ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI, } } +std::pair<SDValue, SDValue> +SelectionDAG::getMemcmp(SDValue Chain, const SDLoc &dl, SDValue Mem0, + SDValue Mem1, SDValue Size, const CallInst *CI) { + const char *LibCallName = TLI->getLibcallName(RTLIB::MEMCMP); + if (!LibCallName) + return {}; + + // Emit a library call. + auto GetEntry = [](Type *Ty, SDValue &SDV) { + TargetLowering::ArgListEntry E; + E.Ty = Ty; + E.Node = SDV; + return E; + }; + + PointerType *PT = PointerType::getUnqual(*getContext()); + TargetLowering::ArgListTy Args = { + GetEntry(PT, Mem0), GetEntry(PT, Mem1), + GetEntry(getDataLayout().getIntPtrType(*getContext()), Size)}; + + TargetLowering::CallLoweringInfo CLI(*this); + bool IsTailCall = false; + bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI); + IsTailCall = CI && CI->isTailCall() && + isInTailCallPosition(*CI, getTarget(), ReturnsFirstArg); + + CLI.setDebugLoc(dl) + .setChain(Chain) + .setLibCallee( + TLI->getLibcallCallingConv(RTLIB::MEMCMP), + Type::getInt32Ty(*getContext()), + getExternalSymbol(LibCallName, TLI->getPointerTy(getDataLayout())), + std::move(Args)) + .setTailCall(IsTailCall); + + return TLI->LowerCallTo(CLI); +} + SDValue SelectionDAG::getMemcpy( SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, @@ -12225,8 +12262,6 @@ static bool gluePropagatesDivergence(const SDNode *Node) { } bool SelectionDAG::calculateDivergence(SDNode *N) { - if (!DivergentTarget) - return false; if (TLI->isSDNodeAlwaysUniform(N)) { assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) && "Conflicting divergence information!"); @@ -12246,8 +12281,6 @@ bool SelectionDAG::calculateDivergence(SDNode *N) { } void SelectionDAG::updateDivergence(SDNode *N) { - if (!DivergentTarget) - return; SmallVector<SDNode *, 16> Worklist(1, N); do { N = Worklist.pop_back_val(); @@ -13808,20 +13841,16 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) { Ops[I].setInitial(Vals[I]); EVT VT = Ops[I].getValueType(); - // Take care of the Node's operands iff target has divergence // Skip Chain. It does not carry divergence. - if (DivergentTarget && VT != MVT::Other && + if (VT != MVT::Other && (VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) && Ops[I].getNode()->isDivergent()) { - // Node is going to be divergent if at least one of its operand is - // divergent, unless it belongs to the "AlwaysUniform" exemptions. IsDivergent = true; } } Node->NumOperands = Vals.size(); Node->OperandList = Ops; - // Check the divergence of the Node itself. - if (DivergentTarget && !TLI->isSDNodeAlwaysUniform(Node)) { + if (!TLI->isSDNodeAlwaysUniform(Node)) { IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, UA); Node->SDNodeBits.IsDivergent = IsDivergent; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index d0815e9..48ab797 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2273,9 +2273,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { Flags.setNoExt(); for (unsigned i = 0; i < NumParts; ++i) { - Outs.push_back(ISD::OutputArg(Flags, - Parts[i].getValueType().getSimpleVT(), - VT, /*isfixed=*/true, 0, 0)); + Outs.push_back(ISD::OutputArg( + Flags, Parts[i].getValueType().getSimpleVT(), VT, 0, 0)); OutVals.push_back(Parts[i]); } } @@ -2291,9 +2290,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { assert(SwiftError.getFunctionArg() && "Need a swift error argument"); ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); Flags.setSwiftError(); - Outs.push_back(ISD::OutputArg( - Flags, /*vt=*/TLI.getPointerTy(DL), /*argvt=*/EVT(TLI.getPointerTy(DL)), - /*isfixed=*/true, /*origidx=*/1, /*partOffs=*/0)); + Outs.push_back(ISD::OutputArg(Flags, /*vt=*/TLI.getPointerTy(DL), + /*argvt=*/EVT(TLI.getPointerTy(DL)), + /*origidx=*/1, /*partOffs=*/0)); // Create SDNode for the swifterror virtual register. OutVals.push_back( DAG.getRegister(SwiftError.getOrCreateVRegUseAt( @@ -3978,6 +3977,11 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) { setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N)); } +void SelectionDAGBuilder::visitPtrToAddr(const User &I) { + // FIXME: this is not correct for pointers with addr width != pointer width + visitPtrToInt(I); +} + void SelectionDAGBuilder::visitPtrToInt(const User &I) { // What to do depends on the size of the integer and the size of the pointer. // We can either truncate, zero extend, or no-op, accordingly. @@ -7598,7 +7602,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, if (TM.getOptLevel() == CodeGenOptLevel::None) return; - const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(1)); + const AllocaInst *LifetimeObject = dyn_cast<AllocaInst>(I.getArgOperand(0)); if (!LifetimeObject) return; @@ -9091,7 +9095,7 @@ bool SelectionDAGBuilder::visitMemCmpBCmpCall(const CallInst &I) { const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp( DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS), - getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS)); + getValue(Size), &I); if (Res.first.getNode()) { processIntegerCallValue(I, Res.first, true); PendingLoads.push_back(Res.second); @@ -11124,6 +11128,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { const Align OriginalAlignment(getABIAlignmentForCallingConv(ArgTy, DL)); Flags.setOrigAlign(OriginalAlignment); + if (i >= CLI.NumFixedArgs) + Flags.setVarArg(); if (Args[i].Ty->isPointerTy()) { Flags.setPointer(); Flags.setPointerAddrSpace( @@ -11246,8 +11252,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // For scalable vectors the scalable part is currently handled // by individual targets, so we just use the known minimum size here. ISD::OutputArg MyFlags( - Flags, Parts[j].getValueType().getSimpleVT(), VT, - i < CLI.NumFixedArgs, i, + Flags, Parts[j].getValueType().getSimpleVT(), VT, i, j * Parts[j].getValueType().getStoreSize().getKnownMinValue()); if (NumParts > 1 && j == 0) MyFlags.Flags.setSplit(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index c251755..e0835e6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -574,6 +574,7 @@ private: void visitFPToSI(const User &I); void visitUIToFP(const User &I); void visitSIToFP(const User &I); + void visitPtrToAddr(const User &I); void visitPtrToInt(const User &I); void visitIntToPtr(const User &I); void visitBitCast(const User &I); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 26071ed..ece50ed 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -480,10 +480,7 @@ void SelectionDAGISel::initializeAnalysisResults( MachineModuleInfo &MMI = MAMP.getCachedResult<MachineModuleAnalysis>(*Fn.getParent())->getMMI(); - TTI = &FAM.getResult<TargetIRAnalysis>(Fn); - - CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs, - TTI->hasBranchDivergence(&Fn)); + CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs); // Now get the optional analyzes if we want to. // This is based on the possibly changed OptLevel (after optnone is taken @@ -501,6 +498,10 @@ void SelectionDAGISel::initializeAnalysisResults( BatchAA = std::nullopt; SP = &FAM.getResult<SSPLayoutAnalysis>(Fn); + +#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS + TTI = &FAM.getResult<TargetIRAnalysis>(Fn); +#endif } void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { @@ -536,10 +537,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { MachineModuleInfo &MMI = MFP.getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); - TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn); - - CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs, - TTI->hasBranchDivergence(&Fn)); + CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs); // Now get the optional analyzes if we want to. // This is based on the possibly changed OptLevel (after optnone is taken @@ -558,6 +556,10 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { BatchAA = std::nullopt; SP = &MFP.getAnalysis<StackProtector>().getLayoutInfo(); + +#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS + TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn); +#endif } bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 9f525ea..d80a229 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1008,7 +1008,7 @@ unsigned TargetLoweringBase::getBitWidthForCttzElements( CR = CR.subtract(APInt(64, 1)); unsigned EltWidth = RetTy->getScalarSizeInBits(); - EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits()); + EltWidth = std::min(EltWidth, CR.getActiveBits()); EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8); return EltWidth; @@ -1772,7 +1772,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType, Flags.setZExt(); for (unsigned i = 0; i < NumParts; ++i) - Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isfixed=*/true, 0, 0)); + Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, 0, 0)); } } @@ -1893,6 +1893,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { case SIToFP: return ISD::SINT_TO_FP; case FPTrunc: return ISD::FP_ROUND; case FPExt: return ISD::FP_EXTEND; + case PtrToAddr: return ISD::BITCAST; case PtrToInt: return ISD::BITCAST; case IntToPtr: return ISD::BITCAST; case BitCast: return ISD::BITCAST; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp index 987e639..a201fae 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp @@ -60,6 +60,20 @@ void DWARFGdbIndex::dumpSymbolTable(raw_ostream &OS) const { ", filled slots:", SymbolTableOffset, (uint64_t)SymbolTable.size()) << '\n'; + + const auto FindCuVectorId = [&](uint32_t VecOffset) { + // Entries in ConstantPoolVectors are sorted by their offset in constant + // pool, see how ConstantPoolVectors is populated in parseImpl. + const auto *It = + llvm::lower_bound(ConstantPoolVectors, VecOffset, + [](const auto &ConstantPoolEntry, uint32_t Offset) { + return ConstantPoolEntry.first < Offset; + }); + assert(It != ConstantPoolVectors.end() && It->first == VecOffset && + "Invalid symbol table"); + return It - ConstantPoolVectors.begin(); + }; + uint32_t I = -1; for (const SymTableEntry &E : SymbolTable) { ++I; @@ -72,13 +86,7 @@ void DWARFGdbIndex::dumpSymbolTable(raw_ostream &OS) const { StringRef Name = ConstantPoolStrings.substr( ConstantPoolOffset - StringPoolOffset + E.NameOffset); - auto CuVector = llvm::find_if( - ConstantPoolVectors, - [&](const std::pair<uint32_t, SmallVector<uint32_t, 0>> &V) { - return V.first == E.VecOffset; - }); - assert(CuVector != ConstantPoolVectors.end() && "Invalid symbol table"); - uint32_t CuVectorId = CuVector - ConstantPoolVectors.begin(); + const uint32_t CuVectorId = FindCuVectorId(E.VecOffset); OS << format(" String name: %s, CU vector index: %d\n", Name.data(), CuVectorId); } diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp index 79904fc..574883e 100644 --- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp +++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp @@ -92,16 +92,9 @@ static raw_ostream &operator<<(raw_ostream &OS, return OS; } -static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { - {"CBV", dxil::ResourceClass::CBuffer}, - {"SRV", dxil::ResourceClass::SRV}, - {"UAV", dxil::ResourceClass::UAV}, - {"Sampler", dxil::ResourceClass::Sampler}, -}; - static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) { OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)), - ArrayRef(ResourceClassNames)); + dxbc::getResourceClasses()); return OS; } diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp index 9cf4ed1..1cda308 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp @@ -51,13 +51,6 @@ static std::optional<StringRef> extractMdStringValue(MDNode *Node, return NodeText->getString(); } -static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { - {"CBV", dxil::ResourceClass::CBuffer}, - {"SRV", dxil::ResourceClass::SRV}, - {"UAV", dxil::ResourceClass::UAV}, - {"Sampler", dxil::ResourceClass::Sampler}, -}; - namespace { // We use the OverloadVisit with std::visit to ensure the compiler catches if a @@ -128,7 +121,7 @@ MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) { IRBuilder<> Builder(Ctx); StringRef ResName = enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)), - ArrayRef(ResourceClassNames)); + dxbc::getResourceClasses()); assert(!ResName.empty() && "Provided an invalid Resource Class"); SmallString<7> Name({"Root", ResName}); Metadata *Operands[] = { @@ -170,7 +163,7 @@ MDNode *MetadataBuilder::BuildDescriptorTableClause( IRBuilder<> Builder(Ctx); StringRef ResName = enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)), - ArrayRef(ResourceClassNames)); + dxbc::getResourceClasses()); assert(!ResName.empty() && "Provided an invalid Resource Class"); Metadata *Operands[] = { MDString::get(Ctx, ResName), diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp index 9d84aa8..72308a3d 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp @@ -29,7 +29,7 @@ bool verifyRegisterValue(uint32_t RegisterValue) { // This Range is reserverved, therefore invalid, according to the spec // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#all-the-values-should-be-legal bool verifyRegisterSpace(uint32_t RegisterSpace) { - return !(RegisterSpace >= 0xFFFFFFF0 && RegisterSpace <= 0xFFFFFFFF); + return !(RegisterSpace >= 0xFFFFFFF0); } bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) { diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 3aa4f7a..ea027e4 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -4014,6 +4014,340 @@ OpenMPIRBuilder::createMasked(const LocationDescription &Loc, /*Conditional*/ true, /*hasFinalize*/ true); } +static llvm::CallInst *emitNoUnwindRuntimeCall(IRBuilder<> &Builder, + llvm::FunctionCallee Callee, + ArrayRef<llvm::Value *> Args, + const llvm::Twine &Name) { + llvm::CallInst *Call = Builder.CreateCall( + Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name); + Call->setDoesNotThrow(); + return Call; +} + +// Expects input basic block is dominated by BeforeScanBB. +// Once Scan directive is encountered, the code after scan directive should be +// dominated by AfterScanBB. Scan directive splits the code sequence to +// scan and input phase. Based on whether inclusive or exclusive +// clause is used in the scan directive and whether input loop or scan loop +// is lowered, it adds jumps to input and scan phase. First Scan loop is the +// input loop and second is the scan loop. The code generated handles only +// inclusive scans now. +OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan( + const LocationDescription &Loc, InsertPointTy AllocaIP, + ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType, + bool IsInclusive, ScanInfo *ScanRedInfo) { + if (ScanRedInfo->OMPFirstScanLoop) { + llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars, + ScanVarsType, ScanRedInfo); + if (Err) + return Err; + } + if (!updateToLocation(Loc)) + return Loc.IP; + + llvm::Value *IV = ScanRedInfo->IV; + + if (ScanRedInfo->OMPFirstScanLoop) { + // Emit buffer[i] = red; at the end of the input phase. + for (size_t i = 0; i < ScanVars.size(); i++) { + Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]; + Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr); + Type *DestTy = ScanVarsType[i]; + Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset"); + Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]); + + Builder.CreateStore(Src, Val); + } + } + Builder.CreateBr(ScanRedInfo->OMPScanLoopExit); + emitBlock(ScanRedInfo->OMPScanDispatch, + Builder.GetInsertBlock()->getParent()); + + if (!ScanRedInfo->OMPFirstScanLoop) { + IV = ScanRedInfo->IV; + // Emit red = buffer[i]; at the entrance to the scan phase. + // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated. + for (size_t i = 0; i < ScanVars.size(); i++) { + Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]; + Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr); + Type *DestTy = ScanVarsType[i]; + Value *SrcPtr = + Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset"); + Value *Src = Builder.CreateLoad(DestTy, SrcPtr); + Builder.CreateStore(Src, ScanVars[i]); + } + } + + // TODO: Update it to CreateBr and remove dead blocks + llvm::Value *CmpI = Builder.getInt1(true); + if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) { + Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock, + ScanRedInfo->OMPAfterScanBlock); + } else { + Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock, + ScanRedInfo->OMPBeforeScanBlock); + } + emitBlock(ScanRedInfo->OMPAfterScanBlock, + Builder.GetInsertBlock()->getParent()); + Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock); + return Builder.saveIP(); +} + +Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR( + InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars, + ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) { + + Builder.restoreIP(AllocaIP); + // Create the shared pointer at alloca IP. + for (size_t i = 0; i < ScanVars.size(); i++) { + llvm::Value *BuffPtr = + Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla"); + (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr; + } + + // Allocate temporary buffer by master thread + auto BodyGenCB = [&](InsertPointTy AllocaIP, + InsertPointTy CodeGenIP) -> Error { + Builder.restoreIP(CodeGenIP); + Value *AllocSpan = + Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1)); + for (size_t i = 0; i < ScanVars.size(); i++) { + Type *IntPtrTy = Builder.getInt32Ty(); + Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]); + Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy); + Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize, + AllocSpan, nullptr, "arr"); + Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]); + } + return Error::success(); + }; + // TODO: Perform finalization actions for variables. This has to be + // called for variables which have destructors/finalizers. + auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); }; + + Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator()); + llvm::Value *FilterVal = Builder.getInt32(0); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = + createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal); + + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + BasicBlock *InputBB = Builder.GetInsertBlock(); + if (InputBB->getTerminator()) + Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); + AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier); + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + + return Error::success(); +} + +Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR( + ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) { + auto BodyGenCB = [&](InsertPointTy AllocaIP, + InsertPointTy CodeGenIP) -> Error { + Builder.restoreIP(CodeGenIP); + for (ReductionInfo RedInfo : ReductionInfos) { + Value *PrivateVar = RedInfo.PrivateVariable; + Value *OrigVar = RedInfo.Variable; + Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar]; + Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr); + + Type *SrcTy = RedInfo.ElementType; + Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span, + "arrayOffset"); + Value *Src = Builder.CreateLoad(SrcTy, Val); + + Builder.CreateStore(Src, OrigVar); + Builder.CreateFree(Buff); + } + return Error::success(); + }; + // TODO: Perform finalization actions for variables. This has to be + // called for variables which have destructors/finalizers. + auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); }; + + if (ScanRedInfo->OMPScanFinish->getTerminator()) + Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator()); + else + Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish); + + llvm::Value *FilterVal = Builder.getInt32(0); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = + createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal); + + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + BasicBlock *InputBB = Builder.GetInsertBlock(); + if (InputBB->getTerminator()) + Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); + AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier); + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + return Error::success(); +} + +OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction( + const LocationDescription &Loc, + ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos, + ScanInfo *ScanRedInfo) { + + if (!updateToLocation(Loc)) + return Loc.IP; + auto BodyGenCB = [&](InsertPointTy AllocaIP, + InsertPointTy CodeGenIP) -> Error { + Builder.restoreIP(CodeGenIP); + Function *CurFn = Builder.GetInsertBlock()->getParent(); + // for (int k = 0; k <= ceil(log2(n)); ++k) + llvm::BasicBlock *LoopBB = + BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body"); + llvm::BasicBlock *ExitBB = + splitBB(Builder, false, "omp.outer.log.scan.exit"); + llvm::Function *F = llvm::Intrinsic::getOrInsertDeclaration( + Builder.GetInsertBlock()->getModule(), + (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy()); + llvm::BasicBlock *InputBB = Builder.GetInsertBlock(); + llvm::Value *Arg = + Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy()); + llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, ""); + F = llvm::Intrinsic::getOrInsertDeclaration( + Builder.GetInsertBlock()->getModule(), + (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy()); + LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, ""); + LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty()); + llvm::Value *NMin1 = Builder.CreateNUWSub( + ScanRedInfo->Span, + llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1)); + Builder.SetInsertPoint(InputBB); + Builder.CreateBr(LoopBB); + emitBlock(LoopBB, CurFn); + Builder.SetInsertPoint(LoopBB); + + PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2); + // size pow2k = 1; + PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2); + Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0), + InputBB); + Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1), + InputBB); + // for (size i = n - 1; i >= 2 ^ k; --i) + // tmp[i] op= tmp[i-pow2k]; + llvm::BasicBlock *InnerLoopBB = + BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body"); + llvm::BasicBlock *InnerExitBB = + BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit"); + llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K); + Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB); + emitBlock(InnerLoopBB, CurFn); + Builder.SetInsertPoint(InnerLoopBB); + PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2); + IVal->addIncoming(NMin1, LoopBB); + for (ReductionInfo RedInfo : ReductionInfos) { + Value *ReductionVal = RedInfo.PrivateVariable; + Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal]; + Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr); + Type *DestTy = RedInfo.ElementType; + Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1)); + Value *LHSPtr = + Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset"); + Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K); + Value *RHSPtr = + Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset"); + Value *LHS = Builder.CreateLoad(DestTy, LHSPtr); + Value *RHS = Builder.CreateLoad(DestTy, RHSPtr); + llvm::Value *Result; + InsertPointOrErrorTy AfterIP = + RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result); + if (!AfterIP) + return AfterIP.takeError(); + Builder.CreateStore(Result, LHSPtr); + } + llvm::Value *NextIVal = Builder.CreateNUWSub( + IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1)); + IVal->addIncoming(NextIVal, Builder.GetInsertBlock()); + CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K); + Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB); + emitBlock(InnerExitBB, CurFn); + llvm::Value *Next = Builder.CreateNUWAdd( + Counter, llvm::ConstantInt::get(Counter->getType(), 1)); + Counter->addIncoming(Next, Builder.GetInsertBlock()); + // pow2k <<= 1; + llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true); + Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock()); + llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal); + Builder.CreateCondBr(Cmp, LoopBB, ExitBB); + Builder.SetInsertPoint(ExitBB->getFirstInsertionPt()); + return Error::success(); + }; + + // TODO: Perform finalization actions for variables. This has to be + // called for variables which have destructors/finalizers. + auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); }; + + llvm::Value *FilterVal = Builder.getInt32(0); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = + createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal); + + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier); + + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo); + if (Err) + return Err; + + return AfterIP; +} + +Error OpenMPIRBuilder::emitScanBasedDirectiveIR( + llvm::function_ref<Error()> InputLoopGen, + llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen, + ScanInfo *ScanRedInfo) { + + { + // Emit loop with input phase: + // for (i: 0..<num_iters>) { + // <input phase>; + // buffer[i] = red; + // } + ScanRedInfo->OMPFirstScanLoop = true; + Error Err = InputLoopGen(); + if (Err) + return Err; + } + { + // Emit loop with scan phase: + // for (i: 0..<num_iters>) { + // red = buffer[i]; + // <scan phase>; + // } + ScanRedInfo->OMPFirstScanLoop = false; + Error Err = ScanLoopGen(Builder.saveIP()); + if (Err) + return Err; + } + return Error::success(); +} + +void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) { + Function *Fun = Builder.GetInsertBlock()->getParent(); + ScanRedInfo->OMPScanDispatch = + BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch"); + ScanRedInfo->OMPAfterScanBlock = + BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb"); + ScanRedInfo->OMPBeforeScanBlock = + BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb"); + ScanRedInfo->OMPScanLoopExit = + BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit"); +} CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton( DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name) { @@ -4111,6 +4445,76 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc, return CL; } +Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() { + ScanInfos.emplace_front(); + ScanInfo *Result = &ScanInfos.front(); + return Result; +} + +Expected<SmallVector<llvm::CanonicalLoopInfo *>> +OpenMPIRBuilder::createCanonicalScanLoops( + const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, + Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, + InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) { + LocationDescription ComputeLoc = + ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc; + updateToLocation(ComputeLoc); + + SmallVector<CanonicalLoopInfo *> Result; + + Value *TripCount = calculateCanonicalLoopTripCount( + ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name); + ScanRedInfo->Span = TripCount; + ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init"); + Builder.SetInsertPoint(ScanRedInfo->OMPScanInit); + + auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) { + Builder.restoreIP(CodeGenIP); + ScanRedInfo->IV = IV; + createScanBBs(ScanRedInfo); + BasicBlock *InputBlock = Builder.GetInsertBlock(); + Instruction *Terminator = InputBlock->getTerminator(); + assert(Terminator->getNumSuccessors() == 1); + BasicBlock *ContinueBlock = Terminator->getSuccessor(0); + Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch); + emitBlock(ScanRedInfo->OMPBeforeScanBlock, + Builder.GetInsertBlock()->getParent()); + Builder.CreateBr(ScanRedInfo->OMPScanLoopExit); + emitBlock(ScanRedInfo->OMPScanLoopExit, + Builder.GetInsertBlock()->getParent()); + Builder.CreateBr(ContinueBlock); + Builder.SetInsertPoint( + ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt()); + return BodyGenCB(Builder.saveIP(), IV); + }; + + const auto &&InputLoopGen = [&]() -> Error { + Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop( + Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop, + ComputeIP, Name, true, ScanRedInfo); + if (!LoopInfo) + return LoopInfo.takeError(); + Result.push_back(*LoopInfo); + Builder.restoreIP((*LoopInfo)->getAfterIP()); + return Error::success(); + }; + const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error { + Expected<CanonicalLoopInfo *> LoopInfo = + createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned, + InclusiveStop, ComputeIP, Name, true, ScanRedInfo); + if (!LoopInfo) + return LoopInfo.takeError(); + Result.push_back(*LoopInfo); + Builder.restoreIP((*LoopInfo)->getAfterIP()); + ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock(); + return Error::success(); + }; + Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo); + if (Err) + return Err; + return Result; +} + Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount( const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name) { @@ -4174,7 +4578,8 @@ Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount( Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop( const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, - InsertPointTy ComputeIP, const Twine &Name) { + InsertPointTy ComputeIP, const Twine &Name, bool InScan, + ScanInfo *ScanRedInfo) { LocationDescription ComputeLoc = ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc; @@ -4185,6 +4590,8 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop( Builder.restoreIP(CodeGenIP); Value *Span = Builder.CreateMul(IV, Step); Value *IndVar = Builder.CreateAdd(Span, Start); + if (InScan) + ScanRedInfo->IV = IndVar; return BodyGenCB(Builder.saveIP(), IndVar); }; LocationDescription LoopLoc = @@ -8956,7 +9363,8 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate( const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, - AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) { + AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, + bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) { assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous"); if (!updateToLocation(Loc)) return Loc.IP; @@ -8974,9 +9382,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate( "OpenMP atomic does not support LT or GT operations"); }); - Expected<std::pair<Value *, Value *>> AtomicResult = - emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, - X.IsVolatile, IsXBinopExpr); + Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate( + AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile, + IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory); if (!AtomicResult) return AtomicResult.takeError(); checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update); @@ -9023,7 +9431,8 @@ Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2, Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate( InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, - AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) { + AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr, + bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) { // TODO: handle the case where XElemTy is not byte-sized or not a power of 2 // or a complex datatype. bool emitRMWOp = false; @@ -9046,7 +9455,20 @@ Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate( std::pair<Value *, Value *> Res; if (emitRMWOp) { - Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO); + AtomicRMWInst *RMWInst = + Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO); + if (T.isAMDGPU()) { + if (IsIgnoreDenormalMode) + RMWInst->setMetadata("amdgpu.ignore.denormal.mode", + llvm::MDNode::get(Builder.getContext(), {})); + if (!IsFineGrainedMemory) + RMWInst->setMetadata("amdgpu.no.fine.grained.memory", + llvm::MDNode::get(Builder.getContext(), {})); + if (!IsRemoteMemory) + RMWInst->setMetadata("amdgpu.no.remote.memory", + llvm::MDNode::get(Builder.getContext(), {})); + } + Res.first = RMWInst; // not needed except in case of postfix captures. Generate anyway for // consistency with the else part. Will be removed with any DCE pass. // AtomicRMWInst::Xchg does not have a coressponding instruction. @@ -9178,7 +9600,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture( const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, - bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) { + bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, + bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) { if (!updateToLocation(Loc)) return Loc.IP; @@ -9197,9 +9620,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture( // If UpdateExpr is 'x' updated with some `expr` not based on 'x', // 'x' is simply atomically rewritten with 'expr'. AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg); - Expected<std::pair<Value *, Value *>> AtomicResult = - emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, - X.IsVolatile, IsXBinopExpr); + Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate( + AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile, + IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory); if (!AtomicResult) return AtomicResult.takeError(); Value *CapturedVal = diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 7159107..b91fd70 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1311,14 +1311,15 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, } break; case 'l': - if (Name.starts_with("lifetime.start") || - Name.starts_with("lifetime.end")) { - // Unless remangling is required, do not upgrade the function declaration, - // but do upgrade the calls. - if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F)) - NewFn = *Result; - else - NewFn = F; + if ((Name.starts_with("lifetime.start") || + Name.starts_with("lifetime.end")) && + F->arg_size() == 2) { + Intrinsic::ID IID = Name.starts_with("lifetime.start") + ? Intrinsic::lifetime_start + : Intrinsic::lifetime_end; + rename(F); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID, + F->getArg(0)->getType()); return true; } break; @@ -5133,21 +5134,20 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { - Value *Size = CI->getArgOperand(0); - Value *Ptr = CI->getArgOperand(1); - if (isa<AllocaInst>(Ptr)) { + if (CI->arg_size() != 2) { DefaultCase(); return; } + Value *Ptr = CI->getArgOperand(1); // Try to strip pointer casts, such that the lifetime works on an alloca. Ptr = Ptr->stripPointerCasts(); if (isa<AllocaInst>(Ptr)) { // Don't use NewFn, as we might have looked through an addrspacecast. if (NewFn->getIntrinsicID() == Intrinsic::lifetime_start) - NewCall = Builder.CreateLifetimeStart(Ptr, cast<ConstantInt>(Size)); + NewCall = Builder.CreateLifetimeStart(Ptr); else - NewCall = Builder.CreateLifetimeEnd(Ptr, cast<ConstantInt>(Size)); + NewCall = Builder.CreateLifetimeEnd(Ptr); break; } diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index d4ad21e..6b202ba 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -254,6 +254,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V, return FoldBitCast(V, DestTy); case Instruction::AddrSpaceCast: case Instruction::IntToPtr: + case Instruction::PtrToAddr: case Instruction::PtrToInt: return nullptr; } diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index e09c139..2fcdbcc6 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -829,6 +829,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp, case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::IntToPtr: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::AddrSpaceCast: // Conservatively return getFull set. diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index a3c725b..c7e3113a 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -1567,6 +1567,7 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty, case Instruction::SIToFP: case Instruction::FPToUI: case Instruction::FPToSI: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: @@ -2223,6 +2224,8 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty, llvm_unreachable("Invalid cast opcode"); case Instruction::Trunc: return getTrunc(C, Ty, OnlyIfReduced); + case Instruction::PtrToAddr: + return getPtrToAddr(C, Ty, OnlyIfReduced); case Instruction::PtrToInt: return getPtrToInt(C, Ty, OnlyIfReduced); case Instruction::IntToPtr: @@ -2280,6 +2283,20 @@ Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) { return getFoldedCast(Instruction::Trunc, C, Ty, OnlyIfReduced); } +Constant *ConstantExpr::getPtrToAddr(Constant *C, Type *DstTy, + bool OnlyIfReduced) { + assert(C->getType()->isPtrOrPtrVectorTy() && + "PtrToAddr source must be pointer or pointer vector"); + assert(DstTy->isIntOrIntVectorTy() && + "PtrToAddr destination must be integer or integer vector"); + assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy)); + if (isa<VectorType>(C->getType())) + assert(cast<VectorType>(C->getType())->getElementCount() == + cast<VectorType>(DstTy)->getElementCount() && + "Invalid cast between a different number of vector elements"); + return getFoldedCast(Instruction::PtrToAddr, C, DstTy, OnlyIfReduced); +} + Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy, bool OnlyIfReduced) { assert(C->getType()->isPtrOrPtrVectorTy() && @@ -2435,6 +2452,7 @@ bool ConstantExpr::isDesirableCastOp(unsigned Opcode) { case Instruction::FPToSI: return false; case Instruction::Trunc: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: @@ -2457,6 +2475,7 @@ bool ConstantExpr::isSupportedCastOp(unsigned Opcode) { case Instruction::FPToSI: return false; case Instruction::Trunc: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: @@ -3401,6 +3420,7 @@ Instruction *ConstantExpr::getAsInstruction() const { switch (getOpcode()) { case Instruction::Trunc: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index f1d4549..96065ed 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -57,15 +57,9 @@ DebugVariable::DebugVariable(const DbgVariableRecord *DVR) DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line, unsigned Column, uint64_t AtomGroup, uint8_t AtomRank, ArrayRef<Metadata *> MDs, bool ImplicitCode) - : MDNode(C, DILocationKind, Storage, MDs) -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS - , - AtomGroup(AtomGroup), AtomRank(AtomRank) -#endif -{ -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS + : MDNode(C, DILocationKind, Storage, MDs), AtomGroup(AtomGroup), + AtomRank(AtomRank) { assert(AtomRank <= 7 && "AtomRank number should fit in 3 bits"); -#endif if (AtomGroup) C.updateDILocationAtomGroupWaterline(AtomGroup + 1); diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp index 7b799c7..11d33e2 100644 --- a/llvm/lib/IR/Globals.cpp +++ b/llvm/lib/IR/Globals.cpp @@ -404,6 +404,7 @@ findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases, return findBaseObject(CE->getOperand(0), Aliases, Op); } case Instruction::IntToPtr: + case Instruction::PtrToAddr: case Instruction::PtrToInt: case Instruction::BitCast: case Instruction::GetElementPtr: diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 49c6dc7..614c3a9 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -411,28 +411,16 @@ CallInst *IRBuilderBase::CreateFPMinimumReduce(Value *Src) { return getReductionIntrinsic(Intrinsic::vector_reduce_fminimum, Src); } -CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { +CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr) { assert(isa<PointerType>(Ptr->getType()) && "lifetime.start only applies to pointers."); - if (!Size) - Size = getInt64(-1); - else - assert(Size->getType() == getInt64Ty() && - "lifetime.start requires the size to be an i64"); - Value *Ops[] = { Size, Ptr }; - return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, Ops); + return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, {Ptr}); } -CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) { +CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr) { assert(isa<PointerType>(Ptr->getType()) && "lifetime.end only applies to pointers."); - if (!Size) - Size = getInt64(-1); - else - assert(Size->getType() == getInt64Ty() && - "lifetime.end requires the size to be an i64"); - Value *Ops[] = { Size, Ptr }; - return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, Ops); + return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, {Ptr}); } CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) { diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index b7cd12a..4540268 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -817,6 +817,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) { case UIToFP: return "uitofp"; case SIToFP: return "sitofp"; case IntToPtr: return "inttoptr"; + case PtrToAddr: return "ptrtoaddr"; case PtrToInt: return "ptrtoint"; case BitCast: return "bitcast"; case AddrSpaceCast: return "addrspacecast"; diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index b896382..a1751c0 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -2798,6 +2798,7 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode, return false; case Instruction::BitCast: return true; // BitCast never modifies bits. + case Instruction::PtrToAddr: case Instruction::PtrToInt: return DL.getIntPtrType(SrcTy)->getScalarSizeInBits() == DestTy->getScalarSizeInBits(); @@ -2855,26 +2856,29 @@ unsigned CastInst::isEliminableCastPair( // same reason. const unsigned numCastOps = Instruction::CastOpsEnd - Instruction::CastOpsBegin; + // clang-format off static const uint8_t CastResults[numCastOps][numCastOps] = { - // T F F U S F F P I B A -+ - // R Z S P P I I T P 2 N T S | - // U E E 2 2 2 2 R E I T C C +- secondOp - // N X X U S F F N X N 2 V V | - // C T T I I P P C T T P T T -+ - { 1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc -+ - { 8, 1, 9,99,99, 2,17,99,99,99, 2, 3, 0}, // ZExt | - { 8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt | - { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI | - { 0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI | - { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP +- firstOp - { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP | - { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // FPTrunc | - { 99,99,99, 2, 2,99,99, 8, 2,99,99, 4, 0}, // FPExt | - { 1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt | - { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr | - { 5, 5, 5, 0, 0, 5, 5, 0, 0,16, 5, 1,14}, // BitCast | - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ + // T F F U S F F P P I B A -+ + // R Z S P P I I T P 2 2 N T S | + // U E E 2 2 2 2 R E I A T C C +- secondOp + // N X X U S F F N X N D 2 V V | + // C T T I I P P C T T R P T T -+ + { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // Trunc -+ + { 8, 1, 9,99,99, 2,17,99,99,99,99, 2, 3, 0}, // ZExt | + { 8, 0, 1,99,99, 0, 2,99,99,99,99, 0, 3, 0}, // SExt | + { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToUI | + { 0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // FPToSI | + { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // UIToFP +- firstOp + { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // SIToFP | + { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc | + { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt | + { 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt | + { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr | + { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr | + { 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast | + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ }; + // clang-format on // TODO: This logic could be encoded into the table above and handled in the // switch below. @@ -3046,6 +3050,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty, case SIToFP: return new SIToFPInst (S, Ty, Name, InsertBefore); case FPToUI: return new FPToUIInst (S, Ty, Name, InsertBefore); case FPToSI: return new FPToSIInst (S, Ty, Name, InsertBefore); + case PtrToAddr: return new PtrToAddrInst (S, Ty, Name, InsertBefore); case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore); case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore); case BitCast: @@ -3347,6 +3352,7 @@ CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) { case Instruction::FPToSI: return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() && SrcEC == DstEC; + case Instruction::PtrToAddr: case Instruction::PtrToInt: if (SrcEC != DstEC) return false; @@ -3460,6 +3466,12 @@ PtrToIntInst::PtrToIntInst(Value *S, Type *Ty, const Twine &Name, assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt"); } +PtrToAddrInst::PtrToAddrInst(Value *S, Type *Ty, const Twine &Name, + InsertPosition InsertBefore) + : CastInst(Ty, PtrToAddr, S, Name, InsertBefore) { + assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToAddr"); +} + IntToPtrInst::IntToPtrInst(Value *S, Type *Ty, const Twine &Name, InsertPosition InsertBefore) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) { @@ -4427,6 +4439,10 @@ PtrToIntInst *PtrToIntInst::cloneImpl() const { return new PtrToIntInst(getOperand(0), getType()); } +PtrToAddrInst *PtrToAddrInst::cloneImpl() const { + return new PtrToAddrInst(getOperand(0), getType()); +} + IntToPtrInst *IntToPtrInst::cloneImpl() const { return new IntToPtrInst(getOperand(0), getType()); } diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index aa2a60e..e03f993 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -312,10 +312,8 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey { template <> struct MDNodeKeyImpl<DILocation> { Metadata *Scope; Metadata *InlinedAt; -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS uint64_t AtomGroup : 61; uint64_t AtomRank : 3; -#endif unsigned Line; uint16_t Column; bool ImplicitCode; @@ -323,36 +321,24 @@ template <> struct MDNodeKeyImpl<DILocation> { MDNodeKeyImpl(unsigned Line, uint16_t Column, Metadata *Scope, Metadata *InlinedAt, bool ImplicitCode, uint64_t AtomGroup, uint8_t AtomRank) - : Scope(Scope), InlinedAt(InlinedAt), -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS - AtomGroup(AtomGroup), AtomRank(AtomRank), -#endif - Line(Line), Column(Column), ImplicitCode(ImplicitCode) { - } + : Scope(Scope), InlinedAt(InlinedAt), AtomGroup(AtomGroup), + AtomRank(AtomRank), Line(Line), Column(Column), + ImplicitCode(ImplicitCode) {} MDNodeKeyImpl(const DILocation *L) : Scope(L->getRawScope()), InlinedAt(L->getRawInlinedAt()), -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank()), -#endif Line(L->getLine()), Column(L->getColumn()), - ImplicitCode(L->isImplicitCode()) { - } + ImplicitCode(L->isImplicitCode()) {} bool isKeyOf(const DILocation *RHS) const { return Line == RHS->getLine() && Column == RHS->getColumn() && Scope == RHS->getRawScope() && InlinedAt == RHS->getRawInlinedAt() && - ImplicitCode == RHS->isImplicitCode() -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS - && AtomGroup == RHS->getAtomGroup() && - AtomRank == RHS->getAtomRank(); -#else - ; -#endif + ImplicitCode == RHS->isImplicitCode() && + AtomGroup == RHS->getAtomGroup() && AtomRank == RHS->getAtomRank(); } unsigned getHashValue() const { -#ifdef EXPERIMENTAL_KEY_INSTRUCTIONS // Hashing AtomGroup and AtomRank substantially impacts performance whether // Key Instructions is enabled or not. We can't detect whether it's enabled // here cheaply; avoiding hashing zero values is a good approximation. This @@ -363,7 +349,6 @@ template <> struct MDNodeKeyImpl<DILocation> { if (AtomGroup || AtomRank) return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode, AtomGroup, (uint8_t)AtomRank); -#endif return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode); } }; diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 129ca4a..5928c89 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -747,34 +747,28 @@ const Value *Value::stripAndAccumulateConstantOffsets( // means when we construct GEPOffset, we need to use the size // of GEP's pointer type rather than the size of the original // pointer type. - unsigned CurBitWidth = DL.getIndexTypeSizeInBits(V->getType()); - if (CurBitWidth == BitWidth) { - if (!GEP->accumulateConstantOffset(DL, Offset, ExternalAnalysis)) - return V; - } else { - APInt GEPOffset(CurBitWidth, 0); - if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis)) - return V; + APInt GEPOffset(DL.getIndexTypeSizeInBits(V->getType()), 0); + if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis)) + return V; - // Stop traversal if the pointer offset wouldn't fit in the bit-width - // provided by the Offset argument. This can happen due to AddrSpaceCast - // stripping. - if (GEPOffset.getSignificantBits() > BitWidth) - return V; + // Stop traversal if the pointer offset wouldn't fit in the bit-width + // provided by the Offset argument. This can happen due to AddrSpaceCast + // stripping. + if (GEPOffset.getSignificantBits() > BitWidth) + return V; - // External Analysis can return a result higher/lower than the value - // represents. We need to detect overflow/underflow. - APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth); - if (!ExternalAnalysis) { - Offset += GEPOffsetST; - } else { - bool Overflow = false; - APInt OldOffset = Offset; - Offset = Offset.sadd_ov(GEPOffsetST, Overflow); - if (Overflow) { - Offset = OldOffset; - return V; - } + // External Analysis can return a result higher/lower than the value + // represents. We need to detect overflow/underflow. + APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth); + if (!ExternalAnalysis) { + Offset += GEPOffsetST; + } else { + bool Overflow = false; + APInt OldOffset = Offset; + Offset = Offset.sadd_ov(GEPOffsetST, Overflow); + if (Overflow) { + Offset = OldOffset; + return V; } } V = GEP->getPointerOperand(); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index ca3f148..1d3c379 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -566,6 +566,8 @@ private: void visitUIToFPInst(UIToFPInst &I); void visitSIToFPInst(SIToFPInst &I); void visitIntToPtrInst(IntToPtrInst &I); + void checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V); + void visitPtrToAddrInst(PtrToAddrInst &I); void visitPtrToIntInst(PtrToIntInst &I); void visitBitCastInst(BitCastInst &I); void visitAddrSpaceCastInst(AddrSpaceCastInst &I); @@ -834,6 +836,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) { &GV); Check(GV.getInitializer()->getType()->isSized(), "Global variable initializer must be sized", &GV); + visitConstantExprsRecursively(GV.getInitializer()); // If the global has common linkage, it must have a zero initializer and // cannot be constant. if (GV.hasCommonLinkage()) { @@ -2610,6 +2613,8 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) { Check(CastInst::castIsValid(Instruction::BitCast, CE->getOperand(0), CE->getType()), "Invalid bitcast", CE); + else if (CE->getOpcode() == Instruction::PtrToAddr) + checkPtrToAddr(CE->getOperand(0)->getType(), CE->getType(), *CE); } void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) { @@ -3532,6 +3537,28 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) { visitInstruction(I); } +void Verifier::checkPtrToAddr(Type *SrcTy, Type *DestTy, const Value &V) { + Check(SrcTy->isPtrOrPtrVectorTy(), "PtrToAddr source must be pointer", V); + Check(DestTy->isIntOrIntVectorTy(), "PtrToAddr result must be integral", V); + Check(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToAddr type mismatch", + V); + + if (SrcTy->isVectorTy()) { + auto *VSrc = cast<VectorType>(SrcTy); + auto *VDest = cast<VectorType>(DestTy); + Check(VSrc->getElementCount() == VDest->getElementCount(), + "PtrToAddr vector length mismatch", V); + } + + Type *AddrTy = DL.getAddressType(SrcTy); + Check(AddrTy == DestTy, "PtrToAddr result must be address width", V); +} + +void Verifier::visitPtrToAddrInst(PtrToAddrInst &I) { + checkPtrToAddr(I.getOperand(0)->getType(), I.getType(), I); + visitInstruction(I); +} + void Verifier::visitPtrToIntInst(PtrToIntInst &I) { // Get the source and destination types Type *SrcTy = I.getOperand(0)->getType(); @@ -3547,7 +3574,7 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) { auto *VSrc = cast<VectorType>(SrcTy); auto *VDest = cast<VectorType>(DestTy); Check(VSrc->getElementCount() == VDest->getElementCount(), - "PtrToInt Vector width mismatch", &I); + "PtrToInt Vector length mismatch", &I); } visitInstruction(I); @@ -3567,7 +3594,7 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) { auto *VSrc = cast<VectorType>(SrcTy); auto *VDest = cast<VectorType>(DestTy); Check(VSrc->getElementCount() == VDest->getElementCount(), - "IntToPtr Vector width mismatch", &I); + "IntToPtr Vector length mismatch", &I); } visitInstruction(I); } @@ -6770,7 +6797,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { } case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { - Value *Ptr = Call.getArgOperand(1); + Value *Ptr = Call.getArgOperand(0); Check(isa<AllocaInst>(Ptr) || isa<PoisonValue>(Ptr), "llvm.lifetime.start/end can only be used on alloca or poison", &Call); diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 8c27958..d0c6144 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -443,7 +443,7 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst, // MCAssembler::relaxAlign. auto *Sec = F->getParent(); if (!Sec->isLinkerRelaxable()) - Sec->setLinkerRelaxable(); + Sec->setFirstLinkerRelaxable(F->getLayoutOrder()); // Do not add data after a linker-relaxable instruction. The difference // between a new label and a label at or before the linker-relaxable // instruction cannot be resolved at assemble-time. diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index 27ca131..9ed6fd1 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -20,7 +20,7 @@ using namespace llvm; MCSection::MCSection(StringRef Name, bool IsText, bool IsBss, MCSymbol *Begin) : Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText), - IsBss(IsBss), LinkerRelaxable(false), Name(Name) { + IsBss(IsBss), Name(Name) { DummyFragment.setParent(this); } diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 7ca26aa..df807fc 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -331,61 +331,34 @@ void InstrProfWriter::addDataAccessProfData( DataAccessProfileData = std::move(DataAccessProfDataIn); } -void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) { - assert(Trace.FunctionNameRefs.size() <= MaxTemporalProfTraceLength); - assert(!Trace.FunctionNameRefs.empty()); - if (TemporalProfTraceStreamSize < TemporalProfTraceReservoirSize) { - // Simply append the trace if we have not yet hit our reservoir size limit. - TemporalProfTraces.push_back(std::move(Trace)); - } else { - // Otherwise, replace a random trace in the stream. - std::uniform_int_distribution<uint64_t> Distribution( - 0, TemporalProfTraceStreamSize); - uint64_t RandomIndex = Distribution(RNG); - if (RandomIndex < TemporalProfTraces.size()) - TemporalProfTraces[RandomIndex] = std::move(Trace); - } - ++TemporalProfTraceStreamSize; -} - void InstrProfWriter::addTemporalProfileTraces( SmallVectorImpl<TemporalProfTraceTy> &SrcTraces, uint64_t SrcStreamSize) { + if (TemporalProfTraces.size() > TemporalProfTraceReservoirSize) + TemporalProfTraces.truncate(TemporalProfTraceReservoirSize); for (auto &Trace : SrcTraces) if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength) Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength); llvm::erase_if(SrcTraces, [](auto &T) { return T.FunctionNameRefs.empty(); }); - // Assume that the source has the same reservoir size as the destination to - // avoid needing to record it in the indexed profile format. - bool IsDestSampled = - (TemporalProfTraceStreamSize > TemporalProfTraceReservoirSize); - bool IsSrcSampled = (SrcStreamSize > TemporalProfTraceReservoirSize); - if (!IsDestSampled && IsSrcSampled) { - // If one of the traces are sampled, ensure that it belongs to Dest. - std::swap(TemporalProfTraces, SrcTraces); - std::swap(TemporalProfTraceStreamSize, SrcStreamSize); - std::swap(IsDestSampled, IsSrcSampled); - } - if (!IsSrcSampled) { - // If the source stream is not sampled, we add each source trace normally. - for (auto &Trace : SrcTraces) - addTemporalProfileTrace(std::move(Trace)); + // If there are no source traces, it is probably because + // --temporal-profile-max-trace-length=0 was set to deliberately remove all + // traces. In that case, we do not want to increase the stream size + if (SrcTraces.empty()) return; - } - // Otherwise, we find the traces that would have been removed if we added - // the whole source stream. - SmallSetVector<uint64_t, 8> IndicesToReplace; - for (uint64_t I = 0; I < SrcStreamSize; I++) { - std::uniform_int_distribution<uint64_t> Distribution( - 0, TemporalProfTraceStreamSize); + // Add traces until our reservoir is full or we run out of source traces + auto SrcTraceIt = SrcTraces.begin(); + while (TemporalProfTraces.size() < TemporalProfTraceReservoirSize && + SrcTraceIt < SrcTraces.end()) + TemporalProfTraces.push_back(*SrcTraceIt++); + // Our reservoir is full, we need to sample the source stream + llvm::shuffle(SrcTraceIt, SrcTraces.end(), RNG); + for (uint64_t I = TemporalProfTraces.size(); + I < SrcStreamSize && SrcTraceIt < SrcTraces.end(); I++) { + std::uniform_int_distribution<uint64_t> Distribution(0, I); uint64_t RandomIndex = Distribution(RNG); if (RandomIndex < TemporalProfTraces.size()) - IndicesToReplace.insert(RandomIndex); - ++TemporalProfTraceStreamSize; + TemporalProfTraces[RandomIndex] = *SrcTraceIt++; } - // Then we insert a random sample of the source traces. - llvm::shuffle(SrcTraces.begin(), SrcTraces.end(), RNG); - for (const auto &[Index, Trace] : llvm::zip(IndicesToReplace, SrcTraces)) - TemporalProfTraces[Index] = std::move(Trace); + TemporalProfTraceStreamSize += SrcStreamSize; } void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW, diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index fe34037..70ac68a 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -256,6 +256,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { case llvm::Instruction::FPToUI: case llvm::Instruction::FPToSI: case llvm::Instruction::FPExt: + case llvm::Instruction::PtrToAddr: case llvm::Instruction::PtrToInt: case llvm::Instruction::IntToPtr: case llvm::Instruction::SIToFP: diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp index 956047c..1a81d18 100644 --- a/llvm/lib/SandboxIR/Instruction.cpp +++ b/llvm/lib/SandboxIR/Instruction.cpp @@ -1007,6 +1007,9 @@ static llvm::Instruction::CastOps getLLVMCastOp(Instruction::Opcode Opc) { return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToSI); case Instruction::Opcode::FPExt: return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPExt); + case Instruction::Opcode::PtrToAddr: + return static_cast<llvm::Instruction::CastOps>( + llvm::Instruction::PtrToAddr); case Instruction::Opcode::PtrToInt: return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::PtrToInt); case Instruction::Opcode::IntToPtr: diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp index dc75878..b6a2f8a 100644 --- a/llvm/lib/Support/StringRef.cpp +++ b/llvm/lib/Support/StringRef.cpp @@ -385,7 +385,7 @@ size_t StringRef::count(StringRef Str) const { return Count; } -static unsigned GetAutoSenseRadix(StringRef &Str) { +unsigned llvm::getAutoSenseRadix(StringRef &Str) { if (Str.empty()) return 10; @@ -410,7 +410,7 @@ bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, unsigned long long &Result) { // Autosense radix if not specified. if (Radix == 0) - Radix = GetAutoSenseRadix(Str); + Radix = getAutoSenseRadix(Str); // Empty strings (after the radix autosense) are invalid. if (Str.empty()) return true; @@ -509,7 +509,7 @@ bool StringRef::consumeInteger(unsigned Radix, APInt &Result) { // Autosense radix if not specified. if (Radix == 0) - Radix = GetAutoSenseRadix(Str); + Radix = getAutoSenseRadix(Str); assert(Radix > 1 && Radix <= 36); diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c index 4ed5982..f5c4778 100644 --- a/llvm/lib/Support/regcomp.c +++ b/llvm/lib/Support/regcomp.c @@ -305,7 +305,7 @@ llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) { return (REG_INVARG); len = preg->re_endp - pattern; } else { - len = strlen((const char *)pattern); + len = strlen(pattern); } /* do the mallocs early so failure handling is easy */ diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 201bfe0..d6a3d59 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1236,14 +1236,20 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, .add(MI.getOperand(3)); transferImpOps(MI, I, I); } else { + unsigned RegState = + getRenamableRegState(MI.getOperand(1).isRenamable()) | + getKillRegState( + MI.getOperand(1).isKill() && + MI.getOperand(1).getReg() != MI.getOperand(2).getReg() && + MI.getOperand(1).getReg() != MI.getOperand(3).getReg()); BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8 : AArch64::ORRv16i8)) .addReg(DstReg, RegState::Define | getRenamableRegState(MI.getOperand(0).isRenamable())) - .add(MI.getOperand(1)) - .add(MI.getOperand(1)); + .addReg(MI.getOperand(1).getReg(), RegState) + .addReg(MI.getOperand(1).getReg(), RegState); auto I2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1f5ff4e..3c06c6a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8537,7 +8537,7 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI, if (IsCalleeWin64) { UseVarArgCC = true; } else { - UseVarArgCC = !Outs[i].IsFixed; + UseVarArgCC = ArgFlags.isVarArg(); } } @@ -8982,7 +8982,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { - if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) + if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector()) report_fatal_error("Passing SVE types to variadic functions is " "currently not supported"); } @@ -14742,6 +14742,106 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { return ResultSLI; } +static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AArch64TargetLowering &TLI) { + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); + + if (!VT.isVector()) + return SDValue(); + + if (VT.isScalableVector() && !Subtarget.hasSVE2()) + return SDValue(); + + if (VT.isFixedLengthVector() && + (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT))) + return SDValue(); + + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() != ISD::AND) + return SDValue(); + + // InstCombine does (not (neg a)) => (add a -1). + // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c) + // Loop over all combinations of AND operands. + for (int i = 1; i >= 0; --i) { + for (int j = 1; j >= 0; --j) { + SDValue O0 = N0->getOperand(i); + SDValue O1 = N1->getOperand(j); + SDValue Sub, Add, SubSibling, AddSibling; + + // Find a SUB and an ADD operand, one from each AND. + if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) { + Sub = O0; + Add = O1; + SubSibling = N0->getOperand(1 - i); + AddSibling = N1->getOperand(1 - j); + } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) { + Add = O0; + Sub = O1; + AddSibling = N0->getOperand(1 - i); + SubSibling = N1->getOperand(1 - j); + } else + continue; + + if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode())) + continue; + + // Constant ones is always righthand operand of the Add. + if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode())) + continue; + + if (Sub.getOperand(1) != Add.getOperand(0)) + continue; + + return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling); + } + } + + // (or (and a b) (and (not a) c)) => (bsl a b c) + // We only have to look for constant vectors here since the general, variable + // case can be handled in TableGen. + unsigned Bits = VT.getScalarSizeInBits(); + uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); + for (int i = 1; i >= 0; --i) + for (int j = 1; j >= 0; --j) { + APInt Val1, Val2; + + if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) && + ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) && + (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) { + return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i), + N0->getOperand(1 - i), N1->getOperand(1 - j)); + } + BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); + BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); + if (!BVN0 || !BVN1) + continue; + + bool FoundMatch = true; + for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { + ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); + ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); + if (!CN0 || !CN1 || + CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { + FoundMatch = false; + break; + } + } + if (FoundMatch) + return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i), + N0->getOperand(1 - i), N1->getOperand(1 - j)); + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { if (useSVEForFixedLengthVectorVT(Op.getValueType(), @@ -19419,106 +19519,6 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, return FixConv; } -static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AArch64TargetLowering &TLI) { - EVT VT = N->getValueType(0); - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); - - if (!VT.isVector()) - return SDValue(); - - if (VT.isScalableVector() && !Subtarget.hasSVE2()) - return SDValue(); - - if (VT.isFixedLengthVector() && - (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT))) - return SDValue(); - - SDValue N0 = N->getOperand(0); - if (N0.getOpcode() != ISD::AND) - return SDValue(); - - SDValue N1 = N->getOperand(1); - if (N1.getOpcode() != ISD::AND) - return SDValue(); - - // InstCombine does (not (neg a)) => (add a -1). - // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c) - // Loop over all combinations of AND operands. - for (int i = 1; i >= 0; --i) { - for (int j = 1; j >= 0; --j) { - SDValue O0 = N0->getOperand(i); - SDValue O1 = N1->getOperand(j); - SDValue Sub, Add, SubSibling, AddSibling; - - // Find a SUB and an ADD operand, one from each AND. - if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) { - Sub = O0; - Add = O1; - SubSibling = N0->getOperand(1 - i); - AddSibling = N1->getOperand(1 - j); - } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) { - Add = O0; - Sub = O1; - AddSibling = N0->getOperand(1 - i); - SubSibling = N1->getOperand(1 - j); - } else - continue; - - if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode())) - continue; - - // Constant ones is always righthand operand of the Add. - if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode())) - continue; - - if (Sub.getOperand(1) != Add.getOperand(0)) - continue; - - return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling); - } - } - - // (or (and a b) (and (not a) c)) => (bsl a b c) - // We only have to look for constant vectors here since the general, variable - // case can be handled in TableGen. - unsigned Bits = VT.getScalarSizeInBits(); - uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); - for (int i = 1; i >= 0; --i) - for (int j = 1; j >= 0; --j) { - APInt Val1, Val2; - - if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) && - ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) && - (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) { - return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i), - N0->getOperand(1 - i), N1->getOperand(1 - j)); - } - BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); - BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); - if (!BVN0 || !BVN1) - continue; - - bool FoundMatch = true; - for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { - ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); - ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); - if (!CN0 || !CN1 || - CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { - FoundMatch = false; - break; - } - } - if (FoundMatch) - return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i), - N0->getOperand(1 - i), N1->getOperand(1 - j)); - } - - return SDValue(); -} - // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to // convert to csel(ccmp(.., cc0)), depending on cc1: diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index d068a12..b033f88 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7362,7 +7362,9 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm, [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc, V128, V128, V128, - asm#"2", ".8h", ".16b", ".16b", []>; + asm#"2", ".8h", ".16b", ".16b", + [(set (v8i16 V128:$Rd), (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), + (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm)))))]>; let Predicates = [HasAES] in { def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc, V128, V64, V64, @@ -7374,10 +7376,6 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm, [(set (v16i8 V128:$Rd), (OpNode (extract_high_v2i64 (v2i64 V128:$Rn)), (extract_high_v2i64 (v2i64 V128:$Rm))))]>; } - - def : Pat<(v8i16 (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), - (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))), - (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>; } multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm, @@ -7402,6 +7400,7 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm, (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } +let isCommutable = 1 in multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc, @@ -7483,6 +7482,7 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc, (extract_high_v4i32 (v4i32 V128:$Rm)))))))]>; } +let isCommutable = 1 in multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ac31236..8cfbff9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6055,6 +6055,7 @@ defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>; defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>; defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; +let isCommutable = 1 in defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >; @@ -6806,6 +6807,7 @@ defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn> defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>; defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; +let isCommutable = 1 in defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>; defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>; defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>; @@ -6822,6 +6824,7 @@ defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>; defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>; defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", int_aarch64_neon_sqdmull>; +let isCommutable = 0 in defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>; defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", @@ -6836,6 +6839,7 @@ defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>; +let isCommutable = 0 in defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>; defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index b97d622..fd4ef2a 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -8,8 +8,8 @@ // // This pass performs below peephole optimizations on MIR level. // -// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri -// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri +// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri +// MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi // MOVi64imm + ADDXrr ==> ADDXri + ADDXri @@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { // Strategy used to split logical immediate bitmasks. enum class SplitStrategy { Intersect, + Disjoint, }; template <typename T> bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI, @@ -163,6 +164,7 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", template <typename T> static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { T UImm = static_cast<T>(Imm); + assert(UImm && (UImm != ~static_cast<T>(0)) && "Invalid immediate!"); // The bitmask immediate consists of consecutive ones. Let's say there is // constant 0b00000000001000000000010000000000 which does not consist of @@ -191,18 +193,47 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> +static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, + T &Imm2Enc) { + assert(Imm && (Imm != ~static_cast<T>(0)) && "Invalid immediate!"); + + // Try to split a bitmask of the form 0b00000000011000000000011110000000 into + // two disjoint masks such as 0b00000000011000000000000000000000 and + // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the + // new masks match the original mask. + unsigned LowestBitSet = llvm::countr_zero(Imm); + unsigned LowestGapBitUnset = + LowestBitSet + llvm::countr_one(Imm >> LowestBitSet); + + // Create a mask for the least significant group of consecutive ones. + assert(LowestGapBitUnset < sizeof(T) * CHAR_BIT && "Undefined behaviour!"); + T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) - + (static_cast<T>(1) << LowestBitSet); + // Create a disjoint mask for the remaining ones. + T NewImm2 = Imm & ~NewImm1; + + // Do not split if NewImm2 is not a valid bitmask immediate. + if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) + return false; + + Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize); + Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize); + return true; +} + +template <typename T> bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, SplitStrategy Strategy, unsigned OtherOpc) { - // Try below transformation. + // Try below transformations. // - // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri - // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri + // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri + // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri // // The mov pseudo instruction could be expanded to multiple mov instructions // later. Let's try to split the constant operand of mov instruction into two - // bitmask immediates. It makes only two AND instructions instead of multiple - // mov + and instructions. + // bitmask immediates based on the given split strategy. It makes only two + // logical instructions instead of multiple mov + logic instructions. return splitTwoPartImm<T>( MI, @@ -224,6 +255,9 @@ bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, case SplitStrategy::Intersect: SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1); break; + case SplitStrategy::Disjoint: + SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1); + break; } if (SplitSucc) return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); @@ -889,6 +923,22 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { Changed |= trySplitLogicalImm<uint64_t>( AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri); break; + case AArch64::EORWrr: + Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI, + SplitStrategy::Disjoint); + break; + case AArch64::EORXrr: + Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI, + SplitStrategy::Disjoint); + break; + case AArch64::ORRWrr: + Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI, + SplitStrategy::Disjoint); + break; + case AArch64::ORRXrr: + Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI, + SplitStrategy::Disjoint); + break; case AArch64::ORRWrs: Changed |= visitORR(MI); break; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index adc984a..1bc1d98 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -22,7 +22,8 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320", FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureUseWzrToVecMove]>; + FeatureUseWzrToVecMove, + FeatureUseFixedOverScalableIfEqualCost]>; def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [ @@ -45,7 +46,8 @@ def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510", FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureUseWzrToVecMove + FeatureUseWzrToVecMove, + FeatureUseFixedOverScalableIfEqualCost ]>; def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520", @@ -53,7 +55,8 @@ def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520", FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureUseWzrToVecMove]>; + FeatureUseWzrToVecMove, + FeatureUseFixedOverScalableIfEqualCost]>; def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520", "Cortex-A520AE ARM processors", [ @@ -756,7 +759,6 @@ def ProcessorFeatures { FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE, - FeatureUseFixedOverScalableIfEqualCost, FeatureRAS, FeatureRCPC, FeatureRDM, FeatureFPAC]; list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, @@ -766,7 +768,6 @@ def ProcessorFeatures { FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS, FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, - FeatureUseFixedOverScalableIfEqualCost, FeatureDotProd, FeatureFPAC]; list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index f136a184..a67bd42 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -585,8 +585,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { ClMaxLifetimes); if (StandardLifetime) { IntrinsicInst *Start = Info.LifetimeStart[0]; - uint64_t Size = - cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue(); + uint64_t Size = *Info.AI->getAllocationSize(*DL); Size = alignTo(Size, kTagGranuleSize); tagAlloca(AI, Start->getNextNode(), TagPCall, Size); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 9f05add..5c94aeb 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -554,7 +554,17 @@ static bool isUnpackedVectorVT(EVT VecVT) { VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; } -static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { +static InstructionCost getHistogramCost(const AArch64Subtarget *ST, + const IntrinsicCostAttributes &ICA) { + // We need to know at least the number of elements in the vector of buckets + // and the size of each element to update. + if (ICA.getArgTypes().size() < 2) + return InstructionCost::getInvalid(); + + // Only interested in costing for the hardware instruction from SVE2. + if (!ST->hasSVE2()) + return InstructionCost::getInvalid(); + Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements unsigned TotalHistCnts = 1; @@ -579,9 +589,11 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize; TotalHistCnts = EC / NaturalVectorWidth; + + return InstructionCost(BaseHistCntCost * TotalHistCnts); } - return InstructionCost(BaseHistCntCost * TotalHistCnts); + return InstructionCost::getInvalid(); } InstructionCost @@ -597,10 +609,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return InstructionCost::getInvalid(); switch (ICA.getID()) { - case Intrinsic::experimental_vector_histogram_add: - if (!ST->hasSVE2()) - return InstructionCost::getInvalid(); - return getHistogramCost(ICA); + case Intrinsic::experimental_vector_histogram_add: { + InstructionCost HistCost = getHistogramCost(ST, ICA); + // If the cost isn't valid, we may still be able to scalarize + if (HistCost.isValid()) + return HistCost; + break; + } case Intrinsic::umin: case Intrinsic::umax: case Intrinsic::smin: @@ -3975,6 +3990,27 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead( return DemandedElts.popcount() * (Insert + Extract) * VecInstCost; } +std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost( + Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, + TTI::OperandValueInfo Op2Info, bool IncludeTrunc, + std::function<InstructionCost(Type *)> InstCost) const { + if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy()) + return std::nullopt; + if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16()) + return std::nullopt; + + Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext())); + InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty, + TTI::CastContextHint::None, CostKind); + if (!Op1Info.isConstant() && !Op2Info.isConstant()) + Cost *= 2; + Cost += InstCost(PromotedTy); + if (IncludeTrunc) + Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy, + TTI::CastContextHint::None, CostKind); + return Cost; +} + InstructionCost AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, @@ -3997,6 +4033,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); + // Increase the cost for half and bfloat types if not architecturally + // supported. + if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL || + ISD == ISD::FDIV || ISD == ISD::FREM) + if (auto PromotedCost = getFP16BF16PromoteCost( + Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true, + [&](Type *PromotedTy) { + return getArithmeticInstrCost(Opcode, PromotedTy, CostKind, + Op1Info, Op2Info); + })) + return *PromotedCost; + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4265,11 +4313,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( [[fallthrough]]; case ISD::FADD: case ISD::FSUB: - // Increase the cost for half and bfloat types if not architecturally - // supported. - if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || - (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) - return 2 * LT.first; if (!Ty->getScalarType()->isFP128Ty()) return LT.first; [[fallthrough]]; @@ -4371,25 +4414,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( } if (Opcode == Instruction::FCmp) { - // Without dedicated instructions we promote f16 + bf16 compares to f32. - if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) || - ValTy->getScalarType()->isBFloatTy()) { - Type *PromotedTy = - ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext())); - InstructionCost Cost = - getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy, - TTI::CastContextHint::None, CostKind); - if (!Op1Info.isConstant() && !Op2Info.isConstant()) - Cost *= 2; - Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind, - Op1Info, Op2Info); - if (ValTy->isVectorTy()) - Cost += getCastInstrCost( - Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)), - VectorType::getInteger(cast<VectorType>(PromotedTy)), - TTI::CastContextHint::None, CostKind); - return Cost; - } + if (auto PromotedCost = getFP16BF16PromoteCost( + ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false, + [&](Type *PromotedTy) { + InstructionCost Cost = + getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, + CostKind, Op1Info, Op2Info); + if (isa<VectorType>(PromotedTy)) + Cost += getCastInstrCost( + Instruction::Trunc, + VectorType::getInteger(cast<VectorType>(ValTy)), + VectorType::getInteger(cast<VectorType>(PromotedTy)), + TTI::CastContextHint::None, CostKind); + return Cost; + })) + return *PromotedCost; auto LT = getTypeLegalizationCost(ValTy); // Model unknown fp compares as a libcall. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 7f45177..fa9b25a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -435,6 +435,14 @@ public: bool preferPredicatedReductionSelect() const override { return ST->hasSVE(); } + /// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the + /// architecture features are not present. + std::optional<InstructionCost> + getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, + TTI::OperandValueInfo Op1Info, + TTI::OperandValueInfo Op2Info, bool IncludeTrunc, + std::function<InstructionCost(Type *)> InstCost) const; + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 010d0aaa..2155ace 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -125,7 +125,7 @@ struct AArch64OutgoingValueAssigner bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg(); bool Res; - if (Info.IsFixed && !UseVarArgsCCForFixed) { + if (!Flags.isVarArg() && !UseVarArgsCCForFixed) { if (!IsReturn) applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT); Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); @@ -361,7 +361,7 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler { unsigned MaxSize = MemTy.getSizeInBytes() * 8; // For varargs, we always want to extend them to 8 bytes, in which case // we disable setting a max. - if (!Arg.IsFixed) + if (Arg.Flags[0].isVarArg()) MaxSize = 0; Register ValVReg = Arg.Regs[RegIndex]; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2a324e5..626734a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -997,89 +997,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const Function &F = MF.getFunction(); // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave - // dispatch registers are function args. - unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; - - if (isShader(F.getCallingConv())) { - bool IsPixelShader = - F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); - - // Calculate the number of VGPR registers based on the SPI input registers - uint32_t InputEna = 0; - uint32_t InputAddr = 0; - unsigned LastEna = 0; - - if (IsPixelShader) { - // Note for IsPixelShader: - // By this stage, all enabled inputs are tagged in InputAddr as well. - // We will use InputAddr to determine whether the input counts against the - // vgpr total and only use the InputEnable to determine the last input - // that is relevant - if extra arguments are used, then we have to honour - // the InputAddr for any intermediate non-enabled inputs. - InputEna = MFI->getPSInputEnable(); - InputAddr = MFI->getPSInputAddr(); - - // We only need to consider input args up to the last used arg. - assert((InputEna || InputAddr) && - "PSInputAddr and PSInputEnable should " - "never both be 0 for AMDGPU_PS shaders"); - // There are some rare circumstances where InputAddr is non-zero and - // InputEna can be set to 0. In this case we default to setting LastEna - // to 1. - LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; - } + // dispatch registers as function args. + unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(), + WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs(); - // FIXME: We should be using the number of registers determined during - // calling convention lowering to legalize the types. - const DataLayout &DL = F.getDataLayout(); - unsigned PSArgCount = 0; - unsigned IntermediateVGPR = 0; - for (auto &Arg : F.args()) { - unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; - if (Arg.hasAttribute(Attribute::InReg)) { - WaveDispatchNumSGPR += NumRegs; - } else { - // If this is a PS shader and we're processing the PS Input args (first - // 16 VGPR), use the InputEna and InputAddr bits to define how many - // VGPRs are actually used. - // Any extra VGPR arguments are handled as normal arguments (and - // contribute to the VGPR count whether they're used or not). - if (IsPixelShader && PSArgCount < 16) { - if ((1 << PSArgCount) & InputAddr) { - if (PSArgCount < LastEna) - WaveDispatchNumVGPR += NumRegs; - else - IntermediateVGPR += NumRegs; - } - PSArgCount++; - } else { - // If there are extra arguments we have to include the allocation for - // the non-used (but enabled with InputAddr) input arguments - if (IntermediateVGPR) { - WaveDispatchNumVGPR += IntermediateVGPR; - IntermediateVGPR = 0; - } - WaveDispatchNumVGPR += NumRegs; - } - } - } + if (WaveDispatchNumSGPR) { ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( - {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); + {ProgInfo.NumSGPR, + MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs, + Ctx)}, + Ctx); + } + if (WaveDispatchNumVGPR) { ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); - } else if (isKernel(F.getCallingConv()) && - MFI->getNumKernargPreloadedSGPRs()) { - // Consider cases where the total number of UserSGPRs with trailing - // allocated preload SGPRs, is greater than the number of explicitly - // referenced SGPRs. - const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd( - CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx); - ProgInfo.NumSGPR = - AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx); } // Adjust number of registers used to meet default/requested minimum/maximum diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 3d8d274..64a9bde 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( ++i; } + if (Info->getNumKernargPreloadedSGPRs()) + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); + TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); return true; @@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!determineAssignments(Assigner, SplitArgs, CCInfo)) return false; + if (IsEntryFunc) { + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } + FormalArgHandler Handler(B, MRI); if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index fb83388..9d6584a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3212,6 +3212,44 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, Src = Src.getOperand(0); } + if (Mods != SISrcMods::NONE) + return true; + + // Convert various sign-bit masks on integers to src mods. Currently disabled + // for 16-bit types as the codegen replaces the operand without adding a + // srcmod. This is intentionally finding the cases where we are performing + // float neg and abs on int types, the goal is not to obtain two's complement + // neg or abs. Limit converison to select operands via the nonCanonalizing + // pattern. + // TODO: Add 16-bit support. + if (IsCanonicalizing) + return true; + + unsigned Opc = Src->getOpcode(); + EVT VT = Src.getValueType(); + if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) || + (VT != MVT::i32 && VT != MVT::i64)) + return true; + + ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1)); + if (!CRHS) + return true; + + // Recognise (xor a, 0x80000000) as NEG SrcMod. + // Recognise (and a, 0x7fffffff) as ABS SrcMod. + // Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers. + if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(0); + } else if (Opc == ISD::AND && AllowAbs && + CRHS->getAPIntValue().isMaxSignedValue()) { + Mods |= SISrcMods::ABS; + Src = Src.getOperand(0); + } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) { + Mods |= SISrcMods::ABS | SISrcMods::NEG; + Src = Src.getOperand(0); + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index a6e4a63..40d960e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5879,8 +5879,12 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, const LLT S32 = LLT::scalar(32); MachineRegisterInfo &MRI = *B.getMRI(); - std::tie(BaseReg, ImmOffset) = - AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); + // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before + // being added, so we can only safely match a 32-bit addition with no unsigned + // overflow. + bool CheckNUW = AMDGPU::isGFX1250(ST); + std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset( + MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW); // If BaseReg is a pointer, convert it to int. if (MRI.getType(BaseReg).isPointer()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index f580f43..c21a9a1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -109,12 +109,17 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // Find AV_* registers assigned to AGPRs. const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg); - if (!TRI.isVectorSuperClass(VirtRegRC)) + if (!TRI.hasAGPRs(VirtRegRC)) continue; - const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg); - if (!TRI.isAGPRClass(AssignedRC)) - continue; + const TargetRegisterClass *AssignedRC = VirtRegRC; + if (TRI.hasVGPRs(VirtRegRC)) { + // If this is an AV register, we have to check if the actual assignment is + // to an AGPR + AssignedRC = TRI.getPhysRegBaseClass(PhysReg); + if (!TRI.isAGPRClass(AssignedRC)) + continue; + } LiveInterval &LI = LIS.getInterval(VReg); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ff8efd2..0d2feeb 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4933,6 +4933,43 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { return false; } + // Packed math FP32 instructions typically accept SGPRs or VGPRs as source + // operands. On gfx12+, if a source operand uses SGPRs, the HW can only read + // the first SGPR and use it for both the low and high operations. + if (isPackedFP32Inst(Opc) && isGFX12Plus()) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + + const MCOperand &Src0 = Inst.getOperand(Src0Idx); + const MCOperand &Src1 = Inst.getOperand(Src1Idx); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + auto VerifyOneSGPR = [OpSel, OpSelHi](unsigned Index) -> bool { + unsigned Mask = 1U << Index; + return ((OpSel & Mask) == 0) && ((OpSelHi & Mask) == 0); + }; + + if (Src0.isReg() && isSGPR(Src0.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/0)) + return false; + if (Src1.isReg() && isSGPR(Src1.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/1)) + return false; + + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx != -1) { + const MCOperand &Src2 = Inst.getOperand(Src2Idx); + if (Src2.isReg() && isSGPR(Src2.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/2)) + return false; + } + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 334afd3..ef63acc 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, //////////////////////////////////////////////////////////////////////////////// // GCNRPTarget -GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { +GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF); + setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F)); } GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { - setRegLimits(NumSGPRs, NumVGPRs, MF); + const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { + setTarget(NumSGPRs, NumVGPRs); } GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { + const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned DynamicVGPRBlockSize = MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), - ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF); + setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), + ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize)); } -void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF) { +void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs); MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs); - MaxUnifiedVGPRs = - ST.hasGFX90AInsts() - ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs) - : 0; + if (UnifiedRF) { + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxUnifiedVGPRs = + std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs); + } else { + MaxUnifiedVGPRs = 0; + } } -bool GCNRPTarget::isSaveBeneficial(Register Reg, - const MachineRegisterInfo &MRI) const { +bool GCNRPTarget::isSaveBeneficial(Register Reg) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); @@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg, return RP.getSGPRNum() > MaxSGPRs; unsigned NumVGPRs = SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum(); - return isVGPRBankSaveBeneficial(NumVGPRs); + // The addressable limit must always be respected. + if (NumVGPRs > MaxVGPRs) + return true; + // For unified RFs, combined VGPR usage limit must be respected as well. + return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs; } bool GCNRPTarget::satisfied() const { - if (RP.getSGPRNum() > MaxSGPRs) + if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs) return false; - if (RP.getVGPRNum(false) > MaxVGPRs && - (!CombineVGPRSavings || !satisifiesVGPRBanksTarget())) + if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs) return false; - return satisfiesUnifiedTarget(); + return true; } /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ea33a22..a9c58bb 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -186,20 +186,22 @@ public: /// Sets up the target such that the register pressure starting at \p RP does /// not show register spilling on function \p MF (w.r.t. the function's /// mininum target occupancy). - GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings = false); + GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p /// MF. GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not prevent achieving an occupancy of at least \p Occupancy on function /// \p MF. GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); + + /// Changes the target (same semantics as constructor). + void setTarget(unsigned NumSGPRs, unsigned NumVGPRs); const GCNRegPressure &getCurrentRP() const { return RP; } @@ -207,7 +209,7 @@ public: /// Determines whether saving virtual register \p Reg will be beneficial /// towards achieving the RP target. - bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const; + bool isSaveBeneficial(Register Reg) const; /// Saves virtual register \p Reg with lanemask \p Mask. void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) { @@ -227,15 +229,15 @@ public: if (Target.MaxUnifiedVGPRs) { OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)"; - } else if (Target.CombineVGPRSavings) { - OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/' - << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; } return OS; } #endif private: + const MachineFunction &MF; + const bool UnifiedRF; + /// Current register pressure. GCNRegPressure RP; @@ -246,29 +248,10 @@ private: /// Target number of overall VGPRs for subtargets with unified RFs. Always 0 /// for subtargets with non-unified RFs. unsigned MaxUnifiedVGPRs; - /// Whether we consider that the register allocator will be able to swap - /// between ArchVGPRs and AGPRs by copying them to a super register class. - /// Concretely, this allows savings in one of the VGPR banks to help toward - /// savings in the other VGPR bank. - bool CombineVGPRSavings; - - inline bool satisifiesVGPRBanksTarget() const { - assert(CombineVGPRSavings && "only makes sense with combined savings"); - return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs; - } - - /// Always satisified when the subtarget doesn't have a unified RF. - inline bool satisfiesUnifiedTarget() const { - return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs; - } - - inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const { - return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() || - (CombineVGPRSavings && !satisifiesVGPRBanksTarget()); - } - void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs, - const MachineFunction &MF); + GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF) + : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()), + RP(RP) {} }; /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 96d5668..254b75b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() { } /// Allows to easily filter for this stage's debug output. -#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) +#define REMAT_PREFIX "[PreRARemat] " +#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) bool PreRARematStage::initGCNSchedStage() { // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for @@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() { rematerialize(); if (GCNTrackers) DAG.RegionLiveOuts.buildLiveRegMap(); - REMAT_DEBUG( - dbgs() << "Retrying function scheduling with new min. occupancy of " - << AchievedOcc << " from rematerializing (original was " - << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n"); + REMAT_DEBUG({ + dbgs() << "Retrying function scheduling with new min. occupancy of " + << AchievedOcc << " from rematerializing (original was " + << DAG.MinOccupancy; + if (TargetOcc) + dbgs() << ", target was " << *TargetOcc; + dbgs() << ")\n"; + }); + if (AchievedOcc > DAG.MinOccupancy) { DAG.MinOccupancy = AchievedOcc; SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { return GCNSchedStage::shouldRevertScheduling(WavesAfter) || - mayCauseSpilling(WavesAfter) || - (IncreaseOccupancy && WavesAfter < TargetOcc); + mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc); } bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { @@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, } bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { - REMAT_DEBUG({ - dbgs() << "Collecting rematerializable instructions in "; - MF.getFunction().printAsOperand(dbgs(), false); - dbgs() << '\n'; - }); + const Function &F = MF.getFunction(); // Maps optimizable regions (i.e., regions at minimum and register-limited // occupancy, or regions with spilling) to the target RP we would like to // reach. DenseMap<unsigned, GCNRPTarget> OptRegions; - const Function &F = MF.getFunction(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - - std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); - const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); - const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); - const unsigned MaxSGPRsIncOcc = - ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false); - const unsigned MaxVGPRsIncOcc = - ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize); - IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; - - // Collect optimizable regions. If there is spilling in any region we will - // just try to reduce spilling. Otherwise we will try to increase occupancy by - // one in the whole function. - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - GCNRegPressure &RP = DAG.Pressure[I]; - // We allow ArchVGPR or AGPR savings to count as savings of the other kind - // of VGPR only when trying to eliminate spilling. We cannot do this when - // trying to increase occupancy since VGPR class swaps only occur later in - // the register allocator i.e., the scheduler will not be able to reason - // about these savings and will not report an increase in the achievable - // occupancy, triggering rollbacks. - GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP, - /*CombineVGPRSavings=*/true); - if (!Target.satisfied() && IncreaseOccupancy) { - // There is spilling in the region and we were so far trying to increase - // occupancy. Strop trying that and focus on reducing spilling. - IncreaseOccupancy = false; - OptRegions.clear(); - } else if (IncreaseOccupancy) { - // There is no spilling in the region, try to increase occupancy. - Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP, - /*CombineVGPRSavings=*/false); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); + unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); + auto ResetTargetRegions = [&]() { + OptRegions.clear(); + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + const GCNRegPressure &RP = DAG.Pressure[I]; + GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP); + if (!Target.satisfied()) + OptRegions.insert({I, Target}); } - if (!Target.satisfied()) - OptRegions.insert({I, Target}); - } - if (OptRegions.empty()) - return false; + }; -#ifndef NDEBUG - if (IncreaseOccupancy) { - REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy - << ") in regions:\n"); + ResetTargetRegions(); + if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { + // In addition to register usage being above addressable limits, occupancy + // below the minimum is considered like "spilling" as well. + TargetOcc = std::nullopt; } else { - REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy (" - << WavesPerEU.first << ") in regions:\n"); - } - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) - REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n'); + // There is no spilling and room to improve occupancy; set up "increased + // occupancy targets" for all regions. + TargetOcc = DAG.MinOccupancy + 1; + unsigned VGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false); + MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize); + ResetTargetRegions(); } -#endif - - // When we are reducing spilling, the target is the minimum target number of - // waves/EU determined by the subtarget. In cases where either one of - // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current - // minimum region occupancy may be higher than the latter. - TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 - : std::max(DAG.MinOccupancy, WavesPerEU.first); + REMAT_DEBUG({ + dbgs() << "Analyzing "; + MF.getFunction().printAsOperand(dbgs(), false); + dbgs() << ": "; + if (OptRegions.empty()) { + dbgs() << "no objective to achieve, occupancy is maximal at " + << MFI.getMaxWavesPerEU(); + } else if (!TargetOcc) { + dbgs() << "reduce spilling (minimum target occupancy is " + << MFI.getMinWavesPerEU() << ')'; + } else { + dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to " + << TargetOcc; + } + dbgs() << '\n'; + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) { + dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond() + << '\n'; + } + } + }); + if (OptRegions.empty()) + return false; // Accounts for a reduction in RP in an optimizable region. Returns whether we // estimate that we have identified enough rematerialization opportunities to @@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, bool &Progress) -> bool { GCNRPTarget &Target = OptIt->getSecond(); - if (!Target.isSaveBeneficial(Reg, DAG.MRI)) + if (!Target.isSaveBeneficial(Reg)) return false; Progress = true; Target.saveReg(Reg, Mask, DAG.MRI); @@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { } } - if (IncreaseOccupancy) { + if (TargetOcc) { // We were trying to increase occupancy but failed, abort the stage. REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); Rematerializations.clear(); @@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() { // All regions impacted by at least one rematerialization must be rescheduled. // Maximum pressure must also be recomputed for all regions where it changed // non-predictably and checked against the target occupancy. - AchievedOcc = TargetOcc; + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + AchievedOcc = MFI.getMaxWavesPerEU(); for (auto &[I, OriginalRP] : ImpactedRegions) { bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; RescheduleRegions[I] = !IsEmptyRegion; @@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() { } } DAG.Pressure[I] = RP; - AchievedOcc = std::min( - AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>() - ->getDynamicVGPRBlockSize())); + AchievedOcc = + std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); } REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); } @@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() { // which case we do not want to rollback either (the rescheduling was already // reverted in PreRARematStage::shouldRevertScheduling in such cases). unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); - if (!IncreaseOccupancy || MaxOcc >= TargetOcc) + if (!TargetOcc || MaxOcc >= *TargetOcc) return; REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 32139a9..790370f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -470,15 +470,12 @@ private: /// After successful stage initialization, indicates which regions should be /// rescheduled. BitVector RescheduleRegions; - /// Target occupancy the stage estimates is reachable through - /// rematerialization. Greater than or equal to the pre-stage min occupancy. - unsigned TargetOcc; + /// The target occupancy the stage is trying to achieve. Empty when the + /// objective is spilling reduction. + std::optional<unsigned> TargetOcc; /// Achieved occupancy *only* through rematerializations (pre-rescheduling). /// Smaller than or equal to the target occupancy. unsigned AchievedOcc; - /// Whether the stage is attempting to increase occupancy in the abscence of - /// spilling. - bool IncreaseOccupancy; /// Returns whether remat can reduce spilling or increase function occupancy /// by 1 through rematerialization. If it can do one, collects instructions in diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0c653b1..962c276 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -2081,7 +2081,9 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const { case AMDGPU::V_MAX_F16_fake16_e64: case AMDGPU::V_MAX_F64_e64: case AMDGPU::V_MAX_NUM_F64_e64: - case AMDGPU::V_PK_MAX_F16: { + case AMDGPU::V_PK_MAX_F16: + case AMDGPU::V_MAX_BF16_PSEUDO_e64: + case AMDGPU::V_PK_MAX_NUM_BF16: { if (MI.mayRaiseFPException()) return nullptr; @@ -2108,8 +2110,10 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const { // Having a 0 op_sel_hi would require swizzling the output in the source // instruction, which we can't do. - unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 - : 0u; + unsigned UnsetMods = + (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16) + ? SISrcMods::OP_SEL_1 + : 0u; if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) return nullptr; return Src0; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8f44c03..1b7d65a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments( if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); CCInfo.AnalyzeFormalArguments(Splits, AssignFn); + + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } else if (Info->getNumKernargPreloadedSGPRs()) { + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); } SmallVector<SDValue, 16> Chains; @@ -6106,6 +6115,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, case MVT::f64: return true; case MVT::f16: + case MVT::bf16: return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); default: break; @@ -10877,6 +10887,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } } +// Return whether the operation has NoUnsignedWrap property. +static bool isNoUnsignedWrap(SDValue Addr) { + return (Addr.getOpcode() == ISD::ADD && + Addr->getFlags().hasNoUnsignedWrap()) || + Addr->getOpcode() == ISD::OR; +} + bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { return UseSelectionDAGPTRADD && PtrVT == MVT::i64; @@ -10898,8 +10915,14 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const { if ((C1 = dyn_cast<ConstantSDNode>(N0))) N0 = SDValue(); else if (DAG.isBaseWithConstantOffset(N0)) { - C1 = cast<ConstantSDNode>(N0.getOperand(1)); - N0 = N0.getOperand(0); + // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before + // being added, so we can only safely match a 32-bit addition with no + // unsigned overflow. + bool CheckNUW = AMDGPU::isGFX1250(*Subtarget); + if (!CheckNUW || isNoUnsignedWrap(N0)) { + C1 = cast<ConstantSDNode>(N0.getOperand(1)); + N0 = N0.getOperand(0); + } } if (C1) { diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp index d8fe850..0a68512 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -51,7 +51,7 @@ static cl::opt<unsigned> namespace { enum HardClauseType { - // For GFX10: + // For GFX10 and GFX1250: // Texture, buffer, global or scratch memory instructions. HARDCLAUSE_VMEM, @@ -102,7 +102,8 @@ public: HardClauseType getHardClauseType(const MachineInstr &MI) { if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) { - if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { + if (ST->getGeneration() == AMDGPUSubtarget::GFX10 || + ST->hasGFX1250Insts()) { if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || SIInstrInfo::isSegmentSpecificFLAT(MI)) { if (ST->hasNSAClauseBug()) { @@ -115,7 +116,6 @@ public: if (SIInstrInfo::isFLAT(MI)) return HARDCLAUSE_FLAT; } else { - assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11); if (SIInstrInfo::isMIMG(MI)) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f20b22d..19e6bcf 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -18,6 +18,7 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -5534,6 +5535,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more + // information. + if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) { + for (unsigned I = 0; I < 3; ++I) { + if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I)) + return false; + } + } + return true; } @@ -6005,6 +6015,21 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx]; unsigned Opc = MI.getOpcode(); + // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more + // information. + if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && + MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { + constexpr const AMDGPU::OpName OpNames[] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; + + for (auto [I, OpName] : enumerate(OpNames)) { + int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]); + if (static_cast<unsigned>(SrcIdx) == OpIdx && + !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I, &MO)) + return false; + } + } + if (!isLegalRegOperand(MRI, OpInfo, MO)) return false; @@ -6053,6 +6078,39 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return true; } +bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( + const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, + const MachineOperand *MO) const { + constexpr const unsigned NumOps = 3; + constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; + + assert(SrcN < NumOps); + + if (!MO) { + int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]); + if (SrcIdx == -1) + return true; + MO = &MI.getOperand(SrcIdx); + } + + if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg())) + return true; + + int ModsIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]); + if (ModsIdx == -1) + return true; + + unsigned Mods = MI.getOperand(ModsIdx).getImm(); + bool OpSel = Mods & SISrcMods::OP_SEL_0; + bool OpSelHi = Mods & SISrcMods::OP_SEL_1; + + return !OpSel && !OpSelHi; +} + bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineFunction &MF = *MI.getParent()->getParent(); @@ -6390,6 +6448,15 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg())) legalizeOpWithMove(MI, VOP3Idx[2]); + + // Fix the register class of packed FP32 instructions on gfx12+. See + // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information. + if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) { + for (unsigned I = 0; I < 3; ++I) { + if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I)) + legalizeOpWithMove(MI, VOP3Idx[I]); + } + } } Register SIInstrInfo::readlaneVGPRToSGPR( diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e042b59..6b9403f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1287,6 +1287,19 @@ public: const MachineOperand &MO) const; bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &MO) const; + + /// Check if \p MO would be a legal operand for gfx12+ packed math FP32 + /// instructions. Packed math FP32 instructions typically accept SGPRs or + /// VGPRs as source operands. On gfx12+, if a source operand uses SGPRs, the + /// HW can only read the first SGPR and use it for both the low and high + /// operations. + /// \p SrcN can be 0, 1, or 2, representing src0, src1, and src2, + /// respectively. If \p MO is nullptr, the operand corresponding to SrcN will + /// be used. + bool isLegalGFX12PlusPackedMathFP32Operand( + const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, + const MachineOperand *MO = nullptr) const; + /// Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 9a1448f..49425d5 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), HasSpilledSGPRs(MFI.hasSpilledSGPRs()), HasSpilledVGPRs(MFI.hasSpilledVGPRs()), + NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()), + NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), Occupancy(MFI.getOccupancy()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), @@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( WaveLimiter = YamlMFI.WaveLimiter; HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; + NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs; + NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs; BytesInStackArgArea = YamlMFI.BytesInStackArgArea; ReturnsVoid = YamlMFI.ReturnsVoid; IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 08b0206..ca8f803 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool WaveLimiter = false; bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; + uint16_t NumWaveDispatchSGPRs = 0; + uint16_t NumWaveDispatchVGPRs = 0; uint32_t HighBitsOf32BitAddress = 0; // TODO: 10 may be a better default since it's the maximum. @@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false); YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false); + YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false); + YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, @@ -465,6 +469,9 @@ private: unsigned NumUserSGPRs = 0; unsigned NumSystemSGPRs = 0; + unsigned NumWaveDispatchSGPRs = 0; + unsigned NumWaveDispatchVGPRs = 0; + bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; bool HasNonSpillStackObjects = false; @@ -991,6 +998,14 @@ public: return UserSGPRInfo.getNumKernargPreloadSGPRs(); } + unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; } + + void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; } + + unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; } + + void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; } + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 00dcb9b..1e3e9a2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3318,6 +3318,20 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { return 128; } +bool isPackedFP32Inst(unsigned Opc) { + switch (Opc) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_F32_gfx12: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_F32_gfx12: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMA_F32_gfx12: + return true; + default: + return false; + } +} + } // namespace AMDGPU raw_ostream &operator<<(raw_ostream &OS, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 1252e35..1bcd36c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1709,6 +1709,8 @@ bool isArgPassedInSGPR(const Argument *Arg); bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo); +LLVM_READONLY bool isPackedFP32Inst(unsigned Opc); + LLVM_READONLY bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index ea99cc4..75d3cfa 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -802,6 +802,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::BSWAP, VT, Expand); } + if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps()) + setOperationAction(ISD::SCMP, MVT::i32, Custom); + + if (!Subtarget->hasV8_1MMainlineOps()) + setOperationAction(ISD::UCMP, MVT::i32, Custom); + setOperationAction(ISD::ConstantFP, MVT::f32, Custom); setOperationAction(ISD::ConstantFP, MVT::f64, Custom); @@ -1634,6 +1640,10 @@ bool ARMTargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } +bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const { + return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32; +} + // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, @@ -10612,6 +10622,133 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op, return DAG.getBitcast(MVT::i32, Res); } +SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + // Determine if this is signed or unsigned comparison + bool IsSigned = (Op.getOpcode() == ISD::SCMP); + + // Special case for Thumb1 UCMP only + if (!IsSigned && Subtarget->isThumb1Only()) { + // For Thumb unsigned comparison, use this sequence: + // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags + // sbc r2, r2 ; r2 = r2 - r2 - !carry + // cmp r1, r0 ; compare RHS with LHS + // sbc r1, r1 ; r1 = r1 - r1 - !carry + // subs r0, r2, r1 ; r0 = r2 - r1 (final result) + + // First subtraction: LHS - RHS + SDValue Sub1WithFlags = DAG.getNode( + ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + SDValue Sub1Result = Sub1WithFlags.getValue(0); + SDValue Flags1 = Sub1WithFlags.getValue(1); + + // SUBE: Sub1Result - Sub1Result - !carry + // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned) + SDValue Sbc1 = + DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), + Sub1Result, Sub1Result, Flags1); + SDValue Sbc1Result = Sbc1.getValue(0); + + // Second comparison: RHS vs LHS (reverse comparison) + SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS); + + // SUBE: RHS - RHS - !carry + // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned) + SDValue Sbc2 = DAG.getNode( + ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags); + SDValue Sbc2Result = Sbc2.getValue(0); + + // Final subtraction: Sbc1Result - Sbc2Result (no flags needed) + SDValue Result = + DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result); + if (Op.getValueType() != MVT::i32) + Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType()); + + return Result; + } + + // For the ARM assembly pattern: + // subs r0, r0, r1 ; subtract RHS from LHS and set flags + // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for + // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for + // signed, LO for unsigned) + // ; if LHS == RHS, result remains 0 from the subs + + // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC + unsigned Opcode = ARMISD::SUBC; + + // Check if RHS is a subtraction against 0: (0 - X) + if (RHS.getOpcode() == ISD::SUB) { + SDValue SubLHS = RHS.getOperand(0); + SDValue SubRHS = RHS.getOperand(1); + + // Check if it's 0 - X + if (isNullConstant(SubLHS)) { + bool CanUseAdd = false; + if (IsSigned) { + // For SCMP: only if X is known to never be INT_MIN (to avoid overflow) + if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS) + .getSignedMinValue() + .isMinSignedValue()) { + CanUseAdd = true; + } + } else { + // For UCMP: only if X is known to never be zero + if (DAG.isKnownNeverZero(SubRHS)) { + CanUseAdd = true; + } + } + + if (CanUseAdd) { + Opcode = ARMISD::ADDC; + RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of + // LHS - (0 - X) + } + } + } + + // Generate the operation with flags + SDValue OpWithFlags; + if (Opcode == ARMISD::ADDC) { + // Use ADDC: LHS + RHS (where RHS was 0 - X, now X) + OpWithFlags = DAG.getNode(ARMISD::ADDC, dl, + DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + } else { + // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags) + OpWithFlags = DAG.getNode(ARMISD::SUBC, dl, + DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + } + + SDValue OpResult = OpWithFlags.getValue(0); // The operation result + SDValue Flags = OpWithFlags.getValue(1); // The flags + + // Constants for conditional moves + SDValue One = DAG.getConstant(1, dl, MVT::i32); + SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32); + + // Select condition codes based on signed vs unsigned + ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI; + ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO; + + // First conditional move: if greater than, set to 1 + SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32); + SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One, + GTCondValue, Flags); + + // Second conditional move: if less than, set to -1 + SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32); + SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne, + LTCondValue, Flags); + + if (Op.getValueType() != MVT::i32) + Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType()); + + return Result2; +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { @@ -10740,6 +10877,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); + case ISD::UCMP: + case ISD::SCMP: + return LowerCMP(Op, DAG); } } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 825145d..a84a3cb 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -607,6 +607,8 @@ class VectorType; bool preferZeroCompareBranch() const override { return true; } + bool shouldExpandCmpUsingSelects(EVT VT) const override; + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; bool hasAndNotCompare(SDValue V) const override { @@ -904,6 +906,7 @@ class VectorType; void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const; SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index 3955f2a..25ad9ec 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, default: { // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows // us to fold the constant into the cmp instruction. - RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT); + RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT); CC = ISD::SETGE; break; } @@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to // fold the constant into the cmp instruction. if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { - RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT); + // Doing a "icmp ugt i16 65535, %0" comparison should have been converted + // already to something else. Assert to make sure this assumption holds. + assert((!C->isAllOnes()) && "integer overflow in comparison transform"); + RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT); CC = ISD::SETUGE; break; } diff --git a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp index 73abfe7..306db6a 100644 --- a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp +++ b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp @@ -87,17 +87,50 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) { for (LoadInst *LI : LoadsToProcess) { Value *V = LI->getPointerOperand(); - auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()); + auto *GV = dyn_cast<GlobalVariable>(V); // If we didn't find the global, we may need to walk through a level of // indirection. This generally happens at -O0. - if (!GV) + if (!GV) { if (auto *NestedLI = dyn_cast<LoadInst>(V)) { BasicBlock::iterator BBI(NestedLI); Value *Loaded = FindAvailableLoadedValue( NestedLI, NestedLI->getParent(), BBI, 0, nullptr, nullptr); GV = dyn_cast_or_null<GlobalVariable>(Loaded); + } else if (auto *NestedAlloca = dyn_cast<AllocaInst>(V)) { + for (auto &Use : NestedAlloca->uses()) { + auto *Store = dyn_cast<StoreInst>(Use.getUser()); + if (!Store) + continue; + + Value *StoredVal = Store->getValueOperand(); + if (!StoredVal) + continue; + + // Try direct global match + GV = dyn_cast<GlobalVariable>(StoredVal); + if (GV) + break; + + // If it's a load, check its source + if (auto *Load = dyn_cast<LoadInst>(StoredVal)) { + GV = dyn_cast<GlobalVariable>(Load->getPointerOperand()); + if (GV) + break; + + // If loading from an unmodified stack copy of the global, reuse the + // global's value. Note: we are just repeating what we are doing for + // the load case for the alloca store pattern. + BasicBlock::iterator BBI(Load); + Value *Loaded = FindAvailableLoadedValue(Load, Load->getParent(), + BBI, 0, nullptr, nullptr); + GV = dyn_cast<GlobalVariable>(Loaded); + if (GV) + break; + } + } } + } auto It = HandleMap.find(GV); if (It == HandleMap.end()) { diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index ffd900c..5153d24 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -56,6 +56,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_wave_reduce_sum: case Intrinsic::dx_wave_reduce_umax: case Intrinsic::dx_wave_reduce_usum: + case Intrinsic::dx_imad: + case Intrinsic::dx_umad: return true; default: return false; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index a5bf0e5..6583a0f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -6729,8 +6729,7 @@ static bool CC_LoongArchAssign2GRLen(unsigned GRLen, CCState &State, static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, unsigned ValNo, MVT ValVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, - Type *OrigTy) { + CCState &State, bool IsRet, Type *OrigTy) { unsigned GRLen = DL.getLargestLegalIntTypeSizeInBits(); assert((GRLen == 32 || GRLen == 64) && "Unspport GRLen"); MVT GRLenVT = GRLen == 32 ? MVT::i32 : MVT::i64; @@ -6752,7 +6751,7 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, case LoongArchABI::ABI_LP64F: case LoongArchABI::ABI_ILP32D: case LoongArchABI::ABI_LP64D: - UseGPRForFloat = !IsFixed; + UseGPRForFloat = ArgFlags.isVarArg(); break; case LoongArchABI::ABI_ILP32S: case LoongArchABI::ABI_LP64S: @@ -6766,7 +6765,8 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, // will not be passed by registers if the original type is larger than // 2*GRLen, so the register alignment rule does not apply. unsigned TwoGRLenInBytes = (2 * GRLen) / 8; - if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes && + if (ArgFlags.isVarArg() && + ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes && DL.getTypeAllocSize(OrigTy) == TwoGRLenInBytes) { unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); // Skip 'odd' register if necessary. @@ -6916,7 +6916,7 @@ void LoongArchTargetLowering::analyzeInputArgs( LoongArchABI::ABI ABI = MF.getSubtarget<LoongArchSubtarget>().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Ins[i].Flags, - CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) { + CCInfo, IsRet, ArgTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << ArgVT << '\n'); llvm_unreachable(""); @@ -6934,7 +6934,7 @@ void LoongArchTargetLowering::analyzeOutputArgs( LoongArchABI::ABI ABI = MF.getSubtarget<LoongArchSubtarget>().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Outs[i].Flags, - CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) { + CCInfo, IsRet, OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << ArgVT << "\n"); llvm_unreachable(""); @@ -7647,8 +7647,7 @@ bool LoongArchTargetLowering::CanLowerReturn( LoongArchABI::ABI ABI = MF.getSubtarget<LoongArchSubtarget>().getTargetABI(); if (CC_LoongArch(MF.getDataLayout(), ABI, i, Outs[i].VT, CCValAssign::Full, - Outs[i].Flags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, - nullptr)) + Outs[i].Flags, CCInfo, /*IsRet=*/true, nullptr)) return false; } return true; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 6b49a98f..f79ba74 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -330,7 +330,7 @@ private: unsigned ValNo, MVT ValVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy); + bool IsRet, Type *OrigTy); void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index fda9d97..ca5d27d 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -254,7 +254,8 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN); F.setVarFixups({Fixup}); F.setLinkerRelaxable(); - F.getParent()->setLinkerRelaxable(); + if (!F.getParent()->isLinkerRelaxable()) + F.getParent()->setFirstLinkerRelaxable(F.getLayoutOrder()); return true; } diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp index 9e8cd2e..13237c5 100644 --- a/llvm/lib/Target/Mips/MipsCCState.cpp +++ b/llvm/lib/Target/Mips/MipsCCState.cpp @@ -128,12 +128,10 @@ void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) { OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT)); } -void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, - const char *Func) { +void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) { OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func)); OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy()); OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy()); - CallOperandIsFixed.push_back(IsFixed); } /// Identify lowered values that originated from f128, float and sret to vXfXX @@ -148,7 +146,6 @@ void MipsCCState::PreAnalyzeCallOperands( OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func)); OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy()); OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy()); - CallOperandIsFixed.push_back(Outs[i].IsFixed); } } diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h index 4229da5..30b68e8 100644 --- a/llvm/lib/Target/Mips/MipsCCState.h +++ b/llvm/lib/Target/Mips/MipsCCState.h @@ -36,7 +36,7 @@ public: static bool originalEVTTypeIsVectorFloat(EVT Ty); static bool originalTypeIsVectorFloat(const Type *Ty); - void PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, const char *Func); + void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func); void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags); void PreAnalyzeReturnValue(EVT ArgVT); @@ -86,10 +86,6 @@ private: /// vector. SmallVector<bool, 4> OriginalRetWasFloatVector; - /// Records whether the value was a fixed argument. - /// See ISD::OutputArg::IsFixed, - SmallVector<bool, 4> CallOperandIsFixed; - // Used to handle MIPS16-specific calling convention tweaks. // FIXME: This should probably be a fully fledged calling convention. SpecialCallingConvType SpecialCallingConv; @@ -106,7 +102,6 @@ public: OriginalArgWasF128.clear(); OriginalArgWasFloat.clear(); OriginalArgWasFloatVector.clear(); - CallOperandIsFixed.clear(); PreAnalyzeCallOperands(Outs, FuncArgs, Func); } @@ -213,7 +208,6 @@ public: bool WasOriginalRetVectorFloat(unsigned ValNo) const { return OriginalRetWasFloatVector[ValNo]; } - bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; } SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; } }; } diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp index 555773a..fa49108 100644 --- a/llvm/lib/Target/Mips/MipsCallLowering.cpp +++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp @@ -47,7 +47,7 @@ struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { if (IsReturn) State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty)); else - State.PreAnalyzeCallOperand(Info.Ty, Info.IsFixed, Func); + State.PreAnalyzeCallOperand(Info.Ty, Func); return CallLowering::OutgoingValueAssigner::assignArg( ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State); diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td index 39e184a..0e5c16c 100644 --- a/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/llvm/lib/Target/Mips/MipsCallingConv.td @@ -29,12 +29,6 @@ class CCIfOrigArgWasFloat<CCAction A> class CCIfOrigArgWasF128<CCAction A> : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>; -/// Match if this specific argument is a vararg. -/// This is slightly different fro CCIfIsVarArg which matches if any argument is -/// a vararg. -class CCIfArgIsVarArg<CCAction A> - : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>; - /// Match if the return was a floating point vector. class CCIfOrigArgWasNotVectorFloat<CCAction A> : CCIf<"!static_cast<MipsCCState *>(&State)" @@ -344,7 +338,7 @@ def CC_Mips_VarArg : CallingConv<[ ]>; def CC_Mips : CallingConv<[ - CCIfVarArg<CCIfArgIsVarArg<CCDelegateTo<CC_Mips_VarArg>>>, + CCIfVarArg<CCIfArgVarArg<CCDelegateTo<CC_Mips_VarArg>>>, CCDelegateTo<CC_Mips_FixedArg> ]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 15f45a1..d4f0cc9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -900,6 +900,17 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, if (STI.allowFP16Math() || STI.hasBF16Math()) setTargetDAGCombine(ISD::SETCC); + // Vector reduction operations. These may be turned into shuffle or tree + // reductions depending on what instructions are available for each type. + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + MVT EltVT = VT.getVectorElementType(); + if (EltVT == MVT::f32 || EltVT == MVT::f64) { + setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN, + ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM}, + VT, Custom); + } + } + // Promote fp16 arithmetic if fp16 hardware isn't available or the // user passed --nvptx-no-fp16-math. The flag is useful because, // although sm_53+ GPUs have some sort of FP16 support in @@ -1143,6 +1154,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::BFI) MAKE_CASE(NVPTXISD::PRMT) MAKE_CASE(NVPTXISD::FCOPYSIGN) + MAKE_CASE(NVPTXISD::FMAXNUM3) + MAKE_CASE(NVPTXISD::FMINNUM3) + MAKE_CASE(NVPTXISD::FMAXIMUM3) + MAKE_CASE(NVPTXISD::FMINIMUM3) MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC) MAKE_CASE(NVPTXISD::STACKRESTORE) MAKE_CASE(NVPTXISD::STACKSAVE) @@ -1929,6 +1944,124 @@ static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL, return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode); } +/// Reduces the elements using the scalar operations provided. The operations +/// are sorted descending in number of inputs they take. The flags on the +/// original reduction operation will be propagated to each scalar operation. +/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction +/// used in ExpandReductions and SelectionDAG. +static SDValue buildTreeReduction( + const SmallVector<SDValue> &Elements, EVT EltTy, + ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops, + const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) { + // Build the reduction tree at each level, starting with all the elements. + SmallVector<SDValue> Level = Elements; + + unsigned OpIdx = 0; + while (Level.size() > 1) { + // Try to reduce this level using the current operator. + const auto [Op, NumInputs] = Ops[OpIdx]; + + // Build the next level by partially reducing all elements. + SmallVector<SDValue> ReducedLevel; + unsigned I = 0, E = Level.size(); + for (; I + NumInputs <= E; I += NumInputs) { + // Reduce elements in groups of [NumInputs], as much as possible. + ReducedLevel.push_back(DAG.getNode( + Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags)); + } + + if (I < E) { + // Handle leftover elements. + + if (ReducedLevel.empty()) { + // We didn't reduce anything at this level. We need to pick a smaller + // operator. + ++OpIdx; + assert(OpIdx < Ops.size() && "no smaller operators for reduction"); + continue; + } + + // We reduced some things but there's still more left, meaning the + // operator's number of inputs doesn't evenly divide this level size. Move + // these elements to the next level. + for (; I < E; ++I) + ReducedLevel.push_back(Level[I]); + } + + // Process the next level. + Level = ReducedLevel; + } + + return *Level.begin(); +} + +// Get scalar reduction opcode +static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) { + switch (ReductionOpcode) { + case ISD::VECREDUCE_FMAX: + return ISD::FMAXNUM; + case ISD::VECREDUCE_FMIN: + return ISD::FMINNUM; + case ISD::VECREDUCE_FMAXIMUM: + return ISD::FMAXIMUM; + case ISD::VECREDUCE_FMINIMUM: + return ISD::FMINIMUM; + default: + llvm_unreachable("unhandled reduction opcode"); + } +} + +/// Get 3-input scalar reduction opcode +static std::optional<NVPTXISD::NodeType> +getScalar3OpcodeForReduction(unsigned ReductionOpcode) { + switch (ReductionOpcode) { + case ISD::VECREDUCE_FMAX: + return NVPTXISD::FMAXNUM3; + case ISD::VECREDUCE_FMIN: + return NVPTXISD::FMINNUM3; + case ISD::VECREDUCE_FMAXIMUM: + return NVPTXISD::FMAXIMUM3; + case ISD::VECREDUCE_FMINIMUM: + return NVPTXISD::FMINIMUM3; + default: + return std::nullopt; + } +} + +/// Lower reductions to either a sequence of operations or a tree if +/// reassociations are allowed. This method will use larger operations like +/// max3/min3 when the target supports them. +SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const SDNodeFlags Flags = Op->getFlags(); + SDValue Vector = Op.getOperand(0); + + const unsigned Opcode = Op->getOpcode(); + const EVT EltTy = Vector.getValueType().getVectorElementType(); + + // Whether we can use 3-input min/max when expanding the reduction. + const bool CanUseMinMax3 = + EltTy == MVT::f32 && STI.getSmVersion() >= 100 && + STI.getPTXVersion() >= 88 && + (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN || + Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM); + + // A list of SDNode opcodes with equivalent semantics, sorted descending by + // number of inputs they take. + SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps; + + if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode); + CanUseMinMax3 && Opcode3Elem) + ScalarOps.push_back({*Opcode3Elem, 3}); + ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2}); + + SmallVector<SDValue> Elements; + DAG.ExtractVectorElements(Vector, Elements); + + return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG); +} + SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { // Handle bitcasting from v2i8 without hitting the default promotion // strategy which goes through stack memory. @@ -2808,6 +2941,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: + return LowerVECREDUCE(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::LOAD: @@ -3908,6 +4046,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } + case Intrinsic::nvvm_prefetch_tensormap: { + auto &DL = I.getDataLayout(); + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = getPointerTy(DL); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; + Info.align.reset(); + return true; + } + case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_p: { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index cf72a1e..43e721a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -64,6 +64,11 @@ enum NodeType : unsigned { UNPACK_VECTOR, FCOPYSIGN, + FMAXNUM3, + FMINNUM3, + FMAXIMUM3, + FMINIMUM3, + DYNAMIC_STACKALLOC, STACKRESTORE, STACKSAVE, @@ -286,6 +291,7 @@ private: SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index aac611d..1ab41bf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -347,6 +347,36 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> { Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>; } +// Template for 3-input minimum/maximum instructions +// (sm_100+/PTX 8.8 and f32 only) +// +// Also defines ftz (flush subnormal inputs and results to sign-preserving +// zero) variants for fp32 functions. +multiclass FMINIMUMMAXIMUM3<string OpcStr, bit NaN, SDNode OpNode> { + defvar nan_str = !if(NaN, ".NaN", ""); + def f32rrr : + BasicFlagsNVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, B32:$c), + (ins FTZFlag:$ftz), + OpcStr # "$ftz" # nan_str # ".f32", + [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; + def f32rri : + BasicFlagsNVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, f32imm:$c), + (ins FTZFlag:$ftz), + OpcStr # "$ftz" # nan_str # ".f32", + [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; + def f32rii : + BasicFlagsNVPTXInst<(outs B32:$dst), + (ins B32:$a, f32imm:$b, f32imm:$c), + (ins FTZFlag:$ftz), + OpcStr # "$ftz" # nan_str # ".f32", + [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; +} + // Template for instructions which take three FP args. The // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). // @@ -900,6 +930,20 @@ defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>; defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>; defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>; +def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp, + [SDNPCommutative]>; + +defm FMIN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ false, nvptx_fminnum3>; +defm FMAX3 : FMINIMUMMAXIMUM3<"max", /* NaN */ false, nvptx_fmaxnum3>; +defm FMINNAN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ true, nvptx_fminimum3>; +defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max", /* NaN */ true, nvptx_fmaximum3>; + defm FABS : F2<"abs", fabs>; defm FNEG : F2<"neg", fneg>; defm FABS_H: F2_Support_Half<"abs", fabs>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d337192..d4a0ca7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -39,6 +39,12 @@ def AS_match { code global = [{ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); }]; + code const = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST); + }]; + code param = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM); + }]; } @@ -950,33 +956,47 @@ foreach dim = 3...5 in { defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", [hasTMACTAGroupSupport]>; -//Prefetch and Prefetchu - -let Predicates = [hasPTX<80>, hasSM<90>] in { - class PREFETCH_INTRS<string InstName> : - BasicNVPTXInst<(outs), (ins ADDR:$addr), - InstName, - [(!cast<Intrinsic>(!strconcat("int_nvvm_", - !subst(".", "_", InstName))) addr:$addr)]>; +//Prefetchu and Prefetch - def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">; - def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">; - def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">; - def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">; - def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">; - def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">; +defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr); - def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_normal", - [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>; +multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> { + def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>; +} - def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_last", - [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>; - def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; +multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> { + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch" # addrspace_name # ".tensormap", + [(pattern_frag addr:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; } +defm PREFETCH_CONST_TENSORMAP : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>; +defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>; +defm PREFETCH_PARAM_TENSORMAP : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>; + +class PREFETCH_INTRS<string InstName, Intrinsic Intr> : + BasicNVPTXInst<(outs), (ins ADDR:$addr), + InstName, + [(Intr addr:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; + +def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>; +def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>; +def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>; +def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>; +def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>; +def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>; +def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>; +def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal", + int_nvvm_prefetch_global_L2_evict_normal>; +def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last", + int_nvvm_prefetch_global_L2_evict_last>; + //Applypriority intrinsics class APPLYPRIORITY_L2_INTRS<string addrspace> : BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size), diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 3ae2d9d..f4f8961 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, case Intrinsic::nvvm_isspacep_global: case Intrinsic::nvvm_isspacep_local: case Intrinsic::nvvm_isspacep_shared: - case Intrinsic::nvvm_isspacep_shared_cluster: { + case Intrinsic::nvvm_isspacep_shared_cluster: + case Intrinsic::nvvm_prefetch_tensormap: { OpIndexes.push_back(0); return true; } @@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return ConstantInt::get(II->getType(), *R); return nullptr; } + case Intrinsic::nvvm_prefetch_tensormap: { + IRBuilder<> Builder(II); + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap, + NewV); + } } return nullptr; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 9a6e261..b32d931b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -87,6 +87,13 @@ public: } unsigned getMinVectorRegisterBitWidth() const override { return 32; } + bool shouldExpandReduction(const IntrinsicInst *II) const override { + // Turn off ExpandReductions pass for NVPTX, which doesn't have advanced + // swizzling operations. Our backend/Selection DAG can expand these + // reductions with less movs. + return false; + } + // We don't want to prevent inlining because of target-cpu and -features // attributes that were added to newer versions of LLVM/Clang: There are // no incompatible functions in PTX, ptxas will throw errors in such cases. diff --git a/llvm/lib/Target/PowerPC/PPCCCState.h b/llvm/lib/Target/PowerPC/PPCCCState.h index b0e50b2..feab9c5 100644 --- a/llvm/lib/Target/PowerPC/PPCCCState.h +++ b/llvm/lib/Target/PowerPC/PPCCCState.h @@ -38,36 +38,6 @@ public: void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); } }; -class AIXCCState : public CCState { -private: - BitVector IsFixed; - -public: - AIXCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF, - SmallVectorImpl<CCValAssign> &Locs, LLVMContext &C) - : CCState(CC, IsVarArg, MF, Locs, C) {} - - void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, - CCAssignFn Fn) { - // All formal arguments are fixed. - IsFixed.resize(Ins.size(), true); - CCState::AnalyzeFormalArguments(Ins, Fn); - } - - void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, - CCAssignFn Fn) { - // Record whether the call operand was a fixed argument. - IsFixed.resize(Outs.size(), false); - for (unsigned ValNo = 0, E = Outs.size(); ValNo != E; ++ValNo) - if (Outs[ValNo].IsFixed) - IsFixed.set(ValNo); - - CCState::AnalyzeCallOperands(Outs, Fn); - } - - bool isFixed(unsigned ValNo) const { return IsFixed.test(ValNo); } -}; - } // end namespace llvm #endif diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 196574e..2698bd6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -6089,7 +6089,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; bool Result; - if (Outs[i].IsFixed) { + if (!ArgFlags.isVarArg()) { Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); } else { @@ -6905,8 +6905,7 @@ static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) { static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &S) { - AIXCCState &State = static_cast<AIXCCState &>(S); + CCState &State) { const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>( State.getMachineFunction().getSubtarget()); const bool IsPPC64 = Subtarget.isPPC64(); @@ -7090,7 +7089,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, // They are passed in VRs if any are available (unlike arguments passed // through ellipses) and shadow GPRs (unlike arguments to non-vaarg // functions) - if (State.isFixed(ValNo)) { + if (!ArgFlags.isVarArg()) { if (MCRegister VReg = State.AllocateReg(VR)) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo)); // Shadow allocate GPRs and stack space even though we pass in a VR. @@ -7278,7 +7277,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); - AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); const EVT PtrVT = getPointerTy(MF.getDataLayout()); // Reserve space for the linkage area on the stack. @@ -7625,8 +7624,8 @@ SDValue PPCTargetLowering::LowerCall_AIX( MachineFunction &MF = DAG.getMachineFunction(); SmallVector<CCValAssign, 16> ArgLocs; - AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs, - *DAG.getContext()); + CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs, + *DAG.getContext()); // Reserve space for the linkage save area (LSA) on the stack. // In both PPC32 and PPC64 there are 6 reserved slots in the LSA: diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 76dca47..f123040 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -1102,13 +1102,20 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, SpillsKnownBit = true; break; default: + // When spilling a CR bit, the super register may not be explicitly defined + // (i.e. it can be defined by a CR-logical that only defines the subreg) so + // we state that the CR field is undef. Also, in order to preserve the kill + // flag on the CR bit, we add it as an implicit use. + // On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all // bits (specifically, it produces a -1 if the CR bit is set). Ultimately, // the bit that is of importance to us is bit 32 (bit 0 of a 32-bit // register), and SETNBC will set this. if (Subtarget.isISA3_1()) { BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg) - .addReg(SrcReg, RegState::Undef); + .addReg(SrcReg, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | + getKillRegState(MI.getOperand(0).isKill())); break; } @@ -1122,16 +1129,14 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, SrcReg == PPC::CR4LT || SrcReg == PPC::CR5LT || SrcReg == PPC::CR6LT || SrcReg == PPC::CR7LT) { BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETB8 : PPC::SETB), Reg) - .addReg(getCRFromCRBit(SrcReg), RegState::Undef); + .addReg(getCRFromCRBit(SrcReg), RegState::Undef) + .addReg(SrcReg, RegState::Implicit | + getKillRegState(MI.getOperand(0).isKill())); break; } } // We need to move the CR field that contains the CR bit we are spilling. - // The super register may not be explicitly defined (i.e. it can be defined - // by a CR-logical that only defines the subreg) so we state that the CR - // field is undef. Also, in order to preserve the kill flag on the CR bit, - // we add it as an implicit use. BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) .addReg(getCRFromCRBit(SrcReg), RegState::Undef) .addReg(SrcReg, diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp index 95de9f3..4039fed 100644 --- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp @@ -22,3 +22,9 @@ bool PPCSelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const { return Opcode >= PPCISD::FIRST_STRICTFP_OPCODE && Opcode <= PPCISD::LAST_STRICTFP_OPCODE; } + +std::pair<SDValue, SDValue> PPCSelectionDAGInfo::EmitTargetCodeForMemcmp( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, + SDValue Op3, const CallInst *CI) const { + return DAG.getMemcmp(Chain, dl, Op1, Op2, Op3, CI); +} diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h index 08e2ddb..1537851 100644 --- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h +++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h @@ -20,6 +20,11 @@ public: bool isTargetMemoryOpcode(unsigned Opcode) const override; bool isTargetStrictFPOpcode(unsigned Opcode) const override; + + std::pair<SDValue, SDValue> + EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, + SDValue Op1, SDValue Op2, SDValue Op3, + const CallInst *CI) const override; }; } // namespace llvm diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 67cc01e..e0ac591 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -674,6 +674,9 @@ static constexpr FeatureBitset XAndesGroup = { static constexpr DecoderListEntry DecoderList32[]{ // Vendor Extensions + {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"}, + {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"}, + {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"}, {DecoderTableXVentana32, {RISCV::FeatureVendorXVentanaCondOps}, "XVentanaCondOps"}, @@ -690,9 +693,6 @@ static constexpr DecoderListEntry DecoderList32[]{ "MIPS mips.pref"}, {DecoderTableXAndes32, XAndesGroup, "Andes extensions"}, // Standard Extensions - {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"}, - {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"}, - {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"}, {DecoderTable32, {}, "standard 32-bit instructions"}, {DecoderTableRV32Only32, {}, "RV32-only standard 32-bit instructions"}, {DecoderTableZfinx32, {}, "Zfinx (Float in Integer)"}, diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index d2b75a6..34026ed 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -45,8 +45,8 @@ public: CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) override { - if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, Info.IsFixed, - IsRet, Info.Ty)) + if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet, + Info.Ty)) return true; StackSize = State.getStackSize(); @@ -196,8 +196,8 @@ public: if (LocVT.isScalableVector()) MF.getInfo<RISCVMachineFunctionInfo>()->setIsVectorCall(); - if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, - /*IsFixed=*/true, IsRet, Info.Ty)) + if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet, + Info.Ty)) return true; StackSize = State.getStackSize(); @@ -454,7 +454,7 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF, for (unsigned I = 0, E = Outs.size(); I < E; ++I) { MVT VT = MVT::getVT(Outs[I].Ty); if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo, - /*IsFixed=*/true, /*isRet=*/true, nullptr)) + /*isRet=*/true, nullptr)) return false; } return true; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 95ec42f..8d956ce 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -32,6 +32,11 @@ static cl::opt<bool> ULEB128Reloc( "riscv-uleb128-reloc", cl::init(true), cl::Hidden, cl::desc("Emit R_RISCV_SET_ULEB128/E_RISCV_SUB_ULEB128 if appropriate")); +static cl::opt<bool> + AlignRvc("riscv-align-rvc", cl::init(true), cl::Hidden, + cl::desc("When generating R_RISCV_ALIGN, insert $alignment-2 " + "bytes of NOPs even in norvc code")); + RISCVAsmBackend::RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit, const MCTargetOptions &Options) : MCAsmBackend(llvm::endianness::little), STI(STI), OSABI(OSABI), @@ -306,12 +311,21 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst, // If conditions are met, compute the padding size and create a fixup encoding // the padding size in the addend. bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { - // Use default handling unless linker relaxation is enabled and the alignment - // is larger than the nop size. - const MCSubtargetInfo *STI = F.getSubtargetInfo(); - if (!STI->hasFeature(RISCV::FeatureRelax)) + // Alignments before the first linker-relaxable instruction have fixed sizes + // and do not require relocations. Alignments after a linker-relaxable + // instruction require a relocation, even if the STI specifies norelax. + // + // firstLinkerRelaxable is the layout order within the subsection, which may + // be smaller than the section's order. Therefore, alignments in a + // lower-numbered subsection may be unnecessarily treated as linker-relaxable. + auto *Sec = F.getParent(); + if (F.getLayoutOrder() <= Sec->firstLinkerRelaxable()) return false; - unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4; + + // Use default handling unless the alignment is larger than the nop size. + const MCSubtargetInfo *STI = F.getSubtargetInfo(); + unsigned MinNopLen = + AlignRvc || STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4; if (F.getAlignment() <= MinNopLen) return false; @@ -321,7 +335,6 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN); F.setVarFixups({Fixup}); F.setLinkerRelaxable(); - F.getParent()->setLinkerRelaxable(); return true; } @@ -471,9 +484,12 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, Count -= 1; } + // TODO: emit a mapping symbol right here + if (Count % 4 == 2) { - // The canonical nop with Zca is c.nop. - OS.write(STI->hasFeature(RISCV::FeatureStdExtZca) ? "\x01\0" : "\0\0", 2); + // The canonical nop with Zca is c.nop. For .balign 4, we generate a 2-byte + // c.nop even in a norvc region. + OS.write("\x01\0", 2); Count -= 2; } diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp index cb6117e..70127e3 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp @@ -324,7 +324,7 @@ static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State, // Implements the RISC-V calling convention. Returns true upon failure. bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) { + CCState &State, bool IsRet, Type *OrigTy) { const MachineFunction &MF = State.getMachineFunction(); const DataLayout &DL = MF.getDataLayout(); const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>(); @@ -379,12 +379,12 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, break; case RISCVABI::ABI_ILP32F: case RISCVABI::ABI_LP64F: - UseGPRForF16_F32 = !IsFixed; + UseGPRForF16_F32 = ArgFlags.isVarArg(); break; case RISCVABI::ABI_ILP32D: case RISCVABI::ABI_LP64D: - UseGPRForF16_F32 = !IsFixed; - UseGPRForF64 = !IsFixed; + UseGPRForF16_F32 = ArgFlags.isVarArg(); + UseGPRForF64 = ArgFlags.isVarArg(); break; } @@ -465,7 +465,7 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, // currently if we are using ILP32E calling convention. This behavior may be // changed when RV32E/ILP32E is ratified. unsigned TwoXLenInBytes = (2 * XLen) / 8; - if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes && + if (ArgFlags.isVarArg() && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes && DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes && ABI != RISCVABI::ABI_ILP32E) { unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); @@ -620,8 +620,8 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, // benchmark. But theoretically, it may have benefit for some cases. bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy) { + ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsRet, + Type *OrigTy) { const MachineFunction &MF = State.getMachineFunction(); const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>(); const RISCVTargetLowering &TLI = *Subtarget.getTargetLowering(); diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h index bf823b7..2030ce1 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.h +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h @@ -21,15 +21,15 @@ namespace llvm { typedef bool RISCVCCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy); + bool IsRet, Type *OrigTy); bool CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy); + CCState &State, bool IsRet, Type *OrigTy); bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy); + CCState &State, bool IsRet, Type *OrigTy); bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 03e54b3..e63b937 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1844,6 +1844,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3, /*IsStore*/ true, /*IsUnitStrided*/ false, /*UsePtrVal*/ true); + case Intrinsic::riscv_sseg2_store_mask: + case Intrinsic::riscv_sseg3_store_mask: + case Intrinsic::riscv_sseg4_store_mask: + case Intrinsic::riscv_sseg5_store_mask: + case Intrinsic::riscv_sseg6_store_mask: + case Intrinsic::riscv_sseg7_store_mask: + case Intrinsic::riscv_sseg8_store_mask: + // Operands are (vec, ..., vec, ptr, offset, mask, vl) + return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 4, + /*IsStore*/ true, + /*IsUnitStrided*/ false, /*UsePtrVal*/ true); case Intrinsic::riscv_vlm: return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false, @@ -11084,69 +11095,118 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerVectorIntrinsicScalars(Op, DAG, Subtarget); } -SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, - SelectionDAG &DAG) const { - unsigned IntNo = Op.getConstantOperandVal(1); +static SDValue +lowerFixedVectorSegStoreIntrinsics(unsigned IntNo, SDValue Op, + const RISCVSubtarget &Subtarget, + SelectionDAG &DAG) { + bool IsStrided; switch (IntNo) { - default: - break; case Intrinsic::riscv_seg2_store_mask: case Intrinsic::riscv_seg3_store_mask: case Intrinsic::riscv_seg4_store_mask: case Intrinsic::riscv_seg5_store_mask: case Intrinsic::riscv_seg6_store_mask: case Intrinsic::riscv_seg7_store_mask: - case Intrinsic::riscv_seg8_store_mask: { - SDLoc DL(Op); - static const Intrinsic::ID VssegInts[] = { - Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, - Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, - Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, - Intrinsic::riscv_vsseg8_mask}; + case Intrinsic::riscv_seg8_store_mask: + IsStrided = false; + break; + case Intrinsic::riscv_sseg2_store_mask: + case Intrinsic::riscv_sseg3_store_mask: + case Intrinsic::riscv_sseg4_store_mask: + case Intrinsic::riscv_sseg5_store_mask: + case Intrinsic::riscv_sseg6_store_mask: + case Intrinsic::riscv_sseg7_store_mask: + case Intrinsic::riscv_sseg8_store_mask: + IsStrided = true; + break; + default: + llvm_unreachable("unexpected intrinsic ID"); + } - // Operands: (chain, int_id, vec*, ptr, mask, vl) - unsigned NF = Op->getNumOperands() - 5; - assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); - MVT XLenVT = Subtarget.getXLenVT(); - MVT VT = Op->getOperand(2).getSimpleValueType(); - MVT ContainerVT = getContainerForFixedLengthVector(VT); - unsigned Sz = NF * ContainerVT.getVectorMinNumElements() * - ContainerVT.getScalarSizeInBits(); - EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); + SDLoc DL(Op); + static const Intrinsic::ID VssegInts[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + static const Intrinsic::ID VsssegInts[] = { + Intrinsic::riscv_vssseg2_mask, Intrinsic::riscv_vssseg3_mask, + Intrinsic::riscv_vssseg4_mask, Intrinsic::riscv_vssseg5_mask, + Intrinsic::riscv_vssseg6_mask, Intrinsic::riscv_vssseg7_mask, + Intrinsic::riscv_vssseg8_mask}; + + // Operands: (chain, int_id, vec*, ptr, mask, vl) or + // (chain, int_id, vec*, ptr, stride, mask, vl) + unsigned NF = Op->getNumOperands() - (IsStrided ? 6 : 5); + assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); + MVT XLenVT = Subtarget.getXLenVT(); + MVT VT = Op->getOperand(2).getSimpleValueType(); + MVT ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget); + unsigned Sz = NF * ContainerVT.getVectorMinNumElements() * + ContainerVT.getScalarSizeInBits(); + EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); - SDValue VL = Op.getOperand(Op.getNumOperands() - 1); - SDValue Mask = Op.getOperand(Op.getNumOperands() - 2); - MVT MaskVT = Mask.getSimpleValueType(); - MVT MaskContainerVT = - ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); - Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); + SDValue VL = Op.getOperand(Op.getNumOperands() - 1); + SDValue Mask = Op.getOperand(Op.getNumOperands() - 2); + MVT MaskVT = Mask.getSimpleValueType(); + MVT MaskContainerVT = + ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); + Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); - SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT); - SDValue Ptr = Op->getOperand(NF + 2); + SDValue IntID = DAG.getTargetConstant( + IsStrided ? VsssegInts[NF - 2] : VssegInts[NF - 2], DL, XLenVT); + SDValue Ptr = Op->getOperand(NF + 2); - auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op); + auto *FixedIntrinsic = cast<MemIntrinsicSDNode>(Op); - SDValue StoredVal = DAG.getUNDEF(VecTupTy); - for (unsigned i = 0; i < NF; i++) - StoredVal = DAG.getNode( - RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, - convertToScalableVector( - ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget), - DAG.getTargetConstant(i, DL, MVT::i32)); + SDValue StoredVal = DAG.getUNDEF(VecTupTy); + for (unsigned i = 0; i < NF; i++) + StoredVal = DAG.getNode( + RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, + convertToScalableVector(ContainerVT, FixedIntrinsic->getOperand(2 + i), + DAG, Subtarget), + DAG.getTargetConstant(i, DL, MVT::i32)); + + SmallVector<SDValue, 10> Ops = { + FixedIntrinsic->getChain(), + IntID, + StoredVal, + Ptr, + Mask, + VL, + DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; + // Insert the stride operand. + if (IsStrided) + Ops.insert(std::next(Ops.begin(), 4), + Op.getOperand(Op.getNumOperands() - 3)); + + return DAG.getMemIntrinsicNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, + FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand()); +} + +SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = Op.getConstantOperandVal(1); + switch (IntNo) { + default: + break; + case Intrinsic::riscv_seg2_store_mask: + case Intrinsic::riscv_seg3_store_mask: + case Intrinsic::riscv_seg4_store_mask: + case Intrinsic::riscv_seg5_store_mask: + case Intrinsic::riscv_seg6_store_mask: + case Intrinsic::riscv_seg7_store_mask: + case Intrinsic::riscv_seg8_store_mask: + case Intrinsic::riscv_sseg2_store_mask: + case Intrinsic::riscv_sseg3_store_mask: + case Intrinsic::riscv_sseg4_store_mask: + case Intrinsic::riscv_sseg5_store_mask: + case Intrinsic::riscv_sseg6_store_mask: + case Intrinsic::riscv_sseg7_store_mask: + case Intrinsic::riscv_sseg8_store_mask: + return lowerFixedVectorSegStoreIntrinsics(IntNo, Op, Subtarget, DAG); - SDValue Ops[] = { - FixedIntrinsic->getChain(), - IntID, - StoredVal, - Ptr, - Mask, - VL, - DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; - - return DAG.getMemIntrinsicNode( - ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, - FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand()); - } case Intrinsic::riscv_sf_vc_xv_se: return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XV_SE); case Intrinsic::riscv_sf_vc_iv_se: @@ -22282,8 +22342,8 @@ void RISCVTargetLowering::analyzeInputArgs( else if (In.isOrigArg()) ArgTy = FType->getParamType(In.getOrigArgIndex()); - if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, - /*IsFixed=*/true, IsRet, ArgTy)) { + if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet, + ArgTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << Idx << " has unhandled type " << ArgVT << '\n'); llvm_unreachable(nullptr); @@ -22300,8 +22360,8 @@ void RISCVTargetLowering::analyzeOutputArgs( ISD::ArgFlagsTy ArgFlags = Out.Flags; Type *OrigTy = CLI ? CLI->getArgs()[Out.OrigArgIndex].Ty : nullptr; - if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, Out.IsFixed, - IsRet, OrigTy)) { + if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet, + OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << Idx << " has unhandled type " << ArgVT << "\n"); llvm_unreachable(nullptr); @@ -23083,7 +23143,7 @@ bool RISCVTargetLowering::CanLowerReturn( MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo, - /*IsFixed=*/true, /*IsRet=*/true, nullptr)) + /*IsRet=*/true, nullptr)) return false; } return true; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 27ad10a..413ad8b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -629,9 +629,6 @@ def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)), def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)), (zexti8 (XLenVT GPR:$rs1))), 0xFFFF), (PACKH GPR:$rs1, GPR:$rs2)>; -def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), - (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)), (zexti8 (XLenVT GPR:$rs1))), @@ -642,11 +639,15 @@ let Predicates = [HasStdExtZbkb, IsRV32] in { def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))), (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (shl GPR:$rs2, (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), + (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; + // Match a pattern of 2 bytes being inserted into bits [31:16], with bits // bits [15:0] coming from a zero extended value. We can use pack with packh for // bits [31:16]. If bits [15:0] can also be a packh, it can be matched // separately. -def : Pat<(or (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)), +def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)), (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), (zexti16 (XLenVT GPR:$rs1))), (PACK (XLenVT GPR:$rs1), @@ -657,6 +658,13 @@ let Predicates = [HasStdExtZbkb, IsRV64] in { def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))), (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), + (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; +def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), + (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; + def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)), (zexti16 (i64 GPR:$rs1))), (PACKW GPR:$rs1, GPR:$rs2)>; @@ -669,7 +677,7 @@ def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), // ignored. We can use packw with packh for bits [31:16]. If bits [15:0] can // also be a packh, it can be matched separately. def : Pat<(binop_allwusers<or> - (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)), + (or (shl GPR:$op1rs2, (XLenVT 24)), (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), (zexti16 (XLenVT GPR:$rs1))), (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; @@ -677,11 +685,11 @@ def : Pat<(binop_allwusers<or> def : Pat<(binop_allwusers<or> (or (zexti16 (XLenVT GPR:$rs1)), (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), - (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24))), + (shl GPR:$op1rs2, (XLenVT 24))), (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; def : Pat<(binop_allwusers<or> (or (zexti16 (XLenVT GPR:$rs1)), - (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 24))), + (shl GPR:$op1rs1, (XLenVT 24))), (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))), (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; } // Predicates = [HasStdExtZbkb, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index bf23812..24ebbc3 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -13,78 +13,113 @@ // //===----------------------------------------------------------------------===// -class SMX60IsWorstCaseMX<string mx, list<string> MxList> { - string LLMUL = LargestLMUL<MxList>.r; - bit c = !eq(mx, LLMUL); -} +//===----------------------------------------------------------------------===// +// Helpers + +// Maps LMUL string to corresponding value from the Values array +// LMUL values map to array indices as follows: +// MF8 -> Values[0], MF4 -> Values[1], MF2 -> Values[2], M1 -> Values[3], +// M2 -> Values[4], M4 -> Values[5], M8 -> Values[6] +// Shorter lists are allowed, e.g., widening instructions don't work on M8 +class GetLMULValue<list<int> Values, string LMUL> { + defvar Index = !cond( + !eq(LMUL, "MF8"): 0, + !eq(LMUL, "MF4"): 1, + !eq(LMUL, "MF2"): 2, + !eq(LMUL, "M1"): 3, + !eq(LMUL, "M2"): 4, + !eq(LMUL, "M4"): 5, + !eq(LMUL, "M8"): 6, + ); -class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> { - string LLMUL = LargestLMUL<MxList>.r; - int SSEW = SmallestSEW<mx, isF>.r; - bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); + assert !lt(Index, !size(Values)), + "Missing LMUL value for '" # LMUL # "'. " # + "Expected at least " # !add(Index, 1) # " elements, but got " # + !size(Values) # "."; + + int c = Values[Index]; } -defvar SMX60VLEN = 256; -defvar SMX60DLEN = !div(SMX60VLEN, 2); +// Returns BaseValue for LMUL values before startLMUL, Value for startLMUL, +// then doubles Value for each subsequent LMUL +// Example: ConstValueUntilLMULThenDoubleBase<"M1", 2, 4, "M8"> returns: +// MF8->2, MF4->2, MF2->2, M1->4, M2->8, M4->16, M8->32 +// This is useful for modeling scheduling parameters that scale with LMUL. +class ConstValueUntilLMULThenDoubleBase<string startLMUL, int BaseValue, int Value, string currentLMUL> { + assert !le(BaseValue, Value), "BaseValue must be less-equal to Value"; + defvar startPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], startLMUL>.c; + defvar currentPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], currentLMUL>.c; + + // Calculate the difference in positions + defvar posDiff = !sub(currentPos, startPos); -class Get1248Latency<string mx> { + // Calculate Value * (2^posDiff) int c = !cond( - !eq(mx, "M2") : 2, - !eq(mx, "M4") : 4, - !eq(mx, "M8") : 8, - true: 1 + !eq(posDiff, 0) : Value, + !eq(posDiff, 1) : !mul(Value, 2), + !eq(posDiff, 2) : !mul(Value, 4), + !eq(posDiff, 3) : !mul(Value, 8), + !eq(posDiff, 4) : !mul(Value, 16), + !eq(posDiff, 5) : !mul(Value, 32), + !eq(posDiff, 6) : !mul(Value, 64), + true : BaseValue ); } -// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides -class Get4816Latency<string mx> { - int c = !cond( - !eq(mx, "M4") : 8, - !eq(mx, "M8") : 16, - true: 4 - ); +// Same as the previous function but BaseValue == Value +class ConstValueUntilLMULThenDouble<string startLMUL, int Value, string currentLMUL> { + int c = ConstValueUntilLMULThenDoubleBase<startLMUL, Value, Value, currentLMUL>.c; +} + +// Returns MF8->1, MF4->1, MF2->2, M1->4, M2->8, M4->16, M8->32 +class ConstOneUntilMF4ThenDouble<string mx> { + int c = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c; } +// Returns MF8->1, MF4->1, MF2->1, M1->2, M2->4, M4->8, M8->16 +class ConstOneUntilMF2ThenDouble<string mx> { + int c = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c; +} + +// Returns MF8->1, MF4->1, MF2->1, M1->1, M2->2, M4->4, M8->8 +class ConstOneUntilM1ThenDouble<string mx> { + int c = ConstValueUntilLMULThenDouble<"M1", 1, mx>.c; +} + +//===----------------------------------------------------------------------===// +// Latency helper classes + // Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max -class Get458Latency<string mx> { - int c = !cond( - !eq(mx, "M4") : 5, - !eq(mx, "M8") : 8, - true: 4 - ); +class Get4458Latency<string mx> { + int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/4, /*M4=*/5, /*M8=*/8], mx>.c; } -// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs -// Used for: widening operations +// Used for: widening operations (no M8) class Get4588Latency<string mx> { - int c = !cond( - !eq(mx, "M2") : 5, - !eq(mx, "M4") : 8, - !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback - true: 4 - ); + int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/5, /*M4=*/8], mx>.c; } // Used for: mask-producing comparisons, carry ops with mask, FP comparisons class Get461018Latency<string mx> { - int c = !cond( - !eq(mx, "M2") : 6, - !eq(mx, "M4") : 10, - !eq(mx, "M8") : 18, - true: 4 - ); + int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c; } -// Used for: e64 multiply pattern, complex ops -class Get781632Latency<string mx> { - int c = !cond( - !eq(mx, "M2") : 8, - !eq(mx, "M4") : 16, - !eq(mx, "M8") : 32, - true: 7 - ); +//===----------------------------------------------------------------------===// + +class SMX60IsWorstCaseMX<string mx, list<string> MxList> { + string LLMUL = LargestLMUL<MxList>.r; + bit c = !eq(mx, LLMUL); } +class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> { + string LLMUL = LargestLMUL<MxList>.r; + int SSEW = SmallestSEW<mx, isF>.r; + bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); +} + +defvar SMX60VLEN = 256; +defvar SMX60DLEN = !div(SMX60VLEN, 2); + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -383,12 +418,13 @@ foreach LMul = [1, 2, 4, 8] in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in { + let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [4] in { defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; } - let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in { + defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + let Latency = VIALULat, ReleaseAtCycles = [4] in { // Pattern of vadd, vsub, vrsub: 4/4/5/8 // Pattern of vand, vor, vxor: 4/4/8/16 // They are grouped together, so we used the worst case 4/4/8/16 @@ -425,7 +461,7 @@ foreach mx = SchedMxList in { // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8, // e64 = 7,8,16,32. We use the worst-case until we can split the SEW. // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites - let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in { + let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in { defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; @@ -461,15 +497,8 @@ foreach mx = SchedMxList in { foreach sew = SchedSEWSet<mx>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; - // Slightly reduced for fractional LMULs - defvar Multiplier = !cond( - !eq(mx, "MF8") : 12, - !eq(mx, "MF4") : 12, - !eq(mx, "MF2") : 12, - true: 24 - ); - - let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in { + defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c; + let Latency = VIDivLat, ReleaseAtCycles = [12] in { defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; } @@ -480,14 +509,8 @@ foreach mx = SchedMxList in { foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - // Slightly increased for integer LMULs - defvar Multiplier = !cond( - !eq(mx, "M2") : 2, - !eq(mx, "M4") : 2, - true: 1 - ); - - let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in { + defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; + let Latency = VNarrowingLat, ReleaseAtCycles = [4] in { defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; @@ -501,16 +524,33 @@ foreach mx = SchedMxListW in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { + defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; + } + + // Latency of vsmul: e8/e16 = 4/4/5/8, e32 = 5/5/5/8, e64 = 7/8/16/32 + // We use the worst-case until we can split the SEW. + defvar VSMulLat = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c; + // Latency of vsmul: e8/e16/e32 = 1/2/4/8, e64 = 4/8/16/32 + // We use the worst-case until we can split the SEW. + defvar VSMulOcc = ConstValueUntilLMULThenDoubleBase<"M1", 1, 4, mx>.c; + // TODO: change WriteVSMulV/X to be defined with LMULSEWSchedWrites + let Latency = VSMulLat, ReleaseAtCycles = [VSMulOcc] in { + defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; + } + + defvar VSShiftLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; + defvar VSShiftOcc = ConstOneUntilMF2ThenDouble<mx>.c; + let Latency = VSShiftLat, ReleaseAtCycles = [VSShiftOcc] in { + defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; + } } // 13. Vector Floating-Point Instructions diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 05d504c..6a1f4b3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -114,6 +114,9 @@ public: bool enableScalableVectorization() const override { return ST->hasVInstructions(); } + bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override { + return ST->hasVInstructions(); + } TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override { return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 74aec4f..2b34f61 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -359,18 +359,15 @@ static void lowerExpectAssume(IntrinsicInst *II) { } } -static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID, - ArrayRef<unsigned> OpNos) { - Function *F = nullptr; - if (OpNos.empty()) { - F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID); - } else { - SmallVector<Type *, 4> Tys; - for (unsigned OpNo : OpNos) - Tys.push_back(II->getOperand(OpNo)->getType()); - F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID, Tys); - } - II->setCalledFunction(F); +static bool toSpvLifetimeIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID) { + IRBuilder<> Builder(II); + auto *Alloca = cast<AllocaInst>(II->getArgOperand(0)); + std::optional<TypeSize> Size = + Alloca->getAllocationSize(Alloca->getDataLayout()); + Value *SizeVal = Builder.getInt64(Size ? *Size : -1); + Builder.CreateIntrinsic(NewID, Alloca->getType(), + {SizeVal, II->getArgOperand(0)}); + II->eraseFromParent(); return true; } @@ -406,8 +403,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { break; case Intrinsic::lifetime_start: if (!STI.isShader()) { - Changed |= toSpvOverloadedIntrinsic( - II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1}); + Changed |= toSpvLifetimeIntrinsic( + II, Intrinsic::SPVIntrinsics::spv_lifetime_start); } else { II->eraseFromParent(); Changed = true; @@ -415,8 +412,8 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { break; case Intrinsic::lifetime_end: if (!STI.isShader()) { - Changed |= toSpvOverloadedIntrinsic( - II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1}); + Changed |= toSpvLifetimeIntrinsic( + II, Intrinsic::SPVIntrinsics::spv_lifetime_end); } else { II->eraseFromParent(); Changed = true; diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 1aa8efe..c0fc3a6 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1179,7 +1179,7 @@ static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs, if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128)) continue; // The fixed arguments to a varargs function still go in FP registers. - if (Outs[VA.getValNo()].IsFixed) + if (!Outs[VA.getValNo()].Flags.isVarArg()) continue; // This floating point argument should be reassigned. diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h index 25f4aac..fbb98ff 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h @@ -31,10 +31,6 @@ namespace SystemZ { class SystemZCCState : public CCState { private: - /// Records whether the value was a fixed argument. - /// See ISD::OutputArg::IsFixed. - SmallVector<bool, 4> ArgIsFixed; - /// Records whether the value was widened from a short vector type. SmallVector<bool, 4> ArgIsShortVector; @@ -50,10 +46,6 @@ public: void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, CCAssignFn Fn) { - // Formal arguments are always fixed. - ArgIsFixed.clear(); - for (unsigned i = 0; i < Ins.size(); ++i) - ArgIsFixed.push_back(true); // Record whether the call operand was a short vector. ArgIsShortVector.clear(); for (unsigned i = 0; i < Ins.size(); ++i) @@ -64,10 +56,6 @@ public: void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, CCAssignFn Fn) { - // Record whether the call operand was a fixed argument. - ArgIsFixed.clear(); - for (unsigned i = 0; i < Outs.size(); ++i) - ArgIsFixed.push_back(Outs[i].IsFixed); // Record whether the call operand was a short vector. ArgIsShortVector.clear(); for (unsigned i = 0; i < Outs.size(); ++i) @@ -77,12 +65,11 @@ public: } // This version of AnalyzeCallOperands in the base class is not usable - // since we must provide a means of accessing ISD::OutputArg::IsFixed. + // since we must provide a means of accessing ISD::OutputArg::IsShortVector. void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs, SmallVectorImpl<ISD::ArgFlagsTy> &Flags, CCAssignFn Fn) = delete; - bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; } bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; } }; diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 0ad872b..059f31f 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -16,14 +16,6 @@ class CCIfSubtarget<string F, CCAction A> "getSubtarget<SystemZSubtarget>().", F), A>; -// Match if this specific argument is a fixed (i.e. named) argument. -class CCIfFixed<CCAction A> - : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>; - -// Match if this specific argument is not a fixed (i.e. vararg) argument. -class CCIfNotFixed<CCAction A> - : CCIf<"!(static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))", A>; - // Match if this specific argument was widened from a short vector type. class CCIfShortVector<CCAction A> : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>; @@ -79,7 +71,7 @@ def CC_SystemZ_GHC : CallingConv<[ // Pass in STG registers: XMM1, ..., XMM6 CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>, + CCIfArgFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>, // Fail otherwise CCCustom<"CC_SystemZ_GHC_Error"> @@ -125,8 +117,8 @@ def CC_SystemZ_ELF : CallingConv<[ // during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToReg<[V24, V26, V28, V30, - V25, V27, V29, V31]>>>>, + CCIfArgFixed<CCAssignToReg<[V24, V26, V28, V30, + V25, V27, V29, V31]>>>>, // However, sub-128 vectors which need to go on the stack occupy just a // single 8-byte-aligned 8-byte stack slot. Pass as i64. @@ -227,17 +219,17 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRs. // Although we assign the f32 vararg to be bitcast, it will first be promoted // to an f64 within convertValVTToLocVT(). - CCIfType<[f32, f64], CCIfNotFixed<CCBitConvertToType<i64>>>, + CCIfType<[f32, f64], CCIfArgVarArg<CCBitConvertToType<i64>>>, // Pointers are always passed in full 64-bit registers. CCIfPtr<CCCustom<"CC_XPLINK64_Pointer">>, // long double, can only be passed in GPR2 and GPR3, if available, // hence R2Q - CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>, + CCIfType<[f128], CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>, // Non fixed vector arguments are treated in the same way as long // doubles. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>, + CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>, // A SwiftSelf is passed in callee-saved R10. CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>, @@ -260,22 +252,24 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>, + CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>, CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToRegAndStack<[V24, V25, V26, V27, - V28, V29, V30, V31], 16, 8>>>>, + CCIfArgFixed<CCAssignToRegAndStack<[V24, V25, V26, V27, + V28, V29, V30, V31], 16, 8>>>>, // The first 4 named float and double arguments are passed in registers // FPR0-FPR6. The rest will be passed in the user area. - CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, - CCIfType<[f32], CCIfFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>, - CCIfType<[f64], CCIfFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>, + CCIfType<[f32, f64], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, + CCIfType<[f32], + CCIfArgFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>, + CCIfType<[f64], + CCIfArgFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>, // The first 2 long double arguments are passed in register FPR0/FPR2 // and FPR4/FPR6. The rest will be passed in the user area. - CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, - CCIfType<[f128], CCIfFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>, + CCIfType<[f128], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, + CCIfType<[f128], CCIfArgFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>, // Other arguments are passed in 8-byte-aligned 8-byte stack slots. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index d76babe..afe838a 100644 --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -181,8 +181,7 @@ static SDValue addIPMSequence(const SDLoc &DL, SDValue CCReg, std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp( SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1, - SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo, - MachinePointerInfo Op2PtrInfo) const { + SDValue Src2, SDValue Size, const CallInst *CI) const { SDValue CCReg; // Swap operands to invert CC == 1 vs. CC == 2 cases. if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) { diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h index c928f34..5a1e0cd 100644 --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h @@ -41,8 +41,7 @@ public: std::pair<SDValue, SDValue> EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1, SDValue Src2, SDValue Size, - MachinePointerInfo Op1PtrInfo, - MachinePointerInfo Op2PtrInfo) const override; + const CallInst *CI) const override; std::pair<SDValue, SDValue> EmitTargetCodeForMemchr(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index f32c9bd..2611c29 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -436,20 +436,6 @@ bool SystemZTTIImpl::isLSRCostLess( C2.ScaleCost, C2.SetupCost); } -bool SystemZTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { - const TargetMachine &TM = getTLI()->getTargetMachine(); - - const FeatureBitset &CallerBits = - TM.getSubtargetImpl(*Caller)->getFeatureBits(); - const FeatureBitset &CalleeBits = - TM.getSubtargetImpl(*Callee)->getFeatureBits(); - - // Support only equal feature bitsets. Restriction should be relaxed in the - // future to allow inlining when callee's bits are subset of the caller's. - return CallerBits == CalleeBits; -} - unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { bool Vector = (ClassID == 1); if (!Vector) diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index dc5736e..fc681de 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -65,9 +65,6 @@ public: bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override; - bool areInlineCompatible(const Function *Caller, - const Function *Callee) const override; - /// @} /// \name Vector TTI Implementations diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index b03b350..fc852d0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -136,6 +136,15 @@ static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL, if (VT == MVT::f64) { return wasm::ValType::F64; } + if (VT == MVT::externref) { + return wasm::ValType::EXTERNREF; + } + if (VT == MVT::funcref) { + return wasm::ValType::FUNCREF; + } + if (VT == MVT::exnref) { + return wasm::ValType::EXNREF; + } LLVM_DEBUG(errs() << "Unhandled type for llvm.wasm.ref.test.func: " << VT << "\n"); llvm_unreachable("Unhandled type for llvm.wasm.ref.test.func"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 3f80b2a..f9eba4b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1309,7 +1309,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, OutVal = FINode; } // Count the number of fixed args *after* legalization. - NumFixedArgs += Out.IsFixed; + NumFixedArgs += !Out.Flags.isVarArg(); } bool IsVarArg = CLI.IsVarArg; @@ -1503,7 +1503,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn( for (const ISD::OutputArg &Out : Outs) { assert(!Out.Flags.isByVal() && "byval is not valid for return values"); assert(!Out.Flags.isNest() && "nest is not valid for return values"); - assert(Out.IsFixed && "non-fixed return value is not valid"); + assert(!Out.Flags.isVarArg() && "non-fixed return value is not valid"); if (Out.Flags.isInAlloca()) fail(DL, DAG, "WebAssembly hasn't implemented inalloca results"); if (Out.Flags.isInConsecutiveRegs()) diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp index c0a6035..d9f4405 100644 --- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp @@ -75,7 +75,7 @@ public: static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7}; - if (!Info.IsFixed) + if (Flags.isVarArg()) NumXMMRegs = State.getFirstUnallocated(XMMArgRegs); return Res; @@ -363,7 +363,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, Info.CallConv, Info.IsVarArg)) return false; - bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed; + bool IsFixed = + Info.OrigArgs.empty() ? true : !Info.OrigArgs.back().Flags[0].isVarArg(); if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 3320508..b775c43 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1821,7 +1821,7 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, // only used outside the region. if (Valid && Lifetimes.size() != 0) { auto *NewLifetime = Lifetimes[0]->clone(); - NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI); + NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(0), AI); NewLifetime->insertBefore(DomBB->getTerminator()->getIterator()); // All the outsided lifetime.start markers are no longer necessary. diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index da60f52..6ed3b62 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -226,13 +226,6 @@ public: /*IsVarArgs=*/false); } - static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL, - AllocaInst *Alloced) { - std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL); - uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0; - return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt); - } - bool expansionApplicableToFunction(Module &M, Function *F) { if (F->isIntrinsic() || !F->isVarArg() || F->hasFnAttribute(Attribute::Naked)) @@ -577,8 +570,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder, AllocaInst *VaListInstance = Builder.CreateAlloca(VaListTy, nullptr, "va_start"); - Builder.CreateLifetimeStart(VaListInstance, - sizeOfAlloca(Ctx, DL, VaListInstance)); + Builder.CreateLifetimeStart(VaListInstance); Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)}, {VaListInstance}); @@ -595,8 +587,7 @@ ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder, Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)}, {VaListInstance}); - Builder.CreateLifetimeEnd(VaListInstance, - sizeOfAlloca(Ctx, DL, VaListInstance)); + Builder.CreateLifetimeEnd(VaListInstance); if (Result->getType()->isVoidTy()) Builder.CreateRetVoid(); @@ -746,7 +737,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, // Initialize the fields in the struct Builder.SetInsertPoint(CB); - Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced)); + Builder.CreateLifetimeStart(Alloced); Frame.initializeStructAlloca(DL, Builder, Alloced); const unsigned NumArgs = FuncType->getNumParams(); @@ -762,7 +753,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, Builder.SetCurrentDebugLocation(CB->getStableDebugLoc()); VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument"); Builder.SetInsertPoint(CB); - Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList)); + Builder.CreateLifetimeStart(VaList); } Builder.SetInsertPoint(CB); Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced)); @@ -802,9 +793,9 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, } if (VaList) - Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList)); + Builder.CreateLifetimeEnd(VaList); - Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced)); + Builder.CreateLifetimeEnd(Alloced); NewCB->setAttributes(PAL); NewCB->takeName(CB); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 47e017e..d7a2ef7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1532,6 +1532,51 @@ static Instruction *foldBitOrderCrossLogicOp(Value *V, return nullptr; } +/// Helper to match idempotent binary intrinsics, namely, intrinsics where +/// `f(f(x, y), y) == f(x, y)` holds. +static bool isIdempotentBinaryIntrinsic(Intrinsic::ID IID) { + switch (IID) { + case Intrinsic::smax: + case Intrinsic::smin: + case Intrinsic::umax: + case Intrinsic::umin: + case Intrinsic::maximum: + case Intrinsic::minimum: + case Intrinsic::maximumnum: + case Intrinsic::minimumnum: + case Intrinsic::maxnum: + case Intrinsic::minnum: + return true; + default: + return false; + } +} + +/// Attempt to simplify value-accumulating recurrences of kind: +/// %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ] +/// %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b) +/// And let the idempotent binary intrinsic be hoisted, when the operands are +/// known to be loop-invariant. +static Value *foldIdempotentBinaryIntrinsicRecurrence(InstCombinerImpl &IC, + IntrinsicInst *II) { + PHINode *PN; + Value *Init, *OtherOp; + + // A binary intrinsic recurrence with loop-invariant operands is equivalent to + // `call @llvm.binary.intrinsic(Init, OtherOp)`. + auto IID = II->getIntrinsicID(); + if (!isIdempotentBinaryIntrinsic(IID) || + !matchSimpleBinaryIntrinsicRecurrence(II, PN, Init, OtherOp) || + !IC.getDominatorTree().dominates(OtherOp, PN)) + return nullptr; + + auto *InvariantBinaryInst = + IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp); + if (isa<FPMathOperator>(InvariantBinaryInst)) + cast<Instruction>(InvariantBinaryInst)->copyFastMathFlags(II); + return InvariantBinaryInst; +} + static Value *simplifyReductionOperand(Value *Arg, bool CanReorderLanes) { if (!CanReorderLanes) return nullptr; @@ -3912,6 +3957,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (Value *Reverse = foldReversedIntrinsicOperands(II)) return replaceInstUsesWith(*II, Reverse); + if (Value *Res = foldIdempotentBinaryIntrinsicRecurrence(*this, II)) + return replaceInstUsesWith(*II, Res); + // Some intrinsics (like experimental_gc_statepoint) can be used in invoke // context, so it is handled in visitCallBase and we should trigger it. return visitCallBase(*II); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index fe0f308..b17cf17 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -3042,7 +3042,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *V = LHS; unsigned MaskElems = Mask.size(); auto *SrcTy = cast<FixedVectorType>(V->getType()); - unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedValue(); + unsigned VecBitWidth = DL.getTypeSizeInBits(SrcTy); unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType()); assert(SrcElemBitWidth && "vector elements must have a bitwidth"); unsigned SrcNumElems = SrcTy->getNumElements(); diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 8da65c5..50258af 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1211,23 +1211,19 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { return; if (!II.isLifetimeStartOrEnd()) return; - // Found lifetime intrinsic, add ASan instrumentation if necessary. - auto *Size = cast<ConstantInt>(II.getArgOperand(0)); - // If size argument is undefined, don't do anything. - if (Size->isMinusOne()) return; - // Check that size doesn't saturate uint64_t and can - // be stored in IntptrTy. - const uint64_t SizeValue = Size->getValue().getLimitedValue(); - if (SizeValue == ~0ULL || - !ConstantInt::isValueValidForType(IntptrTy, SizeValue)) - return; // Find alloca instruction that corresponds to llvm.lifetime argument. - AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(1)); + AllocaInst *AI = dyn_cast<AllocaInst>(II.getArgOperand(0)); // We're interested only in allocas we can handle. if (!AI || !ASan.isInterestingAlloca(*AI)) return; + + std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout()); + // Check that size is known and can be stored in IntptrTy. + if (!Size || !ConstantInt::isValueValidForType(IntptrTy, *Size)) + return; + bool DoPoison = (ID == Intrinsic::lifetime_end); - AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison}; + AllocaPoisonCall APC = {&II, AI, *Size, DoPoison}; if (AI->isStaticAlloca()) StaticAllocaPoisonCallVec.push_back(APC); else if (ClInstrumentDynamicAllocas) diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index bcb90d6..fc34d14 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -1469,22 +1469,6 @@ void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, size_t Size = memtag::getAllocaSizeInBytes(*AI); size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); - auto HandleLifetime = [&](IntrinsicInst *II) { - // Set the lifetime intrinsic to cover the whole alloca. This reduces the - // set of assumptions we need to make about the lifetime. Without this we - // would need to ensure that we can track the lifetime pointer to a - // constant offset from the alloca, and would still need to change the - // size to include the extra alignment we use for the untagging to make - // the size consistent. - // - // The check for standard lifetime below makes sure that we have exactly - // one set of start / end in any execution (i.e. the ends are not - // reachable from each other), so this will not cause any problems. - II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize)); - }; - llvm::for_each(Info.LifetimeStart, HandleLifetime); - llvm::for_each(Info.LifetimeEnd, HandleLifetime); - AI->replaceUsesWithIf(Replacement, [AILong](const Use &U) { auto *User = U.getUser(); return User != AILong && !isa<LifetimeIntrinsic>(User); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 7d3c940..6e81387 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3301,7 +3301,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { void handleLifetimeStart(IntrinsicInst &I) { if (!PoisonStack) return; - AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(1)); + AllocaInst *AI = dyn_cast<AllocaInst>(I.getArgOperand(0)); if (AI) LifetimeStartList.push_back(std::make_pair(&I, AI)); } diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 46b5673..9471ae3 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -789,6 +789,13 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, bool NeedsMemMove = false; IRBuilder<> IRB(BB, IP); + auto GetAllocaSize = [&](AllocaInst *AI) { + return IRB.CreateMul( + IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy), + ConstantInt::get(IntptrTy, + DL.getTypeAllocSize(AI->getAllocatedType()))); + }; + if (auto *A = dyn_cast<Argument>(V)) { assert(A->hasByValAttr() && "Type reset for non-byval argument?"); @@ -811,8 +818,12 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, } } } else if (auto *II = dyn_cast<LifetimeIntrinsic>(I)) { - Size = II->getArgOperand(0); - Dest = II->getArgOperand(1); + auto *AI = dyn_cast<AllocaInst>(II->getArgOperand(0)); + if (!AI) + return false; + + Size = GetAllocaSize(AI); + Dest = II->getArgOperand(0); } else if (auto *AI = dyn_cast<AllocaInst>(I)) { // We need to clear the types for new stack allocations (or else we might // read stale type information from a previous function execution). @@ -820,10 +831,7 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(I))); IRB.SetInstDebugLocation(I); - Size = IRB.CreateMul( - IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy), - ConstantInt::get(IntptrTy, - DL.getTypeAllocSize(AI->getAllocatedType()))); + Size = GetAllocaSize(AI); Dest = I; } else { return false; diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 938aab5..a7ba54f 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -582,15 +582,17 @@ struct AllSwitchPaths { VisitedBlocks VB; // Get paths from the determinator BBs to SwitchPhiDefBB std::vector<ThreadingPath> PathsToPhiDef = - getPathsFromStateDefMap(StateDef, SwitchPhi, VB); + getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths); if (SwitchPhiDefBB == SwitchBlock) { TPaths = std::move(PathsToPhiDef); return; } + assert(MaxNumPaths >= PathsToPhiDef.size()); + auto PathsLimit = MaxNumPaths / PathsToPhiDef.size(); // Find and append paths from SwitchPhiDefBB to SwitchBlock. PathsType PathsToSwitchBB = - paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1); + paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit); if (PathsToSwitchBB.empty()) return; @@ -611,13 +613,16 @@ private: typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap; std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef, PHINode *Phi, - VisitedBlocks &VB) { + VisitedBlocks &VB, + unsigned PathsLimit) { std::vector<ThreadingPath> Res; auto *PhiBB = Phi->getParent(); VB.insert(PhiBB); VisitedBlocks UniqueBlocks; for (auto *IncomingBB : Phi->blocks()) { + if (Res.size() >= PathsLimit) + break; if (!UniqueBlocks.insert(IncomingBB).second) continue; if (!SwitchOuterLoop->contains(IncomingBB)) @@ -653,8 +658,9 @@ private: // Direct predecessor, just add to the path. if (IncomingPhiDefBB == IncomingBB) { - std::vector<ThreadingPath> PredPaths = - getPathsFromStateDefMap(StateDef, IncomingPhi, VB); + assert(PathsLimit > Res.size()); + std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap( + StateDef, IncomingPhi, VB, PathsLimit - Res.size()); for (ThreadingPath &Path : PredPaths) { Path.push_back(PhiBB); Res.push_back(std::move(Path)); @@ -667,13 +673,17 @@ private: continue; PathsType IntermediatePaths; - IntermediatePaths = - paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1); + assert(PathsLimit > Res.size()); + auto InterPathLimit = PathsLimit - Res.size(); + IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB, + /* PathDepth = */ 1, InterPathLimit); if (IntermediatePaths.empty()) continue; + assert(InterPathLimit >= IntermediatePaths.size()); + auto PredPathLimit = InterPathLimit / IntermediatePaths.size(); std::vector<ThreadingPath> PredPaths = - getPathsFromStateDefMap(StateDef, IncomingPhi, VB); + getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit); for (const ThreadingPath &Path : PredPaths) { for (const PathType &IPath : IntermediatePaths) { ThreadingPath NewPath(Path); @@ -688,7 +698,7 @@ private: } PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited, - unsigned PathDepth) { + unsigned PathDepth, unsigned PathsLimit) { PathsType Res; // Stop exploring paths after visiting MaxPathLength blocks @@ -715,6 +725,8 @@ private: // is used to prevent a duplicate path from being generated SmallSet<BasicBlock *, 4> Successors; for (BasicBlock *Succ : successors(BB)) { + if (Res.size() >= PathsLimit) + break; if (!Successors.insert(Succ).second) continue; @@ -736,14 +748,12 @@ private: // coverage and compile time. if (LI->getLoopFor(Succ) != CurrLoop) continue; - - PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1); + assert(PathsLimit > Res.size()); + PathsType SuccPaths = + paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size()); for (PathType &Path : SuccPaths) { Path.push_front(BB); Res.push_back(Path); - if (Res.size() >= MaxNumPaths) { - return Res; - } } } // This block could now be visited again from a different predecessor. Note diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 9b87180..f46d54b 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -1363,7 +1363,7 @@ struct DSEState { if (auto *CB = dyn_cast<CallBase>(I)) { if (CB->getIntrinsicID() == Intrinsic::lifetime_end) return { - std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)}; + std::make_pair(MemoryLocation::getForArgument(CB, 0, &TLI), false)}; if (Value *FreedOp = getFreedOperand(CB, &TLI)) return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)}; } diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 85ee824..a097d33 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -434,7 +434,7 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II, NewV = NewV->stripPointerCasts(); Function *NewDecl = Intrinsic::getOrInsertDeclaration( M, II->getIntrinsicID(), {NewV->getType()}); - II->setArgOperand(1, NewV); + II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return true; } @@ -491,7 +491,7 @@ void InferAddressSpacesImpl::collectRewritableIntrinsicOperands( } case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { - appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(1), + appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), PostorderStack, Visited); break; } diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp index 0ddc231..e9bf59c 100644 --- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -58,14 +58,55 @@ bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) { } // Compute alignment from known bits. + auto InferFromKnownBits = [&](Instruction &I, Value *PtrOp) { + KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT); + unsigned TrailZ = + std::min(Known.countMinTrailingZeros(), +Value::MaxAlignmentExponent); + return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); + }; + + // Propagate alignment between loads and stores that originate from the + // same base pointer. + DenseMap<Value *, Align> BestBasePointerAligns; + auto InferFromBasePointer = [&](Value *PtrOp, Align LoadStoreAlign) { + APInt OffsetFromBase(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0); + PtrOp = PtrOp->stripAndAccumulateConstantOffsets(DL, OffsetFromBase, true); + // Derive the base pointer alignment from the load/store alignment + // and the offset from the base pointer. + Align BasePointerAlign = + commonAlignment(LoadStoreAlign, OffsetFromBase.getLimitedValue()); + + auto [It, Inserted] = + BestBasePointerAligns.try_emplace(PtrOp, BasePointerAlign); + if (!Inserted) { + // If the stored base pointer alignment is better than the + // base pointer alignment we derived, we may be able to use it + // to improve the load/store alignment. If not, store the + // improved base pointer alignment for future iterations. + if (It->second > BasePointerAlign) { + Align BetterLoadStoreAlign = + commonAlignment(It->second, OffsetFromBase.getLimitedValue()); + return BetterLoadStoreAlign; + } + It->second = BasePointerAlign; + } + return LoadStoreAlign; + }; + for (BasicBlock &BB : F) { + // We need to reset the map for each block because alignment information + // can only be propagated from instruction A to B if A dominates B. + // This is because control flow (and exception throwing) could be dependent + // on the address (and its alignment) at runtime. Some sort of dominator + // tree approach could be better, but doing a simple forward pass through a + // single basic block is correct too. + BestBasePointerAligns.clear(); + for (Instruction &I : BB) { Changed |= tryToImproveAlign( DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) { - KnownBits Known = computeKnownBits(PtrOp, DL, &AC, &I, &DT); - unsigned TrailZ = std::min(Known.countMinTrailingZeros(), - +Value::MaxAlignmentExponent); - return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); + return std::max(InferFromKnownBits(I, PtrOp), + InferFromBasePointer(PtrOp, OldAlign)); }); } } diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index b3bffeb..c68149b 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -263,6 +263,7 @@ static bool isUniformShape(Value *V) { case llvm::Instruction::FPExt: return true; case llvm::Instruction::AddrSpaceCast: + case CastInst::PtrToAddr: case CastInst::PtrToInt: case CastInst::IntToPtr: return false; @@ -2166,7 +2167,7 @@ public: // If the loads don't alias the lifetime.end, it won't interfere with // fusion. - MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 1, nullptr); + MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 0, nullptr); if (!EndLoc.Ptr) continue; if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc)) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 79721dc..f237322 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -915,7 +915,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, // move the bitcast as well, which we don't handle. if (SkippedLifetimeStart) { auto *LifetimeArg = - dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(1)); + dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(0)); if (LifetimeArg && LifetimeArg->getParent() == C->getParent() && C->comesBefore(LifetimeArg)) return false; @@ -1010,7 +1010,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, // Lifetime of srcAlloca ends at lifetime.end. if (auto *II = dyn_cast<IntrinsicInst>(&I)) { if (II->getIntrinsicID() == Intrinsic::lifetime_end && - II->getArgOperand(1) == srcAlloca) + II->getArgOperand(0) == srcAlloca) break; } @@ -1393,7 +1393,7 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) if (II->getIntrinsicID() == Intrinsic::lifetime_start) if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) - return II->getArgOperand(1) == Alloca; + return II->getArgOperand(0) == Alloca; return false; } diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 1a52af1..40eeeb2 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -1535,7 +1535,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) { - auto *LifetimePtr = II->getOperand(1); + auto *LifetimePtr = II->getOperand(0); if (LoadPtr == lookupOperandLeader(LifetimePtr) || AA->isMustAlias(LoadPtr, LifetimePtr)) return createConstantExpression(UndefValue::get(LoadType)); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 03d9f32..d6e27aa 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1260,10 +1260,7 @@ private: return PI.setAborted(&II); if (II.isLifetimeStartOrEnd()) { - ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); - uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), - Length->getLimitedValue()); - insertUse(II, Offset, Size, true); + insertUse(II, Offset, AllocSize, true); return; } @@ -3614,30 +3611,14 @@ private: return true; } - assert(II.getArgOperand(1) == OldPtr); - // Lifetime intrinsics are only promotable if they cover the whole alloca. - // Therefore, we drop lifetime intrinsics which don't cover the whole - // alloca. - // (In theory, intrinsics which partially cover an alloca could be - // promoted, but PromoteMemToReg doesn't handle that case.) - // FIXME: Check whether the alloca is promotable before dropping the - // lifetime intrinsics? - if (NewBeginOffset != NewAllocaBeginOffset || - NewEndOffset != NewAllocaEndOffset) - return true; - - ConstantInt *Size = - ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), - NewEndOffset - NewBeginOffset); - // Lifetime intrinsics always expect an i8* so directly get such a pointer - // for the new alloca slice. + assert(II.getArgOperand(0) == OldPtr); Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace()); Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy); Value *New; if (II.getIntrinsicID() == Intrinsic::lifetime_start) - New = IRB.CreateLifetimeStart(Ptr, Size); + New = IRB.CreateLifetimeStart(Ptr); else - New = IRB.CreateLifetimeEnd(Ptr, Size); + New = IRB.CreateLifetimeEnd(Ptr); (void)New; LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 7a9dd37..bbd1ed6 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1099,7 +1099,7 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks, // Get the memory operand of the lifetime marker. If the underlying // object is a sunk alloca, or is otherwise defined in the extraction // region, the lifetime marker must not be erased. - Value *Mem = II->getOperand(1)->stripInBoundsOffsets(); + Value *Mem = II->getOperand(0); if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem)) continue; @@ -1115,8 +1115,6 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks, static void insertLifetimeMarkersSurroundingCall( Module *M, ArrayRef<Value *> LifetimesStart, ArrayRef<Value *> LifetimesEnd, CallInst *TheCall) { - LLVMContext &Ctx = M->getContext(); - auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1); Instruction *Term = TheCall->getParent()->getTerminator(); // Emit lifetime markers for the pointers given in \p Objects. Insert the @@ -1130,7 +1128,7 @@ static void insertLifetimeMarkersSurroundingCall( Function *Func = Intrinsic::getOrInsertDeclaration(M, MarkerFunc, Mem->getType()); - auto Marker = CallInst::Create(Func, {NegativeOne, Mem}); + auto Marker = CallInst::Create(Func, Mem); if (InsertBefore) Marker->insertBefore(TheCall->getIterator()); else diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index ed3dca2..fa3c467 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -2361,15 +2361,13 @@ remapIndices(Function &Caller, BasicBlock *StartBB, // Updating the contextual profile after an inlining means, at a high level, // copying over the data of the callee, **intentionally without any value // scaling**, and copying over the callees of the inlined callee. -llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, - PGOContextualProfile &CtxProf, - bool MergeAttributes, - AAResults *CalleeAAR, - bool InsertLifetime, - Function *ForwardVarArgsTo) { +llvm::InlineResult llvm::InlineFunction( + CallBase &CB, InlineFunctionInfo &IFI, PGOContextualProfile &CtxProf, + bool MergeAttributes, AAResults *CalleeAAR, bool InsertLifetime, + Function *ForwardVarArgsTo, OptimizationRemarkEmitter *ORE) { if (!CtxProf.isInSpecializedModule()) return InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime, - ForwardVarArgsTo); + ForwardVarArgsTo, ORE); auto &Caller = *CB.getCaller(); auto &Callee = *CB.getCalledFunction(); @@ -2387,7 +2385,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, const auto NumCalleeCallsites = CtxProf.getNumCallsites(Callee); auto Ret = InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime, - ForwardVarArgsTo); + ForwardVarArgsTo, ORE); if (!Ret.isSuccess()) return Ret; @@ -2457,20 +2455,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, return Ret; } -/// This function inlines the called function into the basic block of the -/// caller. This returns false if it is not possible to inline this call. -/// The program is still in a well defined state if this occurs though. -/// -/// Note that this only does one level of inlining. For example, if the -/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now -/// exists in the instruction stream. Similarly this will inline a recursive -/// function by one level. -llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, - bool MergeAttributes, - AAResults *CalleeAAR, - bool InsertLifetime, - Function *ForwardVarArgsTo, - OptimizationRemarkEmitter *ORE) { +llvm::InlineResult llvm::CanInlineCallSite(const CallBase &CB, + InlineFunctionInfo &IFI) { assert(CB.getParent() && CB.getFunction() && "Instruction not in function!"); // FIXME: we don't inline callbr yet. @@ -2487,7 +2473,6 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // The inliner does not know how to inline through calls with operand bundles // in general ... - Value *ConvergenceControlToken = nullptr; if (CB.hasOperandBundles()) { for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) { auto OBUse = CB.getOperandBundleAt(i); @@ -2503,7 +2488,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (Tag == LLVMContext::OB_kcfi) continue; if (Tag == LLVMContext::OB_convergencectrl) { - ConvergenceControlToken = OBUse.Inputs[0].get(); + IFI.ConvergenceControlToken = OBUse.Inputs[0].get(); continue; } @@ -2521,28 +2506,22 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // fully implements convergence control tokens, there is no mixing of // controlled and uncontrolled convergent operations in the whole program. if (CB.isConvergent()) { - if (!ConvergenceControlToken && + if (!IFI.ConvergenceControlToken && getConvergenceEntry(CalledFunc->getEntryBlock())) { return InlineResult::failure( "convergent call needs convergencectrl operand"); } } - // If the call to the callee cannot throw, set the 'nounwind' flag on any - // calls that we inline. - bool MarkNoUnwind = CB.doesNotThrow(); - - BasicBlock *OrigBB = CB.getParent(); - Function *Caller = OrigBB->getParent(); + const BasicBlock *OrigBB = CB.getParent(); + const Function *Caller = OrigBB->getParent(); // GC poses two hazards to inlining, which only occur when the callee has GC: // 1. If the caller has no GC, then the callee's GC must be propagated to the // caller. // 2. If the caller has a differing GC, it is invalid to inline. if (CalledFunc->hasGC()) { - if (!Caller->hasGC()) - Caller->setGC(CalledFunc->getGC()); - else if (CalledFunc->getGC() != Caller->getGC()) + if (Caller->hasGC() && CalledFunc->getGC() != Caller->getGC()) return InlineResult::failure("incompatible GC"); } @@ -2560,34 +2539,31 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, ? Caller->getPersonalityFn()->stripPointerCasts() : nullptr; if (CalledPersonality) { - if (!CallerPersonality) - Caller->setPersonalityFn(CalledPersonality); // If the personality functions match, then we can perform the // inlining. Otherwise, we can't inline. // TODO: This isn't 100% true. Some personality functions are proper // supersets of others and can be used in place of the other. - else if (CalledPersonality != CallerPersonality) + if (CallerPersonality && CalledPersonality != CallerPersonality) return InlineResult::failure("incompatible personality"); } // We need to figure out which funclet the callsite was in so that we may // properly nest the callee. - Instruction *CallSiteEHPad = nullptr; if (CallerPersonality) { EHPersonality Personality = classifyEHPersonality(CallerPersonality); if (isScopedEHPersonality(Personality)) { std::optional<OperandBundleUse> ParentFunclet = CB.getOperandBundle(LLVMContext::OB_funclet); if (ParentFunclet) - CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front()); + IFI.CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front()); // OK, the inlining site is legal. What about the target function? - if (CallSiteEHPad) { + if (IFI.CallSiteEHPad) { if (Personality == EHPersonality::MSVC_CXX) { // The MSVC personality cannot tolerate catches getting inlined into // cleanup funclets. - if (isa<CleanupPadInst>(CallSiteEHPad)) { + if (isa<CleanupPadInst>(IFI.CallSiteEHPad)) { // Ok, the call site is within a cleanuppad. Let's check the callee // for catchpads. for (const BasicBlock &CalledBB : *CalledFunc) { @@ -2607,13 +2583,34 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, } } + return InlineResult::success(); +} + +/// This function inlines the called function into the basic block of the +/// caller. This returns false if it is not possible to inline this call. +/// The program is still in a well defined state if this occurs though. +/// +/// Note that this only does one level of inlining. For example, if the +/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now +/// exists in the instruction stream. Similarly this will inline a recursive +/// function by one level. +void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI, + bool MergeAttributes, AAResults *CalleeAAR, + bool InsertLifetime, Function *ForwardVarArgsTo, + OptimizationRemarkEmitter *ORE) { + BasicBlock *OrigBB = CB.getParent(); + Function *Caller = OrigBB->getParent(); + Function *CalledFunc = CB.getCalledFunction(); + assert(CalledFunc && !CalledFunc->isDeclaration() && + "CanInlineCallSite should have verified direct call to definition"); + // Determine if we are dealing with a call in an EHPad which does not unwind // to caller. bool EHPadForCallUnwindsLocally = false; - if (CallSiteEHPad && isa<CallInst>(CB)) { + if (IFI.CallSiteEHPad && isa<CallInst>(CB)) { UnwindDestMemoTy FuncletUnwindMap; Value *CallSiteUnwindDestToken = - getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap); + getUnwindDestToken(IFI.CallSiteEHPad, FuncletUnwindMap); EHPadForCallUnwindsLocally = CallSiteUnwindDestToken && @@ -2630,6 +2627,30 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, ClonedCodeInfo InlinedFunctionInfo; Function::iterator FirstNewBlock; + // GC poses two hazards to inlining, which only occur when the callee has GC: + // 1. If the caller has no GC, then the callee's GC must be propagated to the + // caller. + // 2. If the caller has a differing GC, it is invalid to inline. + if (CalledFunc->hasGC()) { + if (!Caller->hasGC()) + Caller->setGC(CalledFunc->getGC()); + else { + assert(CalledFunc->getGC() == Caller->getGC() && + "CanInlineCallSite should have verified compatible GCs"); + } + } + + if (CalledFunc->hasPersonalityFn()) { + Constant *CalledPersonality = + CalledFunc->getPersonalityFn()->stripPointerCasts(); + if (!Caller->hasPersonalityFn()) { + Caller->setPersonalityFn(CalledPersonality); + } else + assert(Caller->getPersonalityFn()->stripPointerCasts() == + CalledPersonality && + "CanInlineCallSite should have verified compatible personality"); + } + { // Scope to destroy VMap after cloning. ValueToValueMapTy VMap; struct ByValInit { @@ -2819,10 +2840,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, IFI.GetAssumptionCache(*Caller).registerAssumption(II); } - if (ConvergenceControlToken) { + if (IFI.ConvergenceControlToken) { IntrinsicInst *IntrinsicCall = getConvergenceEntry(*FirstNewBlock); if (IntrinsicCall) { - IntrinsicCall->replaceAllUsesWith(ConvergenceControlToken); + IntrinsicCall->replaceAllUsesWith(IFI.ConvergenceControlToken); IntrinsicCall->eraseFromParent(); } } @@ -2869,6 +2890,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, } } + // If the call to the callee cannot throw, set the 'nounwind' flag on any + // calls that we inline. + bool MarkNoUnwind = CB.doesNotThrow(); + SmallVector<Value*,4> VarArgsToForward; SmallVector<AttributeSet, 4> VarArgsAttrs; for (unsigned i = CalledFunc->getFunctionType()->getNumParams(); @@ -2979,31 +3004,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (hasLifetimeMarkers(AI)) continue; - // Try to determine the size of the allocation. - ConstantInt *AllocaSize = nullptr; - if (ConstantInt *AIArraySize = - dyn_cast<ConstantInt>(AI->getArraySize())) { - auto &DL = Caller->getDataLayout(); - Type *AllocaType = AI->getAllocatedType(); - TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType); - uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); - - // Don't add markers for zero-sized allocas. - if (AllocaArraySize == 0) - continue; - - // Check that array size doesn't saturate uint64_t and doesn't - // overflow when it's multiplied by type size. - if (!AllocaTypeSize.isScalable() && - AllocaArraySize != std::numeric_limits<uint64_t>::max() && - std::numeric_limits<uint64_t>::max() / AllocaArraySize >= - AllocaTypeSize.getFixedValue()) { - AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), - AllocaArraySize * AllocaTypeSize); - } - } + std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout()); + if (Size && Size->isZero()) + continue; - builder.CreateLifetimeStart(AI, AllocaSize); + builder.CreateLifetimeStart(AI); for (ReturnInst *RI : Returns) { // Don't insert llvm.lifetime.end calls between a musttail or deoptimize // call and a return. The return kills all local allocas. @@ -3013,7 +3018,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (InlinedDeoptimizeCalls && RI->getParent()->getTerminatingDeoptimizeCall()) continue; - IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize); + IRBuilder<>(RI).CreateLifetimeEnd(AI); } } } @@ -3055,12 +3060,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // Update the lexical scopes of the new funclets and callsites. // Anything that had 'none' as its parent is now nested inside the callsite's // EHPad. - if (CallSiteEHPad) { + if (IFI.CallSiteEHPad) { for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); BB != E; ++BB) { // Add bundle operands to inlined call sites. - PropagateOperandBundles(BB, CallSiteEHPad); + PropagateOperandBundles(BB, IFI.CallSiteEHPad); // It is problematic if the inlinee has a cleanupret which unwinds to // caller and we inline it into a call site which doesn't unwind but into @@ -3076,11 +3081,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { if (isa<ConstantTokenNone>(CatchSwitch->getParentPad())) - CatchSwitch->setParentPad(CallSiteEHPad); + CatchSwitch->setParentPad(IFI.CallSiteEHPad); } else { auto *FPI = cast<FuncletPadInst>(I); if (isa<ConstantTokenNone>(FPI->getParentPad())) - FPI->setParentPad(CallSiteEHPad); + FPI->setParentPad(IFI.CallSiteEHPad); } } } @@ -3236,7 +3241,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, AttributeFuncs::mergeAttributesForInlining(*Caller, *CalledFunc); // We are now done with the inlining. - return InlineResult::success(); + return; } // Otherwise, we have the normal case, of more than one block to inline or @@ -3404,6 +3409,19 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (MergeAttributes) AttributeFuncs::mergeAttributesForInlining(*Caller, *CalledFunc); +} - return InlineResult::success(); +llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, + bool MergeAttributes, + AAResults *CalleeAAR, + bool InsertLifetime, + Function *ForwardVarArgsTo, + OptimizationRemarkEmitter *ORE) { + llvm::InlineResult Result = CanInlineCallSite(CB, IFI); + if (Result.isSuccess()) { + InlineFunctionImpl(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime, + ForwardVarArgsTo, ORE); + } + + return Result; } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 2619e73..b559212 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -481,7 +481,7 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I, return true; if (II->isLifetimeStartOrEnd()) { - auto *Arg = II->getArgOperand(1); + auto *Arg = II->getArgOperand(0); if (isa<PoisonValue>(Arg)) return true; diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index 472c03f..1f59b17 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -155,7 +155,7 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE, return; } if (auto *II = dyn_cast<LifetimeIntrinsic>(&Inst)) { - AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(1)); + AllocaInst *AI = dyn_cast<AllocaInst>(II->getArgOperand(0)); if (!AI || getAllocaInterestingness(*AI) != AllocaInterestingness::kInteresting) return; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9667b50..1ac84ef 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -548,9 +548,6 @@ public: protected: friend class LoopVectorizationPlanner; - /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); - // Create a check to see if the vector loop should be executed Value *createIterationCountCheck(ElementCount VF, unsigned UF) const; @@ -2272,56 +2269,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { return TTI.enableMaskedInterleavedAccessVectorization(); } -Value * -InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { - if (VectorTripCount) - return VectorTripCount; - - Value *TC = getTripCount(); - IRBuilder<> Builder(InsertBlock->getTerminator()); - - Type *Ty = TC->getType(); - // This is where we can make the step a runtime constant. - Value *Step = createStepForVF(Builder, Ty, VF, UF); - - // If the tail is to be folded by masking, round the number of iterations N - // up to a multiple of Step instead of rounding down. This is done by first - // adding Step-1 and then rounding down. Note that it's ok if this addition - // overflows: the vector induction variable will eventually wrap to zero given - // that it starts at zero and its Step is a power of two; the loop will then - // exit, with the last early-exit vector comparison also producing all-true. - // For scalable vectors the VF is not guaranteed to be a power of 2, but this - // is accounted for in emitIterationCountCheck that adds an overflow check. - if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && - "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)), - "n.rnd.up"); - } - - // Now we need to generate the expression for the part of the loop that the - // vectorized body will execute. This is equal to N - (N % Step) if scalar - // iterations are not required for correctness, or N - Step, otherwise. Step - // is equal to the vectorization factor (number of SIMD elements) times the - // unroll factor (number of SIMD instructions). - Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); - - // There are cases where we *must* run at least one iteration in the remainder - // loop. See the cost model for when this can happen. If the step evenly - // divides the trip count, we set the remainder to be equal to the step. If - // the step does not evenly divide the trip count, no adjustment is necessary - // since there will already be scalar iterations. Note that the minimum - // iterations check ensures that N >= Step. - if (Cost->requiresScalarEpilogue(VF.isVector())) { - auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); - R = Builder.CreateSelect(IsZero, Step, R); - } - - VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); - - return VectorTripCount; -} - void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { // Note: The block with the minimum trip-count check is already connected // during earlier VPlan construction. @@ -7354,6 +7301,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // Canonicalize EVL loops after regions are dissolved. VPlanTransforms::canonicalizeEVLLoops(BestVPlan); VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH); + VPlanTransforms::materializeVectorTripCount( + BestVPlan, VectorPH, CM.foldTailByMasking(), + CM.requiresScalarEpilogue(BestVF.isVector())); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, @@ -7410,8 +7360,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute( - ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); + BestVPlan.prepareToExecute(State); replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); // Move check blocks to their final position. @@ -7530,9 +7479,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { EPI.MainLoopIterationCountCheck = emitIterationCountCheck(LoopScalarPreHeader, false); - // Generate the induction variable. - EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); - replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); return LoopVectorPreHeader; } @@ -8451,11 +8397,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, *Plan, CM.getMinimalBitwidths()); VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan); // TODO: try to put it close to addActiveLaneMask(). - // Discard the plan if it is not EVL-compatible - if (CM.foldTailWithEVL() && !HasScalarVF && - !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength, - *Plan, CM.getMaxSafeElements())) - break; + if (CM.foldTailWithEVL() && !HasScalarVF) + VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, + *Plan, CM.getMaxSafeElements()); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } @@ -9412,13 +9356,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind, cast_if_present<BinaryOperator>(FPBinOp)); DerivedIV->setName(Name); - // If index is the vector trip count, the concrete value will only be set in - // prepareToExecute, leading to missed simplifications, e.g. if it is 0. - // TODO: Remove the special case for the vector trip count once it is computed - // in VPlan and can be used during VPlan simplification. - assert((DerivedIV != Index || - getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) && - "IV didn't need transforming?"); State.set(this, DerivedIV, VPLane(0)); } @@ -9810,7 +9747,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, - const EpilogueLoopVectorizationInfo &EPI) { + EpilogueLoopVectorizationInfo &EPI) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); @@ -9828,30 +9765,13 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, // loop. using namespace llvm::PatternMatch; PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin(); - assert(EPResumeVal->getType() == IV->getScalarType() && - match(EPResumeVal->getIncomingValueForBlock( - EPI.MainLoopIterationCountCheck), - m_SpecificInt(0)) && - EPResumeVal == - find_singleton<PHINode>( - L->getLoopPreheader()->phis(), - [&EPI, IV](PHINode &P, bool) -> PHINode * { - if (P.getType() == IV->getScalarType() && - match(P.getIncomingValueForBlock( - EPI.MainLoopIterationCountCheck), - m_SpecificInt(0)) && - any_of(P.incoming_values(), - [&EPI](Value *Inc) { - return Inc == EPI.VectorTripCount; - }) && - all_of(P.incoming_values(), [&EPI](Value *Inc) { - return Inc == EPI.VectorTripCount || - match(Inc, m_SpecificInt(0)); - })) - return &P; - return nullptr; - }) && - "Epilogue resume phis do not match!"); + for (Value *Inc : EPResumeVal->incoming_values()) { + if (match(Inc, m_SpecificInt(0))) + continue; + assert(!EPI.VectorTripCount && + "Must only have a single non-zero incoming value"); + EPI.VectorTripCount = Inc; + } VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); assert(all_of(IV->users(), [](const VPUser *U) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5d0e2f9..ec06a21 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3883,6 +3883,7 @@ private: enum CombinedOpcode { NotCombinedOp = -1, MinMax = Instruction::OtherOpsEnd + 1, + FMulAdd, }; CombinedOpcode CombinedOp = NotCombinedOp; @@ -4033,6 +4034,9 @@ private: /// Returns true if any scalar in the list is a copyable element. bool hasCopyableElements() const { return !CopyableElements.empty(); } + /// Returns the state of the operations. + const InstructionsState &getOperations() const { return S; } + /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. unsigned findLaneForValue(Value *V) const { @@ -11987,6 +11991,82 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) { } } +static InstructionCost canConvertToFMA(ArrayRef<Value *> VL, + const InstructionsState &S, + DominatorTree &DT, const DataLayout &DL, + TargetTransformInfo &TTI, + const TargetLibraryInfo &TLI) { + assert(all_of(VL, + [](Value *V) { + return V->getType()->getScalarType()->isFloatingPointTy(); + }) && + "Can only convert to FMA for floating point types"); + assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub"); + + auto CheckForContractable = [&](ArrayRef<Value *> VL) { + FastMathFlags FMF; + FMF.set(); + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + continue; + // TODO: support for copyable elements. + Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I); + if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI) + continue; + if (auto *FPCI = dyn_cast<FPMathOperator>(I)) + FMF &= FPCI->getFastMathFlags(); + } + return FMF.allowContract(); + }; + if (!CheckForContractable(VL)) + return InstructionCost::getInvalid(); + // fmul also should be contractable + InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI); + SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL); + + InstructionsState OpS = getSameOpcode(Operands.front(), TLI); + if (!OpS.valid()) + return InstructionCost::getInvalid(); + if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul) + return InstructionCost::getInvalid(); + if (!CheckForContractable(Operands.front())) + return InstructionCost::getInvalid(); + // Compare the costs. + InstructionCost FMulPlusFAddCost = 0; + InstructionCost FMACost = 0; + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + FastMathFlags FMF; + FMF.set(); + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + continue; + if (auto *FPCI = dyn_cast<FPMathOperator>(I)) + FMF &= FPCI->getFastMathFlags(); + FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind); + } + unsigned NumOps = 0; + for (auto [V, Op] : zip(VL, Operands.front())) { + auto *I = dyn_cast<Instruction>(Op); + if (!I || !I->hasOneUse()) { + if (auto *OpI = dyn_cast<Instruction>(V)) + FMACost += TTI.getInstructionCost(OpI, CostKind); + if (I) + FMACost += TTI.getInstructionCost(I, CostKind); + continue; + } + ++NumOps; + if (auto *FPCI = dyn_cast<FPMathOperator>(I)) + FMF &= FPCI->getFastMathFlags(); + FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind); + } + Type *Ty = VL.front()->getType(); + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF); + FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind); + return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid(); +} + void BoUpSLP::transformNodes() { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; BaseGraphSize = VectorizableTree.size(); @@ -12355,6 +12435,25 @@ void BoUpSLP::transformNodes() { } break; } + case Instruction::FSub: + case Instruction::FAdd: { + // Check if possible to convert (a*b)+c to fma. + if (E.State != TreeEntry::Vectorize || + !E.getOperations().isAddSubLikeOp()) + break; + if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI) + .isValid()) + break; + // This node is a fmuladd node. + E.CombinedOp = TreeEntry::FMulAdd; + TreeEntry *FMulEntry = getOperandEntry(&E, 0); + if (FMulEntry->UserTreeIndex && + FMulEntry->State == TreeEntry::Vectorize) { + // The FMul node is part of the combined fmuladd node. + FMulEntry->State = TreeEntry::CombinedVectorize; + } + break; + } default: break; } @@ -13587,6 +13686,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, } return IntrinsicCost; }; + auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S, + Instruction *VI) { + InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI); + return Cost; + }; switch (ShuffleOrOp) { case Instruction::PHI: { // Count reused scalars. @@ -13927,6 +14031,30 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, }; return GetCostDiff(GetScalarCost, GetVectorCost); } + case TreeEntry::FMulAdd: { + auto GetScalarCost = [&](unsigned Idx) { + if (isa<PoisonValue>(UniqueValues[Idx])) + return InstructionCost(TTI::TCC_Free); + return GetFMulAddCost(E->getOperations(), + cast<Instruction>(UniqueValues[Idx])); + }; + auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) { + FastMathFlags FMF; + FMF.set(); + for (Value *V : E->Scalars) { + if (auto *FPCI = dyn_cast<FPMathOperator>(V)) { + FMF &= FPCI->getFastMathFlags(); + if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0))) + FMF &= FPCIOp->getFastMathFlags(); + } + } + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy, + {VecTy, VecTy, VecTy}, FMF); + InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind); + return VecCost + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: @@ -13964,8 +14092,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, } TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1); TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2); - return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind, - Op1Info, Op2Info, Operands); + InstructionCost ScalarCost = TTI->getArithmeticInstrCost( + ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands); + if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]); + I && (ShuffleOrOp == Instruction::FAdd || + ShuffleOrOp == Instruction::FSub)) { + InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I); + if (IntrinsicCost.isValid()) + ScalarCost = IntrinsicCost; + } + return ScalarCost; }; auto GetVectorCost = [=](InstructionCost CommonCost) { if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { @@ -22594,11 +22730,21 @@ public: /// Try to find a reduction tree. bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, ScalarEvolution &SE, const DataLayout &DL, - const TargetLibraryInfo &TLI) { + const TargetLibraryInfo &TLI, + DominatorTree &DT, TargetTransformInfo &TTI) { RdxKind = HorizontalReduction::getRdxKind(Root); if (!isVectorizable(RdxKind, Root)) return false; + // FMA reduction root - skip. + auto CheckForFMA = [&](Instruction *I) { + return RdxKind == RecurKind::FAdd && + canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI) + .isValid(); + }; + if (CheckForFMA(Root)) + return false; + // Analyze "regular" integer/FP types for reductions - no target-specific // types or pointers. Type *Ty = Root->getType(); @@ -22636,7 +22782,7 @@ public: // Also, do not try to reduce const values, if the operation is not // foldable. if (!EdgeInst || Level > RecursionMaxDepth || - getRdxKind(EdgeInst) != RdxKind || + getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) || IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || !isVectorizable(RdxKind, EdgeInst) || @@ -24205,13 +24351,13 @@ bool SLPVectorizerPass::vectorizeHorReduction( Stack.emplace(SelectRoot(), 0); SmallPtrSet<Value *, 8> VisitedInstrs; bool Res = false; - auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * { + auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * { if (R.isAnalyzedReductionRoot(Inst)) return nullptr; if (!isReductionCandidate(Inst)) return nullptr; HorizontalReduction HorRdx; - if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) + if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI)) return nullptr; return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); }; @@ -24277,6 +24423,12 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType())) return false; + // Skip potential FMA candidates. + if ((I->getOpcode() == Instruction::FAdd || + I->getOpcode() == Instruction::FSub) && + canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI) + .isValid()) + return false; Value *P = I->getParent(); diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp index f32d57f..e414c12 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp @@ -81,6 +81,7 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes( case Instruction::Opcode::FPToUI: case Instruction::Opcode::FPToSI: case Instruction::Opcode::FPExt: + case Instruction::Opcode::PtrToAddr: case Instruction::Opcode::PtrToInt: case Instruction::Opcode::IntToPtr: case Instruction::Opcode::SIToFP: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 8052e31..a820b524 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -951,15 +951,9 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } -void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) { - if (!VectorTripCount.getUnderlyingValue()) - VectorTripCount.setUnderlyingValue(VectorTripCountV); - else - assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV && - "VectorTripCount set earlier must much VectorTripCountV"); - +void VPlan::prepareToExecute(VPTransformState &State) { IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - Type *TCTy = VectorTripCountV->getType(); + Type *TCTy = VPTypeAnalysis(*this).inferScalarType(getTripCount()); // FIXME: Model VF * UF computation completely in VPlan. unsigned UF = getUF(); if (VF.getNumUsers()) { @@ -1054,12 +1048,17 @@ void VPlan::execute(VPTransformState *State) { InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { // For now only return the cost of the vector loop region, ignoring any other - // blocks, like the preheader or middle blocks. + // blocks, like the preheader or middle blocks, expect for checking them for + // recipes with invalid costs. InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx); - // If any instructions in the middle block are invalid return invalid. - // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created. - if (!getMiddleBlock()->cost(VF, Ctx).isValid()) + // If the cost of the loop region is invalid or any recipe in the skeleton + // outside loop regions are invalid return an invalid cost. + if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_shallow(getEntry())), + [&VF, &Ctx](VPBasicBlock *VPBB) { + return !VPBB->cost(VF, Ctx).isValid(); + })) return InstructionCost::getInvalid(); return Cost; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c42cdd5..6f098351 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2408,11 +2408,11 @@ public: // TODO: extend the masked interleaved-group support to reversed access. assert((!Mask || !IG->isReverse()) && "Reversed masked interleave-group not supported."); - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *I = IG->getMember(i)) { - if (I->getType()->isVoidTy()) + for (unsigned I = 0; I < IG->getFactor(); ++I) + if (Instruction *Inst = IG->getMember(I)) { + if (Inst->getType()->isVoidTy()) continue; - new VPValue(I, this); + new VPValue(Inst, this); } for (auto *SV : StoredValues) @@ -3969,7 +3969,7 @@ public: } /// Prepare the plan for execution, setting up the required live-in values. - void prepareToExecute(Value *VectorTripCount, VPTransformState &State); + void prepareToExecute(VPTransformState &State); /// Generate the IR code for this VPlan. void execute(VPTransformState *State); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 0cb704c..34b2abf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2185,6 +2185,21 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { "User of VF that we can't transform to EVL."); Plan.getVF().replaceAllUsesWith(&EVL); + assert(all_of(Plan.getVFxUF().users(), + [&Plan](VPUser *U) { + return match(U, m_c_Binary<Instruction::Add>( + m_Specific(Plan.getCanonicalIV()), + m_Specific(&Plan.getVFxUF()))) || + isa<VPWidenPointerInductionRecipe>(U); + }) && + "Only users of VFxUF should be VPWidenPointerInductionRecipe and the " + "increment of the canonical induction."); + Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { + // Only replace uses in VPWidenPointerInductionRecipe; The increment of the + // canonical induction must not be updated. + return isa<VPWidenPointerInductionRecipe>(U); + }); + // Defer erasing recipes till the end so that we don't invalidate the // VPTypeAnalysis cache. SmallVector<VPRecipeBase *> ToErase; @@ -2320,16 +2335,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { /// %NextAVL = sub IVSize nuw %AVL, %OpEVL /// ... /// -bool VPlanTransforms::tryAddExplicitVectorLength( +void VPlanTransforms::addExplicitVectorLength( VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) { VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - // The transform updates all users of inductions to work based on EVL, instead - // of the VF directly. At the moment, widened pointer inductions cannot be - // updated, so bail out if the plan contains any. - bool ContainsWidenPointerInductions = - any_of(Header->phis(), IsaPred<VPWidenPointerInductionRecipe>); - if (ContainsWidenPointerInductions) - return false; auto *CanonicalIVPHI = Plan.getCanonicalIV(); auto *CanIVTy = CanonicalIVPHI->getScalarType(); @@ -2384,7 +2392,6 @@ bool VPlanTransforms::tryAddExplicitVectorLength( CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); // TODO: support unroll factor > 1. Plan.setUF(1); - return true; } void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { @@ -2808,13 +2815,12 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, R->replaceAllUsesWith(PtrAdd); // Create the backedge value for the scalar pointer phi. - Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi()); + VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); + Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator()); VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF), DL); VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF}); - VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); - Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator()); VPValue *InductionGEP = Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind"); ScalarPtrPhi->addOperand(InductionGEP); @@ -3272,6 +3278,67 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, BTC->replaceAllUsesWith(TCMO); } +void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, + VPBasicBlock *VectorPHVPBB, + bool TailByMasking, + bool RequiresScalarEpilogue) { + VPValue &VectorTC = Plan.getVectorTripCount(); + assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in"); + // There's nothing to do if there are no users of the vector trip count or its + // IR value has already been set. + if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue()) + return; + + VPValue *TC = Plan.getTripCount(); + Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC); + VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin()); + VPValue *Step = &Plan.getVFxUF(); + + // If the tail is to be folded by masking, round the number of iterations N + // up to a multiple of Step instead of rounding down. This is done by first + // adding Step-1 and then rounding down. Note that it's ok if this addition + // overflows: the vector induction variable will eventually wrap to zero given + // that it starts at zero and its Step is a power of two; the loop will then + // exit, with the last early-exit vector comparison also producing all-true. + // For scalable vectors the VF is not guaranteed to be a power of 2, but this + // is accounted for in emitIterationCountCheck that adds an overflow check. + if (TailByMasking) { + TC = Builder.createNaryOp( + Instruction::Add, + {TC, Builder.createNaryOp( + Instruction::Sub, + {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})}, + DebugLoc::getCompilerGenerated(), "n.rnd.up"); + } + + // Now we need to generate the expression for the part of the loop that the + // vectorized body will execute. This is equal to N - (N % Step) if scalar + // iterations are not required for correctness, or N - Step, otherwise. Step + // is equal to the vectorization factor (number of SIMD elements) times the + // unroll factor (number of SIMD instructions). + VPValue *R = + Builder.createNaryOp(Instruction::URem, {TC, Step}, + DebugLoc::getCompilerGenerated(), "n.mod.vf"); + + // There are cases where we *must* run at least one iteration in the remainder + // loop. See the cost model for when this can happen. If the step evenly + // divides the trip count, we set the remainder to be equal to the step. If + // the step does not evenly divide the trip count, no adjustment is necessary + // since there will already be scalar iterations. Note that the minimum + // iterations check ensures that N >= Step. + if (RequiresScalarEpilogue) { + assert(!TailByMasking && + "requiring scalar epilogue is not supported with fail folding"); + VPValue *IsZero = Builder.createICmp( + CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0))); + R = Builder.createSelect(IsZero, Step, R); + } + + VPValue *Res = Builder.createNaryOp( + Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec"); + VectorTC.replaceAllUsesWith(Res); +} + /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index ecaca72..2afe956 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -177,10 +177,9 @@ struct VPlanTransforms { /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. /// VPCanonicalIVPHIRecipe is only used to control the loop after /// this transformation. - /// \returns true if the transformation succeeds, or false if it doesn't. - static bool - tryAddExplicitVectorLength(VPlan &Plan, - const std::optional<unsigned> &MaxEVLSafeElements); + static void + addExplicitVectorLength(VPlan &Plan, + const std::optional<unsigned> &MaxEVLSafeElements); // For each Interleave Group in \p InterleaveGroups replace the Recipes // widening its memory instructions with a single VPInterleaveRecipe at its @@ -257,6 +256,12 @@ struct VPlanTransforms { unsigned BestUF, PredicatedScalarEvolution &PSE); + /// Materialize vector trip count computations to a set of VPInstructions. + static void materializeVectorTripCount(VPlan &Plan, + VPBasicBlock *VectorPHVPBB, + bool TailByMasking, + bool RequiresScalarEpilogue); + /// Materialize the backedge-taken count to be computed explicitly using /// VPInstructions. static void materializeBackedgeTakenCount(VPlan &Plan, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 14ae4f2..3417e1c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) .Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe, - VPWidenIntOrFpInductionRecipe>( + VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case<VPScalarIVStepsRecipe>([&](auto *R) { if (R->getNumOperands() != 3) { |