diff options
Diffstat (limited to 'llvm/lib')
111 files changed, 2032 insertions, 993 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 4969528..dd98b62 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1659,7 +1659,6 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::aarch64_sve_convert_from_svbool: case Intrinsic::wasm_alltrue: case Intrinsic::wasm_anytrue: - case Intrinsic::wasm_dot: // WebAssembly float semantics are always known case Intrinsic::wasm_trunc_signed: case Intrinsic::wasm_trunc_unsigned: @@ -3990,30 +3989,6 @@ static Constant *ConstantFoldFixedVectorCall( } return ConstantVector::get(Result); } - case Intrinsic::wasm_dot: { - unsigned NumElements = - cast<FixedVectorType>(Operands[0]->getType())->getNumElements(); - - assert(NumElements == 8 && Result.size() == 4 && - "wasm dot takes i16x8 and produces i32x4"); - assert(Ty->isIntegerTy()); - int32_t MulVector[8]; - - for (unsigned I = 0; I < NumElements; ++I) { - ConstantInt *Elt0 = - cast<ConstantInt>(Operands[0]->getAggregateElement(I)); - ConstantInt *Elt1 = - cast<ConstantInt>(Operands[1]->getAggregateElement(I)); - - MulVector[I] = Elt0->getSExtValue() * Elt1->getSExtValue(); - } - for (unsigned I = 0; I < Result.size(); I++) { - int32_t IAdd = MulVector[I * 2] + MulVector[I * 2 + 1]; - Result[I] = ConstantInt::get(Ty, IAdd); - } - - return ConstantVector::get(Result); - } default: break; } diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index f1473b2..256befa 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -180,8 +180,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE; ++SrcI) { if (SrcI->mayReadOrWriteMemory()) { - for (inst_iterator DstI = SrcI, DstE = inst_end(F); - DstI != DstE; ++DstI) { + for (inst_iterator DstI = SrcI, DstE = inst_end(F); DstI != DstE; + ++DstI) { if (DstI->mayReadOrWriteMemory()) { OS << "Src:" << *SrcI << " --> Dst:" << *DstI << "\n"; OS << " da analyze - "; @@ -203,7 +203,7 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, // Normalize negative direction vectors if required by clients. if (NormalizeResults && D->normalize(&SE)) - OS << "normalized - "; + OS << "normalized - "; D->dump(OS); for (unsigned Level = 1; Level <= D->getLevels(); Level++) { if (D->isSplitable(Level)) { @@ -227,8 +227,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, void DependenceAnalysisWrapperPass::print(raw_ostream &OS, const Module *) const { - dumpExampleDependence(OS, info.get(), - getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false); + dumpExampleDependence( + OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false); } PreservedAnalyses @@ -249,33 +249,26 @@ bool Dependence::isInput() const { return Src->mayReadFromMemory() && Dst->mayReadFromMemory(); } - // Returns true if this is an output dependence. bool Dependence::isOutput() const { return Src->mayWriteToMemory() && Dst->mayWriteToMemory(); } - // Returns true if this is an flow (aka true) dependence. bool Dependence::isFlow() const { return Src->mayWriteToMemory() && Dst->mayReadFromMemory(); } - // Returns true if this is an anti dependence. bool Dependence::isAnti() const { return Src->mayReadFromMemory() && Dst->mayWriteToMemory(); } - // Returns true if a particular level is scalar; that is, // if no subscript in the source or destination mention the induction // variable associated with the loop at this level. // Leave this out of line, so it will serve as a virtual method anchor -bool Dependence::isScalar(unsigned level) const { - return false; -} - +bool Dependence::isScalar(unsigned level) const { return false; } //===----------------------------------------------------------------------===// // FullDependence methods @@ -338,8 +331,7 @@ bool FullDependence::normalize(ScalarEvolution *SE) { DV[Level - 1].Direction = RevDirection; // Reverse the dependence distance as well. if (DV[Level - 1].Distance != nullptr) - DV[Level - 1].Distance = - SE->getNegativeSCEV(DV[Level - 1].Distance); + DV[Level - 1].Distance = SE->getNegativeSCEV(DV[Level - 1].Distance); } LLVM_DEBUG(dbgs() << "After normalizing negative direction vectors:\n"; @@ -355,14 +347,12 @@ unsigned FullDependence::getDirection(unsigned Level) const { return DV[Level - 1].Direction; } - // Returns the distance (or NULL) associated with a particular level. const SCEV *FullDependence::getDistance(unsigned Level) const { assert(0 < Level && Level <= Levels && "Level out of range"); return DV[Level - 1].Distance; } - // Returns true if a particular level is scalar; that is, // if no subscript in the source or destination mention the induction // variable associated with the loop at this level. @@ -371,7 +361,6 @@ bool FullDependence::isScalar(unsigned Level) const { return DV[Level - 1].Scalar; } - // Returns true if peeling the first iteration from this loop // will break this dependence. bool FullDependence::isPeelFirst(unsigned Level) const { @@ -379,7 +368,6 @@ bool FullDependence::isPeelFirst(unsigned Level) const { return DV[Level - 1].PeelFirst; } - // Returns true if peeling the last iteration from this loop // will break this dependence. bool FullDependence::isPeelLast(unsigned Level) const { @@ -387,14 +375,12 @@ bool FullDependence::isPeelLast(unsigned Level) const { return DV[Level - 1].PeelLast; } - // Returns true if splitting this loop will break the dependence. bool FullDependence::isSplitable(unsigned Level) const { assert(0 < Level && Level <= Levels && "Level out of range"); return DV[Level - 1].Splitable; } - //===----------------------------------------------------------------------===// // DependenceInfo::Constraint methods @@ -405,7 +391,6 @@ const SCEV *DependenceInfo::Constraint::getX() const { return A; } - // If constraint is a point <X, Y>, returns Y. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getY() const { @@ -413,7 +398,6 @@ const SCEV *DependenceInfo::Constraint::getY() const { return B; } - // If constraint is a line AX + BY = C, returns A. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getA() const { @@ -422,7 +406,6 @@ const SCEV *DependenceInfo::Constraint::getA() const { return A; } - // If constraint is a line AX + BY = C, returns B. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getB() const { @@ -431,7 +414,6 @@ const SCEV *DependenceInfo::Constraint::getB() const { return B; } - // If constraint is a line AX + BY = C, returns C. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getC() const { @@ -440,7 +422,6 @@ const SCEV *DependenceInfo::Constraint::getC() const { return C; } - // If constraint is a distance, returns D. // Otherwise assert. const SCEV *DependenceInfo::Constraint::getD() const { @@ -448,7 +429,6 @@ const SCEV *DependenceInfo::Constraint::getD() const { return SE->getNegativeSCEV(C); } - // Returns the loop associated with this constraint. const Loop *DependenceInfo::Constraint::getAssociatedLoop() const { assert((Kind == Distance || Kind == Line || Kind == Point) && @@ -499,17 +479,16 @@ LLVM_DUMP_METHOD void DependenceInfo::Constraint::dump(raw_ostream &OS) const { else if (isPoint()) OS << " Point is <" << *getX() << ", " << *getY() << ">\n"; else if (isDistance()) - OS << " Distance is " << *getD() << - " (" << *getA() << "*X + " << *getB() << "*Y = " << *getC() << ")\n"; + OS << " Distance is " << *getD() << " (" << *getA() << "*X + " << *getB() + << "*Y = " << *getC() << ")\n"; else if (isLine()) - OS << " Line is " << *getA() << "*X + " << - *getB() << "*Y = " << *getC() << "\n"; + OS << " Line is " << *getA() << "*X + " << *getB() << "*Y = " << *getC() + << "\n"; else llvm_unreachable("unknown constraint type in Constraint::dump"); } #endif - // Updates X with the intersection // of the Constraints X and Y. Returns true if X has changed. // Corresponds to Figure 4 from the paper @@ -591,15 +570,14 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { const SCEV *A1B2 = SE->getMulExpr(X->getA(), Y->getB()); const SCEV *A2B1 = SE->getMulExpr(Y->getA(), X->getB()); const SCEVConstant *C1A2_C2A1 = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1)); + dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1)); const SCEVConstant *C1B2_C2B1 = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1)); + dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1)); const SCEVConstant *A1B2_A2B1 = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1)); + dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1)); const SCEVConstant *A2B1_A1B2 = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2)); - if (!C1B2_C2B1 || !C1A2_C2A1 || - !A1B2_A2B1 || !A2B1_A1B2) + dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2)); + if (!C1B2_C2B1 || !C1A2_C2A1 || !A1B2_A2B1 || !A2B1_A1B2) return false; APInt Xtop = C1B2_C2B1->getAPInt(); APInt Xbot = A1B2_A2B1->getAPInt(); @@ -626,8 +604,8 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { ++DeltaSuccesses; return true; } - if (const SCEVConstant *CUB = - collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) { + if (const SCEVConstant *CUB = collectConstantUpperBound( + X->getAssociatedLoop(), Prod1->getType())) { const APInt &UpperBound = CUB->getAPInt(); LLVM_DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n"); if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) { @@ -636,8 +614,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { return true; } } - X->setPoint(SE->getConstant(Xq), - SE->getConstant(Yq), + X->setPoint(SE->getConstant(Xq), SE->getConstant(Yq), X->getAssociatedLoop()); ++DeltaSuccesses; return true; @@ -667,7 +644,6 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) { return false; } - //===----------------------------------------------------------------------===// // DependenceInfo methods @@ -737,8 +713,7 @@ void Dependence::dump(raw_ostream &OS) const { // tbaa, non-overlapping regions etc), then it is known there is no dependecy. // Otherwise the underlying objects are checked to see if they point to // different identifiable objects. -static AliasResult underlyingObjectsAlias(AAResults *AA, - const DataLayout &DL, +static AliasResult underlyingObjectsAlias(AAResults *AA, const DataLayout &DL, const MemoryLocation &LocA, const MemoryLocation &LocB) { // Check the original locations (minus size) for noalias, which can happen for @@ -773,8 +748,7 @@ static AliasResult underlyingObjectsAlias(AAResults *AA, // Returns true if the load or store can be analyzed. Atomic and volatile // operations have properties which this analysis does not understand. -static -bool isLoadOrStore(const Instruction *I) { +static bool isLoadOrStore(const Instruction *I) { if (const LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->isUnordered(); else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) @@ -782,7 +756,6 @@ bool isLoadOrStore(const Instruction *I) { return false; } - // Examines the loop nesting of the Src and Dst // instructions and establishes their shared loops. Sets the variables // CommonLevels, SrcLevels, and MaxLevels. @@ -860,14 +833,12 @@ void DependenceInfo::establishNestingLevels(const Instruction *Src, MaxLevels -= CommonLevels; } - // Given one of the loops containing the source, return // its level index in our numbering scheme. unsigned DependenceInfo::mapSrcLoop(const Loop *SrcLoop) const { return SrcLoop->getLoopDepth(); } - // Given one of the loops containing the destination, // return its level index in our numbering scheme. unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const { @@ -880,7 +851,6 @@ unsigned DependenceInfo::mapDstLoop(const Loop *DstLoop) const { return D; } - // Returns true if Expression is loop invariant in LoopNest. bool DependenceInfo::isLoopInvariant(const SCEV *Expression, const Loop *LoopNest) const { @@ -896,8 +866,6 @@ bool DependenceInfo::isLoopInvariant(const SCEV *Expression, return SE->isLoopInvariant(Expression, LoopNest->getOutermostLoop()); } - - // Finds the set of loops from the LoopNest that // have a level <= CommonLevels and are referred to by the SCEV Expression. void DependenceInfo::collectCommonLoops(const SCEV *Expression, @@ -924,9 +892,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) { IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType()); IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType()); if (SrcTy == nullptr || DstTy == nullptr) { - assert(SrcTy == DstTy && "This function only unify integer types and " - "expect Src and Dst share the same type " - "otherwise."); + assert(SrcTy == DstTy && + "This function only unify integer types and " + "expect Src and Dst share the same type otherwise."); continue; } if (SrcTy->getBitWidth() > widestWidthSeen) { @@ -939,7 +907,6 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) { } } - assert(widestWidthSeen > 0); // Now extend each pair to the widest seen. @@ -949,9 +916,9 @@ void DependenceInfo::unifySubscriptType(ArrayRef<Subscript *> Pairs) { IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType()); IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType()); if (SrcTy == nullptr || DstTy == nullptr) { - assert(SrcTy == DstTy && "This function only unify integer types and " - "expect Src and Dst share the same type " - "otherwise."); + assert(SrcTy == DstTy && + "This function only unify integer types and " + "expect Src and Dst share the same type otherwise."); continue; } if (SrcTy->getBitWidth() < widestWidthSeen) @@ -1028,7 +995,6 @@ bool DependenceInfo::checkDstSubscript(const SCEV *Dst, const Loop *LoopNest, return checkSubscript(Dst, LoopNest, Loops, false); } - // Examines the subscript pair (the Src and Dst SCEVs) // and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear. // Collects the associated loops in a set. @@ -1049,14 +1015,12 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest, return Subscript::ZIV; if (N == 1) return Subscript::SIV; - if (N == 2 && (SrcLoops.count() == 0 || - DstLoops.count() == 0 || + if (N == 2 && (SrcLoops.count() == 0 || DstLoops.count() == 0 || (SrcLoops.count() == 1 && DstLoops.count() == 1))) return Subscript::RDIV; return Subscript::MIV; } - // A wrapper around SCEV::isKnownPredicate. // Looks for cases where we're interested in comparing for equality. // If both X and Y have been identically sign or zero extended, @@ -1069,12 +1033,9 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest, // involving symbolics. bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X, const SCEV *Y) const { - if (Pred == CmpInst::ICMP_EQ || - Pred == CmpInst::ICMP_NE) { - if ((isa<SCEVSignExtendExpr>(X) && - isa<SCEVSignExtendExpr>(Y)) || - (isa<SCEVZeroExtendExpr>(X) && - isa<SCEVZeroExtendExpr>(Y))) { + if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) { + if ((isa<SCEVSignExtendExpr>(X) && isa<SCEVSignExtendExpr>(Y)) || + (isa<SCEVZeroExtendExpr>(X) && isa<SCEVZeroExtendExpr>(Y))) { const SCEVIntegralCastExpr *CX = cast<SCEVIntegralCastExpr>(X); const SCEVIntegralCastExpr *CY = cast<SCEVIntegralCastExpr>(Y); const SCEV *Xop = CX->getOperand(); @@ -1111,7 +1072,10 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X, } } -/// Compare to see if S is less than Size, using isKnownNegative(S - max(Size, 1)) +/// Compare to see if S is less than Size, using +/// +/// isKnownNegative(S - max(Size, 1)) +/// /// with some extra checking if S is an AddRec and we can prove less-than using /// the loop bounds. bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const { @@ -1178,7 +1142,6 @@ const SCEV *DependenceInfo::collectUpperBound(const Loop *L, Type *T) const { return nullptr; } - // Calls collectUpperBound(), then attempts to cast it to SCEVConstant. // If the cast fails, returns NULL. const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L, @@ -1188,7 +1151,6 @@ const SCEVConstant *DependenceInfo::collectConstantUpperBound(const Loop *L, return nullptr; } - // testZIV - // When we have a pair of subscripts of the form [c1] and [c2], // where c1 and c2 are both loop invariant, we attack it using @@ -1218,7 +1180,6 @@ bool DependenceInfo::testZIV(const SCEV *Src, const SCEV *Dst, return false; // possibly dependent } - // strongSIVtest - // From the paper, Practical Dependence Testing, Section 4.2.1 // @@ -1270,9 +1231,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, LLVM_DEBUG(dbgs() << "\t UpperBound = " << *UpperBound); LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n"); const SCEV *AbsDelta = - SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta); + SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta); const SCEV *AbsCoeff = - SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff); + SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff); const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff); if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) { // Distance greater than trip count - no dependence @@ -1286,7 +1247,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) { APInt ConstDelta = cast<SCEVConstant>(Delta)->getAPInt(); APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getAPInt(); - APInt Distance = ConstDelta; // these need to be initialized + APInt Distance = ConstDelta; // these need to be initialized APInt Remainder = ConstDelta; APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder); LLVM_DEBUG(dbgs() << "\t Distance = " << Distance << "\n"); @@ -1307,29 +1268,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, else Result.DV[Level].Direction &= Dependence::DVEntry::EQ; ++StrongSIVsuccesses; - } - else if (Delta->isZero()) { + } else if (Delta->isZero()) { // since 0/X == 0 Result.DV[Level].Distance = Delta; NewConstraint.setDistance(Delta, CurLoop); Result.DV[Level].Direction &= Dependence::DVEntry::EQ; ++StrongSIVsuccesses; - } - else { + } else { if (Coeff->isOne()) { LLVM_DEBUG(dbgs() << "\t Distance = " << *Delta << "\n"); Result.DV[Level].Distance = Delta; // since X/1 == X NewConstraint.setDistance(Delta, CurLoop); - } - else { + } else { Result.Consistent = false; - NewConstraint.setLine(Coeff, - SE->getNegativeSCEV(Coeff), + NewConstraint.setLine(Coeff, SE->getNegativeSCEV(Coeff), SE->getNegativeSCEV(Delta), CurLoop); } // maybe we can get a useful direction - bool DeltaMaybeZero = !SE->isKnownNonZero(Delta); + bool DeltaMaybeZero = !SE->isKnownNonZero(Delta); bool DeltaMaybePositive = !SE->isKnownNonPositive(Delta); bool DeltaMaybeNegative = !SE->isKnownNonNegative(Delta); bool CoeffMaybePositive = !SE->isKnownNonPositive(Coeff); @@ -1353,7 +1310,6 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, return false; } - // weakCrossingSIVtest - // From the paper, Practical Dependence Testing, Section 4.2.2 // @@ -1447,8 +1403,8 @@ bool DependenceInfo::weakCrossingSIVtest( if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) { LLVM_DEBUG(dbgs() << "\t UpperBound = " << *UpperBound << "\n"); const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2); - const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound), - ConstantTwo); + const SCEV *ML = + SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound), ConstantTwo); LLVM_DEBUG(dbgs() << "\t ML = " << *ML << "\n"); if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, ML)) { // Delta too big, no dependence @@ -1498,7 +1454,6 @@ bool DependenceInfo::weakCrossingSIVtest( return false; } - // Kirch's algorithm, from // // Optimizing Supercompilers for Supercomputers @@ -1519,9 +1474,11 @@ static bool findGCD(unsigned Bits, const APInt &AM, const APInt &BM, APInt R = G0; APInt::sdivrem(G0, G1, Q, R); while (R != 0) { + // clang-format off APInt A2 = A0 - Q*A1; A0 = A1; A1 = A2; APInt B2 = B0 - Q*B1; B0 = B1; B1 = B2; G0 = G1; G1 = R; + // clang-format on APInt::sdivrem(G0, G1, Q, R); } G = G1; @@ -1543,8 +1500,7 @@ static APInt floorOfQuotient(const APInt &A, const APInt &B) { APInt::sdivrem(A, B, Q, R); if (R == 0) return Q; - if ((A.sgt(0) && B.sgt(0)) || - (A.slt(0) && B.slt(0))) + if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0))) return Q; else return Q - 1; @@ -1556,8 +1512,7 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) { APInt::sdivrem(A, B, Q, R); if (R == 0) return Q; - if ((A.sgt(0) && B.sgt(0)) || - (A.slt(0) && B.slt(0))) + if ((A.sgt(0) && B.sgt(0)) || (A.slt(0) && B.slt(0))) return Q + 1; else return Q; @@ -1733,17 +1688,14 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, return Result.DV[Level].Direction == Dependence::DVEntry::NONE; } - // Return true if the divisor evenly divides the dividend. -static -bool isRemainderZero(const SCEVConstant *Dividend, - const SCEVConstant *Divisor) { +static bool isRemainderZero(const SCEVConstant *Dividend, + const SCEVConstant *Divisor) { const APInt &ConstDividend = Dividend->getAPInt(); const APInt &ConstDivisor = Divisor->getAPInt(); return ConstDividend.srem(ConstDivisor) == 0; } - // weakZeroSrcSIVtest - // From the paper, Practical Dependence Testing, Section 4.2.2 // @@ -1807,11 +1759,11 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff, const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff); if (!ConstCoeff) return false; - const SCEV *AbsCoeff = - SE->isKnownNegative(ConstCoeff) ? - SE->getNegativeSCEV(ConstCoeff) : ConstCoeff; + const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff) + ? SE->getNegativeSCEV(ConstCoeff) + : ConstCoeff; const SCEV *NewDelta = - SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta; + SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta; // check that Delta/SrcCoeff < iteration count // really check NewDelta < count*AbsCoeff @@ -1853,7 +1805,6 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff, return false; } - // weakZeroDstSIVtest - // From the paper, Practical Dependence Testing, Section 4.2.2 // @@ -1916,11 +1867,11 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff, const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff); if (!ConstCoeff) return false; - const SCEV *AbsCoeff = - SE->isKnownNegative(ConstCoeff) ? - SE->getNegativeSCEV(ConstCoeff) : ConstCoeff; + const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff) + ? SE->getNegativeSCEV(ConstCoeff) + : ConstCoeff; const SCEV *NewDelta = - SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta; + SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta; // check that Delta/SrcCoeff < iteration count // really check NewDelta < count*AbsCoeff @@ -1962,7 +1913,6 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff, return false; } - // exactRDIVtest - Tests the RDIV subscript pair for dependence. // Things of the form [c1 + a*i] and [c2 + b*j], // where i and j are induction variable, c1 and c2 are loop invariant, @@ -2084,7 +2034,6 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff, return TL.sgt(TU); } - // symbolicRDIVtest - // In Section 4.5 of the Practical Dependence Testing paper,the authors // introduce a special case of Banerjee's Inequalities (also called the @@ -2167,8 +2116,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, return true; } } - } - else if (SE->isKnownNonPositive(A2)) { + } else if (SE->isKnownNonPositive(A2)) { // a1 >= 0 && a2 <= 0 if (N1 && N2) { // make sure that c2 - c1 <= a1*N1 - a2*N2 @@ -2187,8 +2135,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, return true; } } - } - else if (SE->isKnownNonPositive(A1)) { + } else if (SE->isKnownNonPositive(A1)) { if (SE->isKnownNonNegative(A2)) { // a1 <= 0 && a2 >= 0 if (N1 && N2) { @@ -2207,8 +2154,7 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, ++SymbolicRDIVindependence; return true; } - } - else if (SE->isKnownNonPositive(A2)) { + } else if (SE->isKnownNonPositive(A2)) { // a1 <= 0 && a2 <= 0 if (N1) { // make sure that a1*N1 <= c2 - c1 @@ -2233,7 +2179,6 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2, return false; } - // testSIV - // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 - a2*i] // where i is an induction variable, c1 and c2 are loop invariant, and a1 and @@ -2260,17 +2205,17 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level, Level = mapSrcLoop(CurLoop); bool disproven; if (SrcCoeff == DstCoeff) - disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, - Level, Result, NewConstraint); + disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level, + Result, NewConstraint); else if (SrcCoeff == SE->getNegativeSCEV(DstCoeff)) disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level, Result, NewConstraint, SplitIter); else disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, Level, Result, NewConstraint); - return disproven || - gcdMIVtest(Src, Dst, Result) || - symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, CurLoop); + return disproven || gcdMIVtest(Src, Dst, Result) || + symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, + CurLoop); } if (SrcAddRec) { const SCEV *SrcConst = SrcAddRec->getStart(); @@ -2278,9 +2223,9 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level, const SCEV *DstConst = Dst; const Loop *CurLoop = SrcAddRec->getLoop(); Level = mapSrcLoop(CurLoop); - return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, - Level, Result, NewConstraint) || - gcdMIVtest(Src, Dst, Result); + return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop, Level, + Result, NewConstraint) || + gcdMIVtest(Src, Dst, Result); } if (DstAddRec) { const SCEV *DstConst = DstAddRec->getStart(); @@ -2288,15 +2233,14 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level, const SCEV *SrcConst = Src; const Loop *CurLoop = DstAddRec->getLoop(); Level = mapDstLoop(CurLoop); - return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst, - CurLoop, Level, Result, NewConstraint) || - gcdMIVtest(Src, Dst, Result); + return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst, CurLoop, Level, + Result, NewConstraint) || + gcdMIVtest(Src, Dst, Result); } llvm_unreachable("SIV test expected at least one AddRec"); return false; } - // testRDIV - // When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j] // where i and j are induction variables, c1 and c2 are loop invariant, @@ -2333,46 +2277,37 @@ bool DependenceInfo::testRDIV(const SCEV *Src, const SCEV *Dst, DstConst = DstAddRec->getStart(); DstCoeff = DstAddRec->getStepRecurrence(*SE); DstLoop = DstAddRec->getLoop(); - } - else if (SrcAddRec) { + } else if (SrcAddRec) { if (const SCEVAddRecExpr *tmpAddRec = - dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) { + dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) { SrcConst = tmpAddRec->getStart(); SrcCoeff = tmpAddRec->getStepRecurrence(*SE); SrcLoop = tmpAddRec->getLoop(); DstConst = Dst; DstCoeff = SE->getNegativeSCEV(SrcAddRec->getStepRecurrence(*SE)); DstLoop = SrcAddRec->getLoop(); - } - else + } else llvm_unreachable("RDIV reached by surprising SCEVs"); - } - else if (DstAddRec) { + } else if (DstAddRec) { if (const SCEVAddRecExpr *tmpAddRec = - dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) { + dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) { DstConst = tmpAddRec->getStart(); DstCoeff = tmpAddRec->getStepRecurrence(*SE); DstLoop = tmpAddRec->getLoop(); SrcConst = Src; SrcCoeff = SE->getNegativeSCEV(DstAddRec->getStepRecurrence(*SE)); SrcLoop = DstAddRec->getLoop(); - } - else + } else llvm_unreachable("RDIV reached by surprising SCEVs"); - } - else + } else llvm_unreachable("RDIV expected at least one AddRec"); - return exactRDIVtest(SrcCoeff, DstCoeff, - SrcConst, DstConst, - SrcLoop, DstLoop, + return exactRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop, DstLoop, Result) || - gcdMIVtest(Src, Dst, Result) || - symbolicRDIVtest(SrcCoeff, DstCoeff, - SrcConst, DstConst, - SrcLoop, DstLoop); + gcdMIVtest(Src, Dst, Result) || + symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, SrcLoop, + DstLoop); } - // Tests the single-subscript MIV pair (Src and Dst) for dependence. // Return true if dependence disproved. // Can sometimes refine direction vectors. @@ -2383,7 +2318,7 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst, LLVM_DEBUG(dbgs() << " dst = " << *Dst << "\n"); Result.Consistent = false; return gcdMIVtest(Src, Dst, Result) || - banerjeeMIVtest(Src, Dst, Loops, Result); + banerjeeMIVtest(Src, Dst, Loops, Result); } // Given a product, e.g., 10*X*Y, returns the first constant operand, @@ -2428,7 +2363,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, // we can't quit the loop just because the GCD == 1. const SCEV *Coefficients = Src; while (const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(Coefficients)) { + dyn_cast<SCEVAddRecExpr>(Coefficients)) { const SCEV *Coeff = AddRec->getStepRecurrence(*SE); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. @@ -2446,7 +2381,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, // we can't quit the loop just because the GCD == 1. Coefficients = Dst; while (const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(Coefficients)) { + dyn_cast<SCEVAddRecExpr>(Coefficients)) { const SCEV *Coeff = AddRec->getStepRecurrence(*SE); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. @@ -2468,16 +2403,14 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, if (isa<SCEVConstant>(Operand)) { assert(!Constant && "Surprised to find multiple constants"); Constant = cast<SCEVConstant>(Operand); - } - else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) { + } else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) { // Search for constant operand to participate in GCD; // If none found; return false. std::optional<APInt> ConstOp = getConstantPart(Product); if (!ConstOp) return false; ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs()); - } - else + } else return false; } } @@ -2512,7 +2445,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, bool Improved = false; Coefficients = Src; while (const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(Coefficients)) { + dyn_cast<SCEVAddRecExpr>(Coefficients)) { Coefficients = AddRec->getStart(); const Loop *CurLoop = AddRec->getLoop(); RunningGCD = ExtraGCD; @@ -2578,7 +2511,6 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, return false; } - //===----------------------------------------------------------------------===// // banerjeeMIVtest - // Use Banerjee's Inequalities to test an MIV subscript pair. @@ -2652,8 +2584,8 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst, if (testBounds(Dependence::DVEntry::ALL, 0, Bound, Delta)) { // Explore the direction vector hierarchy. unsigned DepthExpanded = 0; - unsigned NewDeps = exploreDirections(1, A, B, Bound, - Loops, DepthExpanded, Delta); + unsigned NewDeps = + exploreDirections(1, A, B, Bound, Loops, DepthExpanded, Delta); if (NewDeps > 0) { bool Improved = false; for (unsigned K = 1; K <= CommonLevels; ++K) { @@ -2670,23 +2602,20 @@ bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst, } if (Improved) ++BanerjeeSuccesses; - } - else { + } else { ++BanerjeeIndependence; Disproved = true; } - } - else { + } else { ++BanerjeeIndependence; Disproved = true; } - delete [] Bound; - delete [] A; - delete [] B; + delete[] Bound; + delete[] A; + delete[] B; return Disproved; } - // Hierarchically expands the direction vector // search space, combining the directions of discovered dependences // in the DirSet field of Bound. Returns the number of distinct @@ -2788,27 +2717,26 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A, // test bounds for <, *, *, ... if (testBounds(Dependence::DVEntry::LT, Level, Bound, Delta)) - NewDeps += exploreDirections(Level + 1, A, B, Bound, - Loops, DepthExpanded, Delta); + NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, + Delta); // Test bounds for =, *, *, ... if (testBounds(Dependence::DVEntry::EQ, Level, Bound, Delta)) - NewDeps += exploreDirections(Level + 1, A, B, Bound, - Loops, DepthExpanded, Delta); + NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, + Delta); // test bounds for >, *, *, ... if (testBounds(Dependence::DVEntry::GT, Level, Bound, Delta)) - NewDeps += exploreDirections(Level + 1, A, B, Bound, - Loops, DepthExpanded, Delta); + NewDeps += exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, + Delta); Bound[Level].Direction = Dependence::DVEntry::ALL; return NewDeps; - } - else - return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, Delta); + } else + return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, + Delta); } - // Returns true iff the current bounds are plausible. bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level, BoundInfo *Bound, const SCEV *Delta) const { @@ -2822,7 +2750,6 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level, return true; } - // Computes the upper and lower bounds for level K // using the * direction. Records them in Bound. // Wolfe gives the equations @@ -2840,17 +2767,16 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level, // and the upper bound is always >= 0. void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound, unsigned K) const { - Bound[K].Lower[Dependence::DVEntry::ALL] = nullptr; // Default value = -infinity. - Bound[K].Upper[Dependence::DVEntry::ALL] = nullptr; // Default value = +infinity. + Bound[K].Lower[Dependence::DVEntry::ALL] = + nullptr; // Default value = -infinity. + Bound[K].Upper[Dependence::DVEntry::ALL] = + nullptr; // Default value = +infinity. if (Bound[K].Iterations) { - Bound[K].Lower[Dependence::DVEntry::ALL] = - SE->getMulExpr(SE->getMinusSCEV(A[K].NegPart, B[K].PosPart), - Bound[K].Iterations); - Bound[K].Upper[Dependence::DVEntry::ALL] = - SE->getMulExpr(SE->getMinusSCEV(A[K].PosPart, B[K].NegPart), - Bound[K].Iterations); - } - else { + Bound[K].Lower[Dependence::DVEntry::ALL] = SE->getMulExpr( + SE->getMinusSCEV(A[K].NegPart, B[K].PosPart), Bound[K].Iterations); + Bound[K].Upper[Dependence::DVEntry::ALL] = SE->getMulExpr( + SE->getMinusSCEV(A[K].PosPart, B[K].NegPart), Bound[K].Iterations); + } else { // If the difference is 0, we won't need to know the number of iterations. if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart)) Bound[K].Lower[Dependence::DVEntry::ALL] = @@ -2861,7 +2787,6 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B, } } - // Computes the upper and lower bounds for level K // using the = direction. Records them in Bound. // Wolfe gives the equations @@ -2879,18 +2804,19 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B, // and the upper bound is always >= 0. void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound, unsigned K) const { - Bound[K].Lower[Dependence::DVEntry::EQ] = nullptr; // Default value = -infinity. - Bound[K].Upper[Dependence::DVEntry::EQ] = nullptr; // Default value = +infinity. + Bound[K].Lower[Dependence::DVEntry::EQ] = + nullptr; // Default value = -infinity. + Bound[K].Upper[Dependence::DVEntry::EQ] = + nullptr; // Default value = +infinity. if (Bound[K].Iterations) { const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff); const SCEV *NegativePart = getNegativePart(Delta); Bound[K].Lower[Dependence::DVEntry::EQ] = - SE->getMulExpr(NegativePart, Bound[K].Iterations); + SE->getMulExpr(NegativePart, Bound[K].Iterations); const SCEV *PositivePart = getPositivePart(Delta); Bound[K].Upper[Dependence::DVEntry::EQ] = - SE->getMulExpr(PositivePart, Bound[K].Iterations); - } - else { + SE->getMulExpr(PositivePart, Bound[K].Iterations); + } else { // If the positive/negative part of the difference is 0, // we won't need to know the number of iterations. const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff); @@ -2903,7 +2829,6 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B, } } - // Computes the upper and lower bounds for level K // using the < direction. Records them in Bound. // Wolfe gives the equations @@ -2919,35 +2844,35 @@ void DependenceInfo::findBoundsEQ(CoefficientInfo *A, CoefficientInfo *B, // We must be careful to handle the case where the upper bound is unknown. void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound, unsigned K) const { - Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity. - Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity. + Bound[K].Lower[Dependence::DVEntry::LT] = + nullptr; // Default value = -infinity. + Bound[K].Upper[Dependence::DVEntry::LT] = + nullptr; // Default value = +infinity. if (Bound[K].Iterations) { const SCEV *Iter_1 = SE->getMinusSCEV( Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType())); const SCEV *NegPart = - getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff)); + getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff)); Bound[K].Lower[Dependence::DVEntry::LT] = - SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff); + SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff); const SCEV *PosPart = - getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff)); + getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff)); Bound[K].Upper[Dependence::DVEntry::LT] = - SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff); - } - else { + SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff); + } else { // If the positive/negative part of the difference is 0, // we won't need to know the number of iterations. const SCEV *NegPart = - getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff)); + getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff)); if (NegPart->isZero()) Bound[K].Lower[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff); const SCEV *PosPart = - getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff)); + getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff)); if (PosPart->isZero()) Bound[K].Upper[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff); } } - // Computes the upper and lower bounds for level K // using the > direction. Records them in Bound. // Wolfe gives the equations @@ -2963,45 +2888,45 @@ void DependenceInfo::findBoundsLT(CoefficientInfo *A, CoefficientInfo *B, // We must be careful to handle the case where the upper bound is unknown. void DependenceInfo::findBoundsGT(CoefficientInfo *A, CoefficientInfo *B, BoundInfo *Bound, unsigned K) const { - Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity. - Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity. + Bound[K].Lower[Dependence::DVEntry::GT] = + nullptr; // Default value = -infinity. + Bound[K].Upper[Dependence::DVEntry::GT] = + nullptr; // Default value = +infinity. if (Bound[K].Iterations) { const SCEV *Iter_1 = SE->getMinusSCEV( Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType())); const SCEV *NegPart = - getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart)); + getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart)); Bound[K].Lower[Dependence::DVEntry::GT] = - SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff); + SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff); const SCEV *PosPart = - getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart)); + getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart)); Bound[K].Upper[Dependence::DVEntry::GT] = - SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff); - } - else { + SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff); + } else { // If the positive/negative part of the difference is 0, // we won't need to know the number of iterations. - const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart)); + const SCEV *NegPart = + getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart)); if (NegPart->isZero()) Bound[K].Lower[Dependence::DVEntry::GT] = A[K].Coeff; - const SCEV *PosPart = getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart)); + const SCEV *PosPart = + getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart)); if (PosPart->isZero()) Bound[K].Upper[Dependence::DVEntry::GT] = A[K].Coeff; } } - // X^+ = max(X, 0) const SCEV *DependenceInfo::getPositivePart(const SCEV *X) const { return SE->getSMaxExpr(X, SE->getZero(X->getType())); } - // X^- = min(X, 0) const SCEV *DependenceInfo::getNegativePart(const SCEV *X) const { return SE->getSMinExpr(X, SE->getZero(X->getType())); } - // Walks through the subscript, // collecting each coefficient, the associated loop bounds, // and recording its positive and negative parts for later use. @@ -3046,7 +2971,6 @@ DependenceInfo::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag, return CI; } - // Looks through all the bounds info and // computes the lower bound given the current direction settings // at each level. If the lower bound for any level is -inf, @@ -3062,7 +2986,6 @@ const SCEV *DependenceInfo::getLowerBound(BoundInfo *Bound) const { return Sum; } - // Looks through all the bounds info and // computes the upper bound given the current direction settings // at each level. If the upper bound at any level is +inf, @@ -3078,7 +3001,6 @@ const SCEV *DependenceInfo::getUpperBound(BoundInfo *Bound) const { return Sum; } - //===----------------------------------------------------------------------===// // Constraint manipulation for Delta test. @@ -3098,7 +3020,6 @@ const SCEV *DependenceInfo::findCoefficient(const SCEV *Expr, return findCoefficient(AddRec->getStart(), TargetLoop); } - // Given a linear SCEV, // return the SCEV given by zeroing out the coefficient // corresponding to the specified loop. @@ -3112,12 +3033,10 @@ const SCEV *DependenceInfo::zeroCoefficient(const SCEV *Expr, if (AddRec->getLoop() == TargetLoop) return AddRec->getStart(); return SE->getAddRecExpr(zeroCoefficient(AddRec->getStart(), TargetLoop), - AddRec->getStepRecurrence(*SE), - AddRec->getLoop(), + AddRec->getStepRecurrence(*SE), AddRec->getLoop(), AddRec->getNoWrapFlags()); } - // Given a linear SCEV Expr, // return the SCEV given by adding some Value to the // coefficient corresponding to the specified TargetLoop. @@ -3128,17 +3047,13 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr, const SCEV *Value) const { const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr); if (!AddRec) // create a new addRec - return SE->getAddRecExpr(Expr, - Value, - TargetLoop, + return SE->getAddRecExpr(Expr, Value, TargetLoop, SCEV::FlagAnyWrap); // Worst case, with no info. if (AddRec->getLoop() == TargetLoop) { const SCEV *Sum = SE->getAddExpr(AddRec->getStepRecurrence(*SE), Value); if (Sum->isZero()) return AddRec->getStart(); - return SE->getAddRecExpr(AddRec->getStart(), - Sum, - AddRec->getLoop(), + return SE->getAddRecExpr(AddRec->getStart(), Sum, AddRec->getLoop(), AddRec->getNoWrapFlags()); } if (SE->isLoopInvariant(AddRec, TargetLoop)) @@ -3149,7 +3064,6 @@ const SCEV *DependenceInfo::addToCoefficient(const SCEV *Expr, AddRec->getNoWrapFlags()); } - // Review the constraints, looking for opportunities // to simplify a subscript pair (Src and Dst). // Return true if some simplification occurs. @@ -3178,7 +3092,6 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst, return Result; } - // Attempt to propagate a distance // constraint into a subscript pair (Src and Dst). // Return true if some simplification occurs. @@ -3204,7 +3117,6 @@ bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst, return true; } - // Attempt to propagate a line // constraint into a subscript pair (Src and Dst). // Return true if some simplification occurs. @@ -3224,22 +3136,22 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst, if (A->isZero()) { const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B); const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C); - if (!Bconst || !Cconst) return false; + if (!Bconst || !Cconst) + return false; APInt Beta = Bconst->getAPInt(); APInt Charlie = Cconst->getAPInt(); APInt CdivB = Charlie.sdiv(Beta); assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B"); const SCEV *AP_K = findCoefficient(Dst, CurLoop); - // Src = SE->getAddExpr(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB))); Src = SE->getMinusSCEV(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB))); Dst = zeroCoefficient(Dst, CurLoop); if (!findCoefficient(Src, CurLoop)->isZero()) Consistent = false; - } - else if (B->isZero()) { + } else if (B->isZero()) { const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A); const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C); - if (!Aconst || !Cconst) return false; + if (!Aconst || !Cconst) + return false; APInt Alpha = Aconst->getAPInt(); APInt Charlie = Cconst->getAPInt(); APInt CdivA = Charlie.sdiv(Alpha); @@ -3249,11 +3161,11 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst, Src = zeroCoefficient(Src, CurLoop); if (!findCoefficient(Dst, CurLoop)->isZero()) Consistent = false; - } - else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) { + } else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) { const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A); const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C); - if (!Aconst || !Cconst) return false; + if (!Aconst || !Cconst) + return false; APInt Alpha = Aconst->getAPInt(); APInt Charlie = Cconst->getAPInt(); APInt CdivA = Charlie.sdiv(Alpha); @@ -3264,8 +3176,7 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst, Dst = addToCoefficient(Dst, CurLoop, A_K); if (!findCoefficient(Dst, CurLoop)->isZero()) Consistent = false; - } - else { + } else { // paper is incorrect here, or perhaps just misleading const SCEV *A_K = findCoefficient(Src, CurLoop); Src = SE->getMulExpr(Src, A); @@ -3281,7 +3192,6 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst, return true; } - // Attempt to propagate a point // constraint into a subscript pair (Src and Dst). // Return true if some simplification occurs. @@ -3302,7 +3212,6 @@ bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst, return true; } - // Update direction vector entry based on the current constraint. void DependenceInfo::updateDirection(Dependence::DVEntry &Level, const Constraint &CurConstraint) const { @@ -3322,34 +3231,28 @@ void DependenceInfo::updateDirection(Dependence::DVEntry &Level, if (!SE->isKnownNonNegative(Level.Distance)) // if may be negative NewDirection |= Dependence::DVEntry::GT; Level.Direction &= NewDirection; - } - else if (CurConstraint.isLine()) { + } else if (CurConstraint.isLine()) { Level.Scalar = false; Level.Distance = nullptr; // direction should be accurate - } - else if (CurConstraint.isPoint()) { + } else if (CurConstraint.isPoint()) { Level.Scalar = false; Level.Distance = nullptr; unsigned NewDirection = Dependence::DVEntry::NONE; - if (!isKnownPredicate(CmpInst::ICMP_NE, - CurConstraint.getY(), + if (!isKnownPredicate(CmpInst::ICMP_NE, CurConstraint.getY(), CurConstraint.getX())) // if X may be = Y NewDirection |= Dependence::DVEntry::EQ; - if (!isKnownPredicate(CmpInst::ICMP_SLE, - CurConstraint.getY(), + if (!isKnownPredicate(CmpInst::ICMP_SLE, CurConstraint.getY(), CurConstraint.getX())) // if Y may be > X NewDirection |= Dependence::DVEntry::LT; - if (!isKnownPredicate(CmpInst::ICMP_SGE, - CurConstraint.getY(), + if (!isKnownPredicate(CmpInst::ICMP_SGE, CurConstraint.getY(), CurConstraint.getX())) // if Y may be < X NewDirection |= Dependence::DVEntry::GT; Level.Direction &= NewDirection; - } - else + } else llvm_unreachable("constraint has unexpected kind"); } @@ -3425,7 +3328,7 @@ bool DependenceInfo::tryDelinearizeFixedSize( dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn)); assert(SrcBase && DstBase && SrcBase == DstBase && "expected src and dst scev unknowns to be equal"); - }); + }); SmallVector<int, 4> SrcSizes; SmallVector<int, 4> DstSizes; @@ -3737,9 +3640,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, Pair[P].Group.resize(Pairs); removeMatchingExtensions(&Pair[P]); Pair[P].Classification = - classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), - Pair[P].Dst, LI->getLoopFor(Dst->getParent()), - Pair[P].Loops); + classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst, + LI->getLoopFor(Dst->getParent()), Pair[P].Loops); Pair[P].GroupLoops = Pair[P].Loops; Pair[P].Group.set(P); LLVM_DEBUG(dbgs() << " subscript " << P << "\n"); @@ -3814,18 +3716,15 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, if (Pair[SI].Classification == Subscript::NonLinear) { // ignore these, but collect loops for later ++NonlinearSubscriptPairs; - collectCommonLoops(Pair[SI].Src, - LI->getLoopFor(Src->getParent()), + collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()), Pair[SI].Loops); - collectCommonLoops(Pair[SI].Dst, - LI->getLoopFor(Dst->getParent()), + collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()), Pair[SI].Loops); Result.Consistent = false; } else if (Pair[SI].Classification == Subscript::ZIV) { // always separable Separable.set(SI); - } - else { + } else { // SIV, RDIV, or MIV, so check for coupled group bool Done = true; for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) { @@ -3843,8 +3742,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, if (Pair[SI].Group.count() == 1) { Separable.set(SI); ++SeparableSubscriptPairs; - } - else { + } else { Coupled.set(SI); ++CoupledSubscriptPairs; } @@ -3950,10 +3848,9 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, Constraints, Result.Consistent)) { LLVM_DEBUG(dbgs() << "\t Changed\n"); ++DeltaPropagations; - Pair[SJ].Classification = - classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()), - Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()), - Pair[SJ].Loops); + Pair[SJ].Classification = classifyPair( + Pair[SJ].Src, LI->getLoopFor(Src->getParent()), Pair[SJ].Dst, + LI->getLoopFor(Dst->getParent()), Pair[SJ].Loops); switch (Pair[SJ].Classification) { case Subscript::ZIV: LLVM_DEBUG(dbgs() << "ZIV\n"); @@ -3995,8 +3892,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, LLVM_DEBUG(dbgs() << "MIV test\n"); if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result)) return nullptr; - } - else + } else llvm_unreachable("expected only MIV subscripts at this point"); } @@ -4052,8 +3948,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, break; } } - } - else { + } else { // On the other hand, if all directions are equal and there's no // loop-independent dependence possible, then no dependence exists. bool AllEqual = true; @@ -4158,9 +4053,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, Pair[P].Group.resize(Pairs); removeMatchingExtensions(&Pair[P]); Pair[P].Classification = - classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), - Pair[P].Dst, LI->getLoopFor(Dst->getParent()), - Pair[P].Loops); + classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst, + LI->getLoopFor(Dst->getParent()), Pair[P].Loops); Pair[P].GroupLoops = Pair[P].Loops; Pair[P].Group.set(P); } @@ -4172,15 +4066,12 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, for (unsigned SI = 0; SI < Pairs; ++SI) { if (Pair[SI].Classification == Subscript::NonLinear) { // ignore these, but collect loops for later - collectCommonLoops(Pair[SI].Src, - LI->getLoopFor(Src->getParent()), + collectCommonLoops(Pair[SI].Src, LI->getLoopFor(Src->getParent()), Pair[SI].Loops); - collectCommonLoops(Pair[SI].Dst, - LI->getLoopFor(Dst->getParent()), + collectCommonLoops(Pair[SI].Dst, LI->getLoopFor(Dst->getParent()), Pair[SI].Loops); Result.Consistent = false; - } - else if (Pair[SI].Classification == Subscript::ZIV) + } else if (Pair[SI].Classification == Subscript::ZIV) Separable.set(SI); else { // SIV, RDIV, or MIV, so check for coupled group @@ -4214,8 +4105,8 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, case Subscript::SIV: { unsigned Level; const SCEV *SplitIter = nullptr; - (void) testSIV(Pair[SI].Src, Pair[SI].Dst, Level, - Result, NewConstraint, SplitIter); + (void)testSIV(Pair[SI].Src, Pair[SI].Dst, Level, Result, NewConstraint, + SplitIter); if (Level == SplitLevel) { assert(SplitIter != nullptr); return SplitIter; diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index b3b4c37..425ea31 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::exp: case Intrinsic::exp10: case Intrinsic::exp2: + case Intrinsic::ldexp: case Intrinsic::log: case Intrinsic::log10: case Intrinsic::log2: @@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::canonicalize: case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::ucmp: @@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( switch (ID) { case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::vp_lrint: @@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::vp_is_fpclass: return OpdIdx == 0; case Intrinsic::powi: + case Intrinsic::ldexp: return OpdIdx == -1 || OpdIdx == 1; default: return OpdIdx == -1; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index f9d7e76..67f526f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1292,12 +1292,10 @@ DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const { } } -DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE, - const DISubprogram *CalleeSP, - bool IsTail, - const MCSymbol *PCAddr, - const MCSymbol *CallAddr, - unsigned CallReg) { +DIE &DwarfCompileUnit::constructCallSiteEntryDIE( + DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail, + const MCSymbol *PCAddr, const MCSymbol *CallAddr, unsigned CallReg, + DIType *AllocSiteTy) { // Insert a call site entry DIE within ScopeDIE. DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site), ScopeDIE, nullptr); @@ -1306,7 +1304,7 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE, // Indirect call. addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target), MachineLocation(CallReg)); - } else { + } else if (CalleeSP) { DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP); assert(CalleeDIE && "Could not create DIE for call site entry origin"); if (AddLinkageNamesToDeclCallOriginsForTuning(DD) && @@ -1351,6 +1349,9 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_return_pc), PCAddr); } + if (AllocSiteTy) + addType(CallSiteDIE, AllocSiteTy, dwarf::DW_AT_LLVM_alloc_type); + return CallSiteDIE; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 09be22c..c2f6ca0 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -289,7 +289,8 @@ public: /// the \p CallReg is set to 0. DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail, const MCSymbol *PCAddr, - const MCSymbol *CallAddr, unsigned CallReg); + const MCSymbol *CallAddr, unsigned CallReg, + DIType *AllocSiteTy); /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params /// were collected by the \ref collectCallSiteParameters. /// Note: The order of parameters does not matter, since debuggers recognize diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 5ae2d2a..c27f100 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -936,6 +936,8 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP, if (MI.hasDelaySlot() && !delaySlotSupported(*&MI)) return; + DIType *AllocSiteTy = dyn_cast_or_null<DIType>(MI.getHeapAllocMarker()); + // If this is a direct call, find the callee's subprogram. // In the case of an indirect call find the register that holds // the callee. @@ -950,23 +952,23 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP, PhysRegCalleeOperand = PhysRegCalleeOperand && MCOI.OperandType == MCOI::OPERAND_REGISTER; } - if (!CalleeOp.isGlobal() && !PhysRegCalleeOperand) - continue; unsigned CallReg = 0; const DISubprogram *CalleeSP = nullptr; const Function *CalleeDecl = nullptr; if (PhysRegCalleeOperand) { - CallReg = CalleeOp.getReg(); - if (!CallReg) - continue; - } else { + CallReg = CalleeOp.getReg(); // might be zero + } else if (CalleeOp.isGlobal()) { CalleeDecl = dyn_cast<Function>(CalleeOp.getGlobal()); - if (!CalleeDecl || !CalleeDecl->getSubprogram()) - continue; - CalleeSP = CalleeDecl->getSubprogram(); + if (CalleeDecl) + CalleeSP = CalleeDecl->getSubprogram(); // might be nullptr } + // Omit DIE if we can't tell where the call goes *and* we don't want to + // add metadata to it. + if (CalleeSP == nullptr && CallReg == 0 && AllocSiteTy == nullptr) + continue; + // TODO: Omit call site entries for runtime calls (objc_msgSend, etc). bool IsTail = TII->isTailCall(MI); @@ -1000,7 +1002,7 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP, << (IsTail ? " [IsTail]" : "") << "\n"); DIE &CallSiteDIE = CU.constructCallSiteEntryDIE( - ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg); + ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg, AllocSiteTy); // Optionally emit call-site-param debug info. if (emitDebugEntryValues()) { diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 9ba1782..0f3ec8b 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -132,9 +132,10 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, unsigned i = 0; unsigned NumFixedArgs = CB.getFunctionType()->getNumParams(); for (const auto &Arg : CB.args()) { - ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i), - i < NumFixedArgs}; + ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i)}; setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB); + if (i >= NumFixedArgs) + OrigArg.Flags[0].setVarArg(); // If we have an explicit sret argument that is an Instruction, (i.e., it // might point to function-local memory), we can't meaningfully tail-call. @@ -301,7 +302,7 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg, // double] -> double). SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), OrigArg.OrigArgIndex, OrigArg.Flags[0], - OrigArg.IsFixed, OrigArg.OrigValue); + OrigArg.OrigValue); return; } @@ -313,7 +314,7 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg, for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.OrigArgIndex, - OrigArg.Flags[0], OrigArg.IsFixed); + OrigArg.Flags[0]); if (NeedsRegBlock) SplitArgs.back().Flags[0].setInConsecutiveRegs(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d70e96938..7341914 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9390,8 +9390,7 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { LLVMContext &Context = *DAG.getContext(); unsigned NumStores = Stores.size(); unsigned WideNumBits = NumStores * NarrowNumBits; - EVT WideVT = EVT::getIntegerVT(Context, WideNumBits); - if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64) + if (WideNumBits != 16 && WideNumBits != 32 && WideNumBits != 64) return SDValue(); // Check if all bytes of the source value that we are looking at are stored @@ -9445,7 +9444,7 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { SourceValue = WideVal; // Give up if the source value type is smaller than the store size. - if (SourceValue.getScalarValueSizeInBits() < WideVT.getScalarSizeInBits()) + if (SourceValue.getScalarValueSizeInBits() < WideNumBits) return SDValue(); } @@ -9469,6 +9468,8 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { OffsetMap[Offset] = ByteOffsetFromBase; } + EVT WideVT = EVT::getIntegerVT(Context, WideNumBits); + assert(FirstOffset != INT64_MAX && "First byte offset must be set"); assert(FirstStore && "First store must be set"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 71a175d..bfa72bf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5630,6 +5630,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::FDIV: case ISD::FREM: case ISD::FCOPYSIGN: + case ISD::FP_EXTEND: // No poison except from flags (which is handled above) return false; @@ -6422,6 +6423,20 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (N1.isUndef()) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, DL, VT); + + // Skip unnecessary sext_inreg pattern: + // (sext (trunc x)) -> x iff the upper bits are all signbits. + if (OpOpcode == ISD::TRUNCATE) { + SDValue OpOp = N1.getOperand(0); + if (OpOp.getValueType() == VT) { + unsigned NumSignExtBits = + VT.getScalarSizeInBits() - N1.getScalarValueSizeInBits(); + if (ComputeNumSignBits(OpOp) > NumSignExtBits) { + transferDbgValues(N1, OpOp); + return OpOp; + } + } + } break; case ISD::ZERO_EXTEND: assert(VT.isInteger() && N1.getValueType().isInteger() && diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index d0815e9..868e2f4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2273,9 +2273,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { Flags.setNoExt(); for (unsigned i = 0; i < NumParts; ++i) { - Outs.push_back(ISD::OutputArg(Flags, - Parts[i].getValueType().getSimpleVT(), - VT, /*isfixed=*/true, 0, 0)); + Outs.push_back(ISD::OutputArg( + Flags, Parts[i].getValueType().getSimpleVT(), VT, 0, 0)); OutVals.push_back(Parts[i]); } } @@ -2291,9 +2290,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { assert(SwiftError.getFunctionArg() && "Need a swift error argument"); ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); Flags.setSwiftError(); - Outs.push_back(ISD::OutputArg( - Flags, /*vt=*/TLI.getPointerTy(DL), /*argvt=*/EVT(TLI.getPointerTy(DL)), - /*isfixed=*/true, /*origidx=*/1, /*partOffs=*/0)); + Outs.push_back(ISD::OutputArg(Flags, /*vt=*/TLI.getPointerTy(DL), + /*argvt=*/EVT(TLI.getPointerTy(DL)), + /*origidx=*/1, /*partOffs=*/0)); // Create SDNode for the swifterror virtual register. OutVals.push_back( DAG.getRegister(SwiftError.getOrCreateVRegUseAt( @@ -11124,6 +11123,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { const Align OriginalAlignment(getABIAlignmentForCallingConv(ArgTy, DL)); Flags.setOrigAlign(OriginalAlignment); + if (i >= CLI.NumFixedArgs) + Flags.setVarArg(); if (Args[i].Ty->isPointerTy()) { Flags.setPointer(); Flags.setPointerAddrSpace( @@ -11246,8 +11247,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // For scalable vectors the scalable part is currently handled // by individual targets, so we just use the known minimum size here. ISD::OutputArg MyFlags( - Flags, Parts[j].getValueType().getSimpleVT(), VT, - i < CLI.NumFixedArgs, i, + Flags, Parts[j].getValueType().getSimpleVT(), VT, i, j * Parts[j].getValueType().getStoreSize().getKnownMinValue()); if (NumParts > 1 && j == 0) MyFlags.Flags.setSplit(); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 9f525ea..17a01f48 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1772,7 +1772,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType, Flags.setZExt(); for (unsigned i = 0; i < NumParts; ++i) - Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isfixed=*/true, 0, 0)); + Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, 0, 0)); } } diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp index 6ddb12b..8052773 100644 --- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp @@ -109,6 +109,7 @@ static bool isODRAttribute(uint16_t Attr) { case dwarf::DW_AT_specification: case dwarf::DW_AT_abstract_origin: case dwarf::DW_AT_import: + case dwarf::DW_AT_LLVM_alloc_type: return true; } llvm_unreachable("Improper attribute."); diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp index 78c20a6..79904fc 100644 --- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp +++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp @@ -18,24 +18,6 @@ namespace hlsl { namespace rootsig { template <typename T> -static std::optional<StringRef> getEnumName(const T Value, - ArrayRef<EnumEntry<T>> Enums) { - for (const auto &EnumItem : Enums) - if (EnumItem.Value == Value) - return EnumItem.Name; - return std::nullopt; -} - -template <typename T> -static raw_ostream &printEnum(raw_ostream &OS, const T Value, - ArrayRef<EnumEntry<T>> Enums) { - auto MaybeName = getEnumName(Value, Enums); - if (MaybeName) - OS << *MaybeName; - return OS; -} - -template <typename T> static raw_ostream &printFlags(raw_ostream &OS, const T Value, ArrayRef<EnumEntry<T>> Flags) { bool FlagSet = false; @@ -46,9 +28,9 @@ static raw_ostream &printFlags(raw_ostream &OS, const T Value, if (FlagSet) OS << " | "; - auto MaybeFlag = getEnumName(T(Bit), Flags); - if (MaybeFlag) - OS << *MaybeFlag; + StringRef MaybeFlag = enumToStringRef(T(Bit), Flags); + if (!MaybeFlag.empty()) + OS << MaybeFlag; else OS << "invalid: " << Bit; @@ -70,43 +52,42 @@ static const EnumEntry<RegisterType> RegisterNames[] = { }; static raw_ostream &operator<<(raw_ostream &OS, const Register &Reg) { - printEnum(OS, Reg.ViewType, ArrayRef(RegisterNames)); - OS << Reg.Number; + OS << enumToStringRef(Reg.ViewType, ArrayRef(RegisterNames)) << Reg.Number; return OS; } static raw_ostream &operator<<(raw_ostream &OS, const llvm::dxbc::ShaderVisibility &Visibility) { - printEnum(OS, Visibility, dxbc::getShaderVisibility()); + OS << enumToStringRef(Visibility, dxbc::getShaderVisibility()); return OS; } static raw_ostream &operator<<(raw_ostream &OS, const llvm::dxbc::SamplerFilter &Filter) { - printEnum(OS, Filter, dxbc::getSamplerFilters()); + OS << enumToStringRef(Filter, dxbc::getSamplerFilters()); return OS; } static raw_ostream &operator<<(raw_ostream &OS, const dxbc::TextureAddressMode &Address) { - printEnum(OS, Address, dxbc::getTextureAddressModes()); + OS << enumToStringRef(Address, dxbc::getTextureAddressModes()); return OS; } static raw_ostream &operator<<(raw_ostream &OS, const dxbc::ComparisonFunc &CompFunc) { - printEnum(OS, CompFunc, dxbc::getComparisonFuncs()); + OS << enumToStringRef(CompFunc, dxbc::getComparisonFuncs()); return OS; } static raw_ostream &operator<<(raw_ostream &OS, const dxbc::StaticBorderColor &BorderColor) { - printEnum(OS, BorderColor, dxbc::getStaticBorderColors()); + OS << enumToStringRef(BorderColor, dxbc::getStaticBorderColors()); return OS; } @@ -119,8 +100,8 @@ static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { }; static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) { - printEnum(OS, dxil::ResourceClass(llvm::to_underlying(Type)), - ArrayRef(ResourceClassNames)); + OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)), + ArrayRef(ResourceClassNames)); return OS; } diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp index 6d89fa7..9cf4ed1 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp @@ -58,13 +58,6 @@ static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { {"Sampler", dxil::ResourceClass::Sampler}, }; -static std::optional<StringRef> getResourceName(dxil::ResourceClass Class) { - for (const auto &ClassEnum : ResourceClassNames) - if (ClassEnum.Value == Class) - return ClassEnum.Name; - return std::nullopt; -} - namespace { // We use the OverloadVisit with std::visit to ensure the compiler catches if a @@ -133,10 +126,11 @@ MDNode *MetadataBuilder::BuildRootConstants(const RootConstants &Constants) { MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) { IRBuilder<> Builder(Ctx); - std::optional<StringRef> ResName = - getResourceName(dxil::ResourceClass(to_underlying(Descriptor.Type))); - assert(ResName && "Provided an invalid Resource Class"); - SmallString<7> Name({"Root", *ResName}); + StringRef ResName = + enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)), + ArrayRef(ResourceClassNames)); + assert(!ResName.empty() && "Provided an invalid Resource Class"); + SmallString<7> Name({"Root", ResName}); Metadata *Operands[] = { MDString::get(Ctx, Name), ConstantAsMetadata::get( @@ -174,11 +168,12 @@ MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) { MDNode *MetadataBuilder::BuildDescriptorTableClause( const DescriptorTableClause &Clause) { IRBuilder<> Builder(Ctx); - std::optional<StringRef> ResName = - getResourceName(dxil::ResourceClass(to_underlying(Clause.Type))); - assert(ResName && "Provided an invalid Resource Class"); + StringRef ResName = + enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)), + ArrayRef(ResourceClassNames)); + assert(!ResName.empty() && "Provided an invalid Resource Class"); Metadata *Operands[] = { - MDString::get(Ctx, *ResName), + MDString::get(Ctx, ResName), ConstantAsMetadata::get(Builder.getInt32(Clause.NumDescriptors)), ConstantAsMetadata::get(Builder.getInt32(Clause.Reg.Number)), ConstantAsMetadata::get(Builder.getInt32(Clause.Space)), diff --git a/llvm/lib/LTO/LTOModule.cpp b/llvm/lib/LTO/LTOModule.cpp index e0a9758..7dd0611 100644 --- a/llvm/lib/LTO/LTOModule.cpp +++ b/llvm/lib/LTO/LTOModule.cpp @@ -203,8 +203,10 @@ LTOModule::makeLTOModule(MemoryBufferRef Buffer, const TargetOptions &options, // find machine architecture for this module std::string errMsg; const Target *march = TargetRegistry::lookupTarget(Triple, errMsg); - if (!march) + if (!march) { + Context.emitError(errMsg); return make_error_code(object::object_error::arch_not_found); + } // construct LTOModule, hand over ownership of module and target SubtargetFeatures Features; diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 93614cd..9a5e070 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -2432,6 +2432,11 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst, void MCAsmStreamer::emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) { + if (CurFrag) { + MCSection *Sec = getCurrentSectionOnly(); + Sec->setHasInstructions(true); + } + if (MAI->isAIX() && CurFrag) // Now that a machine instruction has been assembled into this section, make // a line entry for any .loc directive that has been seen. diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp index 89c1df8..07514dd 100644 --- a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp +++ b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp @@ -301,7 +301,7 @@ void MachOWriter::writeSymbolTable() { O.LoadCommands[*O.SymTabCommandIndex] .MachOLoadCommand.symtab_command_data; - char *SymTable = (char *)Buf->getBufferStart() + SymTabCommand.symoff; + char *SymTable = Buf->getBufferStart() + SymTabCommand.symoff; for (auto &Symbol : O.SymTable.Symbols) { SymbolEntry *Sym = Symbol.get(); uint32_t Nstrx = LayoutBuilder.getStringTableBuilder().getOffset(Sym->Name); @@ -319,7 +319,7 @@ void MachOWriter::writeRebaseInfo() { const MachO::dyld_info_command &DyLdInfoCommand = O.LoadCommands[*O.DyLdInfoCommandIndex] .MachOLoadCommand.dyld_info_command_data; - char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.rebase_off; + char *Out = Buf->getBufferStart() + DyLdInfoCommand.rebase_off; assert((DyLdInfoCommand.rebase_size == O.Rebases.Opcodes.size()) && "Incorrect rebase opcodes size"); memcpy(Out, O.Rebases.Opcodes.data(), O.Rebases.Opcodes.size()); @@ -331,7 +331,7 @@ void MachOWriter::writeBindInfo() { const MachO::dyld_info_command &DyLdInfoCommand = O.LoadCommands[*O.DyLdInfoCommandIndex] .MachOLoadCommand.dyld_info_command_data; - char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.bind_off; + char *Out = Buf->getBufferStart() + DyLdInfoCommand.bind_off; assert((DyLdInfoCommand.bind_size == O.Binds.Opcodes.size()) && "Incorrect bind opcodes size"); memcpy(Out, O.Binds.Opcodes.data(), O.Binds.Opcodes.size()); @@ -343,7 +343,7 @@ void MachOWriter::writeWeakBindInfo() { const MachO::dyld_info_command &DyLdInfoCommand = O.LoadCommands[*O.DyLdInfoCommandIndex] .MachOLoadCommand.dyld_info_command_data; - char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.weak_bind_off; + char *Out = Buf->getBufferStart() + DyLdInfoCommand.weak_bind_off; assert((DyLdInfoCommand.weak_bind_size == O.WeakBinds.Opcodes.size()) && "Incorrect weak bind opcodes size"); memcpy(Out, O.WeakBinds.Opcodes.data(), O.WeakBinds.Opcodes.size()); @@ -355,7 +355,7 @@ void MachOWriter::writeLazyBindInfo() { const MachO::dyld_info_command &DyLdInfoCommand = O.LoadCommands[*O.DyLdInfoCommandIndex] .MachOLoadCommand.dyld_info_command_data; - char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.lazy_bind_off; + char *Out = Buf->getBufferStart() + DyLdInfoCommand.lazy_bind_off; assert((DyLdInfoCommand.lazy_bind_size == O.LazyBinds.Opcodes.size()) && "Incorrect lazy bind opcodes size"); memcpy(Out, O.LazyBinds.Opcodes.data(), O.LazyBinds.Opcodes.size()); @@ -367,7 +367,7 @@ void MachOWriter::writeExportInfo() { const MachO::dyld_info_command &DyLdInfoCommand = O.LoadCommands[*O.DyLdInfoCommandIndex] .MachOLoadCommand.dyld_info_command_data; - char *Out = (char *)Buf->getBufferStart() + DyLdInfoCommand.export_off; + char *Out = Buf->getBufferStart() + DyLdInfoCommand.export_off; assert((DyLdInfoCommand.export_size == O.Exports.Trie.size()) && "Incorrect export trie size"); memcpy(Out, O.Exports.Trie.data(), O.Exports.Trie.size()); @@ -397,7 +397,7 @@ void MachOWriter::writeLinkData(std::optional<size_t> LCIndex, return; const MachO::linkedit_data_command &LinkEditDataCommand = O.LoadCommands[*LCIndex].MachOLoadCommand.linkedit_data_command_data; - char *Out = (char *)Buf->getBufferStart() + LinkEditDataCommand.dataoff; + char *Out = Buf->getBufferStart() + LinkEditDataCommand.dataoff; assert((LinkEditDataCommand.datasize == LD.Data.size()) && "Incorrect data size"); memcpy(Out, LD.Data.data(), LD.Data.size()); @@ -574,7 +574,7 @@ void MachOWriter::writeExportsTrieData() { const MachO::linkedit_data_command &ExportsTrieCmd = O.LoadCommands[*O.ExportsTrieCommandIndex] .MachOLoadCommand.linkedit_data_command_data; - char *Out = (char *)Buf->getBufferStart() + ExportsTrieCmd.dataoff; + char *Out = Buf->getBufferStart() + ExportsTrieCmd.dataoff; assert((ExportsTrieCmd.datasize == O.Exports.Trie.size()) && "Incorrect export trie size"); memcpy(Out, O.Exports.Trie.data(), O.Exports.Trie.size()); diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp index 5db2642..e09dc94 100644 --- a/llvm/lib/Object/MachOObjectFile.cpp +++ b/llvm/lib/Object/MachOObjectFile.cpp @@ -3115,7 +3115,7 @@ void ExportEntry::pushNode(uint64_t offset) { } State.ChildCount = *Children; if (State.ChildCount != 0 && Children + 1 >= Trie.end()) { - *E = malformedError("byte for count of childern in export trie data at " + *E = malformedError("byte for count of children in export trie data at " "node: 0x" + Twine::utohexstr(offset) + " extends past end of trie data"); @@ -3157,7 +3157,7 @@ void ExportEntry::pushDownUntilBottom() { } for (const NodeState &node : nodes()) { if (node.Start == Trie.begin() + childNodeIndex){ - *E = malformedError("loop in childern in export trie data at node: 0x" + + *E = malformedError("loop in children in export trie data at node: 0x" + Twine::utohexstr(Top.Start - Trie.begin()) + " back to node: 0x" + Twine::utohexstr(childNodeIndex)); diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 46084c5..3d688a1 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -4949,6 +4949,21 @@ DoubleAPFloat &DoubleAPFloat::operator=(const DoubleAPFloat &RHS) { return *this; } +// Returns a result such that: +// 1. abs(Lo) <= ulp(Hi)/2 +// 2. Hi == RTNE(Hi + Lo) +// 3. Hi + Lo == X + Y +// +// Requires that log2(X) >= log2(Y). +static std::pair<APFloat, APFloat> fastTwoSum(APFloat X, APFloat Y) { + if (!X.isFinite()) + return {X, APFloat::getZero(X.getSemantics(), /*Negative=*/false)}; + APFloat Hi = X + Y; + APFloat Delta = Hi - X; + APFloat Lo = Y - Delta; + return {Hi, Lo}; +} + // Implement addition, subtraction, multiplication and division based on: // "Software for Doubled-Precision Floating-Point Computations", // by Seppo Linnainmaa, ACM TOMS vol 7 no 3, September 1981, pages 272-283. @@ -5218,10 +5233,78 @@ DoubleAPFloat::fusedMultiplyAdd(const DoubleAPFloat &Multiplicand, APFloat::opStatus DoubleAPFloat::roundToIntegral(APFloat::roundingMode RM) { assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); - auto Ret = Tmp.roundToIntegral(RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); - return Ret; + const APFloat &Hi = getFirst(); + const APFloat &Lo = getSecond(); + + APFloat RoundedHi = Hi; + const opStatus HiStatus = RoundedHi.roundToIntegral(RM); + + // We can reduce the problem to just the high part if the input: + // 1. Represents a non-finite value. + // 2. Has a component which is zero. + if (!Hi.isFiniteNonZero() || Lo.isZero()) { + Floats[0] = std::move(RoundedHi); + Floats[1].makeZero(/*Neg=*/false); + return HiStatus; + } + + // Adjust `Rounded` in the direction of `TieBreaker` if `ToRound` was at a + // halfway point. + auto RoundToNearestHelper = [](APFloat ToRound, APFloat Rounded, + APFloat TieBreaker) { + // RoundingError tells us which direction we rounded: + // - RoundingError > 0: we rounded up. + // - RoundingError < 0: we rounded down. + // Sterbenz' lemma ensures that RoundingError is exact. + const APFloat RoundingError = Rounded - ToRound; + if (TieBreaker.isNonZero() && + TieBreaker.isNegative() != RoundingError.isNegative() && + abs(RoundingError).isExactlyValue(0.5)) + Rounded.add( + APFloat::getOne(Rounded.getSemantics(), TieBreaker.isNegative()), + rmNearestTiesToEven); + return Rounded; + }; + + // Case 1: Hi is not an integer. + // Special cases are for rounding modes that are sensitive to ties. + if (RoundedHi != Hi) { + // We need to consider the case where Hi was between two integers and the + // rounding mode broke the tie when, in fact, Lo may have had a different + // sign than Hi. + if (RM == rmNearestTiesToAway || RM == rmNearestTiesToEven) + RoundedHi = RoundToNearestHelper(Hi, RoundedHi, Lo); + + Floats[0] = std::move(RoundedHi); + Floats[1].makeZero(/*Neg=*/false); + return HiStatus; + } + + // Case 2: Hi is an integer. + // Special cases are for rounding modes which are rounding towards or away from zero. + RoundingMode LoRoundingMode; + if (RM == rmTowardZero) + // When our input is positive, we want the Lo component rounded toward + // negative infinity to get the smallest result magnitude. Likewise, + // negative inputs want the Lo component rounded toward positive infinity. + LoRoundingMode = isNegative() ? rmTowardPositive : rmTowardNegative; + else + LoRoundingMode = RM; + + APFloat RoundedLo = Lo; + const opStatus LoStatus = RoundedLo.roundToIntegral(LoRoundingMode); + if (LoRoundingMode == rmNearestTiesToAway) + // We need to consider the case where Lo was between two integers and the + // rounding mode broke the tie when, in fact, Hi may have had a different + // sign than Lo. + RoundedLo = RoundToNearestHelper(Lo, RoundedLo, Hi); + + // We must ensure that the final result has no overlap between the two APFloat values. + std::tie(RoundedHi, RoundedLo) = fastTwoSum(RoundedHi, RoundedLo); + + Floats[0] = std::move(RoundedHi); + Floats[1] = std::move(RoundedLo); + return LoStatus; } void DoubleAPFloat::changeSign() { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 018c16d..a40de86b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8537,7 +8537,7 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI, if (IsCalleeWin64) { UseVarArgCC = true; } else { - UseVarArgCC = !Outs[i].IsFixed; + UseVarArgCC = ArgFlags.isVarArg(); } } @@ -8982,7 +8982,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { - if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) + if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector()) report_fatal_error("Passing SVE types to variadic functions is " "currently not supported"); } @@ -11390,13 +11390,18 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc -> // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc // The second forms can be matched into subs+cneg. + // NOTE: Drop poison generating flags from the negated operand to avoid + // inadvertently propagating poison after the canonicalisation. if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) { if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS && - FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) + FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) { + TVal->dropFlags(SDNodeFlags::PoisonGeneratingFlags); FVal = DAG.getNegative(TVal, DL, TVal.getValueType()); - else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS && - FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) + } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS && + FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) { + FVal->dropFlags(SDNodeFlags::PoisonGeneratingFlags); TVal = DAG.getNegative(FVal, DL, FVal.getValueType()); + } } unsigned Opcode = AArch64ISD::CSEL; @@ -13477,7 +13482,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, // Look for the first non-undef element. const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); - // Benefit form APInt to handle overflow when calculating expected element. + // Benefit from APInt to handle overflow when calculating expected element. unsigned NumElts = VT.getVectorNumElements(); unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false, @@ -13485,7 +13490,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, // The following shuffle indices must be the successive elements after the // first real element. bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) { - return Elt != ExpectedElt++ && Elt != -1; + return Elt != ExpectedElt++ && Elt >= 0; }); if (FoundWrongElt) return false; @@ -15772,6 +15777,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { isREVMask(M, EltSize, NumElts, 32) || isREVMask(M, EltSize, NumElts, 16) || isEXTMask(M, VT, DummyBool, DummyUnsigned) || + isSingletonEXTMask(M, VT, DummyUnsigned) || isTRNMask(M, NumElts, DummyUnsigned) || isUZPMask(M, NumElts, DummyUnsigned) || isZIPMask(M, NumElts, DummyUnsigned) || @@ -16284,9 +16290,8 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size); if (Align) - SP = - DAG.getNode(ISD::AND, DL, VT, SP.getValue(0), - DAG.getSignedConstant(-(uint64_t)Align->value(), DL, VT)); + SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0), + DAG.getSignedConstant(-Align->value(), DL, VT)); Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, DL); @@ -16323,7 +16328,7 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0), - DAG.getSignedConstant(-(uint64_t)Align->value(), DL, VT)); + DAG.getSignedConstant(-Align->value(), DL, VT)); Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP); Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL); @@ -16351,7 +16356,7 @@ AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0), - DAG.getSignedConstant(-(uint64_t)Align->value(), DL, VT)); + DAG.getSignedConstant(-Align->value(), DL, VT)); // Set the real SP to the new value with a probing loop. Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP); @@ -25450,6 +25455,29 @@ static SDValue performCSELCombine(SDNode *N, } } + // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't + // use overflow flags, to avoid the comparison with zero. In case of success, + // this also replaces the original SUB(x,y) with the newly created SUBS(x,y). + // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB + // nodes with their SUBS equivalent as is already done for other flag-setting + // operators, in which case doing the replacement here becomes redundant. + if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) && + isNullConstant(Cond.getOperand(1))) { + SDValue Sub = Cond.getOperand(0); + AArch64CC::CondCode CC = + static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2)); + if (Sub.getOpcode() == ISD::SUB && + (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI || + CC == AArch64CC::PL)) { + SDLoc DL(N); + SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(), + Sub.getOperand(0), Sub.getOperand(1)); + DCI.CombineTo(Sub.getNode(), Subs); + DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1)); + return SDValue(N, 0); + } + } + // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z if (SDValue CondLast = foldCSELofLASTB(N, DAG)) return CondLast; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 5a537f2..d068a12 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -12564,7 +12564,7 @@ multiclass STOPregister<string asm, string instr> { let Predicates = [HasLSUI] in class BaseSTOPregisterLSUI<string asm, RegisterClass OP, Register Reg, Instruction inst> : - InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn), 0>; + InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>; multiclass STOPregisterLSUI<string asm, string instr> { def : BaseSTOPregisterLSUI<asm # "l", GPR32, WZR, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 59d4fd2..fb59c9f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5861,33 +5861,41 @@ void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( } } -// Convenience function to create a DWARF expression for -// Expr + NumBytes + NumVGScaledBytes * AArch64::VG -static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes, - int NumVGScaledBytes, unsigned VG, - llvm::raw_string_ostream &Comment) { - uint8_t buffer[16]; - - if (NumBytes) { +// Convenience function to create a DWARF expression for: Constant `Operation`. +// This helper emits compact sequences for common cases. For example, for`-15 +// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus. +static void appendConstantExpr(SmallVectorImpl<char> &Expr, int64_t Constant, + dwarf::LocationAtom Operation) { + if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) { + // -Constant (1 to 31) + Expr.push_back(dwarf::DW_OP_lit0 - Constant); + Operation = dwarf::DW_OP_minus; + } else if (Constant >= 0 && Constant <= 31) { + // Literal value 0 to 31 + Expr.push_back(dwarf::DW_OP_lit0 + Constant); + } else { + // Signed constant Expr.push_back(dwarf::DW_OP_consts); - Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); - Expr.push_back((uint8_t)dwarf::DW_OP_plus); - Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); + appendLEB128<LEB128Sign::Signed>(Expr, Constant); } + return Expr.push_back(Operation); +} - if (NumVGScaledBytes) { - Expr.push_back((uint8_t)dwarf::DW_OP_consts); - Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); - - Expr.push_back((uint8_t)dwarf::DW_OP_bregx); - Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); - Expr.push_back(0); - - Expr.push_back((uint8_t)dwarf::DW_OP_mul); - Expr.push_back((uint8_t)dwarf::DW_OP_plus); +// Convenience function to create a DWARF expression for a register. +static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) { + Expr.push_back((char)dwarf::DW_OP_bregx); + appendLEB128<LEB128Sign::Unsigned>(Expr, RegNum); + Expr.push_back(0); +} - Comment << (NumVGScaledBytes < 0 ? " - " : " + ") - << std::abs(NumVGScaledBytes) << " * VG"; +// Convenience function to create a comment for +// (+/-) NumBytes (* RegScale)? +static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, + StringRef RegScale = {}) { + if (NumBytes) { + Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); + if (!RegScale.empty()) + Comment << ' ' << RegScale; } } @@ -5909,19 +5917,26 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, else Comment << printReg(Reg, &TRI); - // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG) + // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes) SmallString<64> Expr; unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); - Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg)); - Expr.push_back(0); - appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, - TRI.getDwarfRegNum(AArch64::VG, true), Comment); + assert(DwarfReg >= 0 && DwarfReg <= 31 && "DwarfReg out of bounds (0..31)"); + // Reg + NumBytes + Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg); + appendLEB128<LEB128Sign::Signed>(Expr, NumBytes); + appendOffsetComment(NumBytes, Comment); + if (NumVGScaledBytes) { + // + VG * NumVGScaledBytes + appendOffsetComment(NumVGScaledBytes, Comment, "* VG"); + appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true)); + appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul); + Expr.push_back(dwarf::DW_OP_plus); + } // Wrap this into DW_CFA_def_cfa. SmallString<64> DefCfaExpr; DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); - uint8_t buffer[16]; - DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); + appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size()); DefCfaExpr.append(Expr.str()); return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(), Comment.str()); @@ -5958,17 +5973,25 @@ MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI, llvm::raw_string_ostream Comment(CommentBuffer); Comment << printReg(Reg, &TRI) << " @ cfa"; - // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) + // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes) + assert(NumVGScaledBytes && "Expected scalable offset"); SmallString<64> OffsetExpr; - appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, - TRI.getDwarfRegNum(AArch64::VG, true), Comment); + // + VG * NumVGScaledBytes + appendOffsetComment(NumVGScaledBytes, Comment, "* VG"); + appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true)); + appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul); + OffsetExpr.push_back(dwarf::DW_OP_plus); + if (NumBytes) { + // + NumBytes + appendOffsetComment(NumBytes, Comment); + appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus); + } // Wrap this into DW_CFA_expression SmallString<64> CfaExpr; CfaExpr.push_back(dwarf::DW_CFA_expression); - uint8_t buffer[16]; - CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); - CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); + appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg); + appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size()); CfaExpr.append(OffsetExpr.str()); return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(), diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index b97d622..fd4ef2a 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -8,8 +8,8 @@ // // This pass performs below peephole optimizations on MIR level. // -// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri -// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri +// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri +// MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi // MOVi64imm + ADDXrr ==> ADDXri + ADDXri @@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { // Strategy used to split logical immediate bitmasks. enum class SplitStrategy { Intersect, + Disjoint, }; template <typename T> bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI, @@ -163,6 +164,7 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", template <typename T> static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { T UImm = static_cast<T>(Imm); + assert(UImm && (UImm != ~static_cast<T>(0)) && "Invalid immediate!"); // The bitmask immediate consists of consecutive ones. Let's say there is // constant 0b00000000001000000000010000000000 which does not consist of @@ -191,18 +193,47 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> +static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, + T &Imm2Enc) { + assert(Imm && (Imm != ~static_cast<T>(0)) && "Invalid immediate!"); + + // Try to split a bitmask of the form 0b00000000011000000000011110000000 into + // two disjoint masks such as 0b00000000011000000000000000000000 and + // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the + // new masks match the original mask. + unsigned LowestBitSet = llvm::countr_zero(Imm); + unsigned LowestGapBitUnset = + LowestBitSet + llvm::countr_one(Imm >> LowestBitSet); + + // Create a mask for the least significant group of consecutive ones. + assert(LowestGapBitUnset < sizeof(T) * CHAR_BIT && "Undefined behaviour!"); + T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) - + (static_cast<T>(1) << LowestBitSet); + // Create a disjoint mask for the remaining ones. + T NewImm2 = Imm & ~NewImm1; + + // Do not split if NewImm2 is not a valid bitmask immediate. + if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) + return false; + + Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize); + Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize); + return true; +} + +template <typename T> bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, SplitStrategy Strategy, unsigned OtherOpc) { - // Try below transformation. + // Try below transformations. // - // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri - // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri + // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri + // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri // // The mov pseudo instruction could be expanded to multiple mov instructions // later. Let's try to split the constant operand of mov instruction into two - // bitmask immediates. It makes only two AND instructions instead of multiple - // mov + and instructions. + // bitmask immediates based on the given split strategy. It makes only two + // logical instructions instead of multiple mov + logic instructions. return splitTwoPartImm<T>( MI, @@ -224,6 +255,9 @@ bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, case SplitStrategy::Intersect: SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1); break; + case SplitStrategy::Disjoint: + SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1); + break; } if (SplitSucc) return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); @@ -889,6 +923,22 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { Changed |= trySplitLogicalImm<uint64_t>( AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri); break; + case AArch64::EORWrr: + Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI, + SplitStrategy::Disjoint); + break; + case AArch64::EORXrr: + Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI, + SplitStrategy::Disjoint); + break; + case AArch64::ORRWrr: + Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI, + SplitStrategy::Disjoint); + break; + case AArch64::ORRXrr: + Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI, + SplitStrategy::Disjoint); + break; case AArch64::ORRWrs: Changed |= visitORR(MI); break; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index adc984a..1bc1d98 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -22,7 +22,8 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320", FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureUseWzrToVecMove]>; + FeatureUseWzrToVecMove, + FeatureUseFixedOverScalableIfEqualCost]>; def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [ @@ -45,7 +46,8 @@ def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510", FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureUseWzrToVecMove + FeatureUseWzrToVecMove, + FeatureUseFixedOverScalableIfEqualCost ]>; def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520", @@ -53,7 +55,8 @@ def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520", FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureUseWzrToVecMove]>; + FeatureUseWzrToVecMove, + FeatureUseFixedOverScalableIfEqualCost]>; def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520", "Cortex-A520AE ARM processors", [ @@ -756,7 +759,6 @@ def ProcessorFeatures { FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE, - FeatureUseFixedOverScalableIfEqualCost, FeatureRAS, FeatureRCPC, FeatureRDM, FeatureFPAC]; list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, @@ -766,7 +768,6 @@ def ProcessorFeatures { FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS, FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, - FeatureUseFixedOverScalableIfEqualCost, FeatureDotProd, FeatureFPAC]; list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 010d0aaa..2155ace 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -125,7 +125,7 @@ struct AArch64OutgoingValueAssigner bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg(); bool Res; - if (Info.IsFixed && !UseVarArgsCCForFixed) { + if (!Flags.isVarArg() && !UseVarArgsCCForFixed) { if (!IsReturn) applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT); Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); @@ -361,7 +361,7 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler { unsigned MaxSize = MemTy.getSizeInBytes() * 8; // For varargs, we always want to extend them to 8 bytes, in which case // we disable setting a max. - if (!Arg.IsFixed) + if (Arg.Flags[0].isVarArg()) MaxSize = 0; Register ValVReg = Arg.Regs[RegIndex]; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index d905692..f359731 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1697,7 +1697,7 @@ bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, Pred); AArch64CC::CondCode CC1, CC2; - changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2); + changeFCMPPredToAArch64CC(Pred, CC1, CC2); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); if (CC2 != AArch64CC::AL) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d84f512..f266398 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1013,6 +1013,14 @@ def FeatureAgentScopeFineGrainedRemoteMemoryAtomics "device memory." >; +def FeatureEmulatedSystemScopeAtomics + : SubtargetFeature<"emulated-system-scope-atomics", + "HasEmulatedSystemScopeAtomics", + "true", + "System scope atomics unsupported by the PCI-e are emulated in HW via CAS " + "loop and functional." +>; + def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero", "HasDefaultComponentZero", "true", @@ -2062,6 +2070,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureAtomicFMinFMaxF64FlatInsts, FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureMemoryAtomicFAddF32DenormalSupport, + FeatureEmulatedSystemScopeAtomics, FeatureGloballyAddressableScratch, FeatureKernargPreload, FeatureVmemPrefInsts, @@ -2603,6 +2612,10 @@ def HasPkMinMax3Insts : Predicate<"Subtarget->hasPkMinMax3Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; +def HasSGetShaderCyclesInst : + Predicate<"Subtarget->hasSGetShaderCyclesInst()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">, AssemblerPredicate<(all_of FeatureImageInsts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 6681393..2a324e5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -486,12 +486,16 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { // Pad with s_code_end to help tools and guard against instruction prefetch // causing stale data in caches. Arguably this should be done by the linker, // which is why this isn't done for Mesa. + // Don't do it if there is no code. const MCSubtargetInfo &STI = *getGlobalSTI(); if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && (STI.getTargetTriple().getOS() == Triple::AMDHSA || STI.getTargetTriple().getOS() == Triple::AMDPAL)) { - OutStreamer->switchSection(getObjFileLowering().getTextSection()); - getTargetStreamer()->EmitCodeEnd(STI); + MCSection *TextSect = getObjFileLowering().getTextSection(); + if (TextSect->hasInstructions()) { + OutStreamer->switchSection(TextSect); + getTargetStreamer()->EmitCodeEnd(STI); + } } // Assign expressions which can only be resolved when all other functions are diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index a0c99b0..846a0b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -991,10 +991,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { return true; if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { - if (Intrinsic->getIntrinsicID() == Intrinsic::read_register) + Intrinsic::ID IID = Intrinsic->getIntrinsicID(); + switch (IID) { + case Intrinsic::read_register: return isReadRegisterSourceOfDivergence(Intrinsic); - - return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); + case Intrinsic::amdgcn_addrspacecast_nonnull: { + unsigned SrcAS = + Intrinsic->getOperand(0)->getType()->getPointerAddressSpace(); + unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace(); + return SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + DstAS == AMDGPUAS::FLAT_ADDRESS && + ST->hasGloballyAddressableScratch(); + } + default: + return AMDGPU::isIntrinsicSourceOfDivergence(IID); + } } // Assume all function calls are a source of divergence. @@ -1008,6 +1019,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { if (isa<InvokeInst>(V)) return true; + // If the target supports globally addressable scratch, the mapping from + // scratch memory to the flat aperture changes therefore an address space cast + // is no longer uniform. + if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) { + return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS && + ST->hasGloballyAddressableScratch(); + } + return false; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 5530886..f47ddf5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -187,6 +187,7 @@ protected: bool HasFlatBufferGlobalAtomicFaddF64Inst = false; bool HasDefaultComponentZero = false; bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false; + bool HasEmulatedSystemScopeAtomics = false; bool HasDefaultComponentBroadcast = false; bool HasXF32Insts = false; /// The maximum number of instructions that may be placed within an S_CLAUSE, @@ -950,6 +951,12 @@ public: return HasAgentScopeFineGrainedRemoteMemoryAtomics; } + /// \return true is HW emulates system scope atomics unsupported by the PCI-e + /// via CAS loop. + bool hasEmulatedSystemScopeAtomics() const { + return HasEmulatedSystemScopeAtomics; + } + bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } bool hasDefaultComponentBroadcast() const { @@ -1081,7 +1088,7 @@ public: } bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } - bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } + bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; } /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } @@ -1555,12 +1562,16 @@ public: // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions. bool hasPkMinMax3Insts() const { return GFX1250Insts; } + // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction. + bool hasSGetShaderCyclesInst() const { return GFX1250Insts; } + // \returns true if target has S_SETPRIO_INC_WG instruction. bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; } // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead - // of sign-extending. - bool hasGetPCZeroExtension() const { return GFX12Insts; } + // of sign-extending. Note that GFX1250 has not only fixed the bug but also + // extended VA to 57 bits. + bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; } /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index deadb7a..2d0102f 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -536,6 +536,10 @@ enum Id { // HwRegCode, (6) [5:0] ID_SQ_PERF_SNAPSHOT_DATA1 = 22, ID_SQ_PERF_SNAPSHOT_PC_LO = 23, ID_SQ_PERF_SNAPSHOT_PC_HI = 24, + + // GFX1250 + ID_XNACK_STATE_PRIV = 33, + ID_XNACK_MASK_gfx1250 = 34, }; enum Offset : unsigned { // Offset, (5) [10:6] diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 63826b7..8f44c03 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17695,6 +17695,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() && RMW->hasMetadata("amdgpu.no.remote.memory")) return true; + if (Subtarget.hasEmulatedSystemScopeAtomics()) + return true; } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics()) return true; @@ -17942,8 +17944,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::UMax: { if (AMDGPU::isFlatGlobalAddrSpace(AS) || AS == AMDGPUAS::BUFFER_FAT_POINTER) { - // Always expand system scope min/max atomics. - if (HasSystemScope) + if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics()) return AtomicExpansionKind::CmpXChg; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 3f61bbd..f20b22d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6122,10 +6122,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, !Op.isIdenticalTo(*MO)) return false; - // Do not fold a frame index into an instruction that already has a frame - // index. The frame index handling code doesn't handle fixing up operand - // constraints if there are multiple indexes. - if (Op.isFI() && MO->isFI()) + // Do not fold a non-inlineable and non-register operand into an + // instruction that already has a frame index. The frame index handling + // code could not handle well when a frame index co-exists with another + // non-register operand, unless that operand is an inlineable immediate. + if (Op.isFI()) return false; } } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() && @@ -10073,7 +10074,30 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); unsigned opcode = MI.getOpcode(); + + auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg() + : MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + unsigned DstAS = DstTy.getAddressSpace(); + unsigned SrcAS = SrcTy.getAddressSpace(); + return SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + DstAS == AMDGPUAS::FLAT_ADDRESS && + ST.hasGloballyAddressableScratch() + ? InstructionUniformity::NeverUniform + : InstructionUniformity::Default; + }; + + // If the target supports globally addressable scratch, the mapping from + // scratch memory to the flat aperture changes therefore an address space cast + // is no longer uniform. + if (opcode == TargetOpcode::G_ADDRSPACE_CAST) + return HandleAddrSpaceCast(MI); + if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { auto IID = GI->getIntrinsicID(); if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) @@ -10082,6 +10106,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::AlwaysUniform; switch (IID) { + case Intrinsic::amdgcn_addrspacecast_nonnull: + return HandleAddrSpaceCast(MI); case Intrinsic::amdgcn_if: case Intrinsic::amdgcn_else: // FIXME: Uniform if second result diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 8303410..431d73b 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1653,6 +1653,12 @@ def S_SETPRIO_INC_WG : SOPP_Pseudo <"s_setprio_inc_wg", (ins i16imm:$simm16), "$ let SubtargetPredicate = HasSetPrioIncWgInst; } +def S_GET_SHADER_CYCLES_U64 : SOP1_64_0 <"s_get_shader_cycles_u64", + [(set i64:$sdst, (readcyclecounter))]> { + let SubtargetPredicate = HasSGetShaderCyclesInst; + let hasSideEffects = 1; +} + let Uses = [EXEC, M0] in { def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsg:$simm16), "$simm16", [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> { @@ -2145,6 +2151,7 @@ defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>; defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>; // GFX1250 +defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx12<0x06>; defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index e433b85..3d9455f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -223,6 +223,10 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_SQ_PERF_SNAPSHOT_PC_LO"}, ID_SQ_PERF_SNAPSHOT_PC_LO, isGFX940}, {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940}, + // GFX1250 + {{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250}, + {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250}, + // Aliases {{"HW_REG_HW_ID"}, ID_HW_ID1, isGFX10}, }; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 74c7c97..ea99cc4 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -20350,7 +20350,8 @@ static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) { if (PR == 0 || VT == MVT::Other) return false; return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) || - (ARM::DPRRegClass.contains(PR) && VT != MVT::f64); + (ARM::DPRRegClass.contains(PR) && VT != MVT::f64 && + !VT.is64BitVector()); } using RCPair = std::pair<unsigned, const TargetRegisterClass *>; @@ -20783,9 +20784,8 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); if (Align) - SP = DAG.getNode( - ISD::AND, DL, MVT::i32, SP.getValue(0), - DAG.getSignedConstant(-(uint64_t)Align->value(), DL, MVT::i32)); + SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), + DAG.getSignedConstant(-Align->value(), DL, MVT::i32)); Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); SDValue Ops[2] = { SP, Chain }; return DAG.getMergeValues(Ops, DL); diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index 3955f2a..25ad9ec 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, default: { // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows // us to fold the constant into the cmp instruction. - RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT); + RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT); CC = ISD::SETGE; break; } @@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to // fold the constant into the cmp instruction. if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { - RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT); + // Doing a "icmp ugt i16 65535, %0" comparison should have been converted + // already to something else. Assert to make sure this assumption holds. + assert((!C->isAllOnes()) && "integer overflow in comparison transform"); + RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT); CC = ISD::SETUGE; break; } diff --git a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp index 73abfe7..306db6a 100644 --- a/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp +++ b/llvm/lib/Target/DirectX/DXILForwardHandleAccesses.cpp @@ -87,17 +87,50 @@ static bool forwardHandleAccesses(Function &F, DominatorTree &DT) { for (LoadInst *LI : LoadsToProcess) { Value *V = LI->getPointerOperand(); - auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()); + auto *GV = dyn_cast<GlobalVariable>(V); // If we didn't find the global, we may need to walk through a level of // indirection. This generally happens at -O0. - if (!GV) + if (!GV) { if (auto *NestedLI = dyn_cast<LoadInst>(V)) { BasicBlock::iterator BBI(NestedLI); Value *Loaded = FindAvailableLoadedValue( NestedLI, NestedLI->getParent(), BBI, 0, nullptr, nullptr); GV = dyn_cast_or_null<GlobalVariable>(Loaded); + } else if (auto *NestedAlloca = dyn_cast<AllocaInst>(V)) { + for (auto &Use : NestedAlloca->uses()) { + auto *Store = dyn_cast<StoreInst>(Use.getUser()); + if (!Store) + continue; + + Value *StoredVal = Store->getValueOperand(); + if (!StoredVal) + continue; + + // Try direct global match + GV = dyn_cast<GlobalVariable>(StoredVal); + if (GV) + break; + + // If it's a load, check its source + if (auto *Load = dyn_cast<LoadInst>(StoredVal)) { + GV = dyn_cast<GlobalVariable>(Load->getPointerOperand()); + if (GV) + break; + + // If loading from an unmodified stack copy of the global, reuse the + // global's value. Note: we are just repeating what we are doing for + // the load case for the alloca store pattern. + BasicBlock::iterator BBI(Load); + Value *Loaded = FindAvailableLoadedValue(Load, Load->getParent(), + BBI, 0, nullptr, nullptr); + GV = dyn_cast<GlobalVariable>(Loaded); + if (GV) + break; + } + } } + } auto It = HandleMap.find(GV); if (It == HandleMap.end()) { diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index a5bf0e5..6583a0f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -6729,8 +6729,7 @@ static bool CC_LoongArchAssign2GRLen(unsigned GRLen, CCState &State, static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, unsigned ValNo, MVT ValVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, - Type *OrigTy) { + CCState &State, bool IsRet, Type *OrigTy) { unsigned GRLen = DL.getLargestLegalIntTypeSizeInBits(); assert((GRLen == 32 || GRLen == 64) && "Unspport GRLen"); MVT GRLenVT = GRLen == 32 ? MVT::i32 : MVT::i64; @@ -6752,7 +6751,7 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, case LoongArchABI::ABI_LP64F: case LoongArchABI::ABI_ILP32D: case LoongArchABI::ABI_LP64D: - UseGPRForFloat = !IsFixed; + UseGPRForFloat = ArgFlags.isVarArg(); break; case LoongArchABI::ABI_ILP32S: case LoongArchABI::ABI_LP64S: @@ -6766,7 +6765,8 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, // will not be passed by registers if the original type is larger than // 2*GRLen, so the register alignment rule does not apply. unsigned TwoGRLenInBytes = (2 * GRLen) / 8; - if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes && + if (ArgFlags.isVarArg() && + ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes && DL.getTypeAllocSize(OrigTy) == TwoGRLenInBytes) { unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); // Skip 'odd' register if necessary. @@ -6916,7 +6916,7 @@ void LoongArchTargetLowering::analyzeInputArgs( LoongArchABI::ABI ABI = MF.getSubtarget<LoongArchSubtarget>().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Ins[i].Flags, - CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) { + CCInfo, IsRet, ArgTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << ArgVT << '\n'); llvm_unreachable(""); @@ -6934,7 +6934,7 @@ void LoongArchTargetLowering::analyzeOutputArgs( LoongArchABI::ABI ABI = MF.getSubtarget<LoongArchSubtarget>().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Outs[i].Flags, - CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) { + CCInfo, IsRet, OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << ArgVT << "\n"); llvm_unreachable(""); @@ -7647,8 +7647,7 @@ bool LoongArchTargetLowering::CanLowerReturn( LoongArchABI::ABI ABI = MF.getSubtarget<LoongArchSubtarget>().getTargetABI(); if (CC_LoongArch(MF.getDataLayout(), ABI, i, Outs[i].VT, CCValAssign::Full, - Outs[i].Flags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, - nullptr)) + Outs[i].Flags, CCInfo, /*IsRet=*/true, nullptr)) return false; } return true; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 6b49a98f..f79ba74 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -330,7 +330,7 @@ private: unsigned ValNo, MVT ValVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy); + bool IsRet, Type *OrigTy); void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet, diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp index 9e8cd2e..13237c5 100644 --- a/llvm/lib/Target/Mips/MipsCCState.cpp +++ b/llvm/lib/Target/Mips/MipsCCState.cpp @@ -128,12 +128,10 @@ void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) { OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT)); } -void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, - const char *Func) { +void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) { OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func)); OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy()); OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy()); - CallOperandIsFixed.push_back(IsFixed); } /// Identify lowered values that originated from f128, float and sret to vXfXX @@ -148,7 +146,6 @@ void MipsCCState::PreAnalyzeCallOperands( OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func)); OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy()); OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy()); - CallOperandIsFixed.push_back(Outs[i].IsFixed); } } diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h index 4229da5..30b68e8 100644 --- a/llvm/lib/Target/Mips/MipsCCState.h +++ b/llvm/lib/Target/Mips/MipsCCState.h @@ -36,7 +36,7 @@ public: static bool originalEVTTypeIsVectorFloat(EVT Ty); static bool originalTypeIsVectorFloat(const Type *Ty); - void PreAnalyzeCallOperand(const Type *ArgTy, bool IsFixed, const char *Func); + void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func); void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags); void PreAnalyzeReturnValue(EVT ArgVT); @@ -86,10 +86,6 @@ private: /// vector. SmallVector<bool, 4> OriginalRetWasFloatVector; - /// Records whether the value was a fixed argument. - /// See ISD::OutputArg::IsFixed, - SmallVector<bool, 4> CallOperandIsFixed; - // Used to handle MIPS16-specific calling convention tweaks. // FIXME: This should probably be a fully fledged calling convention. SpecialCallingConvType SpecialCallingConv; @@ -106,7 +102,6 @@ public: OriginalArgWasF128.clear(); OriginalArgWasFloat.clear(); OriginalArgWasFloatVector.clear(); - CallOperandIsFixed.clear(); PreAnalyzeCallOperands(Outs, FuncArgs, Func); } @@ -213,7 +208,6 @@ public: bool WasOriginalRetVectorFloat(unsigned ValNo) const { return OriginalRetWasFloatVector[ValNo]; } - bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; } SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; } }; } diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp index 555773a..fa49108 100644 --- a/llvm/lib/Target/Mips/MipsCallLowering.cpp +++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp @@ -47,7 +47,7 @@ struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { if (IsReturn) State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty)); else - State.PreAnalyzeCallOperand(Info.Ty, Info.IsFixed, Func); + State.PreAnalyzeCallOperand(Info.Ty, Func); return CallLowering::OutgoingValueAssigner::assignArg( ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State); diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td index 39e184a..0e5c16c 100644 --- a/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/llvm/lib/Target/Mips/MipsCallingConv.td @@ -29,12 +29,6 @@ class CCIfOrigArgWasFloat<CCAction A> class CCIfOrigArgWasF128<CCAction A> : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>; -/// Match if this specific argument is a vararg. -/// This is slightly different fro CCIfIsVarArg which matches if any argument is -/// a vararg. -class CCIfArgIsVarArg<CCAction A> - : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>; - /// Match if the return was a floating point vector. class CCIfOrigArgWasNotVectorFloat<CCAction A> : CCIf<"!static_cast<MipsCCState *>(&State)" @@ -344,7 +338,7 @@ def CC_Mips_VarArg : CallingConv<[ ]>; def CC_Mips : CallingConv<[ - CCIfVarArg<CCIfArgIsVarArg<CCDelegateTo<CC_Mips_VarArg>>>, + CCIfVarArg<CCIfArgVarArg<CCDelegateTo<CC_Mips_VarArg>>>, CCDelegateTo<CC_Mips_FixedArg> ]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 15f45a1..d4f0cc9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -900,6 +900,17 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, if (STI.allowFP16Math() || STI.hasBF16Math()) setTargetDAGCombine(ISD::SETCC); + // Vector reduction operations. These may be turned into shuffle or tree + // reductions depending on what instructions are available for each type. + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + MVT EltVT = VT.getVectorElementType(); + if (EltVT == MVT::f32 || EltVT == MVT::f64) { + setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN, + ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM}, + VT, Custom); + } + } + // Promote fp16 arithmetic if fp16 hardware isn't available or the // user passed --nvptx-no-fp16-math. The flag is useful because, // although sm_53+ GPUs have some sort of FP16 support in @@ -1143,6 +1154,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::BFI) MAKE_CASE(NVPTXISD::PRMT) MAKE_CASE(NVPTXISD::FCOPYSIGN) + MAKE_CASE(NVPTXISD::FMAXNUM3) + MAKE_CASE(NVPTXISD::FMINNUM3) + MAKE_CASE(NVPTXISD::FMAXIMUM3) + MAKE_CASE(NVPTXISD::FMINIMUM3) MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC) MAKE_CASE(NVPTXISD::STACKRESTORE) MAKE_CASE(NVPTXISD::STACKSAVE) @@ -1929,6 +1944,124 @@ static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL, return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode); } +/// Reduces the elements using the scalar operations provided. The operations +/// are sorted descending in number of inputs they take. The flags on the +/// original reduction operation will be propagated to each scalar operation. +/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction +/// used in ExpandReductions and SelectionDAG. +static SDValue buildTreeReduction( + const SmallVector<SDValue> &Elements, EVT EltTy, + ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops, + const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) { + // Build the reduction tree at each level, starting with all the elements. + SmallVector<SDValue> Level = Elements; + + unsigned OpIdx = 0; + while (Level.size() > 1) { + // Try to reduce this level using the current operator. + const auto [Op, NumInputs] = Ops[OpIdx]; + + // Build the next level by partially reducing all elements. + SmallVector<SDValue> ReducedLevel; + unsigned I = 0, E = Level.size(); + for (; I + NumInputs <= E; I += NumInputs) { + // Reduce elements in groups of [NumInputs], as much as possible. + ReducedLevel.push_back(DAG.getNode( + Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags)); + } + + if (I < E) { + // Handle leftover elements. + + if (ReducedLevel.empty()) { + // We didn't reduce anything at this level. We need to pick a smaller + // operator. + ++OpIdx; + assert(OpIdx < Ops.size() && "no smaller operators for reduction"); + continue; + } + + // We reduced some things but there's still more left, meaning the + // operator's number of inputs doesn't evenly divide this level size. Move + // these elements to the next level. + for (; I < E; ++I) + ReducedLevel.push_back(Level[I]); + } + + // Process the next level. + Level = ReducedLevel; + } + + return *Level.begin(); +} + +// Get scalar reduction opcode +static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) { + switch (ReductionOpcode) { + case ISD::VECREDUCE_FMAX: + return ISD::FMAXNUM; + case ISD::VECREDUCE_FMIN: + return ISD::FMINNUM; + case ISD::VECREDUCE_FMAXIMUM: + return ISD::FMAXIMUM; + case ISD::VECREDUCE_FMINIMUM: + return ISD::FMINIMUM; + default: + llvm_unreachable("unhandled reduction opcode"); + } +} + +/// Get 3-input scalar reduction opcode +static std::optional<NVPTXISD::NodeType> +getScalar3OpcodeForReduction(unsigned ReductionOpcode) { + switch (ReductionOpcode) { + case ISD::VECREDUCE_FMAX: + return NVPTXISD::FMAXNUM3; + case ISD::VECREDUCE_FMIN: + return NVPTXISD::FMINNUM3; + case ISD::VECREDUCE_FMAXIMUM: + return NVPTXISD::FMAXIMUM3; + case ISD::VECREDUCE_FMINIMUM: + return NVPTXISD::FMINIMUM3; + default: + return std::nullopt; + } +} + +/// Lower reductions to either a sequence of operations or a tree if +/// reassociations are allowed. This method will use larger operations like +/// max3/min3 when the target supports them. +SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const SDNodeFlags Flags = Op->getFlags(); + SDValue Vector = Op.getOperand(0); + + const unsigned Opcode = Op->getOpcode(); + const EVT EltTy = Vector.getValueType().getVectorElementType(); + + // Whether we can use 3-input min/max when expanding the reduction. + const bool CanUseMinMax3 = + EltTy == MVT::f32 && STI.getSmVersion() >= 100 && + STI.getPTXVersion() >= 88 && + (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN || + Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM); + + // A list of SDNode opcodes with equivalent semantics, sorted descending by + // number of inputs they take. + SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps; + + if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode); + CanUseMinMax3 && Opcode3Elem) + ScalarOps.push_back({*Opcode3Elem, 3}); + ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2}); + + SmallVector<SDValue> Elements; + DAG.ExtractVectorElements(Vector, Elements); + + return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG); +} + SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { // Handle bitcasting from v2i8 without hitting the default promotion // strategy which goes through stack memory. @@ -2808,6 +2941,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: + return LowerVECREDUCE(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::LOAD: @@ -3908,6 +4046,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } + case Intrinsic::nvvm_prefetch_tensormap: { + auto &DL = I.getDataLayout(); + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = getPointerTy(DL); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; + Info.align.reset(); + return true; + } + case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_p: { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index cf72a1e..43e721a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -64,6 +64,11 @@ enum NodeType : unsigned { UNPACK_VECTOR, FCOPYSIGN, + FMAXNUM3, + FMINNUM3, + FMAXIMUM3, + FMINIMUM3, + DYNAMIC_STACKALLOC, STACKRESTORE, STACKSAVE, @@ -286,6 +291,7 @@ private: SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index aac611d..1ab41bf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -347,6 +347,36 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> { Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>; } +// Template for 3-input minimum/maximum instructions +// (sm_100+/PTX 8.8 and f32 only) +// +// Also defines ftz (flush subnormal inputs and results to sign-preserving +// zero) variants for fp32 functions. +multiclass FMINIMUMMAXIMUM3<string OpcStr, bit NaN, SDNode OpNode> { + defvar nan_str = !if(NaN, ".NaN", ""); + def f32rrr : + BasicFlagsNVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, B32:$c), + (ins FTZFlag:$ftz), + OpcStr # "$ftz" # nan_str # ".f32", + [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; + def f32rri : + BasicFlagsNVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, f32imm:$c), + (ins FTZFlag:$ftz), + OpcStr # "$ftz" # nan_str # ".f32", + [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; + def f32rii : + BasicFlagsNVPTXInst<(outs B32:$dst), + (ins B32:$a, f32imm:$b, f32imm:$c), + (ins FTZFlag:$ftz), + OpcStr # "$ftz" # nan_str # ".f32", + [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; +} + // Template for instructions which take three FP args. The // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). // @@ -900,6 +930,20 @@ defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>; defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>; defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>; +def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp, + [SDNPCommutative]>; + +defm FMIN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ false, nvptx_fminnum3>; +defm FMAX3 : FMINIMUMMAXIMUM3<"max", /* NaN */ false, nvptx_fmaxnum3>; +defm FMINNAN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ true, nvptx_fminimum3>; +defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max", /* NaN */ true, nvptx_fmaximum3>; + defm FABS : F2<"abs", fabs>; defm FNEG : F2<"neg", fneg>; defm FABS_H: F2_Support_Half<"abs", fabs>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d337192..d4a0ca7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -39,6 +39,12 @@ def AS_match { code global = [{ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); }]; + code const = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST); + }]; + code param = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM); + }]; } @@ -950,33 +956,47 @@ foreach dim = 3...5 in { defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", [hasTMACTAGroupSupport]>; -//Prefetch and Prefetchu - -let Predicates = [hasPTX<80>, hasSM<90>] in { - class PREFETCH_INTRS<string InstName> : - BasicNVPTXInst<(outs), (ins ADDR:$addr), - InstName, - [(!cast<Intrinsic>(!strconcat("int_nvvm_", - !subst(".", "_", InstName))) addr:$addr)]>; +//Prefetchu and Prefetch - def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">; - def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">; - def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">; - def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">; - def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">; - def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">; +defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr); - def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_normal", - [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>; +multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> { + def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>; +} - def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_last", - [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>; +defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>; - def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; +multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> { + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch" # addrspace_name # ".tensormap", + [(pattern_frag addr:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; } +defm PREFETCH_CONST_TENSORMAP : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>; +defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>; +defm PREFETCH_PARAM_TENSORMAP : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>; + +class PREFETCH_INTRS<string InstName, Intrinsic Intr> : + BasicNVPTXInst<(outs), (ins ADDR:$addr), + InstName, + [(Intr addr:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; + +def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>; +def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>; +def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>; +def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>; +def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>; +def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>; +def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>; +def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal", + int_nvvm_prefetch_global_L2_evict_normal>; +def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last", + int_nvvm_prefetch_global_L2_evict_last>; + //Applypriority intrinsics class APPLYPRIORITY_L2_INTRS<string addrspace> : BasicNVPTXInst<(outs), (ins ADDR:$addr, B64:$size), diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 3ae2d9d..f4f8961 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, case Intrinsic::nvvm_isspacep_global: case Intrinsic::nvvm_isspacep_local: case Intrinsic::nvvm_isspacep_shared: - case Intrinsic::nvvm_isspacep_shared_cluster: { + case Intrinsic::nvvm_isspacep_shared_cluster: + case Intrinsic::nvvm_prefetch_tensormap: { OpIndexes.push_back(0); return true; } @@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return ConstantInt::get(II->getType(), *R); return nullptr; } + case Intrinsic::nvvm_prefetch_tensormap: { + IRBuilder<> Builder(II); + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap, + NewV); + } } return nullptr; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 9a6e261..b32d931b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -87,6 +87,13 @@ public: } unsigned getMinVectorRegisterBitWidth() const override { return 32; } + bool shouldExpandReduction(const IntrinsicInst *II) const override { + // Turn off ExpandReductions pass for NVPTX, which doesn't have advanced + // swizzling operations. Our backend/Selection DAG can expand these + // reductions with less movs. + return false; + } + // We don't want to prevent inlining because of target-cpu and -features // attributes that were added to newer versions of LLVM/Clang: There are // no incompatible functions in PTX, ptxas will throw errors in such cases. diff --git a/llvm/lib/Target/PowerPC/PPCCCState.h b/llvm/lib/Target/PowerPC/PPCCCState.h index b0e50b2..feab9c5 100644 --- a/llvm/lib/Target/PowerPC/PPCCCState.h +++ b/llvm/lib/Target/PowerPC/PPCCCState.h @@ -38,36 +38,6 @@ public: void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); } }; -class AIXCCState : public CCState { -private: - BitVector IsFixed; - -public: - AIXCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF, - SmallVectorImpl<CCValAssign> &Locs, LLVMContext &C) - : CCState(CC, IsVarArg, MF, Locs, C) {} - - void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, - CCAssignFn Fn) { - // All formal arguments are fixed. - IsFixed.resize(Ins.size(), true); - CCState::AnalyzeFormalArguments(Ins, Fn); - } - - void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, - CCAssignFn Fn) { - // Record whether the call operand was a fixed argument. - IsFixed.resize(Outs.size(), false); - for (unsigned ValNo = 0, E = Outs.size(); ValNo != E; ++ValNo) - if (Outs[ValNo].IsFixed) - IsFixed.set(ValNo); - - CCState::AnalyzeCallOperands(Outs, Fn); - } - - bool isFixed(unsigned ValNo) const { return IsFixed.test(ValNo); } -}; - } // end namespace llvm #endif diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 30b5fd6..2698bd6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -3925,9 +3925,6 @@ SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { - if (Subtarget.isAIXABI()) - report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX."); - return Op.getOperand(0); } @@ -3984,9 +3981,6 @@ SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { - if (Subtarget.isAIXABI()) - report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX."); - SDValue Chain = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function @@ -3994,6 +3988,65 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); + + if (Subtarget.isAIXABI()) { + // On AIX we create a trampoline descriptor by combining the + // entry point and TOC from the global descriptor (FPtr) with the + // nest argument as the environment pointer. + uint64_t PointerSize = Subtarget.isPPC64() ? 8 : 4; + MaybeAlign PointerAlign(PointerSize); + auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() + ? (MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant) + : MachineMemOperand::MONone; + + uint64_t TOCPointerOffset = 1 * PointerSize; + uint64_t EnvPointerOffset = 2 * PointerSize; + SDValue SDTOCPtrOffset = DAG.getConstant(TOCPointerOffset, dl, PtrVT); + SDValue SDEnvPtrOffset = DAG.getConstant(EnvPointerOffset, dl, PtrVT); + + const Value *TrampolineAddr = + cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + const Function *Func = + cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); + + SDValue OutChains[3]; + + // Copy the entry point address from the global descriptor to the + // trampoline buffer. + SDValue LoadEntryPoint = + DAG.getLoad(PtrVT, dl, Chain, FPtr, MachinePointerInfo(Func, 0), + PointerAlign, MMOFlags); + SDValue EPLoadChain = LoadEntryPoint.getValue(1); + OutChains[0] = DAG.getStore(EPLoadChain, dl, LoadEntryPoint, Trmp, + MachinePointerInfo(TrampolineAddr, 0)); + + // Copy the TOC pointer from the global descriptor to the trampoline + // buffer. + SDValue TOCFromDescriptorPtr = + DAG.getNode(ISD::ADD, dl, PtrVT, FPtr, SDTOCPtrOffset); + SDValue TOCReg = DAG.getLoad(PtrVT, dl, Chain, TOCFromDescriptorPtr, + MachinePointerInfo(Func, TOCPointerOffset), + PointerAlign, MMOFlags); + SDValue TrampolineTOCPointer = + DAG.getNode(ISD::ADD, dl, PtrVT, Trmp, SDTOCPtrOffset); + SDValue TOCLoadChain = TOCReg.getValue(1); + OutChains[1] = + DAG.getStore(TOCLoadChain, dl, TOCReg, TrampolineTOCPointer, + MachinePointerInfo(TrampolineAddr, TOCPointerOffset)); + + // Store the nest argument into the environment pointer in the trampoline + // buffer. + SDValue EnvPointer = DAG.getNode(ISD::ADD, dl, PtrVT, Trmp, SDEnvPtrOffset); + OutChains[2] = + DAG.getStore(Chain, dl, Nest, EnvPointer, + MachinePointerInfo(TrampolineAddr, EnvPointerOffset)); + + SDValue TokenFactor = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); + return TokenFactor; + } + bool isPPC64 = (PtrVT == MVT::i64); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); @@ -6036,7 +6089,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; bool Result; - if (Outs[i].IsFixed) { + if (!ArgFlags.isVarArg()) { Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); } else { @@ -6852,8 +6905,7 @@ static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) { static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &S) { - AIXCCState &State = static_cast<AIXCCState &>(S); + CCState &State) { const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>( State.getMachineFunction().getSubtarget()); const bool IsPPC64 = Subtarget.isPPC64(); @@ -6865,9 +6917,6 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, if (ValVT == MVT::f128) report_fatal_error("f128 is unimplemented on AIX."); - if (ArgFlags.isNest()) - report_fatal_error("Nest arguments are unimplemented."); - static const MCPhysReg GPR_32[] = {// 32-bit registers. PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10}; @@ -6882,6 +6931,14 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32; + if (ArgFlags.isNest()) { + MCRegister EnvReg = State.AllocateReg(IsPPC64 ? PPC::X11 : PPC::R11); + if (!EnvReg) + report_fatal_error("More then one nest argument."); + State.addLoc(CCValAssign::getReg(ValNo, ValVT, EnvReg, RegVT, LocInfo)); + return false; + } + if (ArgFlags.isByVal()) { const Align ByValAlign(ArgFlags.getNonZeroByValAlign()); if (ByValAlign > StackAlign) @@ -7032,7 +7089,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, // They are passed in VRs if any are available (unlike arguments passed // through ellipses) and shadow GPRs (unlike arguments to non-vaarg // functions) - if (State.isFixed(ValNo)) { + if (!ArgFlags.isVarArg()) { if (MCRegister VReg = State.AllocateReg(VR)) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo)); // Shadow allocate GPRs and stack space even though we pass in a VR. @@ -7220,7 +7277,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); - AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); const EVT PtrVT = getPointerTy(MF.getDataLayout()); // Reserve space for the linkage area on the stack. @@ -7567,8 +7624,8 @@ SDValue PPCTargetLowering::LowerCall_AIX( MachineFunction &MF = DAG.getMachineFunction(); SmallVector<CCValAssign, 16> ArgLocs; - AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs, - *DAG.getContext()); + CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs, + *DAG.getContext()); // Reserve space for the linkage save area (LSA) on the stack. // In both PPC32 and PPC64 there are 6 reserved slots in the LSA: @@ -9593,12 +9650,14 @@ static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op, return false; } -bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN) { +bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN, + bool IsLittleEndian) { assert(BVN.getNumOperands() > 0 && "Unexpected 0-size build vector"); BitMask.clearAllBits(); EVT VT = BVN.getValueType(0); - APInt ConstValue(VT.getSizeInBits(), 0); + unsigned VTSize = VT.getSizeInBits(); + APInt ConstValue(VTSize, 0); unsigned EltWidth = VT.getScalarSizeInBits(); @@ -9608,8 +9667,10 @@ bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN) { if (!CN) return false; - - ConstValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos); + // The elements in a vector register are ordered in reverse byte order + // between little-endian and big-endian modes. + ConstValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), + IsLittleEndian ? BitPos : VTSize - EltWidth - BitPos); BitPos += EltWidth; } @@ -9640,7 +9701,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // we do not convert it to MTVSRBMI. // The xxleqv instruction sets a vector with all ones. // The xxlxor instruction sets a vector with all zeros. - if (isValidMtVsrBmi(BitMask, *BVN) && BitMask != 0 && BitMask != 0xffff) { + if (isValidMtVsrBmi(BitMask, *BVN, Subtarget.isLittleEndian()) && + BitMask != 0 && BitMask != 0xffff) { SDValue SDConstant = DAG.getTargetConstant(BitMask, dl, MVT::i32); MachineSDNode *MSDNode = DAG.getMachineNode(PPC::MTVSRBMI, dl, MVT::v16i8, SDConstant); diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index 1dc485d..98dd846 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -2175,10 +2175,7 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { // - Other vector types [v16i8, v8i16] require COPY_TO_REGCLASS to/from VRRC // ============================================================================= -class XXEvalPattern<dag pattern, bits<8> imm> - : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} - -class XXEvalPatterns<ValueType Vt, dag InputPattern, bits<8> Imm> +class XXEvalPattern<ValueType Vt, dag InputPattern, bits<8> Imm> : Pat<(Vt InputPattern), !if(!or(!eq(Vt, v4i32), !eq(Vt, v2i64)), // VSRC path: direct XXEVAL for v4i32 and v2i64 @@ -2246,26 +2243,26 @@ def VEqv // ============================================================================= multiclass XXEvalTernarySelectAnd<ValueType Vt> { // Pattern: A ? XOR(B,C) : AND(B,C) XXEVAL immediate value: 22 - def : XXEvalPatterns< + def : XXEvalPattern< Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 22>; // Pattern: A ? NOR(B,C) : AND(B,C) XXEVAL immediate value: 24 - def : XXEvalPatterns< + def : XXEvalPattern< Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 24>; // Pattern: A ? EQV(B,C) : AND(B,C) XXEVAL immediate value: 25 - def : XXEvalPatterns< + def : XXEvalPattern< Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 25>; // Pattern: A ? NOT(C) : AND(B,C) XXEVAL immediate value: 26 - def : XXEvalPatterns< + def : XXEvalPattern< Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 26>; // Pattern: A ? NOT(B) : AND(B,C) XXEVAL immediate value: 28 - def : XXEvalPatterns< + def : XXEvalPattern< Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>; } @@ -2299,83 +2296,83 @@ let Predicates = [PrefixInstrs, HasP10Vector] in { // Anonymous patterns for XXEVAL // AND // and(A, B, C) - def : XXEvalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>; + def : XXEvalPattern<v4i32, (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>; // and(A, xor(B, C)) - def : XXEvalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>; + def : XXEvalPattern<v4i32, (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>; // and(A, or(B, C)) - def : XXEvalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>; + def : XXEvalPattern<v4i32, (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>; // and(A, nor(B, C)) - def : XXEvalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>; + def : XXEvalPattern<v4i32, (and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>; // and(A, eqv(B, C)) - def : XXEvalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>; + def : XXEvalPattern<v4i32, (and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>; // and(A, nand(B, C)) - def : XXEvalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>; + def : XXEvalPattern<v4i32, (and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>; // NAND // nand(A, B, C) - def : XXEvalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), + def : XXEvalPattern<v4i32, (vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), !sub(255, 1)>; // nand(A, xor(B, C)) - def : XXEvalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), + def : XXEvalPattern<v4i32, (vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), !sub(255, 6)>; // nand(A, or(B, C)) - def : XXEvalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), + def : XXEvalPattern<v4i32, (vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), !sub(255, 7)>; // nand(A, nor(B, C)) - def : XXEvalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<v4i32, (or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), !sub(255, 8)>; // nand(A, eqv(B, C)) - def : XXEvalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<v4i32, (or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), !sub(255, 9)>; // nand(A, nand(B, C)) - def : XXEvalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<v4i32, (or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), !sub(255, 14)>; // EQV // (eqv A, B, C) - def : XXEvalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<v4i32, (or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), (vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)))), 150>; // (eqv A, (and B, C)) - def : XXEvalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>; + def : XXEvalPattern<v4i32, (vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>; // (eqv A, (or B, C)) - def : XXEvalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>; + def : XXEvalPattern<v4i32, (vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>; // NOR // (nor A, B, C) - def : XXEvalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>; + def : XXEvalPattern<v4i32, (vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>; // (nor A, (and B, C)) - def : XXEvalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>; + def : XXEvalPattern<v4i32, (vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>; // (nor A, (eqv B, C)) - def : XXEvalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>; + def : XXEvalPattern<v4i32, (and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>; // (nor A, (nand B, C)) - def : XXEvalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>; + def : XXEvalPattern<v4i32, (and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>; // (nor A, (nor B, C)) - def : XXEvalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>; + def : XXEvalPattern<v4i32, (and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>; // (nor A, (xor B, C)) - def : XXEvalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>; + def : XXEvalPattern<v4i32, (vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>; // OR // (or A, B, C) - def : XXEvalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>; + def : XXEvalPattern<v4i32, (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>; // (or A, (and B, C)) - def : XXEvalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>; + def : XXEvalPattern<v4i32, (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>; // (or A, (eqv B, C)) - def : XXEvalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>; + def : XXEvalPattern<v4i32, (or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>; // (or A, (nand B, C)) - def : XXEvalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>; + def : XXEvalPattern<v4i32, (or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>; // (or A, (nor B, C)) - def : XXEvalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>; + def : XXEvalPattern<v4i32, (or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>; // (or A, (xor B, C)) - def : XXEvalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>; + def : XXEvalPattern<v4i32, (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>; // XOR // (xor A, B, C) - def : XXEvalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>; + def : XXEvalPattern<v4i32, (xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>; // (xor A, (and B, C)) - def : XXEvalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>; + def : XXEvalPattern<v4i32, (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>; // (xor A, (or B, C)) - def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; + def : XXEvalPattern<v4i32, (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; // XXEval Patterns for ternary Operations. foreach Ty = [v4i32, v2i64, v8i16, v16i8] in { diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index d2b75a6..34026ed 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -45,8 +45,8 @@ public: CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, CCState &State) override { - if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, Info.IsFixed, - IsRet, Info.Ty)) + if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet, + Info.Ty)) return true; StackSize = State.getStackSize(); @@ -196,8 +196,8 @@ public: if (LocVT.isScalableVector()) MF.getInfo<RISCVMachineFunctionInfo>()->setIsVectorCall(); - if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, - /*IsFixed=*/true, IsRet, Info.Ty)) + if (RISCVAssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State, IsRet, + Info.Ty)) return true; StackSize = State.getStackSize(); @@ -454,7 +454,7 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF, for (unsigned I = 0, E = Outs.size(); I < E; ++I) { MVT VT = MVT::getVT(Outs[I].Ty); if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo, - /*IsFixed=*/true, /*isRet=*/true, nullptr)) + /*isRet=*/true, nullptr)) return false; } return true; diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp index cb6117e..70127e3 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp @@ -324,7 +324,7 @@ static MCRegister allocateRVVReg(MVT ValVT, unsigned ValNo, CCState &State, // Implements the RISC-V calling convention. Returns true upon failure. bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) { + CCState &State, bool IsRet, Type *OrigTy) { const MachineFunction &MF = State.getMachineFunction(); const DataLayout &DL = MF.getDataLayout(); const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>(); @@ -379,12 +379,12 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, break; case RISCVABI::ABI_ILP32F: case RISCVABI::ABI_LP64F: - UseGPRForF16_F32 = !IsFixed; + UseGPRForF16_F32 = ArgFlags.isVarArg(); break; case RISCVABI::ABI_ILP32D: case RISCVABI::ABI_LP64D: - UseGPRForF16_F32 = !IsFixed; - UseGPRForF64 = !IsFixed; + UseGPRForF16_F32 = ArgFlags.isVarArg(); + UseGPRForF64 = ArgFlags.isVarArg(); break; } @@ -465,7 +465,7 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, // currently if we are using ILP32E calling convention. This behavior may be // changed when RV32E/ILP32E is ratified. unsigned TwoXLenInBytes = (2 * XLen) / 8; - if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes && + if (ArgFlags.isVarArg() && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes && DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes && ABI != RISCVABI::ABI_ILP32E) { unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); @@ -620,8 +620,8 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, // benchmark. But theoretically, it may have benefit for some cases. bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy) { + ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsRet, + Type *OrigTy) { const MachineFunction &MF = State.getMachineFunction(); const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>(); const RISCVTargetLowering &TLI = *Subtarget.getTargetLowering(); diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.h b/llvm/lib/Target/RISCV/RISCVCallingConv.h index bf823b7..2030ce1 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.h +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.h @@ -21,15 +21,15 @@ namespace llvm { typedef bool RISCVCCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, - bool IsFixed, bool IsRet, Type *OrigTy); + bool IsRet, Type *OrigTy); bool CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy); + CCState &State, bool IsRet, Type *OrigTy); bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy); + CCState &State, bool IsRet, Type *OrigTy); bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 171940e..a7329d2 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1700,6 +1700,18 @@ def TuneNLogNVRGather def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler", "UsePostRAScheduler", "true", "Schedule again after register allocation">; +def TuneDisableMISchedLoadClustering : SubtargetFeature<"disable-misched-load-clustering", + "EnableMISchedLoadClustering", "false", "Disable load clustering in the machine scheduler">; + +def TuneDisableMISchedStoreClustering : SubtargetFeature<"disable-misched-store-clustering", + "EnableMISchedStoreClustering", "false", "Disable store clustering in the machine scheduler">; + +def TuneDisablePostMISchedLoadClustering : SubtargetFeature<"disable-postmisched-load-clustering", + "EnablePostMISchedLoadClustering", "false", "Disable PostRA load clustering in the machine scheduler">; + +def TuneDisablePostMISchedStoreClustering : SubtargetFeature<"disable-postmisched-store-clustering", + "EnablePostMISchedStoreClustering", "false", "Disable PostRA store clustering in the machine scheduler">; + def TuneDisableLatencySchedHeuristic : SubtargetFeature<"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0077ecf..e4aa8b8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22282,8 +22282,8 @@ void RISCVTargetLowering::analyzeInputArgs( else if (In.isOrigArg()) ArgTy = FType->getParamType(In.getOrigArgIndex()); - if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, - /*IsFixed=*/true, IsRet, ArgTy)) { + if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet, + ArgTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << Idx << " has unhandled type " << ArgVT << '\n'); llvm_unreachable(nullptr); @@ -22300,8 +22300,8 @@ void RISCVTargetLowering::analyzeOutputArgs( ISD::ArgFlagsTy ArgFlags = Out.Flags; Type *OrigTy = CLI ? CLI->getArgs()[Out.OrigArgIndex].Ty : nullptr; - if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, Out.IsFixed, - IsRet, OrigTy)) { + if (Fn(Idx, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, IsRet, + OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << Idx << " has unhandled type " << ArgVT << "\n"); llvm_unreachable(nullptr); @@ -23083,7 +23083,7 @@ bool RISCVTargetLowering::CanLowerReturn( MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo, - /*IsFixed=*/true, /*IsRet=*/true, nullptr)) + /*IsRet=*/true, nullptr)) return false; } return true; @@ -24691,7 +24691,7 @@ SDValue RISCVTargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, XLenVT, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT)); + DAG.getSignedConstant(-Align->value(), dl, VT)); // Set the real SP to the new value with a probing loop. Chain = DAG.getNode(RISCVISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 04ffb05..413ad8b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -629,9 +629,6 @@ def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)), def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)), (zexti8 (XLenVT GPR:$rs1))), 0xFFFF), (PACKH GPR:$rs1, GPR:$rs2)>; -def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), - (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)), (zexti8 (XLenVT GPR:$rs1))), @@ -642,11 +639,15 @@ let Predicates = [HasStdExtZbkb, IsRV32] in { def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))), (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (shl GPR:$rs2, (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), + (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; + // Match a pattern of 2 bytes being inserted into bits [31:16], with bits // bits [15:0] coming from a zero extended value. We can use pack with packh for // bits [31:16]. If bits [15:0] can also be a packh, it can be matched // separately. -def : Pat<(or (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)), +def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)), (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), (zexti16 (XLenVT GPR:$rs1))), (PACK (XLenVT GPR:$rs1), @@ -657,12 +658,40 @@ let Predicates = [HasStdExtZbkb, IsRV64] in { def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))), (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), + (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; +def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), + (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; + def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)), (zexti16 (i64 GPR:$rs1))), (PACKW GPR:$rs1, GPR:$rs2)>; def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), (zexti16 (i64 GPR:$rs1)))), (PACKW GPR:$rs1, GPR:$rs2)>; + +// Match a pattern of 2 bytes being inserted into bits [31:16], with bits +// bits [15:0] coming from a zero extended value, and bits [63:32] being +// ignored. We can use packw with packh for bits [31:16]. If bits [15:0] can +// also be a packh, it can be matched separately. +def : Pat<(binop_allwusers<or> + (or (shl GPR:$op1rs2, (XLenVT 24)), + (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), + (zexti16 (XLenVT GPR:$rs1))), + (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; +// We need to manually reassociate the patterns because of the binop_allwusers. +def : Pat<(binop_allwusers<or> + (or (zexti16 (XLenVT GPR:$rs1)), + (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), + (shl GPR:$op1rs2, (XLenVT 24))), + (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; +def : Pat<(binop_allwusers<or> + (or (zexti16 (XLenVT GPR:$rs1)), + (shl GPR:$op1rs1, (XLenVT 24))), + (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))), + (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; } // Predicates = [HasStdExtZbkb, IsRV64] let Predicates = [HasStdExtZbb, IsRV32] in diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.td b/llvm/lib/Target/RISCV/RISCVMacroFusion.td index 875a93d..39e099b 100644 --- a/llvm/lib/Target/RISCV/RISCVMacroFusion.td +++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.td @@ -91,3 +91,59 @@ def TuneLDADDFusion CheckIsImmOperand<2>, CheckImmOperand<2, 0> ]>>; + +defvar Load = [LB, LH, LW, LD, LBU, LHU, LWU]; + +// Fuse add(.uw) followed by a load (lb, lh, lw, ld, lbu, lhu, lwu): +// add(.uw) rd, rs1, rs2 +// load rd, imm12(rd) +def TuneADDLoadFusion + : SimpleFusion<"add-load-fusion", "HasADDLoadFusion", "Enable ADD(.UW) + load macrofusion", + CheckOpcode<[ADD, ADD_UW]>, + CheckOpcode<Load>>; + +// Fuse AUIPC followed by by a load (lb, lh, lw, ld, lbu, lhu, lwu) +// auipc rd, imm20 +// load rd, imm12(rd) +def TuneAUIPCLoadFusion + : SimpleFusion<"auipc-load-fusion", "HasAUIPCLoadFusion", + "Enable AUIPC + load macrofusion", + CheckOpcode<[AUIPC]>, + CheckOpcode<Load>>; + +// Fuse LUI followed by a load (lb, lh, lw, ld, lbu, lhu, lwu) +// lui rd, imm[31:12] +// load rd, imm12(rd) +def TuneLUILoadFusion + : SimpleFusion<"lui-load-fusion", "HasLUILoadFusion", + "Enable LUI + load macrofusion", + CheckOpcode<[LUI]>, + CheckOpcode<Load>>; + +// Bitfield extract fusion: similar to TuneShiftedZExtWFusion +// but without the immediate restriction +// slli rd, rs1, imm12 +// srli rd, rd, imm12 +def TuneBFExtFusion + : SimpleFusion<"bfext-fusion", "HasBFExtFusion", + "Enable SLLI+SRLI (bitfield extract) macrofusion", + CheckOpcode<[SLLI]>, + CheckOpcode<[SRLI]>>; + +// Fuse ADDI followed by a load (lb, lh, lw, ld, lbu, lhu, lwu) +// addi rd, rs1, imm12 +// load rd, imm12(rd) +def TuneADDILoadFusion + : SimpleFusion<"addi-load-fusion", "HasADDILoadFusion", + "Enable ADDI + load macrofusion", + CheckOpcode<[ADDI]>, + CheckOpcode<Load>>; + +// Fuse shXadd(.uw) followed by a load (lb, lh, lw, ld, lbu, lhu, lwu) +// shXadd(.uw) rd, rs1, rs2 +// load rd, imm12(rd) +def TuneSHXADDLoadFusion + : SimpleFusion<"shxadd-load-fusion", "HasSHXADDLoadFusion", + "Enable SH(1|2|3)ADD(.UW) + load macrofusion", + CheckOpcode<[SH1ADD, SH2ADD, SH3ADD, SH1ADD_UW, SH2ADD_UW, SH3ADD_UW]>, + CheckOpcode<Load>>; diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 838edf6..31d2b3a 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -590,12 +590,17 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1", FeatureStdExtZicboz, FeatureVendorXVentanaCondOps], [TuneVentanaVeyron, + TuneDisableMISchedLoadClustering, + TuneDisablePostMISchedLoadClustering, + TuneDisablePostMISchedStoreClustering, TuneLUIADDIFusion, TuneAUIPCADDIFusion, TuneZExtHFusion, TuneZExtWFusion, TuneShiftedZExtWFusion, - TuneLDADDFusion]> { + TuneADDLoadFusion, + TuneAUIPCLoadFusion, + TuneLUILoadFusion]> { let MVendorID = 0x61f; let MArchID = 0x8000000000010000; let MImpID = 0x111; diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 3f2a83f..66ce134 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -94,16 +94,6 @@ static cl::opt<bool> cl::desc("Enable the loop data prefetch pass"), cl::init(true)); -static cl::opt<bool> EnableMISchedLoadStoreClustering( - "riscv-misched-load-store-clustering", cl::Hidden, - cl::desc("Enable load and store clustering in the machine scheduler"), - cl::init(true)); - -static cl::opt<bool> EnablePostMISchedLoadStoreClustering( - "riscv-postmisched-load-store-clustering", cl::Hidden, - cl::desc("Enable PostRA load and store clustering in the machine scheduler"), - cl::init(true)); - static cl::opt<bool> DisableVectorMaskMutation( "riscv-disable-vector-mask-mutation", cl::desc("Disable the vector mask scheduling mutation"), cl::init(false), @@ -294,15 +284,17 @@ bool RISCVTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, ScheduleDAGInstrs * RISCVTargetMachine::createMachineScheduler(MachineSchedContext *C) const { + const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>(); ScheduleDAGMILive *DAG = createSchedLive(C); - if (EnableMISchedLoadStoreClustering) { + + if (ST.enableMISchedLoadClustering()) DAG->addMutation(createLoadClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); + + if (ST.enableMISchedStoreClustering()) DAG->addMutation(createStoreClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); - } - const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>(); if (!DisableVectorMaskMutation && ST.hasVInstructions()) DAG->addMutation(createRISCVVectorMaskDAGMutation(DAG->TRI)); @@ -311,13 +303,16 @@ RISCVTargetMachine::createMachineScheduler(MachineSchedContext *C) const { ScheduleDAGInstrs * RISCVTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { + const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>(); ScheduleDAGMI *DAG = createSchedPostRA(C); - if (EnablePostMISchedLoadStoreClustering) { + + if (ST.enablePostMISchedLoadClustering()) DAG->addMutation(createLoadClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); + + if (ST.enablePostMISchedStoreClustering()) DAG->addMutation(createStoreClusterDAGMutation( DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true)); - } return DAG; } diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h index 78a066b..ed0a1e1 100644 --- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h +++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h @@ -73,7 +73,11 @@ public: Entry(std::move(CR.Entry)), Exits(std::move(CR.Exits)), Blocks(std::move(CR.Blocks)) {} + ~ConvergenceRegion() { releaseMemory(); } + + ConvergenceRegion &operator=(ConvergenceRegion &&CR) = delete; ConvergenceRegion(const ConvergenceRegion &other) = delete; + ConvergenceRegion &operator=(const ConvergenceRegion &other) = delete; // Returns true if the given basic block belongs to this region, or to one of // its subregion. @@ -101,6 +105,9 @@ public: ~ConvergenceRegionInfo() { releaseMemory(); } + ConvergenceRegionInfo(const ConvergenceRegionInfo &LHS) = delete; + ConvergenceRegionInfo &operator=(const ConvergenceRegionInfo &LHS) = delete; + ConvergenceRegionInfo(ConvergenceRegionInfo &&LHS) : TopLevelRegion(LHS.TopLevelRegion) { if (TopLevelRegion != LHS.TopLevelRegion) { diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt index ba09451..6660de9 100644 --- a/llvm/lib/Target/SPIRV/CMakeLists.txt +++ b/llvm/lib/Target/SPIRV/CMakeLists.txt @@ -26,6 +26,7 @@ add_llvm_target(SPIRVCodeGen SPIRVGlobalRegistry.cpp SPIRVInstrInfo.cpp SPIRVInstructionSelector.cpp + SPIRVLegalizeImplicitBinding.cpp SPIRVStripConvergentIntrinsics.cpp SPIRVLegalizePointerCast.cpp SPIRVMergeRegionExitTargets.cpp diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index 64d301e..4ec31bf 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -96,7 +96,7 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI, void SPIRVInstPrinter::recordOpExtInstImport(const MCInst *MI) { MCRegister Reg = MI->getOperand(0).getReg(); auto Name = getSPIRVStringOperand(*MI, 1); - auto Set = getExtInstSetFromString(Name); + auto Set = getExtInstSetFromString(std::move(Name)); ExtInstSetIDs.insert({Reg, Set}); } @@ -210,6 +210,7 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address, case SPIRV::OpConstantF: // The last fixed operand along with any variadic operands that follow // are part of the variable value. + assert(NumFixedOps > 0 && "Expected at least one fixed operand"); printOpConstantVarOps(MI, NumFixedOps - 1, OS); break; case SPIRV::OpCooperativeMatrixMulAddKHR: { diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h index 1688fa3..1934e98 100644 --- a/llvm/lib/Target/SPIRV/SPIRV.h +++ b/llvm/lib/Target/SPIRV/SPIRV.h @@ -23,6 +23,7 @@ ModulePass *createSPIRVPrepareFunctionsPass(const SPIRVTargetMachine &TM); FunctionPass *createSPIRVStructurizerPass(); FunctionPass *createSPIRVMergeRegionExitTargetsPass(); FunctionPass *createSPIRVStripConvergenceIntrinsicsPass(); +ModulePass *createSPIRVLegalizeImplicitBindingPass(); FunctionPass *createSPIRVLegalizePointerCastPass(SPIRVTargetMachine *TM); FunctionPass *createSPIRVRegularizerPass(); FunctionPass *createSPIRVPreLegalizerCombiner(); @@ -49,6 +50,7 @@ void initializeSPIRVRegularizerPass(PassRegistry &); void initializeSPIRVMergeRegionExitTargetsPass(PassRegistry &); void initializeSPIRVPrepareFunctionsPass(PassRegistry &); void initializeSPIRVStripConvergentIntrinsicsPass(PassRegistry &); +void initializeSPIRVLegalizeImplicitBindingPass(PassRegistry &); } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRV_H diff --git a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp index cfe7ef4..d6581b2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp @@ -156,7 +156,7 @@ SPIRVTranslateModule(Module *M, std::string &SpirvObj, std::string &ErrMsg, } } return SPIRVTranslate(M, SpirvObj, ErrMsg, AllowExtNames, OLevel, - TargetTriple); + std::move(TargetTriple)); } } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 1ebfde2..c2a6e51 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -50,7 +50,8 @@ class SPIRVAsmPrinter : public AsmPrinter { public: explicit SPIRVAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer), ID), ST(nullptr), TII(nullptr) {} + : AsmPrinter(TM, std::move(Streamer), ID), ModuleSectionsEmitted(false), + ST(nullptr), TII(nullptr), MAI(nullptr) {} static char ID; bool ModuleSectionsEmitted; const SPIRVSubtarget *ST; @@ -591,7 +592,9 @@ void SPIRVAsmPrinter::outputAnnotations(const Module &M) { cast<GlobalVariable>(CS->getOperand(1)->stripPointerCasts()); StringRef AnnotationString; - getConstantStringInfo(GV, AnnotationString); + [[maybe_unused]] bool Success = + getConstantStringInfo(GV, AnnotationString); + assert(Success && "Failed to get annotation string"); MCInst Inst; Inst.setOpcode(SPIRV::OpDecorate); Inst.addOperand(MCOperand::createReg(Reg)); diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 25cdf72..e6e86b7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -51,7 +51,7 @@ struct IncomingCall { IncomingCall(const std::string BuiltinName, const DemangledBuiltin *Builtin, const Register ReturnRegister, const SPIRVType *ReturnType, const SmallVectorImpl<Register> &Arguments) - : BuiltinName(BuiltinName), Builtin(Builtin), + : BuiltinName(std::move(BuiltinName)), Builtin(Builtin), ReturnRegister(ReturnRegister), ReturnType(ReturnType), Arguments(Arguments) {} @@ -2619,6 +2619,7 @@ static bool generateConvertInst(const StringRef DemangledCall, GR->getSPIRVTypeID(Call->ReturnType)); } + assert(Builtin && "Conversion builtin not found."); if (Builtin->IsSaturated) buildOpDecorate(Call->ReturnRegister, MIRBuilder, SPIRV::Decoration::SaturatedConversion, {}); diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 2c3e087..f5a49e2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -499,7 +499,7 @@ void SPIRVEmitIntrinsics::propagateElemTypeRec( std::unordered_set<Value *> Visited; DenseMap<Function *, CallInst *> Ptrcasts; propagateElemTypeRec(Op, PtrElemTy, CastElemTy, VisitedSubst, Visited, - Ptrcasts); + std::move(Ptrcasts)); } void SPIRVEmitIntrinsics::propagateElemTypeRec( @@ -897,17 +897,16 @@ Type *SPIRVEmitIntrinsics::deduceNestedTypeHelper( bool Change = false; for (unsigned i = 0; i < U->getNumOperands(); ++i) { Value *Op = U->getOperand(i); + assert(Op && "Operands should not be null."); Type *OpTy = Op->getType(); Type *Ty = OpTy; - if (Op) { - if (auto *PtrTy = dyn_cast<PointerType>(OpTy)) { - if (Type *NestedTy = - deduceElementTypeHelper(Op, Visited, UnknownElemTypeI8)) - Ty = getTypedPointerWrapper(NestedTy, PtrTy->getAddressSpace()); - } else { - Ty = deduceNestedTypeHelper(dyn_cast<User>(Op), OpTy, Visited, - UnknownElemTypeI8); - } + if (auto *PtrTy = dyn_cast<PointerType>(OpTy)) { + if (Type *NestedTy = + deduceElementTypeHelper(Op, Visited, UnknownElemTypeI8)) + Ty = getTypedPointerWrapper(NestedTy, PtrTy->getAddressSpace()); + } else { + Ty = deduceNestedTypeHelper(dyn_cast<User>(Op), OpTy, Visited, + UnknownElemTypeI8); } Tys.push_back(Ty); Change |= Ty != OpTy; diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp index 7f0d636..275463e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp @@ -116,6 +116,7 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) { } } const NamedMDNode *ModuleFlags = M->getNamedMetadata("llvm.module.flags"); + assert(ModuleFlags && "Expected llvm.module.flags metadata to be present"); for (const auto *Op : ModuleFlags->operands()) { const MDOperand &MaybeStrOp = Op->getOperand(1); if (MaybeStrOp.equalsStr("Dwarf Version")) diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index f1436d5..cfe24c8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -87,7 +87,7 @@ storageClassRequiresExplictLayout(SPIRV::StorageClass::StorageClass SC) { } SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize) - : PointerSize(PointerSize), Bound(0) {} + : PointerSize(PointerSize), Bound(0), CurMF(nullptr) {} SPIRVType *SPIRVGlobalRegistry::assignIntTypeToVReg(unsigned BitWidth, Register VReg, @@ -474,8 +474,8 @@ Register SPIRVGlobalRegistry::getOrCreateBaseRegister( } if (Type->getOpcode() == SPIRV::OpTypeFloat) { SPIRVType *SpvBaseType = getOrCreateSPIRVFloatType(BitWidth, I, TII); - return getOrCreateConstFP(dyn_cast<ConstantFP>(Val)->getValue(), I, - SpvBaseType, TII, ZeroAsNull); + return getOrCreateConstFP(cast<ConstantFP>(Val)->getValue(), I, SpvBaseType, + TII, ZeroAsNull); } assert(Type->getOpcode() == SPIRV::OpTypeInt); SPIRVType *SpvBaseType = getOrCreateSPIRVIntegerType(BitWidth, I, TII); @@ -1069,7 +1069,8 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType( MIRBuilder); }; } - return getOpTypeStruct(SType, MIRBuilder, AccQual, Decorator, EmitIR); + return getOpTypeStruct(SType, MIRBuilder, AccQual, std::move(Decorator), + EmitIR); } if (auto FType = dyn_cast<FunctionType>(Ty)) { SPIRVType *RetTy = findSPIRVType(FType->getReturnType(), MIRBuilder, @@ -1406,8 +1407,9 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateLayoutType( // We need a new OpTypeStruct instruction because decorations will be // different from a struct with an explicit layout created from a different // entry point. - SPIRVType *SPIRVStructType = getOpTypeStruct( - ST, MIRBuilder, SPIRV::AccessQualifier::None, Decorator, EmitIr); + SPIRVType *SPIRVStructType = + getOpTypeStruct(ST, MIRBuilder, SPIRV::AccessQualifier::None, + std::move(Decorator), EmitIr); add(Key, SPIRVStructType); return SPIRVStructType; } diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index e9f5ffa..5259db1 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -362,6 +362,7 @@ SPIRVInstructionSelector::SPIRVInstructionSelector(const SPIRVTargetMachine &TM, const RegisterBankInfo &RBI) : InstructionSelector(), STI(ST), TII(*ST.getInstrInfo()), TRI(*ST.getRegisterInfo()), RBI(RBI), GR(*ST.getSPIRVGlobalRegistry()), + MRI(nullptr), #define GET_GLOBALISEL_PREDICATES_INIT #include "SPIRVGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_INIT @@ -3574,7 +3575,7 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow( // Join all the resulting registers back into the return type in order // (ie i32x2, i32x2, i32x1 -> i32x5) - return selectOpWithSrcs(ResVReg, ResType, I, PartialRegs, + return selectOpWithSrcs(ResVReg, ResType, I, std::move(PartialRegs), SPIRV::OpCompositeConstruct); } diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp new file mode 100644 index 0000000..0398e52 --- /dev/null +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp @@ -0,0 +1,159 @@ +//===- SPIRVLegalizeImplicitBinding.cpp - Legalize implicit bindings ----*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass legalizes the @llvm.spv.resource.handlefromimplicitbinding +// intrinsic by replacing it with a call to +// @llvm.spv.resource.handlefrombinding. +// +//===----------------------------------------------------------------------===// + +#include "SPIRV.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsSPIRV.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include <algorithm> +#include <vector> + +using namespace llvm; + +namespace { +class SPIRVLegalizeImplicitBinding : public ModulePass { +public: + static char ID; + SPIRVLegalizeImplicitBinding() : ModulePass(ID) {} + + bool runOnModule(Module &M) override; + +private: + void collectBindingInfo(Module &M); + uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet); + void replaceImplicitBindingCalls(Module &M); + + // A map from descriptor set to a bit vector of used binding numbers. + std::vector<BitVector> UsedBindings; + // A list of all implicit binding calls, to be sorted by order ID. + SmallVector<CallInst *, 16> ImplicitBindingCalls; +}; + +struct BindingInfoCollector : public InstVisitor<BindingInfoCollector> { + std::vector<BitVector> &UsedBindings; + SmallVector<CallInst *, 16> &ImplicitBindingCalls; + + BindingInfoCollector(std::vector<BitVector> &UsedBindings, + SmallVector<CallInst *, 16> &ImplicitBindingCalls) + : UsedBindings(UsedBindings), ImplicitBindingCalls(ImplicitBindingCalls) { + } + + void visitCallInst(CallInst &CI) { + if (CI.getIntrinsicID() == Intrinsic::spv_resource_handlefrombinding) { + const uint32_t DescSet = + cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); + const uint32_t Binding = + cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); + + if (UsedBindings.size() <= DescSet) { + UsedBindings.resize(DescSet + 1); + UsedBindings[DescSet].resize(64); + } + if (UsedBindings[DescSet].size() <= Binding) { + UsedBindings[DescSet].resize(2 * Binding + 1); + } + UsedBindings[DescSet].set(Binding); + } else if (CI.getIntrinsicID() == + Intrinsic::spv_resource_handlefromimplicitbinding) { + ImplicitBindingCalls.push_back(&CI); + } + } +}; + +void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) { + BindingInfoCollector InfoCollector(UsedBindings, ImplicitBindingCalls); + InfoCollector.visit(M); + + // Sort the collected calls by their order ID. + std::sort( + ImplicitBindingCalls.begin(), ImplicitBindingCalls.end(), + [](const CallInst *A, const CallInst *B) { + const uint32_t OrderIdArgIdx = 0; + const uint32_t OrderA = + cast<ConstantInt>(A->getArgOperand(OrderIdArgIdx))->getZExtValue(); + const uint32_t OrderB = + cast<ConstantInt>(B->getArgOperand(OrderIdArgIdx))->getZExtValue(); + return OrderA < OrderB; + }); +} + +uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding( + uint32_t DescSet) { + if (UsedBindings.size() <= DescSet) { + UsedBindings.resize(DescSet + 1); + UsedBindings[DescSet].resize(64); + } + + int NewBinding = UsedBindings[DescSet].find_first_unset(); + if (NewBinding == -1) { + NewBinding = UsedBindings[DescSet].size(); + UsedBindings[DescSet].resize(2 * NewBinding + 1); + } + + UsedBindings[DescSet].set(NewBinding); + return NewBinding; +} + +void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) { + for (CallInst *OldCI : ImplicitBindingCalls) { + IRBuilder<> Builder(OldCI); + const uint32_t DescSet = + cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue(); + const uint32_t NewBinding = getAndReserveFirstUnusedBinding(DescSet); + + SmallVector<Value *, 8> Args; + Args.push_back(Builder.getInt32(DescSet)); + Args.push_back(Builder.getInt32(NewBinding)); + + // Copy the remaining arguments from the old call. + for (uint32_t i = 2; i < OldCI->arg_size(); ++i) { + Args.push_back(OldCI->getArgOperand(i)); + } + + Function *NewFunc = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::spv_resource_handlefrombinding, OldCI->getType()); + CallInst *NewCI = Builder.CreateCall(NewFunc, Args); + NewCI->setCallingConv(OldCI->getCallingConv()); + + OldCI->replaceAllUsesWith(NewCI); + OldCI->eraseFromParent(); + } +} + +bool SPIRVLegalizeImplicitBinding::runOnModule(Module &M) { + collectBindingInfo(M); + if (ImplicitBindingCalls.empty()) { + return false; + } + + replaceImplicitBindingCalls(M); + return true; +} +} // namespace + +char SPIRVLegalizeImplicitBinding::ID = 0; + +INITIALIZE_PASS(SPIRVLegalizeImplicitBinding, "legalize-spirv-implicit-binding", + "Legalize SPIR-V implicit bindings", false, false) + +ModulePass *llvm::createSPIRVLegalizeImplicitBindingPass() { + return new SPIRVLegalizeImplicitBinding(); +}
\ No newline at end of file diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index ab06fc0..8039cf0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -93,7 +93,7 @@ getSymbolicOperandRequirements(SPIRV::OperandCategory::OperandCategory Category, if (Reqs.isCapabilityAvailable(Cap)) { ReqExts.append(getSymbolicOperandExtensions( SPIRV::OperandCategory::CapabilityOperand, Cap)); - return {true, {Cap}, ReqExts, ReqMinVer, ReqMaxVer}; + return {true, {Cap}, std::move(ReqExts), ReqMinVer, ReqMaxVer}; } } else { // By SPIR-V specification: "If an instruction, enumerant, or other @@ -111,7 +111,7 @@ getSymbolicOperandRequirements(SPIRV::OperandCategory::OperandCategory Category, if (i == Sz - 1 || !AvoidCaps.S.contains(Cap)) { ReqExts.append(getSymbolicOperandExtensions( SPIRV::OperandCategory::CapabilityOperand, Cap)); - return {true, {Cap}, ReqExts, ReqMinVer, ReqMaxVer}; + return {true, {Cap}, std::move(ReqExts), ReqMinVer, ReqMaxVer}; } } } @@ -558,7 +558,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, bool Append = true) { MAI.setSkipEmission(&MI); InstrSignature MISign = instrToSignature(MI, MAI, true); - auto FoundMI = IS.insert(MISign); + auto FoundMI = IS.insert(std::move(MISign)); if (!FoundMI.second) return; // insert failed, so we found a duplicate; don't add it to MAI.MS // No duplicates, so add it. diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index a0d47cb..41c792a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -54,8 +54,8 @@ struct Requirements { std::optional<Capability::Capability> Cap = {}, ExtensionList Exts = {}, VersionTuple MinVer = VersionTuple(), VersionTuple MaxVer = VersionTuple()) - : IsSatisfiable(IsSatisfiable), Cap(Cap), Exts(Exts), MinVer(MinVer), - MaxVer(MaxVer) {} + : IsSatisfiable(IsSatisfiable), Cap(Cap), Exts(std::move(Exts)), + MinVer(MinVer), MaxVer(MaxVer) {} Requirements(Capability::Capability Cap) : Requirements(true, {Cap}) {} }; @@ -217,7 +217,8 @@ struct SPIRVModuleAnalysis : public ModulePass { static char ID; public: - SPIRVModuleAnalysis() : ModulePass(ID) {} + SPIRVModuleAnalysis() + : ModulePass(ID), ST(nullptr), GR(nullptr), TII(nullptr), MMI(nullptr) {} bool runOnModule(Module &M) override; void getAnalysisUsage(AnalysisUsage &AU) const override; diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp index 1d38244..d17528d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp @@ -147,7 +147,7 @@ void visit(MachineFunction &MF, MachineBasicBlock &Start, // Do a preorder traversal of the CFG starting from the given function's entry // point. Calls |op| on each basic block encountered during the traversal. void visit(MachineFunction &MF, std::function<void(MachineBasicBlock *)> op) { - visit(MF, *MF.begin(), op); + visit(MF, *MF.begin(), std::move(op)); } bool SPIRVPostLegalizer::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index f4b4846..b62db7f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -99,6 +99,7 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR, SPIRVType *ExtType = GR->getOrCreateSPIRVType( Const->getType(), MIB, SPIRV::AccessQualifier::ReadWrite, true); + assert(SrcMI && "Expected source instruction to be valid"); SrcMI->setDesc(STI.getInstrInfo()->get(SPIRV::OpConstantNull)); SrcMI->addOperand(MachineOperand::CreateReg( GR->getSPIRVTypeID(ExtType), false)); diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 595424b..74aec4f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -234,7 +234,7 @@ static SmallVector<Metadata *> parseAnnotation(Value *I, return SmallVector<Metadata *>{}; MDs.push_back(MDNode::get(Ctx, MDsItem)); } - return Pos == static_cast<int>(Anno.length()) ? MDs + return Pos == static_cast<int>(Anno.length()) ? std::move(MDs) : SmallVector<Metadata *>{}; } diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index d7cf211..e0bfb77 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -226,6 +226,7 @@ void SPIRVPassConfig::addIRPasses() { } void SPIRVPassConfig::addISelPrepare() { + addPass(createSPIRVLegalizeImplicitBindingPass()); addPass(createSPIRVEmitIntrinsicsPass(&getTM<SPIRVTargetMachine>())); if (TM.getSubtargetImpl()->isLogicalSPIRV()) addPass(createSPIRVLegalizePointerCastPass(&getTM<SPIRVTargetMachine>())); diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 416d811..820e56b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -463,8 +463,10 @@ std::string getOclOrSpirvBuiltinDemangledName(StringRef Name) { DemangledNameLenStart = NameSpaceStart + 11; } Start = Name.find_first_not_of("0123456789", DemangledNameLenStart); - Name.substr(DemangledNameLenStart, Start - DemangledNameLenStart) - .getAsInteger(10, Len); + [[maybe_unused]] bool Error = + Name.substr(DemangledNameLenStart, Start - DemangledNameLenStart) + .getAsInteger(10, Len); + assert(!Error && "Failed to parse demangled name length"); return Name.substr(Start, Len).str(); } @@ -756,7 +758,7 @@ bool getVacantFunctionName(Module &M, std::string &Name) { for (unsigned I = 0; I < MaxIters; ++I) { std::string OrdName = Name + Twine(I).str(); if (!M.getFunction(OrdName)) { - Name = OrdName; + Name = std::move(OrdName); return true; } } diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 1aa8efe..c0fc3a6 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1179,7 +1179,7 @@ static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs, if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128)) continue; // The fixed arguments to a varargs function still go in FP registers. - if (Outs[VA.getValNo()].IsFixed) + if (!Outs[VA.getValNo()].Flags.isVarArg()) continue; // This floating point argument should be reassigned. diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h index 25f4aac..fbb98ff 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h @@ -31,10 +31,6 @@ namespace SystemZ { class SystemZCCState : public CCState { private: - /// Records whether the value was a fixed argument. - /// See ISD::OutputArg::IsFixed. - SmallVector<bool, 4> ArgIsFixed; - /// Records whether the value was widened from a short vector type. SmallVector<bool, 4> ArgIsShortVector; @@ -50,10 +46,6 @@ public: void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins, CCAssignFn Fn) { - // Formal arguments are always fixed. - ArgIsFixed.clear(); - for (unsigned i = 0; i < Ins.size(); ++i) - ArgIsFixed.push_back(true); // Record whether the call operand was a short vector. ArgIsShortVector.clear(); for (unsigned i = 0; i < Ins.size(); ++i) @@ -64,10 +56,6 @@ public: void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, CCAssignFn Fn) { - // Record whether the call operand was a fixed argument. - ArgIsFixed.clear(); - for (unsigned i = 0; i < Outs.size(); ++i) - ArgIsFixed.push_back(Outs[i].IsFixed); // Record whether the call operand was a short vector. ArgIsShortVector.clear(); for (unsigned i = 0; i < Outs.size(); ++i) @@ -77,12 +65,11 @@ public: } // This version of AnalyzeCallOperands in the base class is not usable - // since we must provide a means of accessing ISD::OutputArg::IsFixed. + // since we must provide a means of accessing ISD::OutputArg::IsShortVector. void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs, SmallVectorImpl<ISD::ArgFlagsTy> &Flags, CCAssignFn Fn) = delete; - bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; } bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; } }; diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 0ad872b..059f31f 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -16,14 +16,6 @@ class CCIfSubtarget<string F, CCAction A> "getSubtarget<SystemZSubtarget>().", F), A>; -// Match if this specific argument is a fixed (i.e. named) argument. -class CCIfFixed<CCAction A> - : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>; - -// Match if this specific argument is not a fixed (i.e. vararg) argument. -class CCIfNotFixed<CCAction A> - : CCIf<"!(static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))", A>; - // Match if this specific argument was widened from a short vector type. class CCIfShortVector<CCAction A> : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>; @@ -79,7 +71,7 @@ def CC_SystemZ_GHC : CallingConv<[ // Pass in STG registers: XMM1, ..., XMM6 CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>, + CCIfArgFixed<CCAssignToReg<[V16, V17, V18, V19, V20, V21]>>>>, // Fail otherwise CCCustom<"CC_SystemZ_GHC_Error"> @@ -125,8 +117,8 @@ def CC_SystemZ_ELF : CallingConv<[ // during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToReg<[V24, V26, V28, V30, - V25, V27, V29, V31]>>>>, + CCIfArgFixed<CCAssignToReg<[V24, V26, V28, V30, + V25, V27, V29, V31]>>>>, // However, sub-128 vectors which need to go on the stack occupy just a // single 8-byte-aligned 8-byte stack slot. Pass as i64. @@ -227,17 +219,17 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRs. // Although we assign the f32 vararg to be bitcast, it will first be promoted // to an f64 within convertValVTToLocVT(). - CCIfType<[f32, f64], CCIfNotFixed<CCBitConvertToType<i64>>>, + CCIfType<[f32, f64], CCIfArgVarArg<CCBitConvertToType<i64>>>, // Pointers are always passed in full 64-bit registers. CCIfPtr<CCCustom<"CC_XPLINK64_Pointer">>, // long double, can only be passed in GPR2 and GPR3, if available, // hence R2Q - CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>, + CCIfType<[f128], CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>, // Non fixed vector arguments are treated in the same way as long // doubles. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>, + CCIfArgVarArg<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>, // A SwiftSelf is passed in callee-saved R10. CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>, @@ -260,22 +252,24 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>, + CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>, CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToRegAndStack<[V24, V25, V26, V27, - V28, V29, V30, V31], 16, 8>>>>, + CCIfArgFixed<CCAssignToRegAndStack<[V24, V25, V26, V27, + V28, V29, V30, V31], 16, 8>>>>, // The first 4 named float and double arguments are passed in registers // FPR0-FPR6. The rest will be passed in the user area. - CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, - CCIfType<[f32], CCIfFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>, - CCIfType<[f64], CCIfFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>, + CCIfType<[f32, f64], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, + CCIfType<[f32], + CCIfArgFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>, + CCIfType<[f64], + CCIfArgFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>, // The first 2 long double arguments are passed in register FPR0/FPR2 // and FPR4/FPR6. The rest will be passed in the user area. - CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, - CCIfType<[f128], CCIfFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>, + CCIfType<[f128], CCIfArgFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, + CCIfType<[f128], CCIfArgFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>, // Other arguments are passed in 8-byte-aligned 8-byte stack slots. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 6297916..5ee66e3 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -574,13 +574,11 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF, // Call mcount (Regmask from CC AnyReg since mcount preserves all normal // argument registers). - FunctionCallee FC = MF.getFunction().getParent()->getOrInsertFunction( - "mcount", Type::getVoidTy(MF.getFunction().getContext())); const uint32_t *Mask = MF.getSubtarget<SystemZSubtarget>() .getSpecialRegisters() ->getCallPreservedMask(MF, CallingConv::AnyReg); BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::CallBRASL)) - .addGlobalAddress(dyn_cast<Function>(FC.getCallee())) + .addExternalSymbol("mcount") .addRegMask(Mask); // Reload return address from 8 bytes above stack pointer. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 3f80b2a..f9eba4b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1309,7 +1309,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, OutVal = FINode; } // Count the number of fixed args *after* legalization. - NumFixedArgs += Out.IsFixed; + NumFixedArgs += !Out.Flags.isVarArg(); } bool IsVarArg = CLI.IsVarArg; @@ -1503,7 +1503,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn( for (const ISD::OutputArg &Out : Outs) { assert(!Out.Flags.isByVal() && "byval is not valid for return values"); assert(!Out.Flags.isNest() && "nest is not valid for return values"); - assert(Out.IsFixed && "non-fixed return value is not valid"); + assert(!Out.Flags.isVarArg() && "non-fixed return value is not valid"); if (Out.Flags.isInAlloca()) fail(DL, DAG, "WebAssembly hasn't implemented inalloca results"); if (Out.Flags.isInConsecutiveRegs()) diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp index c0a6035..d9f4405 100644 --- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp @@ -75,7 +75,7 @@ public: static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7}; - if (!Info.IsFixed) + if (Flags.isVarArg()) NumXMMRegs = State.getFirstUnallocated(XMMArgRegs); return Res; @@ -363,7 +363,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, Info.CallConv, Info.IsVarArg)) return false; - bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed; + bool IsFixed = + Info.OrigArgs.empty() ? true : !Info.OrigArgs.back().Flags[0].isVarArg(); if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs diff --git a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp index 2f92f86..39bec47 100644 --- a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp +++ b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp @@ -145,39 +145,40 @@ struct DecodeRegister { }; const DecodeRegister SRDecoderTable[] = { - {Xtensa::LBEG, 0}, {Xtensa::LEND, 1}, - {Xtensa::LCOUNT, 2}, {Xtensa::SAR, 3}, - {Xtensa::BREG, 4}, {Xtensa::LITBASE, 5}, - {Xtensa::ACCLO, 16}, {Xtensa::ACCHI, 17}, - {Xtensa::M0, 32}, {Xtensa::M1, 33}, - {Xtensa::M2, 34}, {Xtensa::M3, 35}, - {Xtensa::WINDOWBASE, 72}, {Xtensa::WINDOWSTART, 73}, - {Xtensa::IBREAKENABLE, 96}, {Xtensa::MEMCTL, 97}, - {Xtensa::DDR, 104}, {Xtensa::IBREAKA0, 128}, - {Xtensa::IBREAKA1, 129}, {Xtensa::DBREAKA0, 144}, - {Xtensa::DBREAKA1, 145}, {Xtensa::DBREAKC0, 160}, - {Xtensa::DBREAKC1, 161}, {Xtensa::CONFIGID0, 176}, - {Xtensa::EPC1, 177}, {Xtensa::EPC2, 178}, - {Xtensa::EPC3, 179}, {Xtensa::EPC4, 180}, - {Xtensa::EPC5, 181}, {Xtensa::EPC6, 182}, - {Xtensa::EPC7, 183}, {Xtensa::DEPC, 192}, - {Xtensa::EPS2, 194}, {Xtensa::EPS3, 195}, - {Xtensa::EPS4, 196}, {Xtensa::EPS5, 197}, - {Xtensa::EPS6, 198}, {Xtensa::EPS7, 199}, - {Xtensa::CONFIGID1, 208}, {Xtensa::EXCSAVE1, 209}, - {Xtensa::EXCSAVE2, 210}, {Xtensa::EXCSAVE3, 211}, - {Xtensa::EXCSAVE4, 212}, {Xtensa::EXCSAVE5, 213}, - {Xtensa::EXCSAVE6, 214}, {Xtensa::EXCSAVE7, 215}, - {Xtensa::CPENABLE, 224}, {Xtensa::INTERRUPT, 226}, - {Xtensa::INTCLEAR, 227}, {Xtensa::INTENABLE, 228}, - {Xtensa::PS, 230}, {Xtensa::VECBASE, 231}, - {Xtensa::EXCCAUSE, 232}, {Xtensa::DEBUGCAUSE, 233}, - {Xtensa::CCOUNT, 234}, {Xtensa::PRID, 235}, - {Xtensa::ICOUNT, 236}, {Xtensa::ICOUNTLEVEL, 237}, - {Xtensa::EXCVADDR, 238}, {Xtensa::CCOMPARE0, 240}, - {Xtensa::CCOMPARE1, 241}, {Xtensa::CCOMPARE2, 242}, - {Xtensa::MISC0, 244}, {Xtensa::MISC1, 245}, - {Xtensa::MISC2, 246}, {Xtensa::MISC3, 247}}; + {Xtensa::LBEG, 0}, {Xtensa::LEND, 1}, + {Xtensa::LCOUNT, 2}, {Xtensa::SAR, 3}, + {Xtensa::BREG, 4}, {Xtensa::LITBASE, 5}, + {Xtensa::SCOMPARE1, 12}, {Xtensa::ACCLO, 16}, + {Xtensa::ACCHI, 17}, {Xtensa::M0, 32}, + {Xtensa::M1, 33}, {Xtensa::M2, 34}, + {Xtensa::M3, 35}, {Xtensa::WINDOWBASE, 72}, + {Xtensa::WINDOWSTART, 73}, {Xtensa::IBREAKENABLE, 96}, + {Xtensa::MEMCTL, 97}, {Xtensa::ATOMCTL, 99}, + {Xtensa::DDR, 104}, {Xtensa::IBREAKA0, 128}, + {Xtensa::IBREAKA1, 129}, {Xtensa::DBREAKA0, 144}, + {Xtensa::DBREAKA1, 145}, {Xtensa::DBREAKC0, 160}, + {Xtensa::DBREAKC1, 161}, {Xtensa::CONFIGID0, 176}, + {Xtensa::EPC1, 177}, {Xtensa::EPC2, 178}, + {Xtensa::EPC3, 179}, {Xtensa::EPC4, 180}, + {Xtensa::EPC5, 181}, {Xtensa::EPC6, 182}, + {Xtensa::EPC7, 183}, {Xtensa::DEPC, 192}, + {Xtensa::EPS2, 194}, {Xtensa::EPS3, 195}, + {Xtensa::EPS4, 196}, {Xtensa::EPS5, 197}, + {Xtensa::EPS6, 198}, {Xtensa::EPS7, 199}, + {Xtensa::CONFIGID1, 208}, {Xtensa::EXCSAVE1, 209}, + {Xtensa::EXCSAVE2, 210}, {Xtensa::EXCSAVE3, 211}, + {Xtensa::EXCSAVE4, 212}, {Xtensa::EXCSAVE5, 213}, + {Xtensa::EXCSAVE6, 214}, {Xtensa::EXCSAVE7, 215}, + {Xtensa::CPENABLE, 224}, {Xtensa::INTERRUPT, 226}, + {Xtensa::INTCLEAR, 227}, {Xtensa::INTENABLE, 228}, + {Xtensa::PS, 230}, {Xtensa::VECBASE, 231}, + {Xtensa::EXCCAUSE, 232}, {Xtensa::DEBUGCAUSE, 233}, + {Xtensa::CCOUNT, 234}, {Xtensa::PRID, 235}, + {Xtensa::ICOUNT, 236}, {Xtensa::ICOUNTLEVEL, 237}, + {Xtensa::EXCVADDR, 238}, {Xtensa::CCOMPARE0, 240}, + {Xtensa::CCOMPARE1, 241}, {Xtensa::CCOMPARE2, 242}, + {Xtensa::MISC0, 244}, {Xtensa::MISC1, 245}, + {Xtensa::MISC2, 246}, {Xtensa::MISC3, 247}}; static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp index 821cba0..080a9c0 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp @@ -200,6 +200,9 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits, case Xtensa::WINDOWBASE: case Xtensa::WINDOWSTART: return FeatureBits[Xtensa::FeatureWindowed]; + case Xtensa::ATOMCTL: + case Xtensa::SCOMPARE1: + return FeatureBits[Xtensa::FeatureWindowed]; case Xtensa::NoRegister: return false; } diff --git a/llvm/lib/Target/Xtensa/XtensaFeatures.td b/llvm/lib/Target/Xtensa/XtensaFeatures.td index 97d5472..d6f3ef0 100644 --- a/llvm/lib/Target/Xtensa/XtensaFeatures.td +++ b/llvm/lib/Target/Xtensa/XtensaFeatures.td @@ -73,6 +73,22 @@ def FeatureDiv32 : SubtargetFeature<"div32", "HasDiv32", "true", def HasDiv32 : Predicate<"Subtarget->hasDiv32()">, AssemblerPredicate<(all_of FeatureDiv32)>; +def FeatureS32C1I : SubtargetFeature<"s32c1i", "HasS32C1I", "true", + "Enable Xtensa S32C1I option">; +def HasS32C1I : Predicate<"Subtarget->hasS32C1I()">, + AssemblerPredicate<(all_of FeatureS32C1I)>; + +// Assume that lock-free native-width atomics are available, even if the target +// and operating system combination would not usually provide them. The user +// is responsible for providing any necessary __sync implementations. Code +// built with this feature is not ABI-compatible with code built without this +// feature, if atomic variables are exposed across the ABI boundary. +def FeatureForcedAtomics : SubtargetFeature<"forced-atomics", "HasForcedAtomics", "true", + "Assume that lock-free native-width atomics are available">; +def HasForcedAtomics : Predicate<"Subtarget->hasForcedAtomics()">, + AssemblerPredicate<(all_of FeatureForcedAtomics)>; +def HasAtomicLdSt : Predicate<"Subtarget->hasS32C1I() || Subtarget->hasForcedAtomics()">; + def FeatureRegionProtection : SubtargetFeature<"regprotect", "HasRegionProtection", "true", "Enable Xtensa Region Protection option">; def HasRegionProtection : Predicate<"Subtarget->hasRegionProtection()">, diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp index fd42fd2..6a07bd8 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp @@ -250,6 +250,15 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM, // Floating-point truncation and stores need to be done separately. setTruncStoreAction(MVT::f64, MVT::f32, Expand); + if (Subtarget.hasS32C1I()) { + setMaxAtomicSizeInBitsSupported(32); + setMinCmpXchgSizeInBits(32); + } else if (Subtarget.hasForcedAtomics()) { + setMaxAtomicSizeInBitsSupported(32); + } else { + setMaxAtomicSizeInBitsSupported(0); + } + // Compute derived properties from the register classes computeRegisterProperties(STI.getRegisterInfo()); } @@ -1548,6 +1557,11 @@ const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const { return nullptr; } +TargetLowering::AtomicExpansionKind +XtensaTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + return AtomicExpansionKind::CmpXChg; +} + //===----------------------------------------------------------------------===// // Custom insertion //===----------------------------------------------------------------------===// @@ -1696,6 +1710,23 @@ MachineBasicBlock *XtensaTargetLowering::EmitInstrWithCustomInserter( return MBB; } + case Xtensa::ATOMIC_CMP_SWAP_32_P: { + MachineOperand &R = MI.getOperand(0); + MachineOperand &Addr = MI.getOperand(1); + MachineOperand &Cmp = MI.getOperand(2); + MachineOperand &Swap = MI.getOperand(3); + + BuildMI(*MBB, MI, DL, TII.get(Xtensa::WSR), Xtensa::SCOMPARE1) + .addReg(Cmp.getReg()); + + BuildMI(*MBB, MI, DL, TII.get(Xtensa::S32C1I), R.getReg()) + .addReg(Swap.getReg()) + .addReg(Addr.getReg()) + .addImm(0); + + MI.eraseFromParent(); + return MBB; + } default: llvm_unreachable("Unexpected instr type to insert"); } diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h index e6ddf98..d84cbdb 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.h +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h @@ -145,6 +145,12 @@ public: const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + bool shouldInsertFencesForAtomic(const Instruction *I) const override { + return true; + } + + AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td index 31608f4..edcf247 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td @@ -496,6 +496,8 @@ def EXTW : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins), let hasSideEffects = 1; } +def : Pat<(atomic_fence timm, timm), (MEMW)>; + //===----------------------------------------------------------------------===// // Illegal instructions //===----------------------------------------------------------------------===// @@ -1499,6 +1501,46 @@ def RFI : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$imm), } //===----------------------------------------------------------------------===// +// S32C1I +//===----------------------------------------------------------------------===// + +let mayStore = 1, mayLoad = 1, Predicates = [HasS32C1I] in { + def S32C1I : RRI8_Inst<0x02, (outs AR:$a), (ins AR:$t, mem32:$addr), + "s32c1i\t$t, $addr", []> { + bits<12> addr; + + let r = 0x0e; + let Uses = [SCOMPARE1]; + let Constraints = "$a = $t"; + let imm8{7-0} = addr{11-4}; + let s{3-0} = addr{3-0}; + } +} + +//===----------------------------------------------------------------------===// +// Atomic patterns +//===----------------------------------------------------------------------===// + +// Atomic load/store are available under both +s32c1i and +force-atomics. +// Fences will be inserted for atomic load/stores according to the logic in +// XtensaTargetLowering. +let Predicates = [HasAtomicLdSt] in { + def : Pat<(i32 (atomic_load_8 addr_ish1:$addr)), (L8UI addr_ish1:$addr)>; + def : Pat<(i32 (atomic_load_16 addr_ish2:$addr)), (L16UI addr_ish2:$addr)>; + def : Pat<(i32 (atomic_load_32 addr_ish4:$addr)), (L32I addr_ish4:$addr)>; + + def : Pat<(atomic_store_8 AR:$t, addr_ish1:$addr), (S8I AR:$t, addr_ish1:$addr)>; + def : Pat<(atomic_store_16 AR:$t, addr_ish2:$addr), (S16I AR:$t, addr_ish2:$addr)>; + def : Pat<(atomic_store_32 AR:$t, addr_ish4:$addr), (S32I AR:$t, addr_ish4:$addr)>; +} + +let usesCustomInserter = 1, Predicates = [HasS32C1I] in { + def ATOMIC_CMP_SWAP_32_P : Pseudo<(outs AR:$dst), (ins AR:$ptr, AR:$cmp, AR:$swap), + "!atomic_cmp_swap_32_p, $dst, $ptr, $cmp, $swap", + [(set AR:$dst, (atomic_cmp_swap_i32 AR:$ptr, AR:$cmp, AR:$swap))]>; +} + +//===----------------------------------------------------------------------===// // DSP Instructions //===----------------------------------------------------------------------===// include "XtensaDSPInstrInfo.td" diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td index 596c410..d1f2c6b 100644 --- a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td +++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td @@ -84,6 +84,9 @@ def SAR : SRReg<3, "sar", ["SAR","3"]>; // Boolean Register def BREG : SRReg<4, "br", ["BR","4"]>; +// Expected data value for S32C1I operation +def SCOMPARE1 : SRReg<12, "scompare1", ["SCOMPARE1", "12"]>; + // Literal base def LITBASE : SRReg<5, "litbase", ["LITBASE", "5"]>; @@ -97,6 +100,9 @@ def IBREAKENABLE : SRReg<96, "ibreakenable", ["IBREAKENABLE", "96"]>; // Memory Control Register def MEMCTL : SRReg<97, "memctl", ["MEMCTL", "97"]>; +// Atomic Operation Control +def ATOMCTL : SRReg<99, "atomctl", ["ATOMCTL", "99"]>; + def DDR : SRReg<104, "ddr", ["DDR", "104"]>; // Instuction break address register 0 @@ -218,8 +224,8 @@ def MR23 : RegisterClass<"Xtensa", [i32], 32, (add M2, M3)>; def MR : RegisterClass<"Xtensa", [i32], 32, (add MR01, MR23)>; def SR : RegisterClass<"Xtensa", [i32], 32, (add - LBEG, LEND, LCOUNT, SAR, BREG, LITBASE, ACCLO, ACCHI, MR, - WINDOWBASE, WINDOWSTART, IBREAKENABLE, MEMCTL, DDR, IBREAKA0, IBREAKA1, + LBEG, LEND, LCOUNT, SAR, BREG, SCOMPARE1, LITBASE, ACCLO, ACCHI, MR, + WINDOWBASE, WINDOWSTART, IBREAKENABLE, MEMCTL, ATOMCTL, DDR, IBREAKA0, IBREAKA1, DBREAKA0, DBREAKA1, DBREAKC0, DBREAKC1, CONFIGID0, EPC1, EPC2, EPC3, EPC4, EPC5, EPC6, EPC7, DEPC, EPS2, EPS3, EPS4, EPS5, EPS6, EPS7, CONFIGID1, EXCSAVE1, EXCSAVE2, EXCSAVE3, EXCSAVE4, EXCSAVE5, EXCSAVE6, EXCSAVE7, CPENABLE, INTERRUPT, INTSET, INTCLEAR, INTENABLE, diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.h b/llvm/lib/Target/Xtensa/XtensaSubtarget.h index fd677a4..b406534 100644 --- a/llvm/lib/Target/Xtensa/XtensaSubtarget.h +++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.h @@ -77,6 +77,8 @@ public: bool hasMul32() const { return HasMul32; } bool hasMul32High() const { return HasMul32High; } bool hasDiv32() const { return HasDiv32; } + bool hasS32C1I() const { return HasS32C1I; } + bool hasForcedAtomics() const { return HasForcedAtomics; } bool hasSingleFloat() const { return HasSingleFloat; } bool hasRegionProtection() const { return HasRegionProtection; } bool hasRelocatableVector() const { return HasRelocatableVector; } diff --git a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp index 8d2dca6..c9f1ca8 100644 --- a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp +++ b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp @@ -107,6 +107,7 @@ public: } bool addInstSelector() override; + void addIRPasses() override; void addPreEmitPass() override; }; } // end anonymous namespace @@ -116,6 +117,11 @@ bool XtensaPassConfig::addInstSelector() { return false; } +void XtensaPassConfig::addIRPasses() { + addPass(createAtomicExpandLegacyPass()); + TargetPassConfig::addIRPasses(); +} + void XtensaPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); } TargetPassConfig *XtensaTargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 7e09d30..22192e1f 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -11,7 +11,10 @@ //===----------------------------------------------------------------------===// #include "llvm/TargetParser/Host.h" +#include "llvm/ADT/Bitfields.h" +#include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -167,35 +170,10 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) { .Default(generic); } -StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { - // The cpuid register on arm is not accessible from user space. On Linux, - // it is exposed through the /proc/cpuinfo file. - - // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line - // in all cases. - SmallVector<StringRef, 32> Lines; - ProcCpuinfoContent.split(Lines, '\n'); - - // Look for the CPU implementer and hardware lines, and store the CPU part - // numbers found. - StringRef Implementer; - StringRef Hardware; - SmallVector<StringRef, 32> Parts; - for (StringRef Line : Lines) { - if (Line.consume_front("CPU implementer")) - Implementer = Line.ltrim("\t :"); - else if (Line.consume_front("Hardware")) - Hardware = Line.ltrim("\t :"); - else if (Line.consume_front("CPU part")) - Parts.emplace_back(Line.ltrim("\t :")); - } - - // Last `Part' seen, in case we don't analyse all `Parts' parsed. - StringRef Part = Parts.empty() ? StringRef() : Parts.back(); - - // Remove duplicate `Parts'. - llvm::sort(Parts); - Parts.erase(llvm::unique(Parts), Parts.end()); +StringRef +getHostCPUNameForARMFromComponents(StringRef Implementer, StringRef Hardware, + StringRef Part, ArrayRef<StringRef> Parts, + function_ref<unsigned()> GetVariant) { auto MatchBigLittle = [](auto const &Parts, StringRef Big, StringRef Little) { if (Parts.size() == 2) @@ -343,21 +321,17 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { if (Implementer == "0x53") { // Samsung Electronics Co., Ltd. // The Exynos chips have a convoluted ID scheme that doesn't seem to follow // any predictive pattern across variants and parts. - unsigned Variant = 0, Part = 0; // Look for the CPU variant line, whose value is a 1 digit hexadecimal // number, corresponding to the Variant bits in the CP15/C0 register. - for (auto I : Lines) - if (I.consume_front("CPU variant")) - I.ltrim("\t :").getAsInteger(0, Variant); + unsigned Variant = GetVariant(); - // Look for the CPU part line, whose value is a 3 digit hexadecimal - // number, corresponding to the PartNum bits in the CP15/C0 register. - for (auto I : Lines) - if (I.consume_front("CPU part")) - I.ltrim("\t :").getAsInteger(0, Part); + // Convert the CPU part line, whose value is a 3 digit hexadecimal number, + // corresponding to the PartNum bits in the CP15/C0 register. + unsigned PartAsInt; + Part.getAsInteger(0, PartAsInt); - unsigned Exynos = (Variant << 12) | Part; + unsigned Exynos = (Variant << 12) | PartAsInt; switch (Exynos) { default: // Default by falling through to Exynos M3. @@ -416,6 +390,78 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { return "generic"; } +StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { + // The cpuid register on arm is not accessible from user space. On Linux, + // it is exposed through the /proc/cpuinfo file. + + // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line + // in all cases. + SmallVector<StringRef, 32> Lines; + ProcCpuinfoContent.split(Lines, '\n'); + + // Look for the CPU implementer and hardware lines, and store the CPU part + // numbers found. + StringRef Implementer; + StringRef Hardware; + SmallVector<StringRef, 32> Parts; + for (StringRef Line : Lines) { + if (Line.consume_front("CPU implementer")) + Implementer = Line.ltrim("\t :"); + else if (Line.consume_front("Hardware")) + Hardware = Line.ltrim("\t :"); + else if (Line.consume_front("CPU part")) + Parts.emplace_back(Line.ltrim("\t :")); + } + + // Last `Part' seen, in case we don't analyse all `Parts' parsed. + StringRef Part = Parts.empty() ? StringRef() : Parts.back(); + + // Remove duplicate `Parts'. + llvm::sort(Parts); + Parts.erase(llvm::unique(Parts), Parts.end()); + + auto GetVariant = [&]() { + unsigned Variant = 0; + for (auto I : Lines) + if (I.consume_front("CPU variant")) + I.ltrim("\t :").getAsInteger(0, Variant); + return Variant; + }; + + return getHostCPUNameForARMFromComponents(Implementer, Hardware, Part, Parts, + GetVariant); +} + +StringRef sys::detail::getHostCPUNameForARM(uint64_t PrimaryCpuInfo, + ArrayRef<uint64_t> UniqueCpuInfos) { + // On Windows, the registry provides cached copied of the MIDR_EL1 register. + using PartNum = Bitfield::Element<uint16_t, 4, 12>; + using Implementer = Bitfield::Element<uint16_t, 24, 8>; + using Variant = Bitfield::Element<uint16_t, 20, 4>; + + SmallVector<std::string> PartsHolder; + PartsHolder.reserve(UniqueCpuInfos.size()); + for (auto Info : UniqueCpuInfos) + PartsHolder.push_back("0x" + utohexstr(Bitfield::get<PartNum>(Info), + /*LowerCase*/ true, + /*Width*/ 3)); + + SmallVector<StringRef> Parts; + Parts.reserve(PartsHolder.size()); + for (const auto &Part : PartsHolder) + Parts.push_back(Part); + + return getHostCPUNameForARMFromComponents( + "0x" + utohexstr(Bitfield::get<Implementer>(PrimaryCpuInfo), + /*LowerCase*/ true, + /*Width*/ 2), + /*Hardware*/ "", + "0x" + utohexstr(Bitfield::get<PartNum>(PrimaryCpuInfo), + /*LowerCase*/ true, + /*Width*/ 3), + Parts, [=]() { return Bitfield::get<Variant>(PrimaryCpuInfo); }); +} + namespace { StringRef getCPUNameFromS390Model(unsigned int Id, bool HaveVectorSupport) { switch (Id) { @@ -1450,6 +1496,75 @@ StringRef sys::getHostCPUName() { return "generic"; } +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +StringRef sys::getHostCPUName() { + constexpr char CentralProcessorKeyName[] = + "HARDWARE\\DESCRIPTION\\System\\CentralProcessor"; + // Sub keys names are simple numbers ("0", "1", etc.) so 10 chars should be + // enough for the slash and name. + constexpr size_t SubKeyNameMaxSize = ARRAYSIZE(CentralProcessorKeyName) + 10; + + SmallVector<uint64_t> Values; + uint64_t PrimaryCpuInfo; + char PrimaryPartKeyName[SubKeyNameMaxSize]; + DWORD PrimaryPartKeyNameSize = 0; + HKEY CentralProcessorKey; + if (RegOpenKeyExA(HKEY_LOCAL_MACHINE, CentralProcessorKeyName, 0, KEY_READ, + &CentralProcessorKey) == ERROR_SUCCESS) { + for (unsigned Index = 0; Index < UINT32_MAX; ++Index) { + char SubKeyName[SubKeyNameMaxSize]; + DWORD SubKeySize = SubKeyNameMaxSize; + HKEY SubKey; + if ((RegEnumKeyExA(CentralProcessorKey, Index, SubKeyName, &SubKeySize, + nullptr, nullptr, nullptr, + nullptr) == ERROR_SUCCESS) && + (RegOpenKeyExA(CentralProcessorKey, SubKeyName, 0, KEY_READ, + &SubKey) == ERROR_SUCCESS)) { + // The "CP 4000" registry key contains a cached copy of the MIDR_EL1 + // register. + uint64_t RegValue; + DWORD ActualType; + DWORD RegValueSize = sizeof(RegValue); + if ((RegQueryValueExA(SubKey, "CP 4000", nullptr, &ActualType, + (PBYTE)&RegValue, + &RegValueSize) == ERROR_SUCCESS) && + (ActualType == REG_QWORD) && RegValueSize == sizeof(RegValue)) { + // Assume that the part with the "highest" reg key name is the primary + // part (to match the way that Linux's cpuinfo is written). Win32 + // makes no guarantees about the order of sub keys, so we have to + // compare the names. + if (PrimaryPartKeyNameSize < SubKeySize || + (PrimaryPartKeyNameSize == SubKeySize && + ::memcmp(SubKeyName, PrimaryPartKeyName, SubKeySize) > 0)) { + PrimaryCpuInfo = RegValue; + ::memcpy(PrimaryPartKeyName, SubKeyName, SubKeySize + 1); + PrimaryPartKeyNameSize = SubKeySize; + } + if (!llvm::is_contained(Values, RegValue)) { + Values.push_back(RegValue); + } + } + RegCloseKey(SubKey); + } else { + // No more sub keys. + break; + } + } + RegCloseKey(CentralProcessorKey); + } + + if (Values.empty()) { + return "generic"; + } + + // Win32 makes no guarantees about the order of sub keys, so sort to ensure + // reproducibility. + llvm::sort(Values); + + return detail::getHostCPUNameForARM(PrimaryCpuInfo, Values); +} + #elif defined(__APPLE__) && defined(__powerpc__) StringRef sys::getHostCPUName() { host_basic_info_data_t hostInfo; diff --git a/llvm/lib/TextAPI/Architecture.cpp b/llvm/lib/TextAPI/Architecture.cpp index 51ca91d..3b53067 100644 --- a/llvm/lib/TextAPI/Architecture.cpp +++ b/llvm/lib/TextAPI/Architecture.cpp @@ -21,7 +21,7 @@ namespace llvm { namespace MachO { Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType) { -#define ARCHINFO(Arch, Type, Subtype, NumBits) \ +#define ARCHINFO(Arch, Name, Type, Subtype, NumBits) \ if (CPUType == (Type) && \ (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) == (Subtype)) \ return AK_##Arch; @@ -33,7 +33,7 @@ Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType) { Architecture getArchitectureFromName(StringRef Name) { return StringSwitch<Architecture>(Name) -#define ARCHINFO(Arch, Type, Subtype, NumBits) .Case(#Arch, AK_##Arch) +#define ARCHINFO(Arch, Name, Type, Subtype, NumBits) .Case(#Name, AK_##Arch) #include "llvm/TextAPI/Architecture.def" #undef ARCHINFO .Default(AK_unknown); @@ -41,9 +41,9 @@ Architecture getArchitectureFromName(StringRef Name) { StringRef getArchitectureName(Architecture Arch) { switch (Arch) { -#define ARCHINFO(Arch, Type, Subtype, NumBits) \ +#define ARCHINFO(Arch, Name, Type, Subtype, NumBits) \ case AK_##Arch: \ - return #Arch; + return #Name; #include "llvm/TextAPI/Architecture.def" #undef ARCHINFO case AK_unknown: @@ -57,7 +57,7 @@ StringRef getArchitectureName(Architecture Arch) { std::pair<uint32_t, uint32_t> getCPUTypeFromArchitecture(Architecture Arch) { switch (Arch) { -#define ARCHINFO(Arch, Type, Subtype, NumBits) \ +#define ARCHINFO(Arch, Name, Type, Subtype, NumBits) \ case AK_##Arch: \ return std::make_pair(Type, Subtype); #include "llvm/TextAPI/Architecture.def" @@ -77,7 +77,7 @@ Architecture mapToArchitecture(const Triple &Target) { bool is64Bit(Architecture Arch) { switch (Arch) { -#define ARCHINFO(Arch, Type, Subtype, NumBits) \ +#define ARCHINFO(Arch, Name, Type, Subtype, NumBits) \ case AK_##Arch: \ return NumBits == 64; #include "llvm/TextAPI/Architecture.def" diff --git a/llvm/lib/TextAPI/TextStubCommon.cpp b/llvm/lib/TextAPI/TextStubCommon.cpp index 0b710b0..7bf1f9a 100644 --- a/llvm/lib/TextAPI/TextStubCommon.cpp +++ b/llvm/lib/TextAPI/TextStubCommon.cpp @@ -133,7 +133,7 @@ QuotingType ScalarTraits<PlatformSet>::mustQuote(StringRef) { void ScalarBitSetTraits<ArchitectureSet>::bitset(IO &IO, ArchitectureSet &Archs) { -#define ARCHINFO(arch, type, subtype, numbits) \ +#define ARCHINFO(arch, name, type, subtype, numbits) \ IO.bitSetCase(Archs, #arch, 1U << static_cast<int>(AK_##arch)); #include "llvm/TextAPI/Architecture.def" #undef ARCHINFO diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 5485998..0d48a35 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -80,6 +80,10 @@ static cl::opt<bool> ClCompoundReadBeforeWrite( "tsan-compound-read-before-write", cl::init(false), cl::desc("Emit special compound instrumentation for reads-before-writes"), cl::Hidden); +static cl::opt<bool> + ClOmitNonCaptured("tsan-omit-by-pointer-capturing", cl::init(true), + cl::desc("Omit accesses due to pointer capturing"), + cl::Hidden); STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); @@ -450,7 +454,8 @@ void ThreadSanitizer::chooseInstructionsToInstrument( const AllocaInst *AI = findAllocaForValue(Addr); // Instead of Addr, we should check whether its base pointer is captured. - if (AI && !PointerMayBeCaptured(AI, /*ReturnCaptures=*/true)) { + if (AI && !PointerMayBeCaptured(AI, /*ReturnCaptures=*/true) && + ClOmitNonCaptured) { // The variable is addressable but not captured, so it cannot be // referenced from a different thread and participate in a data race // (see llvm/Analysis/CaptureTracking.h for details). diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index ed3dca2..59a47a9 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -2361,15 +2361,13 @@ remapIndices(Function &Caller, BasicBlock *StartBB, // Updating the contextual profile after an inlining means, at a high level, // copying over the data of the callee, **intentionally without any value // scaling**, and copying over the callees of the inlined callee. -llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, - PGOContextualProfile &CtxProf, - bool MergeAttributes, - AAResults *CalleeAAR, - bool InsertLifetime, - Function *ForwardVarArgsTo) { +llvm::InlineResult llvm::InlineFunction( + CallBase &CB, InlineFunctionInfo &IFI, PGOContextualProfile &CtxProf, + bool MergeAttributes, AAResults *CalleeAAR, bool InsertLifetime, + Function *ForwardVarArgsTo, OptimizationRemarkEmitter *ORE) { if (!CtxProf.isInSpecializedModule()) return InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime, - ForwardVarArgsTo); + ForwardVarArgsTo, ORE); auto &Caller = *CB.getCaller(); auto &Callee = *CB.getCalledFunction(); @@ -2387,7 +2385,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, const auto NumCalleeCallsites = CtxProf.getNumCallsites(Callee); auto Ret = InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime, - ForwardVarArgsTo); + ForwardVarArgsTo, ORE); if (!Ret.isSuccess()) return Ret; @@ -2457,20 +2455,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, return Ret; } -/// This function inlines the called function into the basic block of the -/// caller. This returns false if it is not possible to inline this call. -/// The program is still in a well defined state if this occurs though. -/// -/// Note that this only does one level of inlining. For example, if the -/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now -/// exists in the instruction stream. Similarly this will inline a recursive -/// function by one level. -llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, - bool MergeAttributes, - AAResults *CalleeAAR, - bool InsertLifetime, - Function *ForwardVarArgsTo, - OptimizationRemarkEmitter *ORE) { +llvm::InlineResult llvm::CanInlineCallSite(const CallBase &CB, + InlineFunctionInfo &IFI) { assert(CB.getParent() && CB.getFunction() && "Instruction not in function!"); // FIXME: we don't inline callbr yet. @@ -2487,7 +2473,6 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // The inliner does not know how to inline through calls with operand bundles // in general ... - Value *ConvergenceControlToken = nullptr; if (CB.hasOperandBundles()) { for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) { auto OBUse = CB.getOperandBundleAt(i); @@ -2503,7 +2488,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (Tag == LLVMContext::OB_kcfi) continue; if (Tag == LLVMContext::OB_convergencectrl) { - ConvergenceControlToken = OBUse.Inputs[0].get(); + IFI.ConvergenceControlToken = OBUse.Inputs[0].get(); continue; } @@ -2521,28 +2506,22 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // fully implements convergence control tokens, there is no mixing of // controlled and uncontrolled convergent operations in the whole program. if (CB.isConvergent()) { - if (!ConvergenceControlToken && + if (!IFI.ConvergenceControlToken && getConvergenceEntry(CalledFunc->getEntryBlock())) { return InlineResult::failure( "convergent call needs convergencectrl operand"); } } - // If the call to the callee cannot throw, set the 'nounwind' flag on any - // calls that we inline. - bool MarkNoUnwind = CB.doesNotThrow(); - - BasicBlock *OrigBB = CB.getParent(); - Function *Caller = OrigBB->getParent(); + const BasicBlock *OrigBB = CB.getParent(); + const Function *Caller = OrigBB->getParent(); // GC poses two hazards to inlining, which only occur when the callee has GC: // 1. If the caller has no GC, then the callee's GC must be propagated to the // caller. // 2. If the caller has a differing GC, it is invalid to inline. if (CalledFunc->hasGC()) { - if (!Caller->hasGC()) - Caller->setGC(CalledFunc->getGC()); - else if (CalledFunc->getGC() != Caller->getGC()) + if (Caller->hasGC() && CalledFunc->getGC() != Caller->getGC()) return InlineResult::failure("incompatible GC"); } @@ -2560,34 +2539,31 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, ? Caller->getPersonalityFn()->stripPointerCasts() : nullptr; if (CalledPersonality) { - if (!CallerPersonality) - Caller->setPersonalityFn(CalledPersonality); // If the personality functions match, then we can perform the // inlining. Otherwise, we can't inline. // TODO: This isn't 100% true. Some personality functions are proper // supersets of others and can be used in place of the other. - else if (CalledPersonality != CallerPersonality) + if (CallerPersonality && CalledPersonality != CallerPersonality) return InlineResult::failure("incompatible personality"); } // We need to figure out which funclet the callsite was in so that we may // properly nest the callee. - Instruction *CallSiteEHPad = nullptr; if (CallerPersonality) { EHPersonality Personality = classifyEHPersonality(CallerPersonality); if (isScopedEHPersonality(Personality)) { std::optional<OperandBundleUse> ParentFunclet = CB.getOperandBundle(LLVMContext::OB_funclet); if (ParentFunclet) - CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front()); + IFI.CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front()); // OK, the inlining site is legal. What about the target function? - if (CallSiteEHPad) { + if (IFI.CallSiteEHPad) { if (Personality == EHPersonality::MSVC_CXX) { // The MSVC personality cannot tolerate catches getting inlined into // cleanup funclets. - if (isa<CleanupPadInst>(CallSiteEHPad)) { + if (isa<CleanupPadInst>(IFI.CallSiteEHPad)) { // Ok, the call site is within a cleanuppad. Let's check the callee // for catchpads. for (const BasicBlock &CalledBB : *CalledFunc) { @@ -2607,13 +2583,34 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, } } + return InlineResult::success(); +} + +/// This function inlines the called function into the basic block of the +/// caller. This returns false if it is not possible to inline this call. +/// The program is still in a well defined state if this occurs though. +/// +/// Note that this only does one level of inlining. For example, if the +/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now +/// exists in the instruction stream. Similarly this will inline a recursive +/// function by one level. +void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI, + bool MergeAttributes, AAResults *CalleeAAR, + bool InsertLifetime, Function *ForwardVarArgsTo, + OptimizationRemarkEmitter *ORE) { + BasicBlock *OrigBB = CB.getParent(); + Function *Caller = OrigBB->getParent(); + Function *CalledFunc = CB.getCalledFunction(); + assert(CalledFunc && !CalledFunc->isDeclaration() && + "CanInlineCallSite should have verified direct call to definition"); + // Determine if we are dealing with a call in an EHPad which does not unwind // to caller. bool EHPadForCallUnwindsLocally = false; - if (CallSiteEHPad && isa<CallInst>(CB)) { + if (IFI.CallSiteEHPad && isa<CallInst>(CB)) { UnwindDestMemoTy FuncletUnwindMap; Value *CallSiteUnwindDestToken = - getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap); + getUnwindDestToken(IFI.CallSiteEHPad, FuncletUnwindMap); EHPadForCallUnwindsLocally = CallSiteUnwindDestToken && @@ -2630,6 +2627,30 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, ClonedCodeInfo InlinedFunctionInfo; Function::iterator FirstNewBlock; + // GC poses two hazards to inlining, which only occur when the callee has GC: + // 1. If the caller has no GC, then the callee's GC must be propagated to the + // caller. + // 2. If the caller has a differing GC, it is invalid to inline. + if (CalledFunc->hasGC()) { + if (!Caller->hasGC()) + Caller->setGC(CalledFunc->getGC()); + else { + assert(CalledFunc->getGC() == Caller->getGC() && + "CanInlineCallSite should have verified compatible GCs"); + } + } + + if (CalledFunc->hasPersonalityFn()) { + Constant *CalledPersonality = + CalledFunc->getPersonalityFn()->stripPointerCasts(); + if (!Caller->hasPersonalityFn()) { + Caller->setPersonalityFn(CalledPersonality); + } else + assert(Caller->getPersonalityFn()->stripPointerCasts() == + CalledPersonality && + "CanInlineCallSite should have verified compatible personality"); + } + { // Scope to destroy VMap after cloning. ValueToValueMapTy VMap; struct ByValInit { @@ -2819,10 +2840,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, IFI.GetAssumptionCache(*Caller).registerAssumption(II); } - if (ConvergenceControlToken) { + if (IFI.ConvergenceControlToken) { IntrinsicInst *IntrinsicCall = getConvergenceEntry(*FirstNewBlock); if (IntrinsicCall) { - IntrinsicCall->replaceAllUsesWith(ConvergenceControlToken); + IntrinsicCall->replaceAllUsesWith(IFI.ConvergenceControlToken); IntrinsicCall->eraseFromParent(); } } @@ -2869,6 +2890,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, } } + // If the call to the callee cannot throw, set the 'nounwind' flag on any + // calls that we inline. + bool MarkNoUnwind = CB.doesNotThrow(); + SmallVector<Value*,4> VarArgsToForward; SmallVector<AttributeSet, 4> VarArgsAttrs; for (unsigned i = CalledFunc->getFunctionType()->getNumParams(); @@ -3055,12 +3080,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // Update the lexical scopes of the new funclets and callsites. // Anything that had 'none' as its parent is now nested inside the callsite's // EHPad. - if (CallSiteEHPad) { + if (IFI.CallSiteEHPad) { for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); BB != E; ++BB) { // Add bundle operands to inlined call sites. - PropagateOperandBundles(BB, CallSiteEHPad); + PropagateOperandBundles(BB, IFI.CallSiteEHPad); // It is problematic if the inlinee has a cleanupret which unwinds to // caller and we inline it into a call site which doesn't unwind but into @@ -3076,11 +3101,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { if (isa<ConstantTokenNone>(CatchSwitch->getParentPad())) - CatchSwitch->setParentPad(CallSiteEHPad); + CatchSwitch->setParentPad(IFI.CallSiteEHPad); } else { auto *FPI = cast<FuncletPadInst>(I); if (isa<ConstantTokenNone>(FPI->getParentPad())) - FPI->setParentPad(CallSiteEHPad); + FPI->setParentPad(IFI.CallSiteEHPad); } } } @@ -3236,7 +3261,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, AttributeFuncs::mergeAttributesForInlining(*Caller, *CalledFunc); // We are now done with the inlining. - return InlineResult::success(); + return; } // Otherwise, we have the normal case, of more than one block to inline or @@ -3404,6 +3429,19 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (MergeAttributes) AttributeFuncs::mergeAttributesForInlining(*Caller, *CalledFunc); +} - return InlineResult::success(); +llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, + bool MergeAttributes, + AAResults *CalleeAAR, + bool InsertLifetime, + Function *ForwardVarArgsTo, + OptimizationRemarkEmitter *ORE) { + llvm::InlineResult Result = CanInlineCallSite(CB, IFI); + if (Result.isSuccess()) { + InlineFunctionImpl(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime, + ForwardVarArgsTo, ORE); + } + + return Result; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a52aa84..be00fd6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7530,9 +7530,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { EPI.MainLoopIterationCountCheck = emitIterationCountCheck(LoopScalarPreHeader, false); - // Generate the induction variable. - EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); - replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); return LoopVectorPreHeader; } @@ -8301,7 +8298,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, VPRecipeBase *Recipe; Instruction *Instr = R->getUnderlyingInstr(); SmallVector<VPValue *, 4> Operands(R->operands()); - if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(R)) { + if (auto *PhiR = dyn_cast<VPPhi>(R)) { VPBasicBlock *Parent = PhiR->getParent(); [[maybe_unused]] VPRegionBlock *LoopRegionOf = Parent->getEnclosingLoopRegion(); @@ -8339,6 +8336,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, PhiRecipe->addOperand(Operands[1]); return PhiRecipe; } + assert(!R->isPhi() && "only VPPhi nodes expected at this point"); if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( cast<TruncInst>(Instr), Operands, Range))) @@ -8450,11 +8448,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, *Plan, CM.getMinimalBitwidths()); VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan); // TODO: try to put it close to addActiveLaneMask(). - // Discard the plan if it is not EVL-compatible - if (CM.foldTailWithEVL() && !HasScalarVF && - !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength, - *Plan, CM.getMaxSafeElements())) - break; + if (CM.foldTailWithEVL() && !HasScalarVF) + VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, + *Plan, CM.getMaxSafeElements()); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } @@ -9809,7 +9805,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, - const EpilogueLoopVectorizationInfo &EPI) { + EpilogueLoopVectorizationInfo &EPI) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); @@ -9827,30 +9823,13 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, // loop. using namespace llvm::PatternMatch; PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin(); - assert(EPResumeVal->getType() == IV->getScalarType() && - match(EPResumeVal->getIncomingValueForBlock( - EPI.MainLoopIterationCountCheck), - m_SpecificInt(0)) && - EPResumeVal == - find_singleton<PHINode>( - L->getLoopPreheader()->phis(), - [&EPI, IV](PHINode &P, bool) -> PHINode * { - if (P.getType() == IV->getScalarType() && - match(P.getIncomingValueForBlock( - EPI.MainLoopIterationCountCheck), - m_SpecificInt(0)) && - any_of(P.incoming_values(), - [&EPI](Value *Inc) { - return Inc == EPI.VectorTripCount; - }) && - all_of(P.incoming_values(), [&EPI](Value *Inc) { - return Inc == EPI.VectorTripCount || - match(Inc, m_SpecificInt(0)); - })) - return &P; - return nullptr; - }) && - "Epilogue resume phis do not match!"); + for (Value *Inc : EPResumeVal->incoming_values()) { + if (match(Inc, m_SpecificInt(0))) + continue; + assert(!EPI.VectorTripCount && + "Must only have a single non-zero incoming value"); + EPI.VectorTripCount = Inc; + } VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); assert(all_of(IV->users(), [](const VPUser *U) { @@ -10321,8 +10300,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { // TODO: Move to general VPlan pipeline once epilogue loops are also // supported. - VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount, - BestPlan, VF.Width, IC, PSE); + VPlanTransforms::runPass( + VPlanTransforms::materializeConstantVectorTripCount, BestPlan, + VF.Width, IC, PSE); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); @@ -10393,8 +10373,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { Checks, BestPlan); // TODO: Move to general VPlan pipeline once epilogue loops are also // supported. - VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount, - BestPlan, VF.Width, IC, PSE); + VPlanTransforms::runPass( + VPlanTransforms::materializeConstantVectorTripCount, BestPlan, + VF.Width, IC, PSE); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 62ab3f52..5d0e2f9 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15097,7 +15097,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals, for (ExternalUser &EU : ExternalUses) { LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry " << EU.E.Idx << " in lane " << EU.Lane << "\n"); - LLVM_DEBUG(dbgs() << " User:" << *EU.User << "\n"); + LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n"; + else dbgs() << " User: nullptr\n"); LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n"); // Uses by ephemeral values are free (because the ephemeral value will be diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 8052e31..73babcc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1054,12 +1054,17 @@ void VPlan::execute(VPTransformState *State) { InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { // For now only return the cost of the vector loop region, ignoring any other - // blocks, like the preheader or middle blocks. + // blocks, like the preheader or middle blocks, expect for checking them for + // recipes with invalid costs. InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx); - // If any instructions in the middle block are invalid return invalid. - // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created. - if (!getMiddleBlock()->cost(VF, Ctx).isValid()) + // If the cost of the loop region is invalid or any recipe in the skeleton + // outside loop regions are invalid return an invalid cost. + if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_shallow(getEntry())), + [&VF, &Ctx](VPBasicBlock *VPBB) { + return !VPBB->cost(VF, Ctx).isValid(); + })) return InstructionCost::getInvalid(); return Cost; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 46dd114..c42cdd5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1242,12 +1242,24 @@ struct LLVM_ABI_FOR_TEST VPPhi : public VPInstruction, public VPPhiAccessors { : VPInstruction(Instruction::PHI, Operands, DL, Name) {} static inline bool classof(const VPUser *U) { - auto *R = dyn_cast<VPInstruction>(U); - return R && R->getOpcode() == Instruction::PHI; + auto *VPI = dyn_cast<VPInstruction>(U); + return VPI && VPI->getOpcode() == Instruction::PHI; + } + + static inline bool classof(const VPValue *V) { + auto *VPI = dyn_cast<VPInstruction>(V); + return VPI && VPI->getOpcode() == Instruction::PHI; + } + + static inline bool classof(const VPSingleDefRecipe *SDR) { + auto *VPI = dyn_cast<VPInstruction>(SDR); + return VPI && VPI->getOpcode() == Instruction::PHI; } VPPhi *clone() override { - return new VPPhi(operands(), getDebugLoc(), getName()); + auto *PhiR = new VPPhi(operands(), getDebugLoc(), getName()); + PhiR->setUnderlyingValue(getUnderlyingValue()); + return PhiR; } void execute(VPTransformState &State) override; diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 1b91901..7e8eff31 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -91,17 +91,15 @@ void PlainCFGBuilder::fixHeaderPhis() { for (auto *Phi : PhisToFix) { assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode."); VPValue *VPVal = IRDef2VPValue[Phi]; - assert(isa<VPWidenPHIRecipe>(VPVal) && - "Expected WidenPHIRecipe for phi node."); - auto *VPPhi = cast<VPWidenPHIRecipe>(VPVal); - assert(VPPhi->getNumOperands() == 0 && - "Expected VPInstruction with no operands."); + assert(isa<VPPhi>(VPVal) && "Expected VPPhi for phi node."); + auto *PhiR = cast<VPPhi>(VPVal); + assert(PhiR->getNumOperands() == 0 && "Expected VPPhi with no operands."); assert(isHeaderBB(Phi->getParent(), LI->getLoopFor(Phi->getParent())) && "Expected Phi in header block."); assert(Phi->getNumOperands() == 2 && "header phi must have exactly 2 operands"); for (BasicBlock *Pred : predecessors(Phi->getParent())) - VPPhi->addOperand( + PhiR->addOperand( getOrCreateVPOperand(Phi->getIncomingValueForBlock(Pred))); } } @@ -204,11 +202,11 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, VPSingleDefRecipe *NewR; if (auto *Phi = dyn_cast<PHINode>(Inst)) { - // Phi node's operands may have not been visited at this point. We create + // Phi node's operands may not have been visited at this point. We create // an empty VPInstruction that we will fix once the whole plain CFG has // been built. - NewR = new VPWidenPHIRecipe(Phi, nullptr, Phi->getDebugLoc(), "vec.phi"); - VPBB->appendRecipe(NewR); + NewR = VPIRBuilder.createScalarPhi({}, Phi->getDebugLoc(), "vec.phi"); + NewR->setUnderlyingValue(Phi); if (isHeaderBB(Phi->getParent(), LI->getLoopFor(Phi->getParent()))) { // Header phis need to be fixed after the VPBB for the latch has been // created. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index d133610..8818843 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -461,6 +461,66 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) { return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1); } +/// ICmp_match is a variant of BinaryRecipe_match that also binds the comparison +/// predicate. +template <typename Op0_t, typename Op1_t> struct ICmp_match { + CmpPredicate *Predicate = nullptr; + Op0_t Op0; + Op1_t Op1; + + ICmp_match(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) + : Predicate(&Pred), Op0(Op0), Op1(Op1) {} + ICmp_match(const Op0_t &Op0, const Op1_t &Op1) : Op0(Op0), Op1(Op1) {} + + bool match(const VPValue *V) const { + auto *DefR = V->getDefiningRecipe(); + return DefR && match(DefR); + } + + bool match(const VPRecipeBase *V) const { + if (m_Binary<Instruction::ICmp>(Op0, Op1).match(V)) { + if (Predicate) + *Predicate = cast<VPRecipeWithIRFlags>(V)->getPredicate(); + return true; + } + return false; + } +}; + +/// SpecificICmp_match is a variant of ICmp_match that matches the comparison +/// predicate, instead of binding it. +template <typename Op0_t, typename Op1_t> struct SpecificICmp_match { + const CmpPredicate Predicate; + Op0_t Op0; + Op1_t Op1; + + SpecificICmp_match(CmpPredicate Pred, const Op0_t &LHS, const Op1_t &RHS) + : Predicate(Pred), Op0(LHS), Op1(RHS) {} + + bool match(const VPValue *V) const { + CmpPredicate CurrentPred; + return ICmp_match<Op0_t, Op1_t>(CurrentPred, Op0, Op1).match(V) && + CmpPredicate::getMatching(CurrentPred, Predicate); + } +}; + +template <typename Op0_t, typename Op1_t> +inline ICmp_match<Op0_t, Op1_t> m_ICmp(const Op0_t &Op0, const Op1_t &Op1) { + return ICmp_match<Op0_t, Op1_t>(Op0, Op1); +} + +template <typename Op0_t, typename Op1_t> +inline ICmp_match<Op0_t, Op1_t> m_ICmp(CmpPredicate &Pred, const Op0_t &Op0, + const Op1_t &Op1) { + return ICmp_match<Op0_t, Op1_t>(Pred, Op0, Op1); +} + +template <typename Op0_t, typename Op1_t> +inline SpecificICmp_match<Op0_t, Op1_t> +m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) { + return SpecificICmp_match<Op0_t, Op1_t>(MatchPred, Op0, Op1); +} + template <typename Op0_t, typename Op1_t> using GEPLikeRecipe_match = BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false, diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 3b3bbc3..862b930 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -227,10 +227,10 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { } void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { - SmallVector<VPWidenPHIRecipe *> Phis; + SmallVector<VPPhi *> Phis; for (VPRecipeBase &R : VPBB->phis()) - Phis.push_back(cast<VPWidenPHIRecipe>(&R)); - for (VPWidenPHIRecipe *PhiR : Phis) { + Phis.push_back(cast<VPPhi>(&R)); + for (VPPhi *PhiR : Phis) { // The non-header Phi is converted into a Blend recipe below, // so we don't have to worry about the insertion order and we can just use // the builder. At this point we generate the predication tree. There may diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a7965a0..1c8bd6c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -63,17 +63,20 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue()); VPRecipeBase *NewRecipe = nullptr; - if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) { - auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue()); + if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) { + auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); const auto *II = GetIntOrFpInductionDescriptor(Phi); - if (!II) - continue; - - VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue()); - VPValue *Step = - vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); - NewRecipe = new VPWidenIntOrFpInductionRecipe( - Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc()); + if (!II) { + NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc()); + for (VPValue *Op : PhiR->operands()) + NewRecipe->addOperand(Op); + } else { + VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue()); + VPValue *Step = + vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); + NewRecipe = new VPWidenIntOrFpInductionRecipe( + Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc()); + } } else { assert(isa<VPInstruction>(&Ingredient) && "only VPInstructions expected here"); @@ -1161,6 +1164,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { + if (Phi->getNumOperands() == 1) + Phi->replaceAllUsesWith(Phi->getOperand(0)); + return; + } + // Some simplifications can only be applied after unrolling. Perform them // below. if (!Plan->isUnrolled()) @@ -1382,11 +1391,10 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, // Currently only handle cases where the single user is a header-mask // comparison with the backedge-taken-count. - if (!match( - *WideIV->user_begin(), - m_Binary<Instruction::ICmp>( - m_Specific(WideIV), - m_Broadcast(m_Specific(Plan.getOrCreateBackedgeTakenCount()))))) + if (!match(*WideIV->user_begin(), + m_ICmp(m_Specific(WideIV), + m_Broadcast( + m_Specific(Plan.getOrCreateBackedgeTakenCount()))))) continue; // Update IV operands and comparison bound to use new narrower type. @@ -1419,11 +1427,9 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, }); auto *CanIV = Plan.getCanonicalIV(); - if (!match(Cond, m_Binary<Instruction::ICmp>( - m_Specific(CanIV->getBackedgeValue()), - m_Specific(&Plan.getVectorTripCount()))) || - cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() != - CmpInst::ICMP_EQ) + if (!match(Cond, m_SpecificICmp(CmpInst::ICMP_EQ, + m_Specific(CanIV->getBackedgeValue()), + m_Specific(&Plan.getVectorTripCount())))) return false; // The compare checks CanIV + VFxUF == vector trip count. The vector trip @@ -1832,7 +1838,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( VPW->dropPoisonGeneratingFlags(); if (OldResSizeInBits != NewResSizeInBits && - !match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue()))) { + !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) { // Extend result to original width. auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); @@ -1841,9 +1847,8 @@ void VPlanTransforms::truncateToMinimalBitwidths( Ext->setOperand(0, ResultVPV); assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?"); } else { - assert( - match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) && - "Only ICmps should not need extending the result."); + assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) && + "Only ICmps should not need extending the result."); } assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed"); @@ -2180,6 +2185,21 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { "User of VF that we can't transform to EVL."); Plan.getVF().replaceAllUsesWith(&EVL); + assert(all_of(Plan.getVFxUF().users(), + [&Plan](VPUser *U) { + return match(U, m_c_Binary<Instruction::Add>( + m_Specific(Plan.getCanonicalIV()), + m_Specific(&Plan.getVFxUF()))) || + isa<VPWidenPointerInductionRecipe>(U); + }) && + "Only users of VFxUF should be VPWidenPointerInductionRecipe and the " + "increment of the canonical induction."); + Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { + // Only replace uses in VPWidenPointerInductionRecipe; The increment of the + // canonical induction must not be updated. + return isa<VPWidenPointerInductionRecipe>(U); + }); + // Defer erasing recipes till the end so that we don't invalidate the // VPTypeAnalysis cache. SmallVector<VPRecipeBase *> ToErase; @@ -2315,16 +2335,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { /// %NextAVL = sub IVSize nuw %AVL, %OpEVL /// ... /// -bool VPlanTransforms::tryAddExplicitVectorLength( +void VPlanTransforms::addExplicitVectorLength( VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) { VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - // The transform updates all users of inductions to work based on EVL, instead - // of the VF directly. At the moment, widened pointer inductions cannot be - // updated, so bail out if the plan contains any. - bool ContainsWidenPointerInductions = - any_of(Header->phis(), IsaPred<VPWidenPointerInductionRecipe>); - if (ContainsWidenPointerInductions) - return false; auto *CanonicalIVPHI = Plan.getCanonicalIV(); auto *CanIVTy = CanonicalIVPHI->getScalarType(); @@ -2379,7 +2392,6 @@ bool VPlanTransforms::tryAddExplicitVectorLength( CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); // TODO: support unroll factor > 1. Plan.setUF(1); - return true; } void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { @@ -2803,13 +2815,12 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, R->replaceAllUsesWith(PtrAdd); // Create the backedge value for the scalar pointer phi. - Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi()); + VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); + Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator()); VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF), DL); VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF}); - VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock(); - Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator()); VPValue *InductionGEP = Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind"); ScalarPtrPhi->addOperand(InductionGEP); @@ -3222,7 +3233,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { } } -void VPlanTransforms::materializeVectorTripCount( +void VPlanTransforms::materializeConstantVectorTripCount( VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE) { assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); @@ -3230,19 +3241,26 @@ void VPlanTransforms::materializeVectorTripCount( VPValue *TC = Plan.getTripCount(); // Skip cases for which the trip count may be non-trivial to materialize. + // I.e., when a scalar tail is absent - due to tail folding, or when a scalar + // tail is required. if (!Plan.hasScalarTail() || Plan.getMiddleBlock()->getSingleSuccessor() == Plan.getScalarPreheader() || !TC->isLiveIn()) return; + // Materialize vector trip counts for constants early if it can simply // be computed as (Original TC / VF * UF) * VF * UF. + // TODO: Compute vector trip counts for loops requiring a scalar epilogue and + // tail-folded loops. ScalarEvolution &SE = *PSE.getSE(); auto *TCScev = SE.getSCEV(TC->getLiveInIRValue()); + if (!isa<SCEVConstant>(TCScev)) + return; const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF); auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF); - if (auto *NewC = dyn_cast<SCEVConstant>(VecTCScev)) - Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue()); + if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev)) + Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue()); } void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 5943684..cc50c75 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -177,10 +177,9 @@ struct VPlanTransforms { /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. /// VPCanonicalIVPHIRecipe is only used to control the loop after /// this transformation. - /// \returns true if the transformation succeeds, or false if it doesn't. - static bool - tryAddExplicitVectorLength(VPlan &Plan, - const std::optional<unsigned> &MaxEVLSafeElements); + static void + addExplicitVectorLength(VPlan &Plan, + const std::optional<unsigned> &MaxEVLSafeElements); // For each Interleave Group in \p InterleaveGroups replace the Recipes // widening its memory instructions with a single VPInterleaveRecipe at its @@ -252,9 +251,10 @@ struct VPlanTransforms { // Materialize vector trip counts for constants early if it can simply be // computed as (Original TC / VF * UF) * VF * UF. - static void materializeVectorTripCount(VPlan &Plan, ElementCount BestVF, - unsigned BestUF, - PredicatedScalarEvolution &PSE); + static void + materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, + unsigned BestUF, + PredicatedScalarEvolution &PSE); /// Materialize the backedge-taken count to be computed explicitly using /// VPInstructions. diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 14ae4f2..3417e1c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) .Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe, - VPWidenIntOrFpInductionRecipe>( + VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case<VPScalarIVStepsRecipe>([&](auto *R) { if (R->getNumOperands() != 3) { |