diff options
Diffstat (limited to 'llvm/lib')
100 files changed, 1757 insertions, 1314 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index b744537..45c889c 100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -329,6 +329,7 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV, // Look through ptr->int and ptr->ptr casts. if (CE->getOpcode() == Instruction::PtrToInt || + CE->getOpcode() == Instruction::PtrToAddr || CE->getOpcode() == Instruction::BitCast) return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL, DSOEquiv); @@ -1495,22 +1496,22 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, default: llvm_unreachable("Missing case"); case Instruction::PtrToAddr: - // TODO: Add some of the ptrtoint folds here as well. - break; case Instruction::PtrToInt: if (auto *CE = dyn_cast<ConstantExpr>(C)) { Constant *FoldedValue = nullptr; - // If the input is a inttoptr, eliminate the pair. This requires knowing + // If the input is an inttoptr, eliminate the pair. This requires knowing // the width of a pointer, so it can't be done in ConstantExpr::getCast. if (CE->getOpcode() == Instruction::IntToPtr) { - // zext/trunc the inttoptr to pointer size. - FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0), - DL.getIntPtrType(CE->getType()), + // zext/trunc the inttoptr to pointer/address size. + Type *MidTy = Opcode == Instruction::PtrToInt + ? DL.getAddressType(CE->getType()) + : DL.getIntPtrType(CE->getType()); + FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0), MidTy, /*IsSigned=*/false, DL); } else if (auto *GEP = dyn_cast<GEPOperator>(CE)) { // If we have GEP, we can perform the following folds: - // (ptrtoint (gep null, x)) -> x - // (ptrtoint (gep (gep null, x), y) -> x + y, etc. + // (ptrtoint/ptrtoaddr (gep null, x)) -> x + // (ptrtoint/ptrtoaddr (gep (gep null, x), y) -> x + y, etc. unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); APInt BaseOffset(BitWidth, 0); auto *Base = cast<Constant>(GEP->stripAndAccumulateConstantOffsets( @@ -1518,7 +1519,8 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, if (Base->isNullValue()) { FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset); } else { - // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V + // ptrtoint/ptrtoaddr (gep i8, Ptr, (sub 0, V)) + // -> sub (ptrtoint/ptrtoaddr Ptr), V if (GEP->getNumIndices() == 1 && GEP->getSourceElementType()->isIntegerTy(8)) { auto *Ptr = cast<Constant>(GEP->getPointerOperand()); @@ -1528,12 +1530,13 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, Sub->getOpcode() == Instruction::Sub && Sub->getOperand(0)->isNullValue()) FoldedValue = ConstantExpr::getSub( - ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1)); + ConstantExpr::getCast(Opcode, Ptr, IntIdxTy), + Sub->getOperand(1)); } } } if (FoldedValue) { - // Do a zext or trunc to get to the ptrtoint dest size. + // Do a zext or trunc to get to the ptrtoint/ptrtoaddr dest size. return ConstantFoldIntegerCast(FoldedValue, DestTy, /*IsSigned=*/false, DL); } diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 1f0da8d1..8d20b0e 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -275,7 +275,7 @@ bool Dependence::isAnti() const { // if no subscript in the source or destination mention the induction // variable associated with the loop at this level. // Leave this out of line, so it will serve as a virtual method anchor -bool Dependence::isScalar(unsigned level, bool isSameSD) const { return false; } +bool Dependence::isScalar(unsigned level, bool IsSameSD) const { return false; } //===----------------------------------------------------------------------===// // FullDependence methods @@ -351,38 +351,38 @@ bool FullDependence::normalize(ScalarEvolution *SE) { // getDirection - Returns the direction associated with a particular common or // SameSD level. -unsigned FullDependence::getDirection(unsigned Level, bool isSameSD) const { - return getDVEntry(Level, isSameSD).Direction; +unsigned FullDependence::getDirection(unsigned Level, bool IsSameSD) const { + return getDVEntry(Level, IsSameSD).Direction; } // Returns the distance (or NULL) associated with a particular common or // SameSD level. -const SCEV *FullDependence::getDistance(unsigned Level, bool isSameSD) const { - return getDVEntry(Level, isSameSD).Distance; +const SCEV *FullDependence::getDistance(unsigned Level, bool IsSameSD) const { + return getDVEntry(Level, IsSameSD).Distance; } // Returns true if a particular regular or SameSD level is scalar; that is, // if no subscript in the source or destination mention the induction variable // associated with the loop at this level. -bool FullDependence::isScalar(unsigned Level, bool isSameSD) const { - return getDVEntry(Level, isSameSD).Scalar; +bool FullDependence::isScalar(unsigned Level, bool IsSameSD) const { + return getDVEntry(Level, IsSameSD).Scalar; } // Returns true if peeling the first iteration from this regular or SameSD // loop level will break this dependence. -bool FullDependence::isPeelFirst(unsigned Level, bool isSameSD) const { - return getDVEntry(Level, isSameSD).PeelFirst; +bool FullDependence::isPeelFirst(unsigned Level, bool IsSameSD) const { + return getDVEntry(Level, IsSameSD).PeelFirst; } // Returns true if peeling the last iteration from this regular or SameSD // loop level will break this dependence. -bool FullDependence::isPeelLast(unsigned Level, bool isSameSD) const { - return getDVEntry(Level, isSameSD).PeelLast; +bool FullDependence::isPeelLast(unsigned Level, bool IsSameSD) const { + return getDVEntry(Level, IsSameSD).PeelLast; } // Returns true if splitting loop will break the dependence. -bool FullDependence::isSplitable(unsigned Level, bool isSameSD) const { - return getDVEntry(Level, isSameSD).Splitable; +bool FullDependence::isSplitable(unsigned Level, bool IsSameSD) const { + return getDVEntry(Level, IsSameSD).Splitable; } // inSameSDLoops - Returns true if this level is an SameSD level, i.e., @@ -691,7 +691,7 @@ void Dependence::dump(raw_ostream &OS) const { dumpImp(OS); unsigned SameSDLevels = getSameSDLevels(); if (SameSDLevels > 0) { - OS << "! / assuming " << SameSDLevels << " loop level(s) fused: "; + OS << " / assuming " << SameSDLevels << " loop level(s) fused: "; dumpImp(OS, true); } } @@ -706,13 +706,13 @@ void Dependence::dump(raw_ostream &OS) const { // For debugging purposes. Dumps a dependence to OS with or without considering // the SameSD levels. -void Dependence::dumpImp(raw_ostream &OS, bool isSameSD) const { +void Dependence::dumpImp(raw_ostream &OS, bool IsSameSD) const { bool Splitable = false; unsigned Levels = getLevels(); unsigned SameSDLevels = getSameSDLevels(); bool OnSameSD = false; unsigned LevelNum = Levels; - if (isSameSD) + if (IsSameSD) LevelNum += SameSDLevels; OS << " ["; for (unsigned II = 1; II <= LevelNum; ++II) { diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index 1794a60..85b5372 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -153,11 +153,6 @@ void Embedding::print(raw_ostream &OS) const { // Embedder and its subclasses //===----------------------------------------------------------------------===// -Embedder::Embedder(const Function &F, const Vocabulary &Vocab) - : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()), - OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight), - FuncVector(Embedding(Dimension)) {} - std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F, const Vocabulary &Vocab) { switch (Mode) { @@ -169,110 +164,85 @@ std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F, return nullptr; } -const InstEmbeddingsMap &Embedder::getInstVecMap() const { - if (InstVecMap.empty()) - computeEmbeddings(); - return InstVecMap; -} - -const BBEmbeddingsMap &Embedder::getBBVecMap() const { - if (BBVecMap.empty()) - computeEmbeddings(); - return BBVecMap; -} - -const Embedding &Embedder::getBBVector(const BasicBlock &BB) const { - auto It = BBVecMap.find(&BB); - if (It != BBVecMap.end()) - return It->second; - computeEmbeddings(BB); - return BBVecMap[&BB]; -} +Embedding Embedder::computeEmbeddings() const { + Embedding FuncVector(Dimension, 0.0); -const Embedding &Embedder::getFunctionVector() const { - // Currently, we always (re)compute the embeddings for the function. - // This is cheaper than caching the vector. - computeEmbeddings(); - return FuncVector; -} - -void Embedder::computeEmbeddings() const { if (F.isDeclaration()) - return; - - FuncVector = Embedding(Dimension, 0.0); + return FuncVector; // Consider only the basic blocks that are reachable from entry - for (const BasicBlock *BB : depth_first(&F)) { - computeEmbeddings(*BB); - FuncVector += BBVecMap[BB]; - } + for (const BasicBlock *BB : depth_first(&F)) + FuncVector += computeEmbeddings(*BB); + return FuncVector; } -void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const { +Embedding Embedder::computeEmbeddings(const BasicBlock &BB) const { Embedding BBVector(Dimension, 0); // We consider only the non-debug and non-pseudo instructions - for (const auto &I : BB.instructionsWithoutDebug()) { - Embedding ArgEmb(Dimension, 0); - for (const auto &Op : I.operands()) - ArgEmb += Vocab[*Op]; - auto InstVector = - Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; - if (const auto *IC = dyn_cast<CmpInst>(&I)) - InstVector += Vocab[IC->getPredicate()]; - InstVecMap[&I] = InstVector; - BBVector += InstVector; - } - BBVecMap[&BB] = BBVector; -} - -void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const { - Embedding BBVector(Dimension, 0); + for (const auto &I : BB.instructionsWithoutDebug()) + BBVector += computeEmbeddings(I); + return BBVector; +} + +Embedding SymbolicEmbedder::computeEmbeddings(const Instruction &I) const { + // Currently, we always (re)compute the embeddings for symbolic embedder. + // This is cheaper than caching the vectors. + Embedding ArgEmb(Dimension, 0); + for (const auto &Op : I.operands()) + ArgEmb += Vocab[*Op]; + auto InstVector = + Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; + if (const auto *IC = dyn_cast<CmpInst>(&I)) + InstVector += Vocab[IC->getPredicate()]; + return InstVector; +} + +Embedding FlowAwareEmbedder::computeEmbeddings(const Instruction &I) const { + // If we have already computed the embedding for this instruction, return it + auto It = InstVecMap.find(&I); + if (It != InstVecMap.end()) + return It->second; - // We consider only the non-debug and non-pseudo instructions - for (const auto &I : BB.instructionsWithoutDebug()) { - // TODO: Handle call instructions differently. - // For now, we treat them like other instructions - Embedding ArgEmb(Dimension, 0); - for (const auto &Op : I.operands()) { - // If the operand is defined elsewhere, we use its embedding - if (const auto *DefInst = dyn_cast<Instruction>(Op)) { - auto DefIt = InstVecMap.find(DefInst); - // Fixme (#159171): Ideally we should never miss an instruction - // embedding here. - // But when we have cyclic dependencies (e.g., phi - // nodes), we might miss the embedding. In such cases, we fall back to - // using the vocabulary embedding. This can be fixed by iterating to a - // fixed-point, or by using a simple solver for the set of simultaneous - // equations. - // Another case when we might miss an instruction embedding is when - // the operand instruction is in a different basic block that has not - // been processed yet. This can be fixed by processing the basic blocks - // in a topological order. - if (DefIt != InstVecMap.end()) - ArgEmb += DefIt->second; - else - ArgEmb += Vocab[*Op]; - } - // If the operand is not defined by an instruction, we use the vocabulary - else { - LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: " - << *Op << "=" << Vocab[*Op][0] << "\n"); + // TODO: Handle call instructions differently. + // For now, we treat them like other instructions + Embedding ArgEmb(Dimension, 0); + for (const auto &Op : I.operands()) { + // If the operand is defined elsewhere, we use its embedding + if (const auto *DefInst = dyn_cast<Instruction>(Op)) { + auto DefIt = InstVecMap.find(DefInst); + // Fixme (#159171): Ideally we should never miss an instruction + // embedding here. + // But when we have cyclic dependencies (e.g., phi + // nodes), we might miss the embedding. In such cases, we fall back to + // using the vocabulary embedding. This can be fixed by iterating to a + // fixed-point, or by using a simple solver for the set of simultaneous + // equations. + // Another case when we might miss an instruction embedding is when + // the operand instruction is in a different basic block that has not + // been processed yet. This can be fixed by processing the basic blocks + // in a topological order. + if (DefIt != InstVecMap.end()) + ArgEmb += DefIt->second; + else ArgEmb += Vocab[*Op]; - } } - // Create the instruction vector by combining opcode, type, and arguments - // embeddings - auto InstVector = - Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; - // Add compare predicate embedding as an additional operand if applicable - if (const auto *IC = dyn_cast<CmpInst>(&I)) - InstVector += Vocab[IC->getPredicate()]; - InstVecMap[&I] = InstVector; - BBVector += InstVector; + // If the operand is not defined by an instruction, we use the + // vocabulary + else { + LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: " + << *Op << "=" << Vocab[*Op][0] << "\n"); + ArgEmb += Vocab[*Op]; + } } - BBVecMap[&BB] = BBVector; + // Create the instruction vector by combining opcode, type, and arguments + // embeddings + auto InstVector = + Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; + if (const auto *IC = dyn_cast<CmpInst>(&I)) + InstVector += Vocab[IC->getPredicate()]; + InstVecMap[&I] = InstVector; + return InstVector; } // ==----------------------------------------------------------------------===// @@ -695,25 +665,17 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M, Emb->getFunctionVector().print(OS); OS << "Basic block vectors:\n"; - const auto &BBMap = Emb->getBBVecMap(); for (const BasicBlock &BB : F) { - auto It = BBMap.find(&BB); - if (It != BBMap.end()) { - OS << "Basic block: " << BB.getName() << ":\n"; - It->second.print(OS); - } + OS << "Basic block: " << BB.getName() << ":\n"; + Emb->getBBVector(BB).print(OS); } OS << "Instruction vectors:\n"; - const auto &InstMap = Emb->getInstVecMap(); for (const BasicBlock &BB : F) { for (const Instruction &I : BB) { - auto It = InstMap.find(&I); - if (It != InstMap.end()) { - OS << "Instruction: "; - I.print(OS); - It->second.print(OS); - } + OS << "Instruction: "; + I.print(OS); + Emb->getInstVector(I).print(OS); } } } diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index d1977f0..4e38626 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -671,12 +671,12 @@ Value *llvm::simplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW, /// This is very similar to stripAndAccumulateConstantOffsets(), except it /// normalizes the offset bitwidth to the stripped pointer type, not the /// original pointer type. -static APInt stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V, - bool AllowNonInbounds = false) { +static APInt stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V) { assert(V->getType()->isPtrOrPtrVectorTy()); APInt Offset = APInt::getZero(DL.getIndexTypeSizeInBits(V->getType())); - V = V->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds); + V = V->stripAndAccumulateConstantOffsets(DL, Offset, + /*AllowNonInbounds=*/true); // As that strip may trace through `addrspacecast`, need to sext or trunc // the offset calculated. return Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(V->getType())); @@ -853,10 +853,12 @@ static Value *simplifySubInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW, return W; // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...). - if (match(Op0, m_PtrToInt(m_Value(X))) && match(Op1, m_PtrToInt(m_Value(Y)))) + if (match(Op0, m_PtrToIntOrAddr(m_Value(X))) && + match(Op1, m_PtrToIntOrAddr(m_Value(Y)))) { if (Constant *Result = computePointerDifference(Q.DL, X, Y)) return ConstantFoldIntegerCast(Result, Op0->getType(), /*IsSigned*/ true, Q.DL); + } // i1 sub -> xor. if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1)) diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 4c2e1fe..54f55b2 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -812,7 +812,9 @@ static bool isPointerUseReplacable(const Use &U) { auto *User = Worklist.pop_back_val(); if (!Visited.insert(User).second) continue; - if (isa<ICmpInst, PtrToIntInst>(User)) + // FIXME: The PtrToIntInst case here is not strictly correct, as it + // changes which provenance is exposed. + if (isa<ICmpInst, PtrToIntInst, PtrToAddrInst>(User)) continue; if (isa<PHINode, SelectInst>(User)) Worklist.append(User->user_begin(), User->user_end()); diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 6f6776c..30bcff7 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15749,51 +15749,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock( return RewriteMap.lookup_or(S, S); }; - // Check for the SCEV expression (A /u B) * B while B is a constant, inside - // \p Expr. The check is done recuresively on \p Expr, which is assumed to - // be a composition of Min/Max SCEVs. Return whether the SCEV expression (A - // /u B) * B was found, and return the divisor B in \p DividesBy. For - // example, if Expr = umin (umax ((A /u 8) * 8, 16), 64), return true since - // (A /u 8) * 8 matched the pattern, and return the constant SCEV 8 in \p - // DividesBy. - std::function<bool(const SCEV *, const SCEV *&)> HasDivisibiltyInfo = - [&](const SCEV *Expr, const SCEV *&DividesBy) { - if (auto *Mul = dyn_cast<SCEVMulExpr>(Expr)) { - if (Mul->getNumOperands() != 2) - return false; - auto *MulLHS = Mul->getOperand(0); - auto *MulRHS = Mul->getOperand(1); - if (isa<SCEVConstant>(MulLHS)) - std::swap(MulLHS, MulRHS); - if (auto *Div = dyn_cast<SCEVUDivExpr>(MulLHS)) - if (Div->getOperand(1) == MulRHS) { - DividesBy = MulRHS; - return true; - } - } - if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) - return HasDivisibiltyInfo(MinMax->getOperand(0), DividesBy) || - HasDivisibiltyInfo(MinMax->getOperand(1), DividesBy); - return false; - }; - - // Return true if Expr known to divide by \p DividesBy. - std::function<bool(const SCEV *, const SCEV *&)> IsKnownToDivideBy = - [&](const SCEV *Expr, const SCEV *DividesBy) { - if (SE.getURemExpr(Expr, DividesBy)->isZero()) - return true; - if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) - return IsKnownToDivideBy(MinMax->getOperand(0), DividesBy) && - IsKnownToDivideBy(MinMax->getOperand(1), DividesBy); - return false; - }; - const SCEV *RewrittenLHS = GetMaybeRewritten(LHS); const SCEV *DividesBy = nullptr; - if (HasDivisibiltyInfo(RewrittenLHS, DividesBy)) - // Check that the whole expression is divided by DividesBy - DividesBy = - IsKnownToDivideBy(RewrittenLHS, DividesBy) ? DividesBy : nullptr; + const APInt &Multiple = SE.getConstantMultiple(RewrittenLHS); + if (!Multiple.isOne()) + DividesBy = SE.getConstant(Multiple); // Collect rewrites for LHS and its transitive operands based on the // condition. diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 6e92766..813632c 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -740,11 +740,6 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setAvailable(LibFunc_fgets_unlocked); } - if (T.isAndroid() && T.isAndroidVersionLT(21)) { - TLI.setUnavailable(LibFunc_stpcpy); - TLI.setUnavailable(LibFunc_stpncpy); - } - if (T.isPS()) { // PS4/PS5 do have memalign. TLI.setAvailable(LibFunc_memalign); diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index f6937d3..50d1d47 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -982,6 +982,7 @@ lltok::Kind LLLexer::LexIdentifier() { DWKEYWORD(ATE, DwarfAttEncoding); DWKEYWORD(VIRTUALITY, DwarfVirtuality); DWKEYWORD(LANG, DwarfLang); + DWKEYWORD(LNAME, DwarfSourceLangName); DWKEYWORD(CC, DwarfCC); DWKEYWORD(OP, DwarfOp); DWKEYWORD(MACINFO, DwarfMacinfo); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 5589966..380b192 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4740,6 +4740,10 @@ struct DwarfLangField : public MDUnsignedField { DwarfLangField() : MDUnsignedField(0, dwarf::DW_LANG_hi_user) {} }; +struct DwarfSourceLangNameField : public MDUnsignedField { + DwarfSourceLangNameField() : MDUnsignedField(0, UINT32_MAX) {} +}; + struct DwarfCCField : public MDUnsignedField { DwarfCCField() : MDUnsignedField(0, dwarf::DW_CC_hi_user) {} }; @@ -4998,6 +5002,25 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, DwarfLangField &Result) { } template <> +bool LLParser::parseMDField(LocTy Loc, StringRef Name, + DwarfSourceLangNameField &Result) { + if (Lex.getKind() == lltok::APSInt) + return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result)); + + if (Lex.getKind() != lltok::DwarfSourceLangName) + return tokError("expected DWARF source language name"); + + unsigned Lang = dwarf::getSourceLanguageName(Lex.getStrVal()); + if (!Lang) + return tokError("invalid DWARF source language name" + Twine(" '") + + Lex.getStrVal() + "'"); + assert(Lang <= Result.Max && "Expected valid DWARF source language name"); + Result.assign(Lang); + Lex.Lex(); + return false; +} + +template <> bool LLParser::parseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) { if (Lex.getKind() == lltok::APSInt) return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result)); @@ -5836,9 +5859,12 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) { if (!IsDistinct) return tokError("missing 'distinct', required for !DICompileUnit"); + LocTy Loc = Lex.getLoc(); + #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ - REQUIRED(language, DwarfLangField, ); \ REQUIRED(file, MDField, (/* AllowNull */ false)); \ + OPTIONAL(language, DwarfLangField, ); \ + OPTIONAL(sourceLanguageName, DwarfSourceLangNameField, ); \ OPTIONAL(producer, MDStringField, ); \ OPTIONAL(isOptimized, MDBoolField, ); \ OPTIONAL(flags, MDStringField, ); \ @@ -5860,12 +5886,23 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) { PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS + if (!language.Seen && !sourceLanguageName.Seen) + return error(Loc, "missing one of 'language' or 'sourceLanguageName', " + "required for !DICompileUnit"); + + if (language.Seen && sourceLanguageName.Seen) + return error(Loc, "can only specify one of 'language' and " + "'sourceLanguageName' on !DICompileUnit"); + Result = DICompileUnit::getDistinct( - Context, DISourceLanguageName(language.Val), file.Val, producer.Val, - isOptimized.Val, flags.Val, runtimeVersion.Val, splitDebugFilename.Val, - emissionKind.Val, enums.Val, retainedTypes.Val, globals.Val, imports.Val, - macros.Val, dwoId.Val, splitDebugInlining.Val, debugInfoForProfiling.Val, - nameTableKind.Val, rangesBaseAddress.Val, sysroot.Val, sdk.Val); + Context, + language.Seen ? DISourceLanguageName(language.Val) + : DISourceLanguageName(sourceLanguageName.Val, 0), + file.Val, producer.Val, isOptimized.Val, flags.Val, runtimeVersion.Val, + splitDebugFilename.Val, emissionKind.Val, enums.Val, retainedTypes.Val, + globals.Val, imports.Val, macros.Val, dwoId.Val, splitDebugInlining.Val, + debugInfoForProfiling.Val, nameTableKind.Val, rangesBaseAddress.Val, + sysroot.Val, sdk.Val); return false; } diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp index 969047a..55fa2df 100644 --- a/llvm/lib/BinaryFormat/Dwarf.cpp +++ b/llvm/lib/BinaryFormat/Dwarf.cpp @@ -893,6 +893,8 @@ StringRef llvm::dwarf::AttributeValueString(uint16_t Attr, unsigned Val) { return DefaultedMemberString(Val); case DW_AT_APPLE_enum_kind: return EnumKindString(Val); + case DW_AT_language_name: + return SourceLanguageNameString(static_cast<SourceLanguageName>(Val)); } return StringRef(); diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index a4d1b83..cdcf7a8 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1867,12 +1867,18 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( // distinct. It's always distinct. IsDistinct = true; + const auto LangVersionMask = (uint64_t(1) << 63); + const bool HasVersionedLanguage = Record[1] & LangVersionMask; + auto *CU = DICompileUnit::getDistinct( - Context, DISourceLanguageName(Record[1]), getMDOrNull(Record[2]), - getMDString(Record[3]), Record[4], getMDString(Record[5]), Record[6], - getMDString(Record[7]), Record[8], getMDOrNull(Record[9]), - getMDOrNull(Record[10]), getMDOrNull(Record[12]), - getMDOrNull(Record[13]), + Context, + HasVersionedLanguage + ? DISourceLanguageName(Record[1] & ~LangVersionMask, 0) + : DISourceLanguageName(Record[1]), + getMDOrNull(Record[2]), getMDString(Record[3]), Record[4], + getMDString(Record[5]), Record[6], getMDString(Record[7]), Record[8], + getMDOrNull(Record[9]), getMDOrNull(Record[10]), + getMDOrNull(Record[12]), getMDOrNull(Record[13]), Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]), Record.size() <= 14 ? 0 : Record[14], Record.size() <= 16 ? true : Record[16], diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 7ed140d..0ca55a26 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -2108,7 +2108,13 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N, assert(N->isDistinct() && "Expected distinct compile units"); Record.push_back(/* IsDistinct */ true); - Record.push_back(N->getSourceLanguage().getUnversionedName()); + auto Lang = N->getSourceLanguage(); + Record.push_back(Lang.getName()); + // Set bit so the MetadataLoader can distniguish between versioned and + // unversioned names. + if (Lang.hasVersionedName()) + Record.back() ^= (uint64_t(1) << 63); + Record.push_back(VE.getMetadataOrNullID(N->getFile())); Record.push_back(VE.getMetadataOrNullID(N->getRawProducer())); Record.push_back(N->isOptimized()); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index bc0bb34..f0f0861 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -587,10 +587,12 @@ bool DwarfExpression::addExpression( emitUnsigned(LeftShift); emitOp(dwarf::DW_OP_shl); } - emitOp(dwarf::DW_OP_constu); - emitUnsigned(RightShift); - emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra - : dwarf::DW_OP_shr); + if (RightShift) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(RightShift); + emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra + : dwarf::DW_OP_shr); + } // The value is now at the top of the stack, so set the location to // implicit so that we get a stack_value at the end. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index eb73d01b..4320b1d 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -3194,7 +3194,7 @@ struct ExtAddrMode : public TargetLowering::AddrMode { case ScaledRegField: return ScaledReg; case BaseOffsField: - return ConstantInt::get(IntPtrTy, BaseOffs); + return ConstantInt::getSigned(IntPtrTy, BaseOffs); } } @@ -6100,7 +6100,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // Add in the Base Offset if present. if (AddrMode.BaseOffs) { - Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); + Value *V = ConstantInt::getSigned(IntPtrTy, AddrMode.BaseOffs); if (ResultIndex) { // We need to add this separately from the scale above to help with // SDAG consecutive load/store merging. @@ -6226,7 +6226,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // Add in the Base Offset if present. if (AddrMode.BaseOffs) { - Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); + Value *V = ConstantInt::getSigned(IntPtrTy, AddrMode.BaseOffs); if (Result) Result = Builder.CreateAdd(Result, V, "sunkaddr"); else diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index fa0ccd6..b425b95 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1215,7 +1215,7 @@ bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const { LLT MemTy = LdSt.getMMO().getMemoryType(); SmallVector<LegalityQuery::MemDesc, 2> MemDescrs( {{MemTy, MemTy.getSizeInBits().getKnownMinValue(), - AtomicOrdering::NotAtomic}}); + AtomicOrdering::NotAtomic, AtomicOrdering::NotAtomic}}); unsigned IndexedOpc = getIndexedOpc(LdSt.getOpcode()); SmallVector<LLT> OpTys; if (IndexedOpc == TargetOpcode::G_INDEXED_STORE) @@ -1728,6 +1728,7 @@ static APFloat constantFoldFpUnary(const MachineInstr &MI, Result.clearSign(); return Result; } + case TargetOpcode::G_FPEXT: case TargetOpcode::G_FPTRUNC: { bool Unused; LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 3f6813e..90c60d4 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -344,6 +344,22 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, Known = KnownBits::mul(Known, Known2); break; } + case TargetOpcode::G_UMULH: { + computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, + Depth + 1); + computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, + Depth + 1); + Known = KnownBits::mulhu(Known, Known2); + break; + } + case TargetOpcode::G_SMULH: { + computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, + Depth + 1); + computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, + Depth + 1); + Known = KnownBits::mulhs(Known, Known2); + break; + } case TargetOpcode::G_SELECT: { computeKnownBitsMin(MI.getOperand(2).getReg(), MI.getOperand(3).getReg(), Known, DemandedElts, Depth + 1); diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp index b2f8435..cdc1f64 100644 --- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp @@ -958,7 +958,8 @@ void LoadStoreOpt::initializeStoreMergeTargetInfo(unsigned AddrSpace) { for (unsigned Size = 2; Size <= MaxStoreSizeToForm; Size *= 2) { LLT Ty = LLT::scalar(Size); SmallVector<LegalityQuery::MemDesc, 2> MemDescrs( - {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic}}); + {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic, + AtomicOrdering::NotAtomic}}); SmallVector<LLT> StoreTys({Ty, PtrTy}); LegalityQuery Q(TargetOpcode::G_STORE, StoreTys, MemDescrs); LegalizeActionStep ActionStep = LI.getAction(Q); diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp index 87565c0..e859765 100644 --- a/llvm/lib/CodeGen/MIR2Vec.cpp +++ b/llvm/lib/CodeGen/MIR2Vec.cpp @@ -49,14 +49,8 @@ cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0), //===----------------------------------------------------------------------===// MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries, - const TargetInstrInfo *TII) - : TII(*TII) { - // Fixme: Use static factory methods for creating vocabularies instead of - // public constructors - // Early return for invalid inputs - creates empty/invalid vocabulary - if (!TII || OpcodeEntries.empty()) - return; - + const TargetInstrInfo &TII) + : TII(TII) { buildCanonicalOpcodeMapping(); unsigned CanonicalOpcodeCount = UniqueBaseOpcodeNames.size(); @@ -67,6 +61,15 @@ MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries, Layout.TotalEntries = Storage.size(); } +Expected<MIRVocabulary> MIRVocabulary::create(VocabMap &&Entries, + const TargetInstrInfo &TII) { + if (Entries.empty()) + return createStringError(errc::invalid_argument, + "Empty vocabulary entries provided"); + + return MIRVocabulary(std::move(Entries), TII); +} + std::string MIRVocabulary::extractBaseOpcodeName(StringRef InstrName) { // Extract base instruction name using regex to capture letters and // underscores Examples: "ADD32rr" -> "ADD", "ARITH_FENCE" -> "ARITH_FENCE" @@ -107,13 +110,11 @@ unsigned MIRVocabulary::getCanonicalIndexForBaseName(StringRef BaseName) const { } unsigned MIRVocabulary::getCanonicalOpcodeIndex(unsigned Opcode) const { - assert(isValid() && "MIR2Vec Vocabulary is invalid"); auto BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode)); return getCanonicalIndexForBaseName(BaseOpcode); } std::string MIRVocabulary::getStringKey(unsigned Pos) const { - assert(isValid() && "MIR2Vec Vocabulary is invalid"); assert(Pos < Layout.TotalEntries && "Position out of bounds in vocabulary"); // For now, all entries are opcodes since we only have one section @@ -232,16 +233,11 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary() { return Error::success(); } -void MIR2VecVocabLegacyAnalysis::emitError(Error Err, LLVMContext &Ctx) { - Ctx.emitError(toString(std::move(Err))); -} - -mir2vec::MIRVocabulary +Expected<mir2vec::MIRVocabulary> MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) { if (StrVocabMap.empty()) { if (Error Err = readVocabulary()) { - emitError(std::move(Err), M.getContext()); - return mir2vec::MIRVocabulary(std::move(StrVocabMap), nullptr); + return std::move(Err); } } @@ -255,15 +251,13 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) { if (auto *MF = MMI.getMachineFunction(F)) { const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - return mir2vec::MIRVocabulary(std::move(StrVocabMap), TII); + return mir2vec::MIRVocabulary::create(std::move(StrVocabMap), *TII); } } - // No machine functions available - return invalid vocabulary - emitError(make_error<StringError>("No machine functions found in module", - inconvertibleErrorCode()), - M.getContext()); - return mir2vec::MIRVocabulary(std::move(StrVocabMap), nullptr); + // No machine functions available - return error + return createStringError(errc::invalid_argument, + "No machine functions found in module"); } //===----------------------------------------------------------------------===// @@ -284,13 +278,15 @@ bool MIR2VecVocabPrinterLegacyPass::runOnMachineFunction(MachineFunction &MF) { bool MIR2VecVocabPrinterLegacyPass::doFinalization(Module &M) { auto &Analysis = getAnalysis<MIR2VecVocabLegacyAnalysis>(); - auto MIR2VecVocab = Analysis.getMIR2VecVocabulary(M); + auto MIR2VecVocabOrErr = Analysis.getMIR2VecVocabulary(M); - if (!MIR2VecVocab.isValid()) { - OS << "MIR2Vec Vocabulary Printer: Invalid vocabulary\n"; + if (!MIR2VecVocabOrErr) { + OS << "MIR2Vec Vocabulary Printer: Failed to get vocabulary - " + << toString(MIR2VecVocabOrErr.takeError()) << "\n"; return false; } + auto &MIR2VecVocab = *MIR2VecVocabOrErr; unsigned Pos = 0; for (const auto &Entry : MIR2VecVocab) { OS << "Key: " << MIR2VecVocab.getStringKey(Pos++) << ": "; diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 3a9651c..89ed4da 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -110,6 +110,7 @@ STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII"); STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found"); STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage"); STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages"); +STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores"); /// A command line option to turn software pipelining on or off. static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), @@ -193,6 +194,13 @@ static cl::opt<bool> MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining")); +/// A command line argument to limit the number of store instructions in the +/// target basic block. +static cl::opt<unsigned> SwpMaxNumStores( + "pipeliner-max-num-stores", + cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, + cl::init(200)); + namespace llvm { // A command line option to enable the CopyToPhi DAG mutation. @@ -544,6 +552,23 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) { return false; } + unsigned NumStores = 0; + for (MachineInstr &MI : *L.getHeader()) + if (MI.mayStore()) + ++NumStores; + if (NumStores > SwpMaxNumStores) { + LLVM_DEBUG(dbgs() << "Too many stores\n"); + NumFailTooManyStores++; + ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop", + L.getStartLoc(), L.getHeader()) + << "Too many store instructions in the loop: " + << ore::NV("NumStores", NumStores) << " > " + << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << "."; + }); + return false; + } + // Remove any subregisters from inputs to phi nodes. preprocessPhiNodes(*L.getHeader()); return true; diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index ebfea8e..e17a214 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -2051,6 +2051,12 @@ bool RegisterCoalescer::joinCopy( } if (CP.getNewRC()) { + if (RegClassInfo.getNumAllocatableRegs(CP.getNewRC()) == 0) { + LLVM_DEBUG(dbgs() << "\tNo " << TRI->getRegClassName(CP.getNewRC()) + << "are available for allocation\n"); + return false; + } + auto SrcRC = MRI->getRegClass(CP.getSrcReg()); auto DstRC = MRI->getRegClass(CP.getDstReg()); unsigned SrcIdx = CP.getSrcIdx(); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c5c3866..5ffdc4e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19340,8 +19340,10 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) { EVT VT = N->getValueType(0); const SDNodeFlags Flags = N->getFlags(); unsigned Opc = N->getOpcode(); - bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM; - bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM; + bool PropAllNaNsToQNaNs = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM; + bool PropOnlySNaNsToQNaNs = Opc == ISD::FMINNUM || Opc == ISD::FMAXNUM; + bool IsMin = + Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM || Opc == ISD::FMINIMUMNUM; SelectionDAG::FlagInserter FlagsInserter(DAG, N); // Constant fold. @@ -19356,34 +19358,53 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) { if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) { const APFloat &AF = N1CFP->getValueAPF(); - // minnum(X, nan) -> X - // maxnum(X, nan) -> X - // minimum(X, nan) -> nan - // maximum(X, nan) -> nan - if (AF.isNaN()) - return PropagatesNaN ? N->getOperand(1) : N->getOperand(0); + // minnum(X, qnan) -> X + // maxnum(X, qnan) -> X + // minnum(X, snan) -> qnan + // maxnum(X, snan) -> qnan + // minimum(X, nan) -> qnan + // maximum(X, nan) -> qnan + // minimumnum(X, nan) -> X + // maximumnum(X, nan) -> X + if (AF.isNaN()) { + if (PropAllNaNsToQNaNs || (AF.isSignaling() && PropOnlySNaNsToQNaNs)) { + if (AF.isSignaling()) + return DAG.getConstantFP(AF.makeQuiet(), SDLoc(N), VT); + return N->getOperand(1); + } + return N->getOperand(0); + } // In the following folds, inf can be replaced with the largest finite // float, if the ninf flag is set. if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) { - // minnum(X, -inf) -> -inf - // maxnum(X, +inf) -> +inf + // minnum(X, -inf) -> -inf (ignoring sNaN -> qNaN propagation) + // maxnum(X, +inf) -> +inf (ignoring sNaN -> qNaN propagation) // minimum(X, -inf) -> -inf if nnan // maximum(X, +inf) -> +inf if nnan - if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs())) + // minimumnum(X, -inf) -> -inf + // maximumnum(X, +inf) -> +inf + if (IsMin == AF.isNegative() && + (!PropAllNaNsToQNaNs || Flags.hasNoNaNs())) return N->getOperand(1); // minnum(X, +inf) -> X if nnan // maxnum(X, -inf) -> X if nnan - // minimum(X, +inf) -> X - // maximum(X, -inf) -> X - if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs())) + // minimum(X, +inf) -> X (ignoring quieting of sNaNs) + // maximum(X, -inf) -> X (ignoring quieting of sNaNs) + // minimumnum(X, +inf) -> X if nnan + // maximumnum(X, -inf) -> X if nnan + if (IsMin != AF.isNegative() && (PropAllNaNsToQNaNs || Flags.hasNoNaNs())) return N->getOperand(0); } } + // There are no VECREDUCE variants of FMINIMUMNUM or FMAXIMUMNUM + if (Opc == ISD::FMINIMUMNUM || Opc == ISD::FMAXIMUMNUM) + return SDValue(); + if (SDValue SD = reassociateReduction( - PropagatesNaN + PropAllNaNsToQNaNs ? (IsMin ? ISD::VECREDUCE_FMINIMUM : ISD::VECREDUCE_FMAXIMUM) : (IsMin ? ISD::VECREDUCE_FMIN : ISD::VECREDUCE_FMAX), Opc, SDLoc(N), VT, N0, N1, Flags)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 87d5453..3b5f83f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3416,7 +3416,7 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue Input2 = N->getOperand(2); SDValue AccLo, AccHi; - std::tie(AccLo, AccHi) = DAG.SplitVector(Acc, DL); + GetSplitVector(Acc, AccLo, AccHi); unsigned Opcode = N->getOpcode(); // If the input types don't need splitting, just accumulate into the @@ -3429,8 +3429,8 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue Input1Lo, Input1Hi; SDValue Input2Lo, Input2Hi; - std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL); - std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(Input2, DL); + GetSplitVector(Input1, Input1Lo, Input1Hi); + GetSplitVector(Input2, Input2Lo, Input2Hi); EVT ResultVT = AccLo.getValueType(); Lo = DAG.getNode(Opcode, DL, ResultVT, AccLo, Input1Lo, Input2Lo); @@ -4761,8 +4761,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) { SDLoc DL(N); SDValue Input1Lo, Input1Hi, Input2Lo, Input2Hi; - std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(N->getOperand(1), DL); - std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL); + GetSplitVector(N->getOperand(1), Input1Lo, Input1Hi); + GetSplitVector(N->getOperand(2), Input2Lo, Input2Hi); unsigned Opcode = N->getOpcode(); EVT ResultVT = Acc.getValueType(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 175753f..6c11c5b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -234,6 +234,19 @@ static bool dontUseFastISelFor(const Function &Fn) { }); } +static bool maintainPGOProfile(const TargetMachine &TM, + CodeGenOptLevel OptLevel) { + if (OptLevel != CodeGenOptLevel::None) + return true; + if (TM.getPGOOption()) { + const PGOOptions &Options = *TM.getPGOOption(); + return Options.Action == PGOOptions::PGOAction::IRUse || + Options.Action == PGOOptions::PGOAction::SampleUse || + Options.CSAction == PGOOptions::CSPGOAction::CSIRUse; + } + return false; +} + namespace llvm { //===--------------------------------------------------------------------===// @@ -395,6 +408,7 @@ SelectionDAGISel::~SelectionDAGISel() { delete CurDAG; } void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const { CodeGenOptLevel OptLevel = Selector->OptLevel; + bool RegisterPGOPasses = maintainPGOProfile(Selector->TM, Selector->OptLevel); if (OptLevel != CodeGenOptLevel::None) AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<GCModuleInfo>(); @@ -403,15 +417,15 @@ void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); - if (UseMBPI && OptLevel != CodeGenOptLevel::None) - AU.addRequired<BranchProbabilityInfoWrapperPass>(); + if (UseMBPI && RegisterPGOPasses) + AU.addRequired<BranchProbabilityInfoWrapperPass>(); AU.addRequired<ProfileSummaryInfoWrapperPass>(); // AssignmentTrackingAnalysis only runs if assignment tracking is enabled for // the module. AU.addRequired<AssignmentTrackingAnalysis>(); AU.addPreserved<AssignmentTrackingAnalysis>(); - if (OptLevel != CodeGenOptLevel::None) - LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); + if (RegisterPGOPasses) + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } @@ -464,6 +478,7 @@ void SelectionDAGISel::initializeAnalysisResults( (void)MatchFilterFuncName; #endif + bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel); TII = MF->getSubtarget().getInstrInfo(); TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); @@ -474,7 +489,7 @@ void SelectionDAGISel::initializeAnalysisResults( auto *PSI = MAMP.getCachedResult<ProfileSummaryAnalysis>(*Fn.getParent()); BlockFrequencyInfo *BFI = nullptr; FAM.getResult<BlockFrequencyAnalysis>(Fn); - if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None) + if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses) BFI = &FAM.getResult<BlockFrequencyAnalysis>(Fn); FunctionVarLocs const *FnVarLocs = nullptr; @@ -492,7 +507,7 @@ void SelectionDAGISel::initializeAnalysisResults( // into account). That's unfortunate but OK because it just means we won't // ask for passes that have been required anyway. - if (UseMBPI && OptLevel != CodeGenOptLevel::None) + if (UseMBPI && RegisterPGOPasses) FuncInfo->BPI = &FAM.getResult<BranchProbabilityAnalysis>(Fn); else FuncInfo->BPI = nullptr; @@ -518,6 +533,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { (void)MatchFilterFuncName; #endif + bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel); TII = MF->getSubtarget().getInstrInfo(); TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); @@ -528,7 +544,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { AC = &MFP.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(Fn); auto *PSI = &MFP.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); BlockFrequencyInfo *BFI = nullptr; - if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None) + if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses) BFI = &MFP.getAnalysis<LazyBlockFrequencyInfoPass>().getBFI(); FunctionVarLocs const *FnVarLocs = nullptr; @@ -549,7 +565,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { // into account). That's unfortunate but OK because it just means we won't // ask for passes that have been required anyway. - if (UseMBPI && OptLevel != CodeGenOptLevel::None) + if (UseMBPI && RegisterPGOPasses) FuncInfo->BPI = &MFP.getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); else diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index edc69a3..212a0c0 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -149,7 +149,8 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, if (!Name.empty()) WithColor(OS, Color) << Name; else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column || - Attr == DW_AT_call_line || Attr == DW_AT_call_column) { + Attr == DW_AT_call_line || Attr == DW_AT_call_column || + Attr == DW_AT_language_version) { if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant()) OS << *Val; else diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 5980ee3..286ed03 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3623,7 +3623,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( // 1. Build a list of reduction variables. // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; auto Size = ReductionInfos.size(); - Type *PtrTy = PointerType::getUnqual(Ctx); + Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS()); + Type *FuncPtrTy = + Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace()); Type *RedArrayTy = ArrayType::get(PtrTy, Size); CodeGenIP = Builder.saveIP(); Builder.restoreIP(AllocaIP); @@ -3667,9 +3669,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Builder.getInt64(MaxDataSize * ReductionInfos.size()); if (!IsTeamsReduction) { Value *SarFuncCast = - Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy); Value *WcFuncCast = - Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy); Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast, WcFuncCast}; Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr( @@ -10072,13 +10074,14 @@ void OpenMPIRBuilder::initializeTypes(Module &M) { LLVMContext &Ctx = M.getContext(); StructType *T; unsigned DefaultTargetAS = Config.getDefaultTargetAS(); + unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace(); #define OMP_TYPE(VarName, InitValue) VarName = InitValue; #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \ VarName##Ty = ArrayType::get(ElemTy, ArraySize); \ VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS); #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \ VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \ - VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS); + VarName##Ptr = PointerType::get(Ctx, ProgramAS); #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \ T = StructType::getTypeByName(Ctx, StructName); \ if (!T) \ diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index ae086bcd..0bc877d 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -2370,10 +2370,16 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N, Out << "!DICompileUnit("; MDFieldPrinter Printer(Out, WriterCtx); - Printer.printDwarfEnum("language", - N->getSourceLanguage().getUnversionedName(), - dwarf::LanguageString, - /* ShouldSkipZero */ false); + auto Lang = N->getSourceLanguage(); + if (Lang.hasVersionedName()) + Printer.printDwarfEnum( + "sourceLanguageName", + static_cast<llvm::dwarf::SourceLanguageName>(Lang.getName()), + dwarf::SourceLanguageNameString, + /* ShouldSkipZero */ false); + else + Printer.printDwarfEnum("language", Lang.getName(), dwarf::LanguageString, + /* ShouldSkipZero */ false); Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false); Printer.printString("producer", N->getProducer()); diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp index 7509188..fba6942 100644 --- a/llvm/lib/IR/ConstantFPRange.cpp +++ b/llvm/lib/IR/ConstantFPRange.cpp @@ -391,3 +391,23 @@ ConstantFPRange ConstantFPRange::unionWith(const ConstantFPRange &CR) const { return ConstantFPRange(minnum(Lower, CR.Lower), maxnum(Upper, CR.Upper), MayBeQNaN | CR.MayBeQNaN, MayBeSNaN | CR.MayBeSNaN); } + +ConstantFPRange ConstantFPRange::abs() const { + if (isNaNOnly()) + return *this; + // Check if the range is all non-negative or all non-positive. + if (Lower.isNegative() == Upper.isNegative()) { + if (Lower.isNegative()) + return negate(); + return *this; + } + // The range contains both positive and negative values. + APFloat NewLower = APFloat::getZero(getSemantics()); + APFloat NewUpper = maxnum(-Lower, Upper); + return ConstantFPRange(std::move(NewLower), std::move(NewUpper), MayBeQNaN, + MayBeSNaN); +} + +ConstantFPRange ConstantFPRange::negate() const { + return ConstantFPRange(-Upper, -Lower, MayBeQNaN, MayBeSNaN); +} diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 1ae20a9f..07a870f 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -715,11 +715,20 @@ DICompositeType *DIBuilder::createArrayType( DICompositeType *DIBuilder::createVectorType(uint64_t Size, uint32_t AlignInBits, DIType *Ty, - DINodeArray Subscripts) { - auto *R = DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "", - nullptr, 0, nullptr, Ty, Size, AlignInBits, 0, - DINode::FlagVector, Subscripts, 0, - /*EnumKind=*/std::nullopt, nullptr); + DINodeArray Subscripts, + Metadata *BitStride) { + auto *R = DICompositeType::get( + VMContext, dwarf::DW_TAG_array_type, /*Name=*/"", + /*File=*/nullptr, /*Line=*/0, /*Scope=*/nullptr, /*BaseType=*/Ty, + /*SizeInBits=*/Size, /*AlignInBits=*/AlignInBits, /*OffsetInBits=*/0, + /*Flags=*/DINode::FlagVector, /*Elements=*/Subscripts, + /*RuntimeLang=*/0, /*EnumKind=*/std::nullopt, /*VTableHolder=*/nullptr, + /*TemplateParams=*/nullptr, /*Identifier=*/"", + /*Discriminator=*/nullptr, /*DataLocation=*/nullptr, + /*Associated=*/nullptr, /*Allocated=*/nullptr, /*Rank=*/nullptr, + /*Annotations=*/nullptr, /*Specification=*/nullptr, + /*NumExtraInhabitants=*/0, + /*BitStride=*/BitStride); trackIfUnresolved(R); return R; } diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp index 4f37624..8e6d654 100644 --- a/llvm/lib/IR/DiagnosticInfo.cpp +++ b/llvm/lib/IR/DiagnosticInfo.cpp @@ -273,6 +273,13 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, C.print(OS); } +DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, + BranchProbability P) + : Key(std::string(Key)) { + raw_string_ostream OS(Val); + P.print(OS); +} + DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc) : Key(std::string(Key)), Loc(Loc) { if (Loc) { diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 47860c0..708e79d 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -20,7 +20,7 @@ using namespace llvm::mustache; namespace { -using Accessor = SmallVector<std::string>; +using Accessor = ArrayRef<StringRef>; static bool isFalsey(const json::Value &V) { return V.getAsNull() || (V.getAsBoolean() && !V.getAsBoolean().value()) || @@ -34,23 +34,32 @@ static bool isContextFalsey(const json::Value *V) { return isFalsey(*V); } -static Accessor splitMustacheString(StringRef Str) { +static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // We split the mustache string into an accessor. // For example: // "a.b.c" would be split into {"a", "b", "c"} // We make an exception for a single dot which // refers to the current context. - Accessor Tokens; + SmallVector<StringRef> Tokens; if (Str == ".") { - Tokens.emplace_back(Str); - return Tokens; - } - while (!Str.empty()) { - StringRef Part; - std::tie(Part, Str) = Str.split("."); - Tokens.emplace_back(Part.trim()); + // "." is a special accessor that refers to the current context. + // It's a literal, so it doesn't need to be saved. + Tokens.push_back("."); + } else { + while (!Str.empty()) { + StringRef Part; + std::tie(Part, Str) = Str.split('.'); + // Each part of the accessor needs to be saved to the arena + // to ensure it has a stable address. + Tokens.push_back(Ctx.Saver.save(Part.trim())); + } } - return Tokens; + // Now, allocate memory for the array of StringRefs in the arena. + StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size()); + // Copy the StringRefs from the stack vector to the arena. + std::copy(Tokens.begin(), Tokens.end(), ArenaTokens); + // Return an ArrayRef pointing to the stable arena memory. + return ArrayRef<StringRef>(ArenaTokens, Tokens.size()); } } // namespace @@ -97,23 +106,23 @@ public: SetDelimiter, }; - Token(std::string Str) - : TokenType(Type::Text), RawBody(std::move(Str)), TokenBody(RawBody), + Token(StringRef Str) + : TokenType(Type::Text), RawBody(Str), TokenBody(RawBody), AccessorValue({}), Indentation(0) {}; - Token(std::string RawBody, std::string TokenBody, char Identifier) - : RawBody(std::move(RawBody)), TokenBody(std::move(TokenBody)), - Indentation(0) { + Token(StringRef RawBody, StringRef TokenBody, char Identifier, + MustacheContext &Ctx) + : RawBody(RawBody), TokenBody(TokenBody), Indentation(0) { TokenType = getTokenType(Identifier); if (TokenType == Type::Comment) return; StringRef AccessorStr(this->TokenBody); if (TokenType != Type::Variable) AccessorStr = AccessorStr.substr(1); - AccessorValue = splitMustacheString(StringRef(AccessorStr).trim()); + AccessorValue = splitMustacheString(StringRef(AccessorStr).trim(), Ctx); } - Accessor getAccessor() const { return AccessorValue; } + ArrayRef<StringRef> getAccessor() const { return AccessorValue; } Type getType() const { return TokenType; } @@ -144,16 +153,16 @@ public: Type TokenType; // RawBody is the original string that was tokenized. - std::string RawBody; + StringRef RawBody; // TokenBody is the original string with the identifier removed. - std::string TokenBody; - Accessor AccessorValue; + StringRef TokenBody; + ArrayRef<StringRef> AccessorValue; size_t Indentation; }; using EscapeMap = DenseMap<char, std::string>; -class ASTNode { +class ASTNode : public ilist_node<ASTNode> { public: enum Type { Root, @@ -168,18 +177,19 @@ public: ASTNode(MustacheContext &Ctx) : Ctx(Ctx), Ty(Type::Root), Parent(nullptr), ParentContext(nullptr) {} - ASTNode(MustacheContext &Ctx, std::string Body, ASTNode *Parent) - : Ctx(Ctx), Ty(Type::Text), Body(std::move(Body)), Parent(Parent), + ASTNode(MustacheContext &Ctx, StringRef Body, ASTNode *Parent) + : Ctx(Ctx), Ty(Type::Text), Body(Body), Parent(Parent), ParentContext(nullptr) {} // Constructor for Section/InvertSection/Variable/UnescapeVariable Nodes - ASTNode(MustacheContext &Ctx, Type Ty, Accessor Accessor, ASTNode *Parent) - : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(std::move(Accessor)), + ASTNode(MustacheContext &Ctx, Type Ty, ArrayRef<StringRef> Accessor, + ASTNode *Parent) + : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(Accessor), ParentContext(nullptr) {} - void addChild(AstPtr Child) { Children.emplace_back(std::move(Child)); }; + void addChild(AstPtr Child) { Children.push_back(Child); }; - void setRawBody(std::string NewBody) { RawBody = std::move(NewBody); }; + void setRawBody(StringRef NewBody) { RawBody = NewBody; }; void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; }; @@ -212,28 +222,27 @@ private: MustacheContext &Ctx; Type Ty; size_t Indentation = 0; - std::string RawBody; - std::string Body; + StringRef RawBody; + StringRef Body; ASTNode *Parent; - // TODO: switch implementation to SmallVector<T> - std::vector<AstPtr> Children; - const Accessor AccessorValue; + ASTNodeList Children; + const ArrayRef<StringRef> AccessorValue; const llvm::json::Value *ParentContext; }; // A wrapper for arena allocator for ASTNodes static AstPtr createRootNode(MustacheContext &Ctx) { - return std::make_unique<ASTNode>(Ctx); + return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx); } -static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T, Accessor A, - ASTNode *Parent) { - return std::make_unique<ASTNode>(Ctx, T, std::move(A), Parent); +static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T, + ArrayRef<StringRef> A, ASTNode *Parent) { + return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, T, A, Parent); } -static AstPtr createTextNode(MustacheContext &Ctx, std::string Body, +static AstPtr createTextNode(MustacheContext &Ctx, StringRef Body, ASTNode *Parent) { - return std::make_unique<ASTNode>(Ctx, std::move(Body), Parent); + return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, Body, Parent); } // Function to check if there is meaningful text behind. @@ -295,9 +304,9 @@ static void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) { StringRef NextTokenBody = NextToken.TokenBody; // Cut off the leading newline which could be \n or \r\n. if (NextTokenBody.starts_with("\r\n")) - NextToken.TokenBody = NextTokenBody.substr(2).str(); + NextToken.TokenBody = NextTokenBody.substr(2); else if (NextTokenBody.starts_with("\n")) - NextToken.TokenBody = NextTokenBody.substr(1).str(); + NextToken.TokenBody = NextTokenBody.substr(1); } // Adjust previous token body if there no text behind. @@ -312,7 +321,7 @@ void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx, StringRef PrevTokenBody = PrevToken.TokenBody; StringRef Unindented = PrevTokenBody.rtrim(" \r\t\v"); size_t Indentation = PrevTokenBody.size() - Unindented.size(); - PrevToken.TokenBody = Unindented.str(); + PrevToken.TokenBody = Unindented; CurrentToken.setIndentation(Indentation); } @@ -402,21 +411,20 @@ static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, } static std::optional<std::pair<StringRef, StringRef>> -processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) { +processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content << ", Kind: " << tagKindToString(T.TagKind) << "\n"); if (T.TagKind == Tag::Kind::Triple) { - Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&'); + Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx); return std::nullopt; } StringRef Interpolated = T.Content; - std::string RawBody = T.FullMatch.str(); if (!Interpolated.trim().starts_with("=")) { char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); - Tokens.emplace_back(RawBody, Interpolated.str(), Front); + Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx); return std::nullopt; } - Tokens.emplace_back(RawBody, Interpolated.str(), '='); + Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx); StringRef DelimSpec = Interpolated.trim(); DelimSpec = DelimSpec.drop_front(1); DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); @@ -432,7 +440,7 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) { // The mustache spec allows {{{ }}} to unescape variables, // but we don't support that here. An unescape variable // is represented only by {{& variable}}. -static SmallVector<Token> tokenize(StringRef Template) { +static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n"); SmallVector<Token> Tokens; SmallString<8> Open("{{"); @@ -446,19 +454,17 @@ static SmallVector<Token> tokenize(StringRef Template) { if (T.TagKind == Tag::Kind::None) { // No more tags, the rest is text. - Tokens.emplace_back(Template.substr(Start).str()); - LLVM_DEBUG(dbgs() << " No more tags. Created final Text token: \"" - << Template.substr(Start) << "\"\n"); + Tokens.emplace_back(Template.substr(Start)); break; } // Add the text before the tag. if (T.StartPosition > Start) { StringRef Text = Template.substr(Start, T.StartPosition - Start); - Tokens.emplace_back(Text.str()); + Tokens.emplace_back(Text); } - if (auto NewDelims = processTag(T, Tokens)) { + if (auto NewDelims = processTag(T, Tokens, Ctx)) { std::tie(Open, Close) = *NewDelims; } @@ -614,20 +620,20 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, const Accessor &A) { AstPtr CurrentNode = createNode(Ctx, Ty, A, Parent); size_t Start = CurrentPtr; - parseMustache(CurrentNode.get()); + parseMustache(CurrentNode); const size_t End = CurrentPtr - 1; - std::string RawBody; + SmallString<128> RawBody; for (std::size_t I = Start; I < End; I++) RawBody += Tokens[I].RawBody; - CurrentNode->setRawBody(std::move(RawBody)); - Parent->addChild(std::move(CurrentNode)); + CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody))); + Parent->addChild(CurrentNode); } AstPtr Parser::parse() { - Tokens = tokenize(TemplateStr); + Tokens = tokenize(TemplateStr, Ctx); CurrentPtr = 0; AstPtr RootNode = createRootNode(Ctx); - parseMustache(RootNode.get()); + parseMustache(RootNode); return RootNode; } @@ -636,31 +642,29 @@ void Parser::parseMustache(ASTNode *Parent) { while (CurrentPtr < Tokens.size()) { Token CurrentToken = Tokens[CurrentPtr]; CurrentPtr++; - Accessor A = CurrentToken.getAccessor(); + ArrayRef<StringRef> A = CurrentToken.getAccessor(); AstPtr CurrentNode; switch (CurrentToken.getType()) { case Token::Type::Text: { - CurrentNode = - createTextNode(Ctx, std::move(CurrentToken.TokenBody), Parent); - Parent->addChild(std::move(CurrentNode)); + CurrentNode = createTextNode(Ctx, CurrentToken.TokenBody, Parent); + Parent->addChild(CurrentNode); break; } case Token::Type::Variable: { - CurrentNode = createNode(Ctx, ASTNode::Variable, std::move(A), Parent); - Parent->addChild(std::move(CurrentNode)); + CurrentNode = createNode(Ctx, ASTNode::Variable, A, Parent); + Parent->addChild(CurrentNode); break; } case Token::Type::UnescapeVariable: { - CurrentNode = - createNode(Ctx, ASTNode::UnescapeVariable, std::move(A), Parent); - Parent->addChild(std::move(CurrentNode)); + CurrentNode = createNode(Ctx, ASTNode::UnescapeVariable, A, Parent); + Parent->addChild(CurrentNode); break; } case Token::Type::Partial: { - CurrentNode = createNode(Ctx, ASTNode::Partial, std::move(A), Parent); + CurrentNode = createNode(Ctx, ASTNode::Partial, A, Parent); CurrentNode->setIndentation(CurrentToken.getIndentation()); - Parent->addChild(std::move(CurrentNode)); + Parent->addChild(CurrentNode); break; } case Token::Type::SectionOpen: { @@ -694,8 +698,7 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) { return; } case json::Value::String: { - auto Str = *Data.getAsString(); - OS << Str.str(); + OS << *Data.getAsString(); return; } @@ -727,7 +730,7 @@ void ASTNode::renderPartial(const json::Value &CurrentCtx, << ", Indentation:" << Indentation << "\n"); auto Partial = Ctx.Partials.find(AccessorValue[0]); if (Partial != Ctx.Partials.end()) - renderPartial(CurrentCtx, OS, Partial->getValue().get()); + renderPartial(CurrentCtx, OS, Partial->getValue()); } void ASTNode::renderVariable(const json::Value &CurrentCtx, @@ -858,8 +861,8 @@ const json::Value *ASTNode::findContext() { void ASTNode::renderChild(const json::Value &Contexts, MustacheOutputStream &OS) { - for (AstPtr &Child : Children) - Child->render(Contexts, OS); + for (ASTNode &Child : Children) + Child.render(Contexts, OS); } void ASTNode::renderPartial(const json::Value &Contexts, @@ -869,7 +872,7 @@ void ASTNode::renderPartial(const json::Value &Contexts, Partial->render(Contexts, IS); } -void ASTNode::renderLambdas(const json::Value &Contexts, +void ASTNode::renderLambdas(const llvm::json::Value &Contexts, MustacheOutputStream &OS, Lambda &L) { json::Value LambdaResult = L(); std::string LambdaStr; @@ -886,9 +889,9 @@ void ASTNode::renderLambdas(const json::Value &Contexts, LambdaNode->render(Contexts, OS); } -void ASTNode::renderSectionLambdas(const json::Value &Contexts, +void ASTNode::renderSectionLambdas(const llvm::json::Value &Contexts, MustacheOutputStream &OS, SectionLambda &L) { - json::Value Return = L(RawBody); + json::Value Return = L(RawBody.str()); if (isFalsey(Return)) return; std::string LambdaStr; @@ -899,15 +902,16 @@ void ASTNode::renderSectionLambdas(const json::Value &Contexts, LambdaNode->render(Contexts, OS); } -void Template::render(const json::Value &Data, llvm::raw_ostream &OS) { +void Template::render(const llvm::json::Value &Data, llvm::raw_ostream &OS) { RawMustacheOutputStream MOS(OS); Tree->render(Data, MOS); } void Template::registerPartial(std::string Name, std::string Partial) { - Parser P(Partial, Ctx); + StringRef SavedPartial = Ctx.Saver.save(Partial); + Parser P(SavedPartial, Ctx); AstPtr PartialTree = P.parse(); - Ctx.Partials.insert(std::make_pair(Name, std::move(PartialTree))); + Ctx.Partials.insert(std::make_pair(Name, PartialTree)); } void Template::registerLambda(std::string Name, Lambda L) { @@ -922,7 +926,7 @@ void Template::overrideEscapeCharacters(EscapeMap E) { Ctx.Escapes = std::move(E); } -Template::Template(StringRef TemplateStr) { +Template::Template(StringRef TemplateStr, MustacheContext &Ctx) : Ctx(Ctx) { Parser P(TemplateStr, Ctx); Tree = P.parse(); // The default behavior is to escape html entities. @@ -935,18 +939,12 @@ Template::Template(StringRef TemplateStr) { } Template::Template(Template &&Other) noexcept - : Ctx(std::move(Other.Ctx)), Tree(std::move(Other.Tree)) {} + : Ctx(Other.Ctx), Tree(Other.Tree) { + Other.Tree = nullptr; +} Template::~Template() = default; -Template &Template::operator=(Template &&Other) noexcept { - if (this != &Other) { - Ctx = std::move(Other.Ctx); - Tree = std::move(Other.Tree); - Other.Tree = nullptr; - } - return *this; -} } // namespace llvm::mustache #undef DEBUG_TYPE diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 639ddcb..ecaeff7 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -350,7 +350,7 @@ def AArch64PostLegalizerLowering // Post-legalization combines which are primarily optimizations. def AArch64PostLegalizerCombiner : GICombiner<"AArch64PostLegalizerCombinerImpl", - [copy_prop, cast_of_cast_combines, + [copy_prop, cast_of_cast_combines, constant_fold_fp_ops, buildvector_of_truncate, integer_of_truncate, mutate_anyext_to_zext, combines_for_extload, combine_indexed_load_store, sext_trunc_sextload, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index dc8e7c8..7294f3e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1458,6 +1458,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal); setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal); + setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom); setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom); if (Subtarget->hasMatMulInt8()) { @@ -16248,7 +16249,9 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { bool Negated; uint64_t SplatVal; - if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { + // NOTE: SRAD cannot be used to represent sdiv-by-one. + if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) && + SplatVal > 1) { SDValue Pg = getPredicateForScalableVector(DAG, DL, VT); SDValue Res = DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0), @@ -30033,7 +30036,9 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( bool Negated; uint64_t SplatVal; - if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { + // NOTE: SRAD cannot be used to represent sdiv-by-one. + if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) && + SplatVal > 1) { EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32); @@ -30605,6 +30610,43 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op, assert(OpVT.isScalableVector() && "Expected scalable vector in LowerVECTOR_DEINTERLEAVE."); + if (Op->getNumOperands() == 3) { + // aarch64_sve_ld3 only supports packed datatypes. + EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount()); + Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment); + + // Write out unmodified operands. + SmallVector<SDValue, 3> Chains; + for (unsigned I = 0; I < 3; ++I) { + SDValue Ptr = + DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL); + SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG); + Chains.push_back( + DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo())); + } + + Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret; + EVT PredVT = PackedVT.changeVectorElementType(MVT::i1); + + SmallVector<SDValue, 7> Ops; + Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains)); + Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64)); + Ops.push_back(DAG.getConstant(1, DL, PredVT)); + Ops.push_back(StackPtr); + + // Read back and deinterleave data. + SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other); + SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops); + + SmallVector<SDValue, 3> Results; + Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG)); + Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG)); + Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG)); + return DAG.getMergeValues(Results, DL); + } + // Are multi-register uzp instructions available? if (Subtarget->hasSME2() && Subtarget->isStreaming() && OpVT.getVectorElementType() != MVT::i1) { @@ -30646,6 +30688,42 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op, assert(OpVT.isScalableVector() && "Expected scalable vector in LowerVECTOR_INTERLEAVE."); + if (Op->getNumOperands() == 3) { + // aarch64_sve_st3 only supports packed datatypes. + EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount()); + SmallVector<SDValue, 3> InVecs; + for (SDValue V : Op->ops()) + InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG)); + + Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment); + + Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3; + EVT PredVT = PackedVT.changeVectorElementType(MVT::i1); + + SmallVector<SDValue, 7> Ops; + Ops.push_back(DAG.getEntryNode()); + Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64)); + Ops.append(InVecs); + Ops.push_back(DAG.getConstant(1, DL, PredVT)); + Ops.push_back(StackPtr); + + // Interleave operands and store. + SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops); + + // Read back the interleaved data. + SmallVector<SDValue, 3> Results; + for (unsigned I = 0; I < 3; ++I) { + SDValue Ptr = + DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL); + SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo()); + Results.push_back(getSVESafeBitCast(OpVT, L, DAG)); + } + + return DAG.getMergeValues(Results, DL); + } + // Are multi-register zip instructions available? if (Subtarget->hasSME2() && Subtarget->isStreaming() && OpVT.getVectorElementType() != MVT::i1) { @@ -30769,6 +30847,17 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op, ResultVT.isFixedLengthVector() && useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true); + // We can handle this case natively by accumulating into a wider + // zero-padded vector. + if (!ConvertToScalable && ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) { + SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32); + SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0); + SDValue Wide = + DAG.getNode(Op.getOpcode(), DL, MVT::v4i32, WideAcc, LHS, RHS); + SDValue Reduced = DAG.getNode(AArch64ISD::ADDP, DL, MVT::v4i32, Wide, Wide); + return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0); + } + if (ConvertToScalable) { ResultVT = getContainerForFixedLengthVector(DAG, ResultVT); OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType()); diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 91e64e6..bd0a17d 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -315,6 +315,8 @@ public: } void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) { + assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) && + "expected SVE stack sizes to be aligned to 16-bytes"); StackSizeZPR = ZPR; StackSizePPR = PPR; HasCalculatedStackSizeSVE = true; @@ -425,6 +427,8 @@ public: // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes' void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) { + assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) && + "expected SVE callee-save sizes to be aligned to 16-bytes"); ZPRCalleeSavedStackSize = ZPR; PPRCalleeSavedStackSize = PPR; HasSVECalleeSavedStackSize = true; diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 1568161..f110558 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -60,7 +60,6 @@ static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) { case AArch64::PTRUE_C_B: return I->getFlag(MachineInstr::FrameSetup) || I->getFlag(MachineInstr::FrameDestroy); - case AArch64::SEH_SavePReg: case AArch64::SEH_SaveZReg: return true; } @@ -75,6 +74,8 @@ static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) { case AArch64::LDR_PXI: return I->getFlag(MachineInstr::FrameSetup) || I->getFlag(MachineInstr::FrameDestroy); + case AArch64::SEH_SavePReg: + return true; } } @@ -94,6 +95,26 @@ AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon( HasFP = AFL.hasFP(MF); NeedsWinCFI = AFL.needsWinCFI(MF); + + // Windows unwind can't represent the required stack adjustments if we have + // both SVE callee-saves and dynamic stack allocations, and the frame pointer + // is before the SVE spills. The allocation of the frame pointer must be the + // last instruction in the prologue so the unwinder can restore the stack + // pointer correctly. (And there isn't any unwind opcode for `addvl sp, x29, + // -17`.) + // + // Because of this, we do spills in the opposite order on Windows: first SVE, + // then GPRs. The main side-effect of this is that it makes accessing + // parameters passed on the stack more expensive. + // + // We could consider rearranging the spills for simpler cases. + if (Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize()) { + if (AFI->hasStackHazardSlotIndex()) + reportFatalUsageError("SME hazard padding is not supported on Windows"); + SVELayout = SVEStackLayout::CalleeSavesAboveFrameRecord; + } else if (AFI->hasSplitSVEObjects()) { + SVELayout = SVEStackLayout::Split; + } } MachineBasicBlock::iterator @@ -334,6 +355,55 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump( return true; } +SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const { + StackOffset PPRCalleeSavesSize = + StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); + StackOffset ZPRCalleeSavesSize = + StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); + StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize; + StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize; + if (SVELayout == SVEStackLayout::Split) + return {{PPRCalleeSavesSize, PPRLocalsSize}, + {ZPRCalleeSavesSize, ZPRLocalsSize}}; + // For simplicity, attribute all locals to ZPRs when split SVE is disabled. + return {{PPRCalleeSavesSize, StackOffset{}}, + {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}}; +} + +struct SVEPartitions { + struct { + MachineBasicBlock::iterator Begin, End; + } PPR, ZPR; +}; + +static SVEPartitions partitionSVECS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + StackOffset PPRCalleeSavesSize, + StackOffset ZPRCalleeSavesSize, + bool IsEpilogue) { + MachineBasicBlock::iterator PPRsI = MBBI; + MachineBasicBlock::iterator End = + IsEpilogue ? MBB.begin() : MBB.getFirstTerminator(); + auto AdjustI = [&](auto MBBI) { return IsEpilogue ? std::prev(MBBI) : MBBI; }; + // Process the SVE CS to find the starts/ends of the ZPR and PPR areas. + if (PPRCalleeSavesSize) { + PPRsI = AdjustI(PPRsI); + assert(isPartOfPPRCalleeSaves(*PPRsI) && "Unexpected instruction"); + while (PPRsI != End && isPartOfPPRCalleeSaves(AdjustI(PPRsI))) + IsEpilogue ? (--PPRsI) : (++PPRsI); + } + MachineBasicBlock::iterator ZPRsI = PPRsI; + if (ZPRCalleeSavesSize) { + ZPRsI = AdjustI(ZPRsI); + assert(isPartOfZPRCalleeSaves(*ZPRsI) && "Unexpected instruction"); + while (ZPRsI != End && isPartOfZPRCalleeSaves(AdjustI(ZPRsI))) + IsEpilogue ? (--ZPRsI) : (++ZPRsI); + } + if (IsEpilogue) + return {{PPRsI, MBBI}, {ZPRsI, PPRsI}}; + return {{MBBI, PPRsI}, {PPRsI, ZPRsI}}; +} + AArch64PrologueEmitter::AArch64PrologueEmitter(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64FrameLowering &AFL) @@ -613,30 +683,12 @@ void AArch64PrologueEmitter::emitPrologue() { bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg()); unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); - // Windows unwind can't represent the required stack adjustments if we have - // both SVE callee-saves and dynamic stack allocations, and the frame - // pointer is before the SVE spills. The allocation of the frame pointer - // must be the last instruction in the prologue so the unwinder can restore - // the stack pointer correctly. (And there isn't any unwind opcode for - // `addvl sp, x29, -17`.) - // - // Because of this, we do spills in the opposite order on Windows: first SVE, - // then GPRs. The main side-effect of this is that it makes accessing - // parameters passed on the stack more expensive. - // - // We could consider rearranging the spills for simpler cases. - bool FPAfterSVECalleeSaves = - Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize(); - - if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex()) - reportFatalUsageError("SME hazard padding is not supported on Windows"); - auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // All of the remaining stack allocations are for locals. determineLocalsStackSize(NumBytes, PrologueSaveSize); MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI; - if (FPAfterSVECalleeSaves) { + if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { // If we're doing SVE saves first, we need to immediately allocate space // for fixed objects, then space for the SVE callee saves. // @@ -712,110 +764,66 @@ void AArch64PrologueEmitter::emitPrologue() { if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding); - StackOffset PPRCalleeSavesSize = - StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); - StackOffset ZPRCalleeSavesSize = - StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); - StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize; - StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize; - StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize; - - std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin, - ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd; - + auto [PPR, ZPR] = getSVEStackFrameSizes(); + StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; + StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes); StackOffset CFAOffset = - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); + StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize; + MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI; - if (!FPAfterSVECalleeSaves) { - // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR - // areas. - PPRCalleeSavesBegin = AfterGPRSavesI; - if (PPRCalleeSavesSize) { - LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = " - << PPRCalleeSavesSize.getScalable() << "\n"); - - assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) && - "Unexpected instruction"); - while (isPartOfPPRCalleeSaves(AfterSVESavesI) && - AfterSVESavesI != MBB.getFirstTerminator()) - ++AfterSVESavesI; + // Allocate space for the callee saves and PPR locals (if any). + if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) { + auto [PPRRange, ZPRRange] = + partitionSVECS(MBB, AfterGPRSavesI, PPR.CalleeSavesSize, + ZPR.CalleeSavesSize, /*IsEpilogue=*/false); + AfterSVESavesI = ZPRRange.End; + if (EmitAsyncCFI) + emitCalleeSavedSVELocations(AfterSVESavesI); + + StackOffset AllocateBeforePPRs = SVECalleeSavesSize; + StackOffset AllocateAfterPPRs = PPR.LocalsSize; + if (SVELayout == SVEStackLayout::Split) { + AllocateBeforePPRs = PPR.CalleeSavesSize; + AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize; } - PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI; - if (ZPRCalleeSavesSize) { - LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = " - << ZPRCalleeSavesSize.getScalable() << "\n"); - assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) && - "Unexpected instruction"); - while (isPartOfZPRCalleeSaves(AfterSVESavesI) && - AfterSVESavesI != MBB.getFirstTerminator()) - ++AfterSVESavesI; - } - ZPRCalleeSavesEnd = AfterSVESavesI; - } - - if (EmitAsyncCFI) - emitCalleeSavedSVELocations(AfterSVESavesI); - - if (AFI->hasSplitSVEObjects()) { - assert(!FPAfterSVECalleeSaves && - "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects"); - assert(!AFL.canUseRedZone(MF) && - "Cannot use redzone with aarch64-split-sve-objects"); - // TODO: Handle HasWinCFI/NeedsWinCFI? - assert(!NeedsWinCFI && - "WinCFI with aarch64-split-sve-objects is not supported"); - - // Split ZPR and PPR allocation. - // Allocate PPR callee saves - allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize, + allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || ZPRCalleeSavesSize || - ZPRLocalsSize || PPRLocalsSize); - CFAOffset += PPRCalleeSavesSize; - - // Allocate PPR locals + ZPR callee saves - assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin && + MFI.hasVarSizedObjects() || AllocateAfterPPRs || + ZPR.LocalsSize || NonSVELocalsSize); + CFAOffset += AllocateBeforePPRs; + assert(PPRRange.End == ZPRRange.Begin && "Expected ZPR callee saves after PPR locals"); - allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding, - PPRLocalsSize + ZPRCalleeSavesSize, - EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || ZPRLocalsSize); - CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize; - - // Allocate ZPR locals - allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding, - ZPRLocalsSize + StackOffset::getFixed(NumBytes), + allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects()); + MFI.hasVarSizedObjects() || ZPR.LocalsSize || + NonSVELocalsSize); + CFAOffset += AllocateAfterPPRs; } else { - // Allocate space for the callee saves (if any). - StackOffset LocalsSize = - PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes); - if (!FPAfterSVECalleeSaves) - allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize, - EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || LocalsSize); + assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord); + // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been + // allocated (and separate PPR locals are not supported, all SVE locals, + // both PPR and ZPR, are within the ZPR locals area). + assert(!PPR.LocalsSize && "Unexpected PPR locals!"); CFAOffset += SVECalleeSavesSize; + } - // Allocate space for the rest of the frame including SVE locals. Align the - // stack as necessary. - assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) && - "Cannot use redzone with stack realignment"); - if (!AFL.canUseRedZone(MF)) { - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize; - allocateStackSpace(AfterSVESavesI, RealignmentPadding, - SVELocalsSize + StackOffset::getFixed(NumBytes), - EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects()); - } + // Allocate space for the rest of the frame including ZPR locals. Align the + // stack as necessary. + assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) && + "Cannot use redzone with stack realignment"); + if (!AFL.canUseRedZone(MF)) { + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the + // correct value here, as NumBytes also includes padding bytes, which + // shouldn't be counted here. + allocateStackSpace( + AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize, + EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects()); } // If we need a base pointer, set it up here. It's whatever the value of the - // stack pointer is at this point. Any variable size objects will be allocated - // after this, so we can still use the base pointer to reference locals. + // stack pointer is at this point. Any variable size objects will be + // allocated after this, so we can still use the base pointer to reference + // locals. // // FIXME: Clarify FrameSetup flags here. // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is @@ -1270,7 +1278,9 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations( StackOffset::getScalable(MFI.getObjectOffset(FI)) - StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI)); - if (AFI->hasSplitSVEObjects() && + // The scalable vectors are below (lower address) the scalable predicates + // with split SVE objects, so we must subtract the size of the predicates. + if (SVELayout == SVEStackLayout::Split && MFI.getStackID(FI) == TargetStackID::ScalableVector) Offset -= PPRStackSize; @@ -1349,13 +1359,10 @@ void AArch64EpilogueEmitter::emitEpilogue() { return; } - bool FPAfterSVECalleeSaves = - Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize(); - bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes); // Assume we can't combine the last pop with the sp restore. bool CombineAfterCSRBump = false; - if (FPAfterSVECalleeSaves) { + if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { AfterCSRPopSize += FixedObject; } else if (!CombineSPBump && PrologueSaveSize != 0) { MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator()); @@ -1390,7 +1397,8 @@ void AArch64EpilogueEmitter::emitEpilogue() { while (FirstGPRRestoreI != Begin) { --FirstGPRRestoreI; if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) || - (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) { + (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord && + isPartOfSVECalleeSaves(FirstGPRRestoreI))) { ++FirstGPRRestoreI; break; } else if (CombineSPBump) @@ -1414,13 +1422,9 @@ void AArch64EpilogueEmitter::emitEpilogue() { if (HasFP && AFI->hasSwiftAsyncContext()) emitSwiftAsyncContextFramePointer(EpilogueEndI, DL); - StackOffset ZPRStackSize = AFL.getZPRStackSize(MF); - StackOffset PPRStackSize = AFL.getPPRStackSize(MF); - StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; - // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { - assert(!SVEStackSize && "Cannot combine SP bump with SVE"); + assert(!AFI->hasSVEStackSize() && "Cannot combine SP bump with SVE"); // When we are about to restore the CSRs, the CFA register is SP again. if (EmitCFI && HasFP) @@ -1437,188 +1441,122 @@ void AArch64EpilogueEmitter::emitEpilogue() { NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); - if (!AFI->hasSplitSVEObjects()) { - // Process the SVE callee-saves to determine what space needs to be - // deallocated. - StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; - MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI, - RestoreEnd = FirstGPRRestoreI; - int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize(); - int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize(); - int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize; - - if (SVECalleeSavedSize) { - if (FPAfterSVECalleeSaves) - RestoreEnd = MBB.getFirstTerminator(); - - RestoreBegin = std::prev(RestoreEnd); - while (RestoreBegin != MBB.begin() && - isPartOfSVECalleeSaves(std::prev(RestoreBegin))) - --RestoreBegin; - - assert(isPartOfSVECalleeSaves(RestoreBegin) && - isPartOfSVECalleeSaves(std::prev(RestoreEnd)) && - "Unexpected instruction"); - - StackOffset CalleeSavedSizeAsOffset = - StackOffset::getScalable(SVECalleeSavedSize); - DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; - DeallocateAfter = CalleeSavedSizeAsOffset; + auto [PPR, ZPR] = getSVEStackFrameSizes(); + auto [PPRRange, ZPRRange] = partitionSVECS( + MBB, + SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord + ? MBB.getFirstTerminator() + : FirstGPRRestoreI, + PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true); + + StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; + StackOffset SVEStackSize = + SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize; + MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin; + MachineBasicBlock::iterator RestoreEnd = PPRRange.End; + + // Deallocate the SVE area. + if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { + StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize; + // If the callee-save area is before FP, restoring the FP implicitly + // deallocates non-callee-save SVE allocations. Otherwise, deallocate them + // explicitly. + if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { + emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, + SVELocalsSize, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI); } - // Deallocate the SVE area. - if (FPAfterSVECalleeSaves) { - // If the callee-save area is before FP, restoring the FP implicitly - // deallocates non-callee-save SVE allocations. Otherwise, deallocate - // them explicitly. - if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { - emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI); - } + // Deallocate callee-save non-SVE registers. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - // Deallocate callee-save non-SVE registers. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(AFI->getCalleeSavedStackSize()), - TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI); - - // Deallocate fixed objects. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(FixedObject), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI); - - // Deallocate callee-save SVE registers. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); - } else if (SVEStackSize) { - int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize(); - // If we have stack realignment or variable-sized objects we must use the - // FP to restore SVE callee saves (as there is an unknown amount of - // data/padding between the SP and SVE CS area). - Register BaseForSVEDealloc = - (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP - : AArch64::SP; - if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) { - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need - // to compute the base address by subtracting the offest in a - // temporary register first (to avoid briefly deallocating the SVE - // CS). - CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); - } - // The code below will deallocate the stack space space by moving the - // SP to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - StackOffset::getScalable(-SVECalleeSavedSize), TII, + // Deallocate fixed objects. + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(FixedObject), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); + + // Deallocate callee-save SVE registers. + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI); + } else if (AFI->hasSVEStackSize()) { + // If we have stack realignment or variable-sized objects we must use the FP + // to restore SVE callee saves (as there is an unknown amount of + // data/padding between the SP and SVE CS area). + Register BaseForSVEDealloc = + (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP + : AArch64::SP; + if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) { + // TODO: Support stack realigment and variable-sized objects. + assert( + SVELayout != SVEStackLayout::Split && + "unexpected stack realignment or variable sized objects with split " + "SVE stack objects"); + + Register CalleeSaveBase = AArch64::FP; + if (int64_t CalleeSaveBaseOffset = + AFI->getCalleeSaveBaseToFrameRecordOffset()) { + // If we have have an non-zero offset to the non-SVE CS base we need to + // compute the base address by subtracting the offest in a temporary + // register first (to avoid briefly deallocating the SVE CS). + CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); + emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, + StackOffset::getFixed(-CalleeSaveBaseOffset), TII, MachineInstr::FrameDestroy); - } else if (BaseForSVEDealloc == AArch64::SP) { - if (SVECalleeSavedSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(NumBytes), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI, EmitCFI && !HasFP, - SVEStackSize + StackOffset::getFixed( - NumBytes + PrologueSaveSize)); - NumBytes = 0; - } - - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - SVEStackSize + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); - - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, - DeallocateAfter + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); + } + // The code below will deallocate the stack space space by moving the SP + // to the start of the SVE callee-save area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, + -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy); + } else if (BaseForSVEDealloc == AArch64::SP) { + auto CFAOffset = + SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); + + if (SVECalleeSavesSize) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + auto NonSVELocals = StackOffset::getFixed(NumBytes); + emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, + NonSVELocals, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); + CFAOffset -= NonSVELocals; + NumBytes = 0; } - if (EmitCFI) - emitCalleeSavedSVERestores(RestoreEnd); - } - } else if (AFI->hasSplitSVEObjects() && SVEStackSize) { - // TODO: Support stack realigment and variable-sized objects. - assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() && - "unexpected stack realignment or variable sized objects with split " - "SVE stack objects"); - // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR - // areas. - auto ZPRCalleeSavedSize = - StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); - auto PPRCalleeSavedSize = - StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); - StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize; - StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize; - - MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI, - PPRRestoreEnd = FirstGPRRestoreI; - if (PPRCalleeSavedSize) { - PPRRestoreBegin = std::prev(PPRRestoreEnd); - while (PPRRestoreBegin != MBB.begin() && - isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin))) - --PPRRestoreBegin; - } - - MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin, - ZPRRestoreEnd = PPRRestoreBegin; - if (ZPRCalleeSavedSize) { - ZPRRestoreBegin = std::prev(ZPRRestoreEnd); - while (ZPRRestoreBegin != MBB.begin() && - isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin))) - --ZPRRestoreBegin; - } - - auto CFAOffset = - SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); - if (PPRCalleeSavedSize || ZPRCalleeSavedSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - auto NonSVELocals = StackOffset::getFixed(NumBytes); - emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, - NonSVELocals, TII, MachineInstr::FrameDestroy, false, - false, nullptr, EmitCFI && !HasFP, CFAOffset); - NumBytes = 0; - CFAOffset -= NonSVELocals; - } + if (ZPR.LocalsSize) { + emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, + ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); + CFAOffset -= ZPR.LocalsSize; + } - if (ZPRLocalsSize) { - emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, - ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false, - false, nullptr, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= ZPRLocalsSize; - } + StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize; + if (SVELayout == SVEStackLayout::Split && + (PPR.LocalsSize || ZPR.CalleeSavesSize)) { + assert(PPRRange.Begin == ZPRRange.End && + "Expected PPR restores after ZPR"); + emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP, + PPR.LocalsSize + ZPR.CalleeSavesSize, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI, EmitCFI && !HasFP, CFAOffset); + CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize; + SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize; + } - if (PPRLocalsSize || ZPRCalleeSavedSize) { - assert(PPRRestoreBegin == ZPRRestoreEnd && - "Expected PPR restores after ZPR"); - emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP, - PPRLocalsSize + ZPRCalleeSavedSize, TII, - MachineInstr::FrameDestroy, false, false, nullptr, - EmitCFI && !HasFP, CFAOffset); - CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize; - } - if (PPRCalleeSavedSize) { - emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP, - PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy, - false, false, nullptr, EmitCFI && !HasFP, CFAOffset); + // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs: + if (SVECalleeSavesToDealloc) + emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, + SVECalleeSavesToDealloc, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI, EmitCFI && !HasFP, CFAOffset); } - // We only emit CFI information for ZPRs so emit CFI after the ZPR restores. if (EmitCFI) - emitCalleeSavedSVERestores(ZPRRestoreEnd); + emitCalleeSavedSVERestores( + SVELayout == SVEStackLayout::Split ? ZPRRange.End : PPRRange.End); } if (!HasFP) { diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h index a1c9b34..bccadda 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h @@ -27,11 +27,23 @@ class AArch64Subtarget; class AArch64FunctionInfo; class AArch64FrameLowering; +struct SVEFrameSizes { + struct { + StackOffset CalleeSavesSize, LocalsSize; + } PPR, ZPR; +}; + class AArch64PrologueEpilogueCommon { public: AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64FrameLowering &AFL); + enum class SVEStackLayout { + Default, + Split, + CalleeSavesAboveFrameRecord, + }; + protected: bool requiresGetVGCall() const; @@ -53,6 +65,8 @@ protected: bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; + SVEFrameSizes getSVEStackFrameSizes() const; + MachineFunction &MF; MachineBasicBlock &MBB; @@ -68,6 +82,7 @@ protected: bool IsFunclet = false; // Note: Set in derived constructors. bool NeedsWinCFI = false; // Note: Can be changed in emitFramePointerSetup. bool HomPrologEpilog = false; // Note: Set in derived constructors. + SVEStackLayout SVELayout = SVEStackLayout::Default; // Note: "HasWinCFI" is mutable as it can change in any "emit" function. mutable bool HasWinCFI = false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 0f2c335..ce2b4a5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,6 +562,11 @@ public: void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +struct AMDGPUUniformIntrinsicCombinePass + : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 24bef82..8e35ba7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "SIModeRegisterDefaults.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -27,6 +28,7 @@ #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -106,6 +108,7 @@ public: bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; mutable Function *LdexpF32 = nullptr; + mutable SmallVector<WeakVH> DeadVals; DenseMap<const PHINode *, bool> BreakPhiNodesCache; @@ -242,6 +245,8 @@ public: Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src, FastMathFlags FMF) const; + bool tryNarrowMathIfNoOverflow(Instruction *I); + public: bool visitFDiv(BinaryOperator &I); @@ -281,28 +286,21 @@ bool AMDGPUCodeGenPrepareImpl::run() { BreakPhiNodesCache.clear(); bool MadeChange = false; - Function::iterator NextBB; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { - BasicBlock *BB = &*FI; - NextBB = std::next(FI); - - BasicBlock::iterator Next; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; - I = Next) { - Next = std::next(I); - - MadeChange |= visit(*I); - - if (Next != E) { // Control flow changed - BasicBlock *NextInstBB = Next->getParent(); - if (NextInstBB != BB) { - BB = NextInstBB; - E = BB->end(); - FE = F.end(); - } - } + // Need to use make_early_inc_range because integer division expansion is + // handled by Transform/Utils, and it can delete instructions such as the + // terminator of the BB. + for (BasicBlock &BB : reverse(F)) { + for (Instruction &I : make_early_inc_range(reverse(BB))) { + if (!isInstructionTriviallyDead(&I, TLI)) + MadeChange |= visit(I); } } + + while (!DeadVals.empty()) { + if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + } + return MadeChange; } @@ -422,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { Value *NewVal = insertValues(Builder, Ty, ResultVals); NewVal->takeName(&I); I.replaceAllUsesWith(NewVal); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -496,10 +494,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const { FoldedT, FoldedF); NewSelect->takeName(&BO); BO.replaceAllUsesWith(NewSelect); - BO.eraseFromParent(); + DeadVals.push_back(&BO); if (CastOp) - CastOp->eraseFromParent(); - Sel->eraseFromParent(); + DeadVals.push_back(CastOp); + DeadVals.push_back(Sel); return true; } @@ -895,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { if (NewVal) { FDiv.replaceAllUsesWith(NewVal); NewVal->takeName(&FDiv); - RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI); + DeadVals.push_back(&FDiv); } return true; @@ -1302,10 +1300,7 @@ it will create `s_and_b32 s0, s0, 0xff`. We accept this change since the non-byte load assumes the upper bits within the byte are all 0. */ -static bool tryNarrowMathIfNoOverflow(Instruction *I, - const SITargetLowering *TLI, - const TargetTransformInfo &TTI, - const DataLayout &DL) { +bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) { unsigned Opc = I->getOpcode(); Type *OldType = I->getType(); @@ -1330,6 +1325,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, NewType = I->getType()->getWithNewBitWidth(NewBit); // Old cost + const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F); InstructionCost OldCost = TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput); // New cost of new op @@ -1360,7 +1356,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, Value *Zext = Builder.CreateZExt(Arith, OldType); I->replaceAllUsesWith(Zext); - I->eraseFromParent(); + DeadVals.push_back(I); return true; } @@ -1370,8 +1366,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (UseMul24Intrin && replaceMulWithMul24(I)) return true; - if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(), - TM.getTargetTransformInfo(F), DL)) + if (tryNarrowMathIfNoOverflow(&I)) return true; bool Changed = false; @@ -1436,7 +1431,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (NewDiv) { I.replaceAllUsesWith(NewDiv); - I.eraseFromParent(); + DeadVals.push_back(&I); Changed = true; } } @@ -1492,7 +1487,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); I.replaceAllUsesWith(ValOrig); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -1534,7 +1529,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { Fract->takeName(&I); I.replaceAllUsesWith(Fract); - RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); + DeadVals.push_back(&I); return true; } @@ -1822,7 +1817,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { } I.replaceAllUsesWith(Vec); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -1903,7 +1898,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { auto *Intrin = B.CreateIntrinsic( I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)}); I.replaceAllUsesWith(Intrin); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -2000,16 +1995,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *Fract = applyFractPat(Builder, FractArg); Fract->takeName(&I); I.replaceAllUsesWith(Fract); - - RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); + DeadVals.push_back(&I); return true; } -static bool isOneOrNegOne(const Value *Val) { - const APFloat *C; - return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0; -} - // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way. bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { Type *Ty = Sqrt.getType()->getScalarType(); @@ -2030,18 +2019,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { if (ReqdAccuracy < 1.0f) return false; - // FIXME: This is an ugly hack for this pass using forward iteration instead - // of reverse. If it worked like a normal combiner, the rsq would form before - // we saw a sqrt call. - auto *FDiv = - dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser()); - if (FDiv && FDiv->getOpcode() == Instruction::FDiv && - FDiv->getFPAccuracy() >= 1.0f && - canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) && - // TODO: We should also handle the arcp case for the fdiv with non-1 value - isOneOrNegOne(FDiv->getOperand(0))) - return false; - Value *SrcVal = Sqrt.getOperand(0); bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt); @@ -2065,7 +2042,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals); NewSqrt->takeName(&Sqrt); Sqrt.replaceAllUsesWith(NewSqrt); - Sqrt.eraseFromParent(); + DeadVals.push_back(&Sqrt); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index e4d328a..b8b419d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1112,8 +1112,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { {N->getOperand(0), N->getOperand(1), CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); } else { - unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO - : AMDGPU::S_USUBO_PSEUDO; + unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO; CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {N->getOperand(0), N->getOperand(1)}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 9449e70..a6074ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) +MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 73b2660..5407566 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -468,6 +468,38 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { + Register Lo, Hi; + switch (MI.getOpcode()) { + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: { + // For signed operations, use sign extension + auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg()); + auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo}) + .getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi}) + .getReg(0); + break; + } + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: { + // For unsigned operations, use zero extension + auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg()); + auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo}) + .getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi}) + .getReg(0); + break; + } + default: + llvm_unreachable("Unpack min/max lowering not implemented"); + } + B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); + MI.eraseFromParent(); +} + static bool isSignedBFE(MachineInstr &MI) { if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI)) return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -654,6 +686,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, } case UnpackBitShift: return lowerUnpackBitShift(MI); + case UnpackMinMax: + return lowerUnpackMinMax(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index 7affe5a..d937815 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -123,6 +123,7 @@ private: void lowerSplitTo32(MachineInstr &MI); void lowerSplitTo32Select(MachineInstr &MI); void lowerSplitTo32SExtInReg(MachineInstr &MI); + void lowerUnpackMinMax(MachineInstr &MI); }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index f413bbc..7392f4b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -522,6 +522,22 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE}); + addRulesForGOpcs({G_SMIN, G_SMAX}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}); + + addRulesForGOpcs({G_UMIN, G_UMAX}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}); + // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT // and G_FREEZE here, rest is trivially regbankselected earlier addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index d0c6910..93e0efd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -212,6 +212,7 @@ enum LoweringMethodID { VccExtToSel, UniExtToSel, UnpackBitShift, + UnpackMinMax, S_BFE, V_BFE, VgprToVccCopy, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 557d87f..56807a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5053,16 +5053,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // // vdst, srcA, srcB, srcC const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + bool UseAGPRForm = !Subtarget.hasGFX90AInsts() || + Info->selectAGPRFormMFMA(MinNumRegsRequired); + OpdsMapping[0] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: @@ -5115,11 +5117,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: { + Register DstReg = MI.getOperand(0).getReg(); + unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired); + // vdst, srcA, srcB, srcC, idx - OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(DstReg, MRI, *TRI) + : getVGPROpMapping(DstReg, MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); - OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[4] = + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c7a91f4c..4958a20 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption( cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden); +static cl::opt<bool> EnableUniformIntrinsicCombine( + "amdgpu-enable-uniform-intrinsic-combine", + cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); @@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); + + if (EnableUniformIntrinsicCombine) + PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp new file mode 100644 index 0000000..50c78d8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -0,0 +1,159 @@ +//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass simplifies certain intrinsic calls when the arguments are uniform. +/// It's true that this pass has transforms that can lead to a situation where +/// some instruction whose operand was previously recognized as statically +/// uniform is later on no longer recognized as statically uniform. However, the +/// semantics of how programs execute don't (and must not, for this precise +/// reason) care about static uniformity, they only ever care about dynamic +/// uniformity. And every instruction that's downstream and cares about dynamic +/// uniformity must be convergent (and isel will introduce v_readfirstlane for +/// them if their operands can't be proven statically uniform). +/// +/// This pass is implemented as a ModulePass because intrinsic declarations +/// exist at the module scope, allowing us to skip processing entirely if no +/// declarations are present and to traverse their user lists directly when +/// they are. A FunctionPass would instead require scanning every instruction +/// in every function to find relevant intrinsics, which is far less efficient. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine" + +using namespace llvm; +using namespace llvm::AMDGPU; +using namespace llvm::PatternMatch; + +/// Wrapper for querying uniformity info that first checks locally tracked +/// instructions. +static bool +isDivergentUseWithNew(const Use &U, const UniformityInfo &UI, + const ValueMap<const Value *, bool> &Tracker) { + Value *V = U.get(); + if (auto It = Tracker.find(V); It != Tracker.end()) + return !It->second; // divergent if marked false + return UI.isDivergentUse(U); +} + +/// Optimizes uniform intrinsics calls if their operand can be proven uniform. +static bool optimizeUniformIntrinsic(IntrinsicInst &II, + const UniformityInfo &UI, + ValueMap<const Value *, bool> &Tracker) { + llvm::Intrinsic::ID IID = II.getIntrinsicID(); + + switch (IID) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n'); + II.replaceAllUsesWith(Src); + II.eraseFromParent(); + return true; + } + case Intrinsic::amdgcn_ballot: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n'); + + bool Changed = false; + for (User *U : make_early_inc_range(II.users())) { + if (auto *ICmp = dyn_cast<ICmpInst>(U)) { + Value *Op0 = ICmp->getOperand(0); + Value *Op1 = ICmp->getOperand(1); + ICmpInst::Predicate Pred = ICmp->getPredicate(); + Value *OtherOp = Op0 == &II ? Op1 : Op0; + + if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) { + // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1 + Instruction *NotOp = + BinaryOperator::CreateNot(Src, "", ICmp->getIterator()); + Tracker[NotOp] = true; // NOT preserves uniformity + LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); + ICmp->replaceAllUsesWith(NotOp); + ICmp->eraseFromParent(); + Changed = true; + } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { + // Case: (icmp ne %ballot, 0) -> %ballot_arg + LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " + << *Src << '\n'); + ICmp->replaceAllUsesWith(Src); + ICmp->eraseFromParent(); + Changed = true; + } + } + } + // Erase the intrinsic if it has no remaining uses. + if (II.use_empty()) + II.eraseFromParent(); + return Changed; + } + default: + llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + } + return false; +} + +/// Iterates over intrinsic declarations in the module to optimize their uses. +static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { + bool IsChanged = false; + ValueMap<const Value *, bool> Tracker; + + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + for (Function &F : M) { + switch (F.getIntrinsicID()) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_ballot: + break; + default: + continue; + } + + for (User *U : make_early_inc_range(F.users())) { + auto *II = cast<IntrinsicInst>(U); + Function *ParentF = II->getFunction(); + const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF); + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); + } + } + return IsChanged; +} + +PreservedAnalyses +AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { + if (!runUniformIntrinsicCombine(M, AM)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve<UniformityInfoAnalysis>(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index aae56ee..13f727b68 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp + AMDGPUUniformIntrinsicCombine.cpp AMDGPUInstrInfo.cpp AMDGPUInstructionSelector.cpp AMDGPUISelDAGToDAG.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index ef63acc..71494be 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -905,7 +905,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { OS << ":\n"; SlotIndex MBBStartSlot = LIS.getSlotIndexes()->getMBBStartIdx(&MBB); - SlotIndex MBBEndSlot = LIS.getSlotIndexes()->getMBBEndIdx(&MBB); + SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB); GCNRPTracker::LiveRegSet LiveIn, LiveOut; GCNRegPressure RPAtMBBEnd; @@ -931,7 +931,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { } } else { GCNUpwardRPTracker RPT(LIS); - RPT.reset(MRI, MBBEndSlot); + RPT.reset(MRI, MBBLastSlot); LiveOut = RPT.getLiveRegs(); RPAtMBBEnd = RPT.getPressure(); @@ -966,14 +966,14 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { OS << PFX " Live-out:" << llvm::print(LiveOut, MRI); if (UseDownwardTracker) - ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBEndSlot, LIS, MRI)); + ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBLastSlot, LIS, MRI)); GCNRPTracker::LiveRegSet LiveThrough; for (auto [Reg, Mask] : LiveIn) { LaneBitmask MaskIntersection = Mask & LiveOut.lookup(Reg); if (MaskIntersection.any()) { LaneBitmask LTMask = getRegLiveThroughMask( - MRI, LIS, Reg, MBBStartSlot, MBBEndSlot, MaskIntersection); + MRI, LIS, Reg, MBBStartSlot, MBBLastSlot, MaskIntersection); if (LTMask.any()) LiveThrough[Reg] = LTMask; } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index a9c58bb..898d1ff 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -313,8 +313,8 @@ public: /// reset tracker to the end of the \p MBB. void reset(const MachineBasicBlock &MBB) { - reset(MBB.getParent()->getRegInfo(), - LIS.getSlotIndexes()->getMBBEndIdx(&MBB)); + SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB); + reset(MBB.getParent()->getRegInfo(), MBBLastSlot); } /// reset tracker to the point just after \p MI (in program order). diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1a686a9..80e985d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -103,52 +103,52 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::Untyped, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); + addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); + addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); + addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); + addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass); addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); + addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass); addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); - addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); + addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); + addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); + addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); - addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288)); + addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass); addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); - addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320)); + addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass); addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); - addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352)); + addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass); addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); - addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384)); + addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); + addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); + addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); + addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); if (Subtarget->has16BitInsts()) { if (Subtarget->useRealTrue16Insts()) { @@ -180,7 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); + addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -6073,9 +6073,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineOperand &Src0 = MI.getOperand(2); MachineOperand &Src1 = MI.getOperand(3); MachineOperand &Src2 = MI.getOperand(4); - unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) - ? AMDGPU::S_ADDC_U32 - : AMDGPU::S_SUBB_U32; if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) @@ -6124,11 +6121,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addImm(0); } - // clang-format off - BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()) - .add(Src0) - .add(Src1); - // clang-format on + unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO + ? AMDGPU::S_ADDC_U32 + : AMDGPU::S_SUBB_U32; + + BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1); unsigned SelOpc = ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; @@ -16571,6 +16568,53 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, } } + // Eliminate setcc by using carryout from add/sub instruction + + // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo + // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi + // similarly for subtraction + + // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1 + // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0 + + if (VT == MVT::i64 && ((CC == ISD::SETULT && + sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) || + (CC == ISD::SETUGT && + sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) || + (CC == ISD::SETEQ && CRHS && CRHS->isZero() && + sd_match(LHS, m_Add(m_Value(), m_One()))))) { + bool IsAdd = LHS.getOpcode() == ISD::ADD; + + SDValue Op0 = LHS.getOperand(0); + SDValue Op1 = LHS.getOperand(1); + + SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0); + SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1); + + SDValue Op0Hi = getHiHalf64(Op0, DAG); + SDValue Op1Hi = getHiHalf64(Op1, DAG); + + SDValue NodeLo = + DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL, + DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo}); + + SDValue CarryInHi = NodeLo.getValue(1); + SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY, + SL, DAG.getVTList(MVT::i32, MVT::i1), + {Op0Hi, Op1Hi, CarryInHi}); + + SDValue ResultLo = NodeLo.getValue(0); + SDValue ResultHi = NodeHi.getValue(0); + + SDValue JoinedResult = + DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi}); + + SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult); + SDValue Overflow = NodeHi.getValue(1); + DCI.CombineTo(LHS.getNode(), Result); + return Overflow; + } + if (VT != MVT::f32 && VT != MVT::f64 && (!Subtarget->has16BitInsts() || VT != MVT::f16)) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index b7dbb59..2c1a13c 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -1202,6 +1202,12 @@ public: unsigned getMinNumAGPRs() const { return MinNumAGPRs; } + /// Return true if an MFMA that requires at least \p NumRegs should select to + /// the AGPR form, instead of the VGPR form. + bool selectAGPRFormMFMA(unsigned NumRegs) const { + return !MFMAVGPRForm && getMinNumAGPRs() >= NumRegs; + } + // \returns true if a function has a use of AGPRs via inline asm or // has a call which may use it. bool mayUseAGPRs(const Function &F) const; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 7cfd059..6500fce 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -964,14 +964,12 @@ class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : Pa class CanUseAGPR_MAI<ValueType vt> { code PredicateCode = [{ return !Subtarget->hasGFX90AInsts() || - (!SIMachineFunctionInfo::MFMAVGPRForm && - MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= - }] # !srl(vt.Size, 5) # ");"; + MF->getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA( + }] # !srl(vt.Size, 5) # ");"; code GISelPredicateCode = [{ return !Subtarget->hasGFX90AInsts() || - (!SIMachineFunctionInfo::MFMAVGPRForm && - MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + MF.getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA( }] # !srl(vt.Size, 5) # ");"; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2a40fb9..83c7def 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -42,7 +42,6 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ComplexDeinterleavingPass.h" #include "llvm/CodeGen/ISDOpcodes.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td index 6d0529f..fb0928b8 100644 --- a/llvm/lib/Target/Hexagon/Hexagon.td +++ b/llvm/lib/Target/Hexagon/Hexagon.td @@ -110,8 +110,6 @@ def FeatureSmallData: SubtargetFeature<"small-data", "UseSmallData", "true", "Allow GP-relative addressing of global variables">; def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true", "Enable generation of duplex instruction">; -def FeatureUnsafeFP: SubtargetFeature<"unsafe-fp", "UseUnsafeMath", "true", - "Use unsafe FP math">; def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19", "true", "Reserve register R19">; def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim", @@ -167,7 +165,6 @@ def UseHVXQFloat : Predicate<"HST->useHVXQFloatOps()">, def UseHVXFloatingPoint: Predicate<"HST->useHVXFloatingPoint()">; def HasMemNoShuf : Predicate<"HST->hasMemNoShuf()">, AssemblerPredicate<(all_of FeatureMemNoShuf)>; -def UseUnsafeMath : Predicate<"HST->useUnsafeMath()">; def NotOptTinyCore : Predicate<"!HST->isTinyCore() ||" "MF->getFunction().hasOptSize()"> { let RecomputePerFunction = 1; diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index 4b23670..85ce944 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -699,35 +699,20 @@ def: OpR_RR_pat<C2_cmpgtp, setgt, i1, I64>; def: OpR_RR_pat<C2_cmpgtup, setugt, i1, I64>; def: OpR_RR_pat<C2_cmpgtp, RevCmp<setlt>, i1, I64>; def: OpR_RR_pat<C2_cmpgtup, RevCmp<setult>, i1, I64>; -def: OpR_RR_pat<A2_vcmpbeq, seteq, i1, V8I8>; def: OpR_RR_pat<A2_vcmpbeq, seteq, v8i1, V8I8>; -def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, i1, V8I8>; def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, v8i1, V8I8>; -def: OpR_RR_pat<A4_vcmpbgt, setgt, i1, V8I8>; def: OpR_RR_pat<A4_vcmpbgt, setgt, v8i1, V8I8>; -def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, i1, V8I8>; def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, v8i1, V8I8>; -def: OpR_RR_pat<A2_vcmpbgtu, setugt, i1, V8I8>; def: OpR_RR_pat<A2_vcmpbgtu, setugt, v8i1, V8I8>; -def: OpR_RR_pat<A2_vcmpheq, seteq, i1, V4I16>; def: OpR_RR_pat<A2_vcmpheq, seteq, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, i1, V4I16>; def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmphgt, setgt, i1, V4I16>; def: OpR_RR_pat<A2_vcmphgt, setgt, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, i1, V4I16>; def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmphgtu, setugt, i1, V4I16>; def: OpR_RR_pat<A2_vcmphgtu, setugt, v4i1, V4I16>; -def: OpR_RR_pat<A2_vcmpweq, seteq, i1, V2I32>; def: OpR_RR_pat<A2_vcmpweq, seteq, v2i1, V2I32>; -def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, i1, V2I32>; def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, v2i1, V2I32>; -def: OpR_RR_pat<A2_vcmpwgt, setgt, i1, V2I32>; def: OpR_RR_pat<A2_vcmpwgt, setgt, v2i1, V2I32>; -def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, i1, V2I32>; def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, v2i1, V2I32>; -def: OpR_RR_pat<A2_vcmpwgtu, setugt, i1, V2I32>; def: OpR_RR_pat<A2_vcmpwgtu, setugt, v2i1, V2I32>; def: OpR_RR_pat<F2_sfcmpeq, seteq, i1, F32>; @@ -1213,12 +1198,6 @@ def: OpR_RI_pat<S2_asl_i_r, Shl, i32, I32, u5_0ImmPred>; def: OpR_RI_pat<S2_asr_i_p, Sra, i64, I64, u6_0ImmPred>; def: OpR_RI_pat<S2_lsr_i_p, Srl, i64, I64, u6_0ImmPred>; def: OpR_RI_pat<S2_asl_i_p, Shl, i64, I64, u6_0ImmPred>; -def: OpR_RI_pat<S2_asr_i_vh, Sra, v4i16, V4I16, u4_0ImmPred>; -def: OpR_RI_pat<S2_lsr_i_vh, Srl, v4i16, V4I16, u4_0ImmPred>; -def: OpR_RI_pat<S2_asl_i_vh, Shl, v4i16, V4I16, u4_0ImmPred>; -def: OpR_RI_pat<S2_asr_i_vh, Sra, v2i32, V2I32, u5_0ImmPred>; -def: OpR_RI_pat<S2_lsr_i_vh, Srl, v2i32, V2I32, u5_0ImmPred>; -def: OpR_RI_pat<S2_asl_i_vh, Shl, v2i32, V2I32, u5_0ImmPred>; def: OpR_RR_pat<S2_asr_r_r, Sra, i32, I32, I32>; def: OpR_RR_pat<S2_lsr_r_r, Srl, i32, I32, I32>; @@ -1611,8 +1590,11 @@ def DfMpy: OutPatFrag<(ops node:$Rs, node:$Rt), $Rt, $Rs), $Rs, $Rt)>; -let Predicates = [HasV67,UseUnsafeMath], AddedComplexity = 50 in { - def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>; +def fmul_afn : PatFrag<(ops node:$a, node:$b), (fmul node:$a, node:$b), [{ + return N->getFlags().hasApproximateFuncs(); +}]>; +let Predicates = [HasV67], AddedComplexity = 50 in { + def : Pat<(fmul_afn F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>; } let Predicates = [HasV67] in { def: OpR_RR_pat<F2_dfmin, pf2<fminimumnum>, f64, F64>; diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index b111471..7430567 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -54,7 +54,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { bool UseNewValueJumps = false; bool UseNewValueStores = false; bool UseSmallData = false; - bool UseUnsafeMath = false; bool UseZRegOps = false; bool UseHVXIEEEFPOps = false; bool UseHVXQFloatOps = false; @@ -234,7 +233,6 @@ public: bool useNewValueJumps() const { return UseNewValueJumps; } bool useNewValueStores() const { return UseNewValueStores; } bool useSmallData() const { return UseSmallData; } - bool useUnsafeMath() const { return UseUnsafeMath; } bool useZRegOps() const { return UseZRegOps; } bool useCabac() const { return UseCabac; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 0afa04a..f5d8b69 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -250,13 +250,6 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const { CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU; std::string FS = FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS; - // Append the preexisting target features last, so that +mattr overrides - // the "unsafe-fp-math" function attribute. - // Creating a separate target feature is not strictly necessary, it only - // exists to make "unsafe-fp-math" force creating a new subtarget. - - if (F.getFnAttribute("unsafe-fp-math").getValueAsBool()) - FS = FS.empty() ? "+unsafe-fp" : "+unsafe-fp," + FS; auto &I = SubtargetMap[CPU + FS]; if (!I) { diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index ba70c9e..97379d7 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -3677,7 +3677,7 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, Out, STI)) return true; - if (IsLikely) { + if (IsLikely && MemOffsetOp.isExpr()) { TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI); TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI); diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td index eff80e5..21d8ded 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.td +++ b/llvm/lib/Target/Mips/MipsInstrInfo.td @@ -855,6 +855,16 @@ def calltarget : Operand<iPTR> { def imm64: Operand<i64>; +def ConstantImmAsmOperandClass : AsmOperandClass { + let Name = "ConstantImm"; + let PredicateMethod = "isConstantImm"; + let RenderMethod = "addImmOperands"; +} + +def ConstantImm64: Operand<i64> { + let ParserMatchClass = ConstantImmAsmOperandClass; +} + def simm19_lsl2 : Operand<i32> { let EncoderMethod = "getSimm19Lsl2Encoding"; let DecoderMethod = "DecodeSimm19Lsl2"; @@ -2947,10 +2957,10 @@ def : MipsInstAlias<"nor\t$rs, $imm", (NORImm GPR32Opnd:$rs, GPR32Opnd:$rs, let hasDelaySlot = 1, isCTI = 1 in { def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), - (ins imm64:$imm64, brtarget:$offset), + (ins ConstantImm64:$imm64, brtarget:$offset), "bne\t$rt, $imm64, $offset">; def BeqImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), - (ins imm64:$imm64, brtarget:$offset), + (ins ConstantImm64:$imm64, brtarget:$offset), "beq\t$rt, $imm64, $offset">; class CondBranchPseudo<string instr_asm> : @@ -2978,7 +2988,7 @@ def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6; let isCTI = 1 in class CondBranchImmPseudo<string instr_asm> : - MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset), + MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, ConstantImm64:$imm, brtarget:$offset), !strconcat(instr_asm, "\t$rs, $imm, $offset")>; def BEQLImmMacro : CondBranchImmPseudo<"beql">, ISA_MIPS2_NOT_32R6_64R6; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index bc047a4a..a1fb665 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -651,7 +651,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Custom conversions to/from v2i8. setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); - // Only logical ops can be done on v4i8 directly, others must be done + // Only logical ops can be done on v4i8/v2i32 directly, others must be done // elementwise. setOperationAction( {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE, @@ -669,7 +669,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM, ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT, ISD::USUBSAT}, - MVT::v4i8, Expand); + {MVT::v4i8, MVT::v2i32}, Expand); // Operations not directly supported by NVPTX. for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, @@ -689,7 +689,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index ecfb5fe..eb41588 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -334,7 +334,7 @@ static bool isLegalElementTypeForRVV(Type *EltTy, if (EltTy->isIntegerTy(64)) return Subtarget.hasVInstructionsI64(); if (EltTy->isHalfTy()) - return Subtarget.hasVInstructionsF16(); + return Subtarget.hasVInstructionsF16Minimal(); if (EltTy->isBFloatTy()) return Subtarget.hasVInstructionsBF16Minimal(); if (EltTy->isFloatTy()) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 40c05e8..5ceb477 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1520,6 +1520,8 @@ def HasVendorXqcics : Predicate<"Subtarget->hasVendorXqcics()">, AssemblerPredicate<(all_of FeatureVendorXqcics), "'Xqcics' (Qualcomm uC Conditional Select Extension)">; +def NoVendorXqcics + : Predicate<"!Subtarget->hasVendorXqcics()">; def FeatureVendorXqcicsr : RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">; @@ -1823,6 +1825,11 @@ def TuneConditionalCompressedMoveFusion def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">; def NoConditionalMoveFusion : Predicate<"!Subtarget->hasConditionalMoveFusion()">; +def TuneHasSingleElementVecFP64 + : SubtargetFeature<"single-element-vec-fp64", "HasSingleElementVectorFP64", "true", + "Certain vector FP64 operations produce a single result " + "element per cycle">; + def TuneMIPSP8700 : SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700", "MIPS p8700 processor">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 447f05c..5e1d07a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1571,35 +1571,42 @@ def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5nonzero>; } let Predicates = [HasVendorXqcicli, IsRV32] in { -def : QCILICCPat<SETEQ, QC_LIEQ>; -def : QCILICCPat<SETNE, QC_LINE>; def : QCILICCPat<SETLT, QC_LILT>; def : QCILICCPat<SETGE, QC_LIGE>; def : QCILICCPat<SETULT, QC_LILTU>; def : QCILICCPat<SETUGE, QC_LIGEU>; -def : QCILICCIPat<SETEQ, QC_LIEQI, simm5>; -def : QCILICCIPat<SETNE, QC_LINEI, simm5>; def : QCILICCIPat<SETLT, QC_LILTI, simm5>; def : QCILICCIPat<SETGE, QC_LIGEI, simm5>; def : QCILICCIPat<SETULT, QC_LILTUI, uimm5>; def : QCILICCIPat<SETUGE, QC_LIGEUI, uimm5>; -def : QCILICCPatInv<SETNE, QC_LIEQ>; -def : QCILICCPatInv<SETEQ, QC_LINE>; def : QCILICCPatInv<SETGE, QC_LILT>; def : QCILICCPatInv<SETLT, QC_LIGE>; def : QCILICCPatInv<SETUGE, QC_LILTU>; def : QCILICCPatInv<SETULT, QC_LIGEU>; -def : QCILICCIPatInv<SETNE, QC_LIEQI, simm5>; -def : QCILICCIPatInv<SETEQ, QC_LINEI, simm5>; def : QCILICCIPatInv<SETGE, QC_LILTI, simm5>; def : QCILICCIPatInv<SETLT, QC_LIGEI, simm5>; def : QCILICCIPatInv<SETUGE, QC_LILTUI, uimm5>; def : QCILICCIPatInv<SETULT, QC_LIGEUI, uimm5>; } // Predicates = [HasVendorXqcicli, IsRV32] +// Prioritize Xqcics over these patterns. +let Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32] in { +def : QCILICCPat<SETEQ, QC_LIEQ>; +def : QCILICCPat<SETNE, QC_LINE>; + +def : QCILICCIPat<SETEQ, QC_LIEQI, simm5>; +def : QCILICCIPat<SETNE, QC_LINEI, simm5>; + +def : QCILICCPatInv<SETNE, QC_LIEQ>; +def : QCILICCPatInv<SETEQ, QC_LINE>; + +def : QCILICCIPatInv<SETNE, QC_LIEQI, simm5>; +def : QCILICCIPatInv<SETEQ, QC_LINEI, simm5>; +} // Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32] + let Predicates = [HasVendorXqcics, IsRV32] in { // (SELECT X, Y, Z) is canonicalised to `(riscv_selectcc x, 0, NE, y, z)`. // These exist to prioritise over the `Select_GPR_Using_CC_GPR` pattern. @@ -1636,7 +1643,7 @@ def : QCISELECTCCIPat<SETNE, QC_SELECTNEI>; } let Predicates = [HasVendorXqcilsm, IsRV32] in { -def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7), +def : Pat<(qc_setwmi (i32 GPR:$rs3), GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7), (QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>; } // Predicates = [HasVendorXqcilsm, IsRV32] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index a29b7dd..57fbaa0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -634,56 +634,56 @@ def : PatGpr<bswap, REV8_RV64, i64>; let Predicates = [HasStdExtZbkb] in { def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF), - (zexti8 (XLenVT GPR:$rs1))), - (PACKH GPR:$rs1, GPR:$rs2)>; -def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)), - (zexti8 (XLenVT GPR:$rs1))), - (PACKH GPR:$rs1, GPR:$rs2)>; + zexti8:$rs1), + (PACKH zexti8:$rs1, GPR:$rs2)>; +def : Pat<(or (shl zexti8:$rs2, (XLenVT 8)), + zexti8:$rs1), + (PACKH zexti8:$rs1, zexti8:$rs2)>; def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)), - (zexti8 (XLenVT GPR:$rs1))), 0xFFFF), - (PACKH GPR:$rs1, GPR:$rs2)>; + zexti8:$rs1), 0xFFFF), + (PACKH zexti8:$rs1, GPR:$rs2)>; def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)), - (zexti8 (XLenVT GPR:$rs1))), - (PACKH GPR:$rs1, GPR:$rs2)>; + zexti8:$rs1), + (PACKH zexti8:$rs1, GPR:$rs2)>; } // Predicates = [HasStdExtZbkb] let Predicates = [HasStdExtZbkb, IsRV32] in { -def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))), - (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(i32 (or zexti16:$rs1, (shl GPR:$rs2, (i32 16)))), + (PACK zexti16:$rs1, GPR:$rs2)>; -def : Pat<(or (shl GPR:$rs2, (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), - (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; +def : Pat<(i32 (or (shl GPR:$rs2, (XLenVT 24)), + (shl zexti8:$rs1, (XLenVT 16)))), + (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>; // Match a pattern of 2 bytes being inserted into bits [31:16], with bits // bits [15:0] coming from a zero extended value. We can use pack with packh for // bits [31:16]. If bits [15:0] can also be a packh, it can be matched // separately. -def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), - (zexti16 (XLenVT GPR:$rs1))), - (PACK (XLenVT GPR:$rs1), - (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; +def : Pat<(i32 (or (or (shl GPR:$op1rs2, (XLenVT 24)), + (shl zexti8:$op1rs1, (XLenVT 16))), + zexti16:$rs1)), + (PACK zexti16:$rs1, + (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; } let Predicates = [HasStdExtZbkb, IsRV64] in { -def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))), - (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(i64 (or zexti32:$rs1, (shl GPR:$rs2, (i64 32)))), + (PACK zexti32:$rs1, GPR:$rs2)>; -def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), - (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; +def : Pat<(i64 (or (shl zexti8:$rs2, (XLenVT 24)), + (shl zexti8:$rs1, (XLenVT 16)))), + (SLLI (XLenVT (PACKH zexti8:$rs1, zexti8:$rs2)), (XLenVT 16))>; def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), - (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; + (shl zexti8:$rs1, (XLenVT 16))), + (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>; def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)), - (zexti16 (i64 GPR:$rs1))), - (PACKW GPR:$rs1, GPR:$rs2)>; + zexti16:$rs1), + (PACKW zexti16:$rs1, GPR:$rs2)>; def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), - (zexti16 (i64 GPR:$rs1)))), - (PACKW GPR:$rs1, GPR:$rs2)>; + zexti16:$rs1)), + (PACKW zexti16:$rs1, GPR:$rs2)>; // Match a pattern of 2 bytes being inserted into bits [31:16], with bits // bits [15:0] coming from a zero extended value, and bits [63:32] being @@ -691,35 +691,35 @@ def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), // also be a packh, it can be matched separately. def : Pat<(binop_allwusers<or> (or (shl GPR:$op1rs2, (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), - (zexti16 (XLenVT GPR:$rs1))), - (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + (shl zexti8:$op1rs1, (XLenVT 16))), + zexti16:$rs1), + (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; // We need to manually reassociate the patterns because of the binop_allwusers. def : Pat<(binop_allwusers<or> - (or (zexti16 (XLenVT GPR:$rs1)), - (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), + (or zexti16:$rs1, + (shl zexti8:$op1rs1, (XLenVT 16))), (shl GPR:$op1rs2, (XLenVT 24))), - (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; def : Pat<(binop_allwusers<or> - (or (zexti16 (XLenVT GPR:$rs1)), - (shl GPR:$op1rs1, (XLenVT 24))), - (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))), - (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + (or zexti16:$rs1, + (shl GPR:$op1rs2, (XLenVT 24))), + (shl zexti8:$op1rs1, (XLenVT 16))), + (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; def : Pat<(i64 (or (or (zexti16 (XLenVT GPR:$rs1)), - (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))), - (sext_inreg (shl GPR:$op1rs1, (XLenVT 24)), i32))), - (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + (shl zexti8:$op1rs1, (XLenVT 16))), + (sext_inreg (shl GPR:$op1rs2, (XLenVT 24)), i32))), + (PACKW GPR:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; // Match a pattern of 2 halfwords being inserted into bits [63:32], with bits // bits [31:0] coming from a zero extended value. We can use pack with packw for // bits [63:32]. If bits [63:31] can also be a packw, it can be matched // separately. def : Pat<(or (or (shl GPR:$op1rs2, (i64 48)), - (shl (zexti16 (i64 GPR:$op1rs1)), (i64 32))), - (zexti32 (i64 GPR:$rs1))), - (PACK (XLenVT GPR:$rs1), - (XLenVT (PACKW GPR:$op1rs1, GPR:$op1rs2)))>; + (shl zexti16:$op1rs1, (i64 32))), + zexti32:$rs1), + (PACK zexti32:$rs1, + (XLenVT (PACKW zexti16:$op1rs1, GPR:$op1rs2)))>; } // Predicates = [HasStdExtZbkb, IsRV64] let Predicates = [HasStdExtZbb, IsRV32] in diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td index 6d86aff..3658817 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td +++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td @@ -14,6 +14,10 @@ // otherwise. def VLDSX0Pred : MCSchedPredicate<CheckRegOperand<3, X0>>; +// This scheduling predicate is true when subtarget feature TuneHasSingleElementVecFP64 +// is enabled. +def SingleElementVecFP64SchedPred : FeatureSchedPredicate<TuneHasSingleElementVecFP64>; + // Returns true if this is the sext.w pattern, addiw rd, rs1, 0. def isSEXT_W : TIIPredicate<"isSEXT_W", diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 17a7948..e86431f 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -338,7 +338,8 @@ def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390", FeatureStdExtZvl1024b, FeatureVendorXSiFivecdiscarddlone, FeatureVendorXSiFivecflushdlone], - SiFiveIntelligenceTuneFeatures>; + !listconcat(SiFiveIntelligenceTuneFeatures, + [TuneHasSingleElementVecFP64])>; defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll, TuneConditionalCompressedMoveFusion, diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 3e07eff..f863392a 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -317,7 +317,6 @@ multiclass SiFive7WriteResBase<int VLEN, ProcResourceKind VL, ProcResourceKind VS, ProcResourceKind VCQ, SiFive7FPLatencies fpLatencies, - bit isFP64Throttled = false, bit hasFastGather = false> { // Branching @@ -832,29 +831,56 @@ multiclass SiFive7WriteResBase<int VLEN, // 13. Vector Floating-Point Instructions foreach mx = SchedMxListF in { foreach sew = SchedSEWSet<mx, isF=1>.val in { - defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 64)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesDefault<mx>.c); - defvar Lat8 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 8); - defvar VA = !if(!and(isFP64Throttled, !eq(sew, 64)), VA1, VA1OrVA2); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; - let Latency = Lat8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; - } - defvar Lat4 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 4); - let Latency = Lat4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA], mx, sew, IsWorstCase>; - // min max require merge - defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>; + if !eq(sew, 64) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + foreach SchedWriteName = ["WriteVFALUV", "WriteVFALUF", "WriteVFMulV", "WriteVFMulF", + "WriteVFMulAddV", "WriteVFMulAddF"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1OrVA2], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + foreach SchedWriteName = ["WriteVFRecpV", "WriteVFCvtIToFV"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + foreach SchedWriteName = ["WriteVFSgnjV", "WriteVFSgnjF"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1OrVA2], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + foreach SchedWriteName = ["WriteVFMinMaxV", "WriteVFMinMaxF"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in { + defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } + let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in { + defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + // min max require merge + defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>; + } } } } @@ -892,19 +918,28 @@ multiclass SiFive7WriteResBase<int VLEN, // Widening foreach mx = SchedMxListW in { foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in { - defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesDefault<mx>.c); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c; - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in - defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c; + if !eq(sew, 32) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtIToFV", SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, + AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in + defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } } } foreach mx = SchedMxListFW in { foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { - defvar Cycles = SiFive7GetCyclesDefault<mx>.c; + defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { + let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in { defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; @@ -912,11 +947,19 @@ multiclass SiFive7WriteResBase<int VLEN, defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; } - defvar CvtCycles = !if(!and(isFP64Throttled, !eq(sew, 32)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesDefault<mx>.c); - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in - defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + if !eq(sew, 32) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtFToFV", SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, + AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in + defm : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } } defvar Cycles = SiFive7GetCyclesDefault<mx>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c; @@ -933,13 +976,23 @@ multiclass SiFive7WriteResBase<int VLEN, } foreach mx = SchedMxListFW in { foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { - defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesNarrowing<mx>.c); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + defvar DefaultCycles = SiFive7GetCyclesNarrowing<mx>.c; + if !eq(sew, 32) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + foreach SchedWriteName = ["WriteVFNCvtIToFV", "WriteVFNCvtFToFV"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, + AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in { + defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } } } } @@ -1499,7 +1552,6 @@ multiclass SiFive7ReadAdvance { /// eventually be supplied by different SchedMachineModels. multiclass SiFive7SchedResources<int vlen, bit extraVALU, SiFive7FPLatencies fpLatencies, - bit isFP64Throttled, bit hasFastGather> { defm SiFive7 : SiFive7ProcResources<extraVALU>; @@ -1527,8 +1579,7 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU, : SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB, SiFive7IDiv, SiFive7FDiv, SiFive7VA1, SiFive7VA1OrVA2, SiFive7VL, SiFive7VS, - SiFive7VCQ, fpLatencies, isFP64Throttled, - hasFastGather>; + SiFive7VCQ, fpLatencies, hasFastGather>; //===----------------------------------------------------------------------===// // Bypass and advance @@ -1560,7 +1611,6 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel { bit HasExtraVALU = false; SiFive7FPLatencies FPLatencies; - bit IsFP64Throttled = false; bit HasFastGather = false; string Name = !subst("Model", "", !subst("SiFive7", "", NAME)); @@ -1587,7 +1637,6 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> { def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> { let HasExtraVALU = true; let FPLatencies = SiFive7LowFPLatencies; - let IsFP64Throttled = true; let HasFastGather = true; } @@ -1596,7 +1645,6 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in { let SchedModel = model in defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU, model.FPLatencies, - model.IsFP64Throttled, model.HasFastGather>; } diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index 01a4308..d11b446 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -128,6 +128,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, IsWorstCase>; } +multiclass LMULSEWWriteResMXSEWVariant<string name, SchedPredicateBase Pred, + list<ProcResourceKind> predResources, + int predLat, list<int> predAcquireCycles, + list<int> predReleaseCycles, + list<ProcResourceKind> noPredResources, + int noPredLat, list<int> noPredAcquireCycles, + list<int> noPredReleaseCycles, + string mx, int sew, bit IsWorstCase> { + defm "" : LMULWriteResVariantImpl<name, name # "_" # mx # "_E" # sew, Pred, predResources, + predLat, predAcquireCycles, + predReleaseCycles, noPredResources, + noPredLat, noPredAcquireCycles, + noPredReleaseCycles, + IsWorstCase>; +} + // Define multiclasses to define SchedWrite, SchedRead, WriteRes, and // ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the // SchedMxList variants above. Each multiclass is responsible for defining diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index e8c849e..28a1690 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -46,7 +46,6 @@ #include "SPIRVSubtarget.h" #include "SPIRVTargetMachine.h" #include "SPIRVUtils.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp index 20f03b0..60d39c9 100644 --- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp @@ -19,7 +19,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Intrinsics.h" diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp index 278ad7c..e621bcd44 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp @@ -14,7 +14,6 @@ #include "SPIRV.h" #include "SPIRVSubtarget.h" #include "SPIRVUtils.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Transforms/Utils/Cloning.h" diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 1811492..5b149f8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp index af79070..275165d 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp @@ -184,8 +184,8 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI, // Output the TLS marker if present. if ((unsigned)OpNum + 1 < MI->getNumOperands()) { const MCOperand &MO = MI->getOperand(OpNum + 1); - const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr()); - switch (refExp.getSpecifier()) { + const MCSymbolRefExpr &RefExp = cast<MCSymbolRefExpr>(*MO.getExpr()); + switch (RefExp.getSpecifier()) { case SystemZ::S_TLSGD: O << ":tls_gdcall:"; break; @@ -195,7 +195,7 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI, default: llvm_unreachable("Unexpected symbol kind"); } - O << refExp.getSymbol().getName(); + O << RefExp.getSymbol().getName(); } } diff --git a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp index fce6393..8c31579 100644 --- a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp +++ b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp @@ -13,10 +13,9 @@ using namespace llvm; -SystemZConstantPoolValue:: -SystemZConstantPoolValue(const GlobalValue *gv, - SystemZCP::SystemZCPModifier modifier) - : MachineConstantPoolValue(gv->getType()), GV(gv), Modifier(modifier) {} +SystemZConstantPoolValue::SystemZConstantPoolValue( + const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier) + : MachineConstantPoolValue(GV->getType()), GV(GV), Modifier(Modifier) {} SystemZConstantPoolValue * SystemZConstantPoolValue::Create(const GlobalValue *GV, diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp index 34d58e0..5313fba 100644 --- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp +++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -352,10 +352,9 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const { // Similarly, a group-ending SU may either fit well (last in group), or // end the group prematurely. if (SC->EndGroup) { - unsigned resultingGroupSize = - (CurrGroupSize + getNumDecoderSlots(SU)); - if (resultingGroupSize < 3) - return (3 - resultingGroupSize); + unsigned ResultingGroupSize = (CurrGroupSize + getNumDecoderSlots(SU)); + if (ResultingGroupSize < 3) + return (3 - ResultingGroupSize); return -1; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9580ade..1cfcb1f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28,7 +28,6 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" -#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 3bc46af..6dd43b2 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -547,7 +547,7 @@ unsigned X86TargetLowering::getAddressSpace() const { static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || - (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); + TargetTriple.isAndroid(); } static Constant* SegmentOffset(IRBuilderBase &IRB, diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 764ff998..4b3ddbd 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -592,10 +592,10 @@ def : Pat<(X86mcvttp2sis (v2f64 (X86VBroadcastld64 addr:$src)), (VCVTTPD2DQSZ128rmbkz VK2WM:$mask, addr:$src)>; // Patterns VCVTTPD2UDQSZ128 -def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))), - (VCVTTPD2UDQSZ128rmb addr:$src)>; def : Pat<(v4i32 (X86cvttp2uis (v2f64 VR128X:$src))), (VCVTTPD2UDQSZ128rr VR128X:$src)>; +def : Pat<(v4i32 (X86cvttp2uis (loadv2f64 addr:$src))), + (VCVTTPD2UDQSZ128rm addr:$src)>; def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2UDQSZ128rmb addr:$src)>; def : Pat<(X86mcvttp2uis (v2f64 VR128X:$src), (v4i32 VR128X:$src0), diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 805bdb4..bbbac45 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -28,8 +28,12 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" @@ -39,6 +43,10 @@ using namespace PatternMatch; #define DEBUG_TYPE "aggressive-instcombine" +namespace llvm { +extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} + STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded"); STATISTIC(NumGuardedRotates, "Number of guarded rotates transformed into funnel shifts"); @@ -599,6 +607,14 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) { auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0)); auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz); + // The true branch of select handles the cttz(0) case, which is rare. + if (!ProfcheckDisableMetadataFixes) { + if (Instruction *SelectI = dyn_cast<Instruction>(Select)) + SelectI->setMetadata( + LLVMContext::MD_prof, + MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights()); + } + // NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target // it should be handled as: `cttz(x) & (typeSize - 1)`. diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp index 9b9e2ba..9150b58 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -459,7 +459,7 @@ void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) { Value *Op0 = I->getOperand(0); Value *LHS = getReducedOperand(I->getOperand(1), SclTy); Value *RHS = getReducedOperand(I->getOperand(2), SclTy); - Res = Builder.CreateSelect(Op0, LHS, RHS); + Res = Builder.CreateSelect(Op0, LHS, RHS, "", I); break; } case Instruction::PHI: { diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp index 9115946..f166fef 100644 --- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp @@ -24,6 +24,9 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -33,6 +36,11 @@ using namespace llvm; #define DEBUG_TYPE "coro-annotation-elide" +static cl::opt<float> CoroElideBranchRatio( + "coro-elide-branch-ratio", cl::init(0.55), cl::Hidden, + cl::desc("Minimum BranchProbability to consider a elide a coroutine.")); +extern cl::opt<unsigned> MinBlockCounterExecution; + static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) { for (Instruction &I : F->getEntryBlock()) if (!isa<AllocaInst>(&I)) @@ -145,6 +153,30 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C, bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine(); bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe); if (IsCallerPresplitCoroutine && HasAttr) { + BranchProbability MinBranchProbability( + static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution), + MinBlockCounterExecution); + + auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller); + + auto Prob = BranchProbability::getBranchProbability( + BFI.getBlockFreq(CB->getParent()).getFrequency(), + BFI.getEntryFreq().getFrequency()); + + if (Prob < MinBranchProbability) { + ORE.emit([&]() { + return OptimizationRemarkMissed( + DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller) + << "'" << ore::NV("callee", Callee->getName()) + << "' not elided in '" + << ore::NV("caller", Caller->getName()) + << "' because of low probability: " + << ore::NV("probability", Prob) << " (threshold: " + << ore::NV("threshold", MinBranchProbability) << ")"; + }); + continue; + } + auto *CallerN = CG.lookup(*Caller); auto *CallerC = CallerN ? CG.lookupSCC(*CallerN) : nullptr; // If CallerC is nullptr, it means LazyCallGraph hasn't visited Caller @@ -156,7 +188,7 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C, return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller) << "'" << ore::NV("callee", Callee->getName()) << "' elided in '" << ore::NV("caller", Caller->getName()) - << "'"; + << "' (probability: " << ore::NV("probability", Prob) << ")"; }); FAM.invalidate(*Caller, PreservedAnalyses::none()); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index cfdfd94..5066a99 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1033,19 +1033,17 @@ private: }; } // namespace -namespace llvm { template <> -struct DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<typename CallsiteContextGraph< ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; template <> -struct DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<typename CallsiteContextGraph< IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; template <> -struct DenseMapInfo<IndexCall> +struct llvm::DenseMapInfo<IndexCall> : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; -} // end namespace llvm namespace { diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 2583249..1a00d17 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -109,7 +109,7 @@ static cl::opt<float> MinRegionSizeRatio( "outline candidate and original function")); // Used to tune the minimum number of execution counts needed in the predecessor // block to the cold edge. ie. confidence interval. -static cl::opt<unsigned> +cl::opt<unsigned> MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, cl::desc("Minimum block executions to consider " "its BranchProbabilityInfo valid")); diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index ac41fdd..2d5cb82 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -372,9 +372,7 @@ struct VTableSlot { } // end anonymous namespace -namespace llvm { - -template <> struct DenseMapInfo<VTableSlot> { +template <> struct llvm::DenseMapInfo<VTableSlot> { static VTableSlot getEmptyKey() { return {DenseMapInfo<Metadata *>::getEmptyKey(), DenseMapInfo<uint64_t>::getEmptyKey()}; @@ -393,7 +391,7 @@ template <> struct DenseMapInfo<VTableSlot> { } }; -template <> struct DenseMapInfo<VTableSlotSummary> { +template <> struct llvm::DenseMapInfo<VTableSlotSummary> { static VTableSlotSummary getEmptyKey() { return {DenseMapInfo<StringRef>::getEmptyKey(), DenseMapInfo<uint64_t>::getEmptyKey()}; @@ -412,8 +410,6 @@ template <> struct DenseMapInfo<VTableSlotSummary> { } }; -} // end namespace llvm - // Returns true if the function must be unreachable based on ValueInfo. // // In particular, identifies a function as unreachable in the following diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 9b272c4..3ddf182 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -28,6 +28,10 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" +namespace llvm { +extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} + /// This is the complement of getICmpCode, which turns an opcode and two /// operands into either a constant true or false, or a brand new ICmp /// instruction. The sign is passed in to determine which kind of predicate to @@ -1272,7 +1276,8 @@ Value *InstCombinerImpl::foldEqOfParts(Value *Cmp0, Value *Cmp1, bool IsAnd) { static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd, bool IsLogical, InstCombiner::BuilderTy &Builder, - const SimplifyQuery &Q) { + const SimplifyQuery &Q, + Instruction &I) { // Match an equality compare with a non-poison constant as Cmp0. // Also, give up if the compare can be constant-folded to avoid looping. CmpPredicate Pred0; @@ -1306,9 +1311,12 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1, return nullptr; SubstituteCmp = Builder.CreateICmp(Pred1, Y, C); } - if (IsLogical) - return IsAnd ? Builder.CreateLogicalAnd(Cmp0, SubstituteCmp) - : Builder.CreateLogicalOr(Cmp0, SubstituteCmp); + if (IsLogical) { + Instruction *MDFrom = + ProfcheckDisableMetadataFixes && isa<SelectInst>(I) ? nullptr : &I; + return IsAnd ? Builder.CreateLogicalAnd(Cmp0, SubstituteCmp, "", MDFrom) + : Builder.CreateLogicalOr(Cmp0, SubstituteCmp, "", MDFrom); + } return Builder.CreateBinOp(IsAnd ? Instruction::And : Instruction::Or, Cmp0, SubstituteCmp); } @@ -3396,13 +3404,13 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, /*IsLogical*/ false, Builder)) return V; - if (Value *V = - foldAndOrOfICmpsWithConstEq(LHS, RHS, IsAnd, IsLogical, Builder, Q)) + if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, IsAnd, IsLogical, + Builder, Q, I)) return V; // We can convert this case to bitwise and, because both operands are used // on the LHS, and as such poison from both will propagate. - if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, IsAnd, - /*IsLogical=*/false, Builder, Q)) { + if (Value *V = foldAndOrOfICmpsWithConstEq( + RHS, LHS, IsAnd, /*IsLogical=*/false, Builder, Q, I)) { // If RHS is still used, we should drop samesign flag. if (IsLogical && RHS->hasSameSign() && !RHS->use_empty()) { RHS->setSameSign(false); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 56194fe..4c9b10a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2202,6 +2202,11 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { return commonCastTransforms(CI); } +Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) { + // FIXME: Implement variants of ptrtoint folds. + return commonCastTransforms(CI); +} + /// This input value (which is known to have vector type) is being zero extended /// or truncated to the specified vector type. Since the zext/trunc is done /// using an integer type, we have a (bitcast(cast(bitcast))) pattern, diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index e01c145..218aaf9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -143,6 +143,7 @@ public: Instruction *visitUIToFP(CastInst &CI); Instruction *visitSIToFP(CastInst &CI); Instruction *visitPtrToInt(PtrToIntInst &CI); + Instruction *visitPtrToAddr(PtrToAddrInst &CI); Instruction *visitIntToPtr(IntToPtrInst &CI); Instruction *visitBitCast(BitCastInst &CI); Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI); diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 5c747bb..9815644 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -1069,27 +1069,22 @@ struct LoweredPHIRecord { }; } // namespace -namespace llvm { - template<> - struct DenseMapInfo<LoweredPHIRecord> { - static inline LoweredPHIRecord getEmptyKey() { - return LoweredPHIRecord(nullptr, 0); - } - static inline LoweredPHIRecord getTombstoneKey() { - return LoweredPHIRecord(nullptr, 1); - } - static unsigned getHashValue(const LoweredPHIRecord &Val) { - return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^ - (Val.Width>>3); - } - static bool isEqual(const LoweredPHIRecord &LHS, - const LoweredPHIRecord &RHS) { - return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift && - LHS.Width == RHS.Width; - } - }; -} // namespace llvm - +template <> struct llvm::DenseMapInfo<LoweredPHIRecord> { + static inline LoweredPHIRecord getEmptyKey() { + return LoweredPHIRecord(nullptr, 0); + } + static inline LoweredPHIRecord getTombstoneKey() { + return LoweredPHIRecord(nullptr, 1); + } + static unsigned getHashValue(const LoweredPHIRecord &Val) { + return DenseMapInfo<PHINode *>::getHashValue(Val.PN) ^ (Val.Shift >> 3) ^ + (Val.Width >> 3); + } + static bool isEqual(const LoweredPHIRecord &LHS, + const LoweredPHIRecord &RHS) { + return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift && LHS.Width == RHS.Width; + } +}; /// This is an integer PHI and we know that it has an illegal type: see if it is /// only used by trunc or trunc(lshr) operations. If so, we split the PHI into diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 3704ad7..860f8f7 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -600,9 +600,7 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize, !IsRISCV64 && !IsLoongArch64 && !(Mapping.Offset & (Mapping.Offset - 1)) && Mapping.Offset != kDynamicShadowSentinel; - bool IsAndroidWithIfuncSupport = - IsAndroid && !TargetTriple.isAndroidVersionLT(21); - Mapping.InGlobal = ClWithIfunc && IsAndroidWithIfuncSupport && IsArmOrThumb; + Mapping.InGlobal = ClWithIfunc && IsAndroid && IsArmOrThumb; return Mapping; } diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 5b8ea15..b74a070 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -1084,8 +1084,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, auto ThenTerm = SplitBlockAndInsertIfThen( IRB.CreateIsNull(Load), &*IP, false, MDBuilder(IRB.getContext()).createUnlikelyBranchWeights()); - IRBuilder<> ThenIRB(ThenTerm); + InstrumentationIRBuilder ThenIRB(ThenTerm); auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr); + if (EntryLoc) + Store->setDebugLoc(EntryLoc); Load->setNoSanitizeMetadata(); Store->setNoSanitizeMetadata(); } @@ -1131,7 +1133,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, EstimatedStackSize >= Options.StackDepthCallbackMin) { if (InsertBefore) IRB.SetInsertPoint(InsertBefore); - IRB.CreateCall(SanCovStackDepthCallback)->setCannotMerge(); + auto Call = IRB.CreateCall(SanCovStackDepthCallback); + if (EntryLoc) + Call->setDebugLoc(EntryLoc); + Call->setCannotMerge(); } } else { // Check stack depth. If it's the deepest so far, record it. @@ -1144,8 +1149,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, auto ThenTerm = SplitBlockAndInsertIfThen( IsStackLower, &*IP, false, MDBuilder(IRB.getContext()).createUnlikelyBranchWeights()); - IRBuilder<> ThenIRB(ThenTerm); + InstrumentationIRBuilder ThenIRB(ThenTerm); auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack); + if (EntryLoc) + Store->setDebugLoc(EntryLoc); LowestStack->setNoSanitizeMetadata(); Store->setNoSanitizeMetadata(); } diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index e448230..e5935f4 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -61,6 +61,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DomTreeUpdater.h" @@ -147,19 +148,16 @@ public: class DFAJumpThreading { public: - DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, + DFAJumpThreading(AssumptionCache *AC, DomTreeUpdater *DTU, LoopInfo *LI, TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) - : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {} + : AC(AC), DTU(DTU), LI(LI), TTI(TTI), ORE(ORE) {} bool run(Function &F); bool LoopInfoBroken; private: void - unfoldSelectInstrs(DominatorTree *DT, - const SmallVector<SelectInstToUnfold, 4> &SelectInsts) { - // TODO: Have everything use a single lazy DTU - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + unfoldSelectInstrs(const SmallVector<SelectInstToUnfold, 4> &SelectInsts) { SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts); while (!Stack.empty()) { @@ -167,7 +165,7 @@ private: std::vector<SelectInstToUnfold> NewSIsToUnfold; std::vector<BasicBlock *> NewBBs; - unfold(&DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs); + unfold(DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs); // Put newly discovered select instructions into the work list. llvm::append_range(Stack, NewSIsToUnfold); @@ -180,7 +178,7 @@ private: std::vector<BasicBlock *> *NewBBs); AssumptionCache *AC; - DominatorTree *DT; + DomTreeUpdater *DTU; LoopInfo *LI; TargetTransformInfo *TTI; OptimizationRemarkEmitter *ORE; @@ -382,19 +380,28 @@ typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap; typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap; inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) { - OS << "< "; - for (const BasicBlock *BB : Path) { - std::string BBName; - if (BB->hasName()) - raw_string_ostream(BBName) << BB->getName(); - else - raw_string_ostream(BBName) << BB; - OS << BBName << " "; - } - OS << ">"; + auto BBNames = llvm::map_range( + Path, [](const BasicBlock *BB) { return BB->getNameOrAsOperand(); }); + OS << "< " << llvm::join(BBNames, ", ") << " >"; return OS; } +/// Helper to get the successor corresponding to a particular case value for +/// a switch statement. +static BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, + const APInt &NextState) { + BasicBlock *NextCase = nullptr; + for (auto Case : Switch->cases()) { + if (Case.getCaseValue()->getValue() == NextState) { + NextCase = Case.getCaseSuccessor(); + break; + } + } + if (!NextCase) + NextCase = Switch->getDefaultDest(); + return NextCase; +} + namespace { /// ThreadingPath is a path in the control flow of a loop that can be threaded /// by cloning necessary basic blocks and replacing conditional branches with @@ -407,6 +414,10 @@ struct ThreadingPath { ExitVal = V->getValue(); IsExitValSet = true; } + void setExitValue(const APInt &V) { + ExitVal = V; + IsExitValSet = true; + } bool isExitValueSet() const { return IsExitValSet; } /// Determinator is the basic block that determines the next state of the DFA. @@ -423,7 +434,7 @@ struct ThreadingPath { } void print(raw_ostream &OS) const { - OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]"; + OS << Path << " [ " << ExitVal << ", " << DBB->getNameOrAsOperand() << " ]"; } private: @@ -589,44 +600,8 @@ struct AllSwitchPaths { BasicBlock *getSwitchBlock() { return SwitchBlock; } void run() { - StateDefMap StateDef = getStateDefMap(); - if (StateDef.empty()) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable", - Switch) - << "Switch instruction is not predictable."; - }); - return; - } - - auto *SwitchPhi = cast<PHINode>(Switch->getOperand(0)); - auto *SwitchPhiDefBB = SwitchPhi->getParent(); - VisitedBlocks VB; - // Get paths from the determinator BBs to SwitchPhiDefBB - std::vector<ThreadingPath> PathsToPhiDef = - getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths); - if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) { - TPaths = std::move(PathsToPhiDef); - return; - } - - assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty()); - auto PathsLimit = MaxNumPaths / PathsToPhiDef.size(); - // Find and append paths from SwitchPhiDefBB to SwitchBlock. - PathsType PathsToSwitchBB = - paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit); - if (PathsToSwitchBB.empty()) - return; - - std::vector<ThreadingPath> TempList; - for (const ThreadingPath &Path : PathsToPhiDef) { - for (const PathType &PathToSw : PathsToSwitchBB) { - ThreadingPath PathCopy(Path); - PathCopy.appendExcludingFirst(PathToSw); - TempList.push_back(PathCopy); - } - } - TPaths = std::move(TempList); + findTPaths(); + unifyTPaths(); } private: @@ -818,6 +793,69 @@ private: return Res; } + // Find all threadable paths. + void findTPaths() { + StateDefMap StateDef = getStateDefMap(); + if (StateDef.empty()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable", + Switch) + << "Switch instruction is not predictable."; + }); + return; + } + + auto *SwitchPhi = cast<PHINode>(Switch->getOperand(0)); + auto *SwitchPhiDefBB = SwitchPhi->getParent(); + VisitedBlocks VB; + // Get paths from the determinator BBs to SwitchPhiDefBB + std::vector<ThreadingPath> PathsToPhiDef = + getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths); + if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) { + TPaths = std::move(PathsToPhiDef); + return; + } + + assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty()); + auto PathsLimit = MaxNumPaths / PathsToPhiDef.size(); + // Find and append paths from SwitchPhiDefBB to SwitchBlock. + PathsType PathsToSwitchBB = + paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit); + if (PathsToSwitchBB.empty()) + return; + + std::vector<ThreadingPath> TempList; + for (const ThreadingPath &Path : PathsToPhiDef) { + for (const PathType &PathToSw : PathsToSwitchBB) { + ThreadingPath PathCopy(Path); + PathCopy.appendExcludingFirst(PathToSw); + TempList.push_back(PathCopy); + } + } + TPaths = std::move(TempList); + } + + // Two states are equivalent if they have the same switch destination. + // Unify the states in different threading path if the states are equivalent. + void unifyTPaths() { + llvm::SmallDenseMap<BasicBlock *, APInt> DestToState; + for (ThreadingPath &Path : TPaths) { + APInt NextState = Path.getExitValue(); + BasicBlock *Dest = getNextCaseSuccessor(Switch, NextState); + auto StateIt = DestToState.find(Dest); + if (StateIt == DestToState.end()) { + DestToState.insert({Dest, NextState}); + continue; + } + + if (NextState != StateIt->second) { + LLVM_DEBUG(dbgs() << "Next state in " << Path << " is equivalent to " + << StateIt->second << "\n"); + Path.setExitValue(StateIt->second); + } + } + } + unsigned NumVisited = 0; SwitchInst *Switch; BasicBlock *SwitchBlock; @@ -828,11 +866,11 @@ private: }; struct TransformDFA { - TransformDFA(AllSwitchPaths *SwitchPaths, DominatorTree *DT, + TransformDFA(AllSwitchPaths *SwitchPaths, DomTreeUpdater *DTU, AssumptionCache *AC, TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, SmallPtrSet<const Value *, 32> EphValues) - : SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE), + : SwitchPaths(SwitchPaths), DTU(DTU), AC(AC), TTI(TTI), ORE(ORE), EphValues(EphValues) {} bool run() { @@ -1008,19 +1046,16 @@ private: SmallPtrSet<BasicBlock *, 16> BlocksToClean; BlocksToClean.insert_range(successors(SwitchBlock)); - { - DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); - for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { - createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU); - NumPaths++; - } - - // After all paths are cloned, now update the last successor of the cloned - // path so it skips over the switch statement - for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) - updateLastSuccessor(TPath, DuplicateMap, &DTU); + for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { + createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, DTU); + NumPaths++; } + // After all paths are cloned, now update the last successor of the cloned + // path so it skips over the switch statement + for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) + updateLastSuccessor(TPath, DuplicateMap, DTU); + // For each instruction that was cloned and used outside, update its uses updateSSA(NewDefs); @@ -1124,7 +1159,7 @@ private: } // SSAUpdater handles phi placement and renaming uses with the appropriate // value. - SSAUpdate.RewriteAllUses(DT); + SSAUpdate.RewriteAllUses(&DTU->getDomTree()); } /// Clones a basic block, and adds it to the CFG. @@ -1341,28 +1376,13 @@ private: return It != ClonedBBs.end() ? (*It).BB : nullptr; } - /// Helper to get the successor corresponding to a particular case value for - /// a switch statement. - BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, const APInt &NextState) { - BasicBlock *NextCase = nullptr; - for (auto Case : Switch->cases()) { - if (Case.getCaseValue()->getValue() == NextState) { - NextCase = Case.getCaseSuccessor(); - break; - } - } - if (!NextCase) - NextCase = Switch->getDefaultDest(); - return NextCase; - } - /// Returns true if IncomingBB is a predecessor of BB. bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) { return llvm::is_contained(predecessors(BB), IncomingBB); } AllSwitchPaths *SwitchPaths; - DominatorTree *DT; + DomTreeUpdater *DTU; AssumptionCache *AC; TargetTransformInfo *TTI; OptimizationRemarkEmitter *ORE; @@ -1405,7 +1425,7 @@ bool DFAJumpThreading::run(Function &F) { << "candidate for jump threading\n"); LLVM_DEBUG(SI->dump()); - unfoldSelectInstrs(DT, Switch.getSelectInsts()); + unfoldSelectInstrs(Switch.getSelectInsts()); if (!Switch.getSelectInsts().empty()) MadeChanges = true; @@ -1427,7 +1447,7 @@ bool DFAJumpThreading::run(Function &F) { } #ifdef NDEBUG - LI->verify(*DT); + LI->verify(DTU->getDomTree()); #endif SmallPtrSet<const Value *, 32> EphValues; @@ -1435,13 +1455,15 @@ bool DFAJumpThreading::run(Function &F) { CodeMetrics::collectEphemeralValues(&F, AC, EphValues); for (AllSwitchPaths SwitchPaths : ThreadableLoops) { - TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues); + TransformDFA Transform(&SwitchPaths, DTU, AC, TTI, ORE, EphValues); if (Transform.run()) MadeChanges = LoopInfoBroken = true; } + DTU->flush(); + #ifdef EXPENSIVE_CHECKS - assert(DT->verify(DominatorTree::VerificationLevel::Full)); + assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full)); verifyFunction(F, &dbgs()); #endif @@ -1456,7 +1478,9 @@ PreservedAnalyses DFAJumpThreadingPass::run(Function &F, LoopInfo &LI = AM.getResult<LoopAnalysis>(F); TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); OptimizationRemarkEmitter ORE(&F); - DFAJumpThreading ThreadImpl(&AC, &DT, &LI, &TTI, &ORE); + + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + DFAJumpThreading ThreadImpl(&AC, &DTU, &LI, &TTI, &ORE); if (!ThreadImpl.run(F)) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 0f8cc6c..2afa7b7 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -108,7 +108,7 @@ struct SimpleValue { // of instruction handled below (UnaryOperator, etc.). if (CallInst *CI = dyn_cast<CallInst>(Inst)) { if (Function *F = CI->getCalledFunction()) { - switch ((Intrinsic::ID)F->getIntrinsicID()) { + switch (F->getIntrinsicID()) { case Intrinsic::experimental_constrained_fadd: case Intrinsic::experimental_constrained_fsub: case Intrinsic::experimental_constrained_fmul: @@ -154,9 +154,7 @@ struct SimpleValue { } // end anonymous namespace -namespace llvm { - -template <> struct DenseMapInfo<SimpleValue> { +template <> struct llvm::DenseMapInfo<SimpleValue> { static inline SimpleValue getEmptyKey() { return DenseMapInfo<Instruction *>::getEmptyKey(); } @@ -169,8 +167,6 @@ template <> struct DenseMapInfo<SimpleValue> { static bool isEqual(SimpleValue LHS, SimpleValue RHS); }; -} // end namespace llvm - /// Match a 'select' including an optional 'not's of the condition. static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A, Value *&B, @@ -509,9 +505,7 @@ struct CallValue { } // end anonymous namespace -namespace llvm { - -template <> struct DenseMapInfo<CallValue> { +template <> struct llvm::DenseMapInfo<CallValue> { static inline CallValue getEmptyKey() { return DenseMapInfo<Instruction *>::getEmptyKey(); } @@ -524,8 +518,6 @@ template <> struct DenseMapInfo<CallValue> { static bool isEqual(CallValue LHS, CallValue RHS); }; -} // end namespace llvm - unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { Instruction *Inst = Val.Inst; @@ -580,9 +572,7 @@ struct GEPValue { } // namespace -namespace llvm { - -template <> struct DenseMapInfo<GEPValue> { +template <> struct llvm::DenseMapInfo<GEPValue> { static inline GEPValue getEmptyKey() { return DenseMapInfo<Instruction *>::getEmptyKey(); } @@ -595,8 +585,6 @@ template <> struct DenseMapInfo<GEPValue> { static bool isEqual(const GEPValue &LHS, const GEPValue &RHS); }; -} // end namespace llvm - unsigned DenseMapInfo<GEPValue>::getHashValue(const GEPValue &Val) { auto *GEP = cast<GetElementPtrInst>(Val.Inst); if (Val.ConstantOffset.has_value()) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 638952a..3a8ade8 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -170,9 +170,7 @@ struct llvm::GVNPass::Expression { } }; -namespace llvm { - -template <> struct DenseMapInfo<GVNPass::Expression> { +template <> struct llvm::DenseMapInfo<GVNPass::Expression> { static inline GVNPass::Expression getEmptyKey() { return ~0U; } static inline GVNPass::Expression getTombstoneKey() { return ~1U; } @@ -188,8 +186,6 @@ template <> struct DenseMapInfo<GVNPass::Expression> { } }; -} // end namespace llvm - /// Represents a particular available value that we know how to materialize. /// Materialization of an AvailableValue never fails. An AvailableValue is /// implicitly associated with a rematerialization point which is the @@ -2084,13 +2080,6 @@ bool GVNPass::processNonLocalLoad(LoadInst *Load) { return Changed; } -static bool hasUsersIn(Value *V, BasicBlock *BB) { - return any_of(V->users(), [BB](User *U) { - auto *I = dyn_cast<Instruction>(U); - return I && I->getParent() == BB; - }); -} - bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) { Value *V = IntrinsicI->getArgOperand(0); @@ -2149,85 +2138,7 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) { } Constant *True = ConstantInt::getTrue(V->getContext()); - bool Changed = false; - - for (BasicBlock *Successor : successors(IntrinsicI->getParent())) { - BasicBlockEdge Edge(IntrinsicI->getParent(), Successor); - - // This property is only true in dominated successors, propagateEquality - // will check dominance for us. - Changed |= propagateEquality(V, True, Edge, false); - } - - // We can replace assume value with true, which covers cases like this: - // call void @llvm.assume(i1 %cmp) - // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true - ReplaceOperandsWithMap[V] = True; - - // Similarly, after assume(!NotV) we know that NotV == false. - Value *NotV; - if (match(V, m_Not(m_Value(NotV)))) - ReplaceOperandsWithMap[NotV] = ConstantInt::getFalse(V->getContext()); - - // If we find an equality fact, canonicalize all dominated uses in this block - // to one of the two values. We heuristically choice the "oldest" of the - // two where age is determined by value number. (Note that propagateEquality - // above handles the cross block case.) - // - // Key case to cover are: - // 1) - // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen - // call void @llvm.assume(i1 %cmp) - // ret float %0 ; will change it to ret float 3.000000e+00 - // 2) - // %load = load float, float* %addr - // %cmp = fcmp oeq float %load, %0 - // call void @llvm.assume(i1 %cmp) - // ret float %load ; will change it to ret float %0 - if (auto *CmpI = dyn_cast<CmpInst>(V)) { - if (CmpI->isEquivalence()) { - Value *CmpLHS = CmpI->getOperand(0); - Value *CmpRHS = CmpI->getOperand(1); - // Heuristically pick the better replacement -- the choice of heuristic - // isn't terribly important here, but the fact we canonicalize on some - // replacement is for exposing other simplifications. - // TODO: pull this out as a helper function and reuse w/ existing - // (slightly different) logic. - if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS)) - std::swap(CmpLHS, CmpRHS); - if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS)) - std::swap(CmpLHS, CmpRHS); - if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) || - (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) { - // Move the 'oldest' value to the right-hand side, using the value - // number as a proxy for age. - uint32_t LVN = VN.lookupOrAdd(CmpLHS); - uint32_t RVN = VN.lookupOrAdd(CmpRHS); - if (LVN < RVN) - std::swap(CmpLHS, CmpRHS); - } - - // Handle degenerate case where we either haven't pruned a dead path or a - // removed a trivial assume yet. - if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS)) - return Changed; - - LLVM_DEBUG(dbgs() << "Replacing dominated uses of " - << *CmpLHS << " with " - << *CmpRHS << " in block " - << IntrinsicI->getParent()->getName() << "\n"); - - // Setup the replacement map - this handles uses within the same block. - if (hasUsersIn(CmpLHS, IntrinsicI->getParent())) - ReplaceOperandsWithMap[CmpLHS] = CmpRHS; - - // NOTE: The non-block local cases are handled by the call to - // propagateEquality above; this block is just about handling the block - // local cases. TODO: There's a bunch of logic in propagateEqualiy which - // isn't duplicated for the block local case, can we share it somehow? - } - } - return Changed; + return propagateEquality(V, True, IntrinsicI); } static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { @@ -2526,39 +2437,28 @@ void GVNPass::assignBlockRPONumber(Function &F) { InvalidBlockRPONumbers = false; } -bool GVNPass::replaceOperandsForInBlockEquality(Instruction *Instr) const { - bool Changed = false; - for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { - Use &Operand = Instr->getOperandUse(OpNum); - auto It = ReplaceOperandsWithMap.find(Operand.get()); - if (It != ReplaceOperandsWithMap.end()) { - const DataLayout &DL = Instr->getDataLayout(); - if (!canReplacePointersInUseIfEqual(Operand, It->second, DL)) - continue; - - LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " - << *It->second << " in instruction " << *Instr << '\n'); - Instr->setOperand(OpNum, It->second); - Changed = true; - } - } - return Changed; -} - -/// The given values are known to be equal in every block +/// The given values are known to be equal in every use /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. -/// If DominatesByEdge is false, then it means that we will propagate the RHS -/// value starting from the end of Root.Start. -bool GVNPass::propagateEquality(Value *LHS, Value *RHS, - const BasicBlockEdge &Root, - bool DominatesByEdge) { +/// The Root may either be a basic block edge (for conditions) or an +/// instruction (for assumes). +bool GVNPass::propagateEquality( + Value *LHS, Value *RHS, + const std::variant<BasicBlockEdge, Instruction *> &Root) { SmallVector<std::pair<Value*, Value*>, 4> Worklist; Worklist.push_back(std::make_pair(LHS, RHS)); bool Changed = false; - // For speed, compute a conservative fast approximation to - // DT->dominates(Root, Root.getEnd()); - const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT); + SmallVector<const BasicBlock *> DominatedBlocks; + if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root)) { + // For speed, compute a conservative fast approximation to + // DT->dominates(Root, Root.getEnd()); + if (isOnlyReachableViaThisEdge(*Edge, DT)) + DominatedBlocks.push_back(Edge->getEnd()); + } else { + Instruction *I = std::get<Instruction *>(Root); + for (const auto *Node : DT->getNode(I->getParent())->children()) + DominatedBlocks.push_back(Node->getBlock()); + } while (!Worklist.empty()) { std::pair<Value*, Value*> Item = Worklist.pop_back_val(); @@ -2606,9 +2506,9 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS, // using the leader table is about compiling faster, not optimizing better). // The leader table only tracks basic blocks, not edges. Only add to if we // have the simple case where the edge dominates the end. - if (RootDominatesEnd && !isa<Instruction>(RHS) && - canReplacePointersIfEqual(LHS, RHS, DL)) - LeaderTable.insert(LVN, RHS, Root.getEnd()); + if (!isa<Instruction>(RHS) && canReplacePointersIfEqual(LHS, RHS, DL)) + for (const BasicBlock *BB : DominatedBlocks) + LeaderTable.insert(LVN, RHS, BB); // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As // LHS always has at least one use that is not dominated by Root, this will @@ -2618,12 +2518,14 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS, auto CanReplacePointersCallBack = [&DL](const Use &U, const Value *To) { return canReplacePointersInUseIfEqual(U, To, DL); }; - unsigned NumReplacements = - DominatesByEdge - ? replaceDominatedUsesWithIf(LHS, RHS, *DT, Root, - CanReplacePointersCallBack) - : replaceDominatedUsesWithIf(LHS, RHS, *DT, Root.getStart(), - CanReplacePointersCallBack); + unsigned NumReplacements; + if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root)) + NumReplacements = replaceDominatedUsesWithIf( + LHS, RHS, *DT, *Edge, CanReplacePointersCallBack); + else + NumReplacements = replaceDominatedUsesWithIf( + LHS, RHS, *DT, std::get<Instruction *>(Root), + CanReplacePointersCallBack); if (NumReplacements > 0) { Changed = true; @@ -2682,26 +2584,45 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS, // If the number we were assigned was brand new then there is no point in // looking for an instruction realizing it: there cannot be one! if (Num < NextNum) { - Value *NotCmp = findLeader(Root.getEnd(), Num); - if (NotCmp && isa<Instruction>(NotCmp)) { - unsigned NumReplacements = - DominatesByEdge - ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root) - : replaceDominatedUsesWith(NotCmp, NotVal, *DT, - Root.getStart()); - Changed |= NumReplacements > 0; - NumGVNEqProp += NumReplacements; - // Cached information for anything that uses NotCmp will be invalid. - if (MD) - MD->invalidateCachedPointerInfo(NotCmp); + for (const auto &Entry : LeaderTable.getLeaders(Num)) { + // Only look at leaders that either dominate the start of the edge, + // or are dominated by the end. This check is not necessary for + // correctness, it only discards cases for which the following + // use replacement will not work anyway. + if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root)) { + if (!DT->dominates(Entry.BB, Edge->getStart()) && + !DT->dominates(Edge->getEnd(), Entry.BB)) + continue; + } else { + auto *InstBB = std::get<Instruction *>(Root)->getParent(); + if (!DT->dominates(Entry.BB, InstBB) && + !DT->dominates(InstBB, Entry.BB)) + continue; + } + + Value *NotCmp = Entry.Val; + if (NotCmp && isa<Instruction>(NotCmp)) { + unsigned NumReplacements; + if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root)) + NumReplacements = + replaceDominatedUsesWith(NotCmp, NotVal, *DT, *Edge); + else + NumReplacements = replaceDominatedUsesWith( + NotCmp, NotVal, *DT, std::get<Instruction *>(Root)); + Changed |= NumReplacements > 0; + NumGVNEqProp += NumReplacements; + // Cached information for anything that uses NotCmp will be invalid. + if (MD) + MD->invalidateCachedPointerInfo(NotCmp); + } } } // Ensure that any instruction in scope that gets the "A < B" value number // is replaced with false. // The leader table only tracks basic blocks, not edges. Only add to if we // have the simple case where the edge dominates the end. - if (RootDominatesEnd) - LeaderTable.insert(Num, NotVal, Root.getEnd()); + for (const BasicBlock *BB : DominatedBlocks) + LeaderTable.insert(Num, NotVal, BB); continue; } @@ -2789,11 +2710,11 @@ bool GVNPass::processInstruction(Instruction *I) { Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext()); BasicBlockEdge TrueE(Parent, TrueSucc); - Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true); + Changed |= propagateEquality(BranchCond, TrueVal, TrueE); Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext()); BasicBlockEdge FalseE(Parent, FalseSucc); - Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true); + Changed |= propagateEquality(BranchCond, FalseVal, FalseE); return Changed; } @@ -2814,7 +2735,7 @@ bool GVNPass::processInstruction(Instruction *I) { // If there is only a single edge, propagate the case value into it. if (SwitchEdges.lookup(Dst) == 1) { BasicBlockEdge E(Parent, Dst); - Changed |= propagateEquality(SwitchCond, Case.getCaseValue(), E, true); + Changed |= propagateEquality(SwitchCond, Case.getCaseValue(), E); } } return Changed; @@ -2942,8 +2863,6 @@ bool GVNPass::processBlock(BasicBlock *BB) { if (DeadBlocks.count(BB)) return false; - // Clearing map before every BB because it can be used only for single BB. - ReplaceOperandsWithMap.clear(); bool ChangedFunction = false; // Since we may not have visited the input blocks of the phis, we can't @@ -2955,11 +2874,8 @@ bool GVNPass::processBlock(BasicBlock *BB) { for (PHINode *PN : PHINodesToRemove) { removeInstruction(PN); } - for (Instruction &Inst : make_early_inc_range(*BB)) { - if (!ReplaceOperandsWithMap.empty()) - ChangedFunction |= replaceOperandsForInBlockEquality(&Inst); + for (Instruction &Inst : make_early_inc_range(*BB)) ChangedFunction |= processInstruction(&Inst); - } return ChangedFunction; } diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 3c1a8ba..80aa98d 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -434,10 +434,6 @@ private: int StoreCount = 0; }; -} // end anonymous namespace - -namespace llvm { - struct ExactEqualsExpression { const Expression &E; @@ -449,8 +445,9 @@ struct ExactEqualsExpression { return E.exactlyEquals(Other); } }; +} // end anonymous namespace -template <> struct DenseMapInfo<const Expression *> { +template <> struct llvm::DenseMapInfo<const Expression *> { static const Expression *getEmptyKey() { auto Val = static_cast<uintptr_t>(-1); Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable; @@ -493,8 +490,6 @@ template <> struct DenseMapInfo<const Expression *> { } }; -} // end namespace llvm - namespace { class NewGVN { diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp index 2190dcd..a87822c 100644 --- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp +++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp @@ -84,10 +84,6 @@ public: bool run(); }; -} // anonymous namespace - -namespace llvm { - struct FrozenIndPHIInfo { // A freeze instruction that uses an induction phi FreezeInst *FI = nullptr; @@ -103,7 +99,9 @@ struct FrozenIndPHIInfo { bool operator==(const FrozenIndPHIInfo &Other) { return FI == Other.FI; } }; -template <> struct DenseMapInfo<FrozenIndPHIInfo> { +} // namespace + +template <> struct llvm::DenseMapInfo<FrozenIndPHIInfo> { static inline FrozenIndPHIInfo getEmptyKey() { return FrozenIndPHIInfo(DenseMapInfo<PHINode *>::getEmptyKey(), DenseMapInfo<BinaryOperator *>::getEmptyKey()); @@ -124,8 +122,6 @@ template <> struct DenseMapInfo<FrozenIndPHIInfo> { }; }; -} // end namespace llvm - // Given U = (value, user), replace value with freeze(value), and let // SCEV forget user. The inserted freeze is placed in the preheader. void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) { diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index b6ca52e..46f2903 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3246,6 +3246,13 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, return ::replaceDominatedUsesWith(From, To, Dominates); } +unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, + DominatorTree &DT, + const Instruction *I) { + auto Dominates = [&](const Use &U) { return DT.dominates(I, U); }; + return ::replaceDominatedUsesWith(From, To, Dominates); +} + unsigned llvm::replaceDominatedUsesWithIf( Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Root, function_ref<bool(const Use &U, const Value *To)> ShouldReplace) { @@ -3264,6 +3271,15 @@ unsigned llvm::replaceDominatedUsesWithIf( return ::replaceDominatedUsesWith(From, To, DominatesAndShouldReplace); } +unsigned llvm::replaceDominatedUsesWithIf( + Value *From, Value *To, DominatorTree &DT, const Instruction *I, + function_ref<bool(const Use &U, const Value *To)> ShouldReplace) { + auto DominatesAndShouldReplace = [&](const Use &U) { + return DT.dominates(I, U) && ShouldReplace(U, To); + }; + return ::replaceDominatedUsesWith(From, To, DominatesAndShouldReplace); +} + bool llvm::callsGCLeafFunction(const CallBase *Call, const TargetLibraryInfo &TLI) { // Check if the function is specifically marked as a gc leaf function. diff --git a/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/llvm/lib/Transforms/Utils/LowerInvoke.cpp index ff2ab3c..cecb662 100644 --- a/llvm/lib/Transforms/Utils/LowerInvoke.cpp +++ b/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -27,15 +27,15 @@ using namespace llvm; STATISTIC(NumInvokes, "Number of invokes replaced"); namespace { - class LowerInvokeLegacyPass : public FunctionPass { - public: - static char ID; // Pass identification, replacement for typeid - explicit LowerInvokeLegacyPass() : FunctionPass(ID) { - initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - }; -} +class LowerInvokeLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + explicit LowerInvokeLegacyPass() : FunctionPass(ID) { + initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; +}; +} // namespace char LowerInvokeLegacyPass::ID = 0; INITIALIZE_PASS(LowerInvokeLegacyPass, "lowerinvoke", @@ -78,11 +78,12 @@ bool LowerInvokeLegacyPass::runOnFunction(Function &F) { return runImpl(F); } -namespace llvm { -char &LowerInvokePassID = LowerInvokeLegacyPass::ID; +char &llvm::LowerInvokePassID = LowerInvokeLegacyPass::ID; // Public Interface To the LowerInvoke pass. -FunctionPass *createLowerInvokePass() { return new LowerInvokeLegacyPass(); } +FunctionPass *llvm::createLowerInvokePass() { + return new LowerInvokeLegacyPass(); +} PreservedAnalyses LowerInvokePass::run(Function &F, FunctionAnalysisManager &AM) { @@ -92,4 +93,3 @@ PreservedAnalyses LowerInvokePass::run(Function &F, return PreservedAnalyses::none(); } -} diff --git a/llvm/lib/Transforms/Utils/MisExpect.cpp b/llvm/lib/Transforms/Utils/MisExpect.cpp index ca7e09d..1585e9e 100644 --- a/llvm/lib/Transforms/Utils/MisExpect.cpp +++ b/llvm/lib/Transforms/Utils/MisExpect.cpp @@ -48,8 +48,6 @@ using namespace llvm; using namespace misexpect; -namespace llvm { - // Command line option to enable/disable the warning when profile data suggests // a mismatch with the use of the llvm.expect intrinsic static cl::opt<bool> PGOWarnMisExpect( @@ -63,22 +61,18 @@ static cl::opt<uint32_t> MisExpectTolerance( cl::desc("Prevents emitting diagnostics when profile counts are " "within N% of the threshold..")); -} // namespace llvm - -namespace { - -bool isMisExpectDiagEnabled(LLVMContext &Ctx) { +static bool isMisExpectDiagEnabled(const LLVMContext &Ctx) { return PGOWarnMisExpect || Ctx.getMisExpectWarningRequested(); } -uint32_t getMisExpectTolerance(LLVMContext &Ctx) { +static uint32_t getMisExpectTolerance(const LLVMContext &Ctx) { return std::max(static_cast<uint32_t>(MisExpectTolerance), Ctx.getDiagnosticsMisExpectTolerance()); } -Instruction *getInstCondition(Instruction *I) { +static const Instruction *getInstCondition(const Instruction *I) { assert(I != nullptr && "MisExpect target Instruction cannot be nullptr"); - Instruction *Ret = nullptr; + const Instruction *Ret = nullptr; if (auto *B = dyn_cast<BranchInst>(I)) { Ret = dyn_cast<Instruction>(B->getCondition()); } @@ -97,8 +91,8 @@ Instruction *getInstCondition(Instruction *I) { return Ret ? Ret : I; } -void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx, - uint64_t ProfCount, uint64_t TotalCount) { +static void emitMisexpectDiagnostic(const Instruction *I, LLVMContext &Ctx, + uint64_t ProfCount, uint64_t TotalCount) { double PercentageCorrect = (double)ProfCount / TotalCount; auto PerString = formatv("{0:P} ({1} / {2})", PercentageCorrect, ProfCount, TotalCount); @@ -106,20 +100,16 @@ void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx, "Potential performance regression from use of the llvm.expect intrinsic: " "Annotation was correct on {0} of profiled executions.", PerString); - Instruction *Cond = getInstCondition(I); + const Instruction *Cond = getInstCondition(I); if (isMisExpectDiagEnabled(Ctx)) Ctx.diagnose(DiagnosticInfoMisExpect(Cond, Twine(PerString))); OptimizationRemarkEmitter ORE(I->getParent()->getParent()); ORE.emit(OptimizationRemark(DEBUG_TYPE, "misexpect", Cond) << RemStr.str()); } -} // namespace - -namespace llvm { -namespace misexpect { - -void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights, - ArrayRef<uint32_t> ExpectedWeights) { +void misexpect::verifyMisExpect(const Instruction &I, + ArrayRef<uint32_t> RealWeights, + ArrayRef<uint32_t> ExpectedWeights) { // To determine if we emit a diagnostic, we need to compare the branch weights // from the profile to those added by the llvm.expect intrinsic. // So first, we extract the "likely" and "unlikely" weights from @@ -128,15 +118,13 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights, uint64_t LikelyBranchWeight = 0, UnlikelyBranchWeight = std::numeric_limits<uint32_t>::max(); size_t MaxIndex = 0; - for (size_t Idx = 0, End = ExpectedWeights.size(); Idx < End; Idx++) { - uint32_t V = ExpectedWeights[Idx]; + for (const auto &[Idx, V] : enumerate(ExpectedWeights)) { if (LikelyBranchWeight < V) { LikelyBranchWeight = V; MaxIndex = Idx; } - if (UnlikelyBranchWeight > V) { + if (UnlikelyBranchWeight > V) UnlikelyBranchWeight = V; - } } const uint64_t ProfiledWeight = RealWeights[MaxIndex]; @@ -161,7 +149,7 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights, uint64_t ScaledThreshold = LikelyProbablilty.scale(RealWeightsTotal); // clamp tolerance range to [0, 100) - auto Tolerance = getMisExpectTolerance(I.getContext()); + uint32_t Tolerance = getMisExpectTolerance(I.getContext()); Tolerance = std::clamp(Tolerance, 0u, 99u); // Allow users to relax checking by N% i.e., if they use a 5% tolerance, @@ -175,8 +163,8 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights, RealWeightsTotal); } -void checkBackendInstrumentation(Instruction &I, - const ArrayRef<uint32_t> RealWeights) { +void misexpect::checkBackendInstrumentation(const Instruction &I, + ArrayRef<uint32_t> RealWeights) { // Backend checking assumes any existing weight comes from an `llvm.expect` // intrinsic. However, SampleProfiling + ThinLTO add branch weights multiple // times, leading to an invalid assumption in our checking. Backend checks @@ -190,24 +178,19 @@ void checkBackendInstrumentation(Instruction &I, verifyMisExpect(I, RealWeights, ExpectedWeights); } -void checkFrontendInstrumentation(Instruction &I, - const ArrayRef<uint32_t> ExpectedWeights) { +void misexpect::checkFrontendInstrumentation( + const Instruction &I, ArrayRef<uint32_t> ExpectedWeights) { SmallVector<uint32_t> RealWeights; if (!extractBranchWeights(I, RealWeights)) return; verifyMisExpect(I, RealWeights, ExpectedWeights); } -void checkExpectAnnotations(Instruction &I, - const ArrayRef<uint32_t> ExistingWeights, - bool IsFrontend) { - if (IsFrontend) { +void misexpect::checkExpectAnnotations(const Instruction &I, + ArrayRef<uint32_t> ExistingWeights, + bool IsFrontend) { + if (IsFrontend) checkFrontendInstrumentation(I, ExistingWeights); - } else { + else checkBackendInstrumentation(I, ExistingWeights); - } } - -} // namespace misexpect -} // namespace llvm -#undef DEBUG_TYPE diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 155fcc5..d831c27 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5959,7 +5959,11 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, unsigned PreviousEdges = OtherCases->size(); if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; - for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) + unsigned E = PreviousEdges - 1; + // Remove all incoming values from OtherDest if OtherDest is unreachable. + if (NewBI->isUnconditional()) + ++E; + for (unsigned I = 0; I != E; ++I) cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); } @@ -7736,8 +7740,7 @@ struct SwitchSuccWrapper { DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> *PhiPredIVs; }; -namespace llvm { -template <> struct DenseMapInfo<const SwitchSuccWrapper *> { +template <> struct llvm::DenseMapInfo<const SwitchSuccWrapper *> { static const SwitchSuccWrapper *getEmptyKey() { return static_cast<SwitchSuccWrapper *>( DenseMapInfo<void *>::getEmptyKey()); @@ -7805,7 +7808,6 @@ template <> struct DenseMapInfo<const SwitchSuccWrapper *> { return true; } }; -} // namespace llvm bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, DomTreeUpdater *DTU) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3f16b03..e62d57e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5696,7 +5696,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { Instruction *I = Worklist.pop_back_val(); for (auto &Op : I->operands()) if (auto *InstOp = dyn_cast<Instruction>(Op)) - if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && + if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) && AddrDefs.insert(InstOp).second) Worklist.push_back(InstOp); } |