diff options
Diffstat (limited to 'llvm/lib')
25 files changed, 472 insertions, 344 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 31546e6..45c889c 100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -329,6 +329,7 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV, // Look through ptr->int and ptr->ptr casts. if (CE->getOpcode() == Instruction::PtrToInt || + CE->getOpcode() == Instruction::PtrToAddr || CE->getOpcode() == Instruction::BitCast) return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL, DSOEquiv); diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index 1794a60..85b5372 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -153,11 +153,6 @@ void Embedding::print(raw_ostream &OS) const { // Embedder and its subclasses //===----------------------------------------------------------------------===// -Embedder::Embedder(const Function &F, const Vocabulary &Vocab) - : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()), - OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight), - FuncVector(Embedding(Dimension)) {} - std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F, const Vocabulary &Vocab) { switch (Mode) { @@ -169,110 +164,85 @@ std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F, return nullptr; } -const InstEmbeddingsMap &Embedder::getInstVecMap() const { - if (InstVecMap.empty()) - computeEmbeddings(); - return InstVecMap; -} - -const BBEmbeddingsMap &Embedder::getBBVecMap() const { - if (BBVecMap.empty()) - computeEmbeddings(); - return BBVecMap; -} - -const Embedding &Embedder::getBBVector(const BasicBlock &BB) const { - auto It = BBVecMap.find(&BB); - if (It != BBVecMap.end()) - return It->second; - computeEmbeddings(BB); - return BBVecMap[&BB]; -} +Embedding Embedder::computeEmbeddings() const { + Embedding FuncVector(Dimension, 0.0); -const Embedding &Embedder::getFunctionVector() const { - // Currently, we always (re)compute the embeddings for the function. - // This is cheaper than caching the vector. - computeEmbeddings(); - return FuncVector; -} - -void Embedder::computeEmbeddings() const { if (F.isDeclaration()) - return; - - FuncVector = Embedding(Dimension, 0.0); + return FuncVector; // Consider only the basic blocks that are reachable from entry - for (const BasicBlock *BB : depth_first(&F)) { - computeEmbeddings(*BB); - FuncVector += BBVecMap[BB]; - } + for (const BasicBlock *BB : depth_first(&F)) + FuncVector += computeEmbeddings(*BB); + return FuncVector; } -void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const { +Embedding Embedder::computeEmbeddings(const BasicBlock &BB) const { Embedding BBVector(Dimension, 0); // We consider only the non-debug and non-pseudo instructions - for (const auto &I : BB.instructionsWithoutDebug()) { - Embedding ArgEmb(Dimension, 0); - for (const auto &Op : I.operands()) - ArgEmb += Vocab[*Op]; - auto InstVector = - Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; - if (const auto *IC = dyn_cast<CmpInst>(&I)) - InstVector += Vocab[IC->getPredicate()]; - InstVecMap[&I] = InstVector; - BBVector += InstVector; - } - BBVecMap[&BB] = BBVector; -} - -void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const { - Embedding BBVector(Dimension, 0); + for (const auto &I : BB.instructionsWithoutDebug()) + BBVector += computeEmbeddings(I); + return BBVector; +} + +Embedding SymbolicEmbedder::computeEmbeddings(const Instruction &I) const { + // Currently, we always (re)compute the embeddings for symbolic embedder. + // This is cheaper than caching the vectors. + Embedding ArgEmb(Dimension, 0); + for (const auto &Op : I.operands()) + ArgEmb += Vocab[*Op]; + auto InstVector = + Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; + if (const auto *IC = dyn_cast<CmpInst>(&I)) + InstVector += Vocab[IC->getPredicate()]; + return InstVector; +} + +Embedding FlowAwareEmbedder::computeEmbeddings(const Instruction &I) const { + // If we have already computed the embedding for this instruction, return it + auto It = InstVecMap.find(&I); + if (It != InstVecMap.end()) + return It->second; - // We consider only the non-debug and non-pseudo instructions - for (const auto &I : BB.instructionsWithoutDebug()) { - // TODO: Handle call instructions differently. - // For now, we treat them like other instructions - Embedding ArgEmb(Dimension, 0); - for (const auto &Op : I.operands()) { - // If the operand is defined elsewhere, we use its embedding - if (const auto *DefInst = dyn_cast<Instruction>(Op)) { - auto DefIt = InstVecMap.find(DefInst); - // Fixme (#159171): Ideally we should never miss an instruction - // embedding here. - // But when we have cyclic dependencies (e.g., phi - // nodes), we might miss the embedding. In such cases, we fall back to - // using the vocabulary embedding. This can be fixed by iterating to a - // fixed-point, or by using a simple solver for the set of simultaneous - // equations. - // Another case when we might miss an instruction embedding is when - // the operand instruction is in a different basic block that has not - // been processed yet. This can be fixed by processing the basic blocks - // in a topological order. - if (DefIt != InstVecMap.end()) - ArgEmb += DefIt->second; - else - ArgEmb += Vocab[*Op]; - } - // If the operand is not defined by an instruction, we use the vocabulary - else { - LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: " - << *Op << "=" << Vocab[*Op][0] << "\n"); + // TODO: Handle call instructions differently. + // For now, we treat them like other instructions + Embedding ArgEmb(Dimension, 0); + for (const auto &Op : I.operands()) { + // If the operand is defined elsewhere, we use its embedding + if (const auto *DefInst = dyn_cast<Instruction>(Op)) { + auto DefIt = InstVecMap.find(DefInst); + // Fixme (#159171): Ideally we should never miss an instruction + // embedding here. + // But when we have cyclic dependencies (e.g., phi + // nodes), we might miss the embedding. In such cases, we fall back to + // using the vocabulary embedding. This can be fixed by iterating to a + // fixed-point, or by using a simple solver for the set of simultaneous + // equations. + // Another case when we might miss an instruction embedding is when + // the operand instruction is in a different basic block that has not + // been processed yet. This can be fixed by processing the basic blocks + // in a topological order. + if (DefIt != InstVecMap.end()) + ArgEmb += DefIt->second; + else ArgEmb += Vocab[*Op]; - } } - // Create the instruction vector by combining opcode, type, and arguments - // embeddings - auto InstVector = - Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; - // Add compare predicate embedding as an additional operand if applicable - if (const auto *IC = dyn_cast<CmpInst>(&I)) - InstVector += Vocab[IC->getPredicate()]; - InstVecMap[&I] = InstVector; - BBVector += InstVector; + // If the operand is not defined by an instruction, we use the + // vocabulary + else { + LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: " + << *Op << "=" << Vocab[*Op][0] << "\n"); + ArgEmb += Vocab[*Op]; + } } - BBVecMap[&BB] = BBVector; + // Create the instruction vector by combining opcode, type, and arguments + // embeddings + auto InstVector = + Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; + if (const auto *IC = dyn_cast<CmpInst>(&I)) + InstVector += Vocab[IC->getPredicate()]; + InstVecMap[&I] = InstVector; + return InstVector; } // ==----------------------------------------------------------------------===// @@ -695,25 +665,17 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M, Emb->getFunctionVector().print(OS); OS << "Basic block vectors:\n"; - const auto &BBMap = Emb->getBBVecMap(); for (const BasicBlock &BB : F) { - auto It = BBMap.find(&BB); - if (It != BBMap.end()) { - OS << "Basic block: " << BB.getName() << ":\n"; - It->second.print(OS); - } + OS << "Basic block: " << BB.getName() << ":\n"; + Emb->getBBVector(BB).print(OS); } OS << "Instruction vectors:\n"; - const auto &InstMap = Emb->getInstVecMap(); for (const BasicBlock &BB : F) { for (const Instruction &I : BB) { - auto It = InstMap.find(&I); - if (It != InstMap.end()) { - OS << "Instruction: "; - I.print(OS); - It->second.print(OS); - } + OS << "Instruction: "; + I.print(OS); + Emb->getInstVector(I).print(OS); } } } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c5c3866..5ffdc4e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19340,8 +19340,10 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) { EVT VT = N->getValueType(0); const SDNodeFlags Flags = N->getFlags(); unsigned Opc = N->getOpcode(); - bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM; - bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM; + bool PropAllNaNsToQNaNs = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM; + bool PropOnlySNaNsToQNaNs = Opc == ISD::FMINNUM || Opc == ISD::FMAXNUM; + bool IsMin = + Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM || Opc == ISD::FMINIMUMNUM; SelectionDAG::FlagInserter FlagsInserter(DAG, N); // Constant fold. @@ -19356,34 +19358,53 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) { if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) { const APFloat &AF = N1CFP->getValueAPF(); - // minnum(X, nan) -> X - // maxnum(X, nan) -> X - // minimum(X, nan) -> nan - // maximum(X, nan) -> nan - if (AF.isNaN()) - return PropagatesNaN ? N->getOperand(1) : N->getOperand(0); + // minnum(X, qnan) -> X + // maxnum(X, qnan) -> X + // minnum(X, snan) -> qnan + // maxnum(X, snan) -> qnan + // minimum(X, nan) -> qnan + // maximum(X, nan) -> qnan + // minimumnum(X, nan) -> X + // maximumnum(X, nan) -> X + if (AF.isNaN()) { + if (PropAllNaNsToQNaNs || (AF.isSignaling() && PropOnlySNaNsToQNaNs)) { + if (AF.isSignaling()) + return DAG.getConstantFP(AF.makeQuiet(), SDLoc(N), VT); + return N->getOperand(1); + } + return N->getOperand(0); + } // In the following folds, inf can be replaced with the largest finite // float, if the ninf flag is set. if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) { - // minnum(X, -inf) -> -inf - // maxnum(X, +inf) -> +inf + // minnum(X, -inf) -> -inf (ignoring sNaN -> qNaN propagation) + // maxnum(X, +inf) -> +inf (ignoring sNaN -> qNaN propagation) // minimum(X, -inf) -> -inf if nnan // maximum(X, +inf) -> +inf if nnan - if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs())) + // minimumnum(X, -inf) -> -inf + // maximumnum(X, +inf) -> +inf + if (IsMin == AF.isNegative() && + (!PropAllNaNsToQNaNs || Flags.hasNoNaNs())) return N->getOperand(1); // minnum(X, +inf) -> X if nnan // maxnum(X, -inf) -> X if nnan - // minimum(X, +inf) -> X - // maximum(X, -inf) -> X - if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs())) + // minimum(X, +inf) -> X (ignoring quieting of sNaNs) + // maximum(X, -inf) -> X (ignoring quieting of sNaNs) + // minimumnum(X, +inf) -> X if nnan + // maximumnum(X, -inf) -> X if nnan + if (IsMin != AF.isNegative() && (PropAllNaNsToQNaNs || Flags.hasNoNaNs())) return N->getOperand(0); } } + // There are no VECREDUCE variants of FMINIMUMNUM or FMAXIMUMNUM + if (Opc == ISD::FMINIMUMNUM || Opc == ISD::FMAXIMUMNUM) + return SDValue(); + if (SDValue SD = reassociateReduction( - PropagatesNaN + PropAllNaNsToQNaNs ? (IsMin ? ISD::VECREDUCE_FMINIMUM : ISD::VECREDUCE_FMAXIMUM) : (IsMin ? ISD::VECREDUCE_FMIN : ISD::VECREDUCE_FMAX), Opc, SDLoc(N), VT, N0, N1, Flags)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 175753f..6c11c5b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -234,6 +234,19 @@ static bool dontUseFastISelFor(const Function &Fn) { }); } +static bool maintainPGOProfile(const TargetMachine &TM, + CodeGenOptLevel OptLevel) { + if (OptLevel != CodeGenOptLevel::None) + return true; + if (TM.getPGOOption()) { + const PGOOptions &Options = *TM.getPGOOption(); + return Options.Action == PGOOptions::PGOAction::IRUse || + Options.Action == PGOOptions::PGOAction::SampleUse || + Options.CSAction == PGOOptions::CSPGOAction::CSIRUse; + } + return false; +} + namespace llvm { //===--------------------------------------------------------------------===// @@ -395,6 +408,7 @@ SelectionDAGISel::~SelectionDAGISel() { delete CurDAG; } void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const { CodeGenOptLevel OptLevel = Selector->OptLevel; + bool RegisterPGOPasses = maintainPGOProfile(Selector->TM, Selector->OptLevel); if (OptLevel != CodeGenOptLevel::None) AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<GCModuleInfo>(); @@ -403,15 +417,15 @@ void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); - if (UseMBPI && OptLevel != CodeGenOptLevel::None) - AU.addRequired<BranchProbabilityInfoWrapperPass>(); + if (UseMBPI && RegisterPGOPasses) + AU.addRequired<BranchProbabilityInfoWrapperPass>(); AU.addRequired<ProfileSummaryInfoWrapperPass>(); // AssignmentTrackingAnalysis only runs if assignment tracking is enabled for // the module. AU.addRequired<AssignmentTrackingAnalysis>(); AU.addPreserved<AssignmentTrackingAnalysis>(); - if (OptLevel != CodeGenOptLevel::None) - LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); + if (RegisterPGOPasses) + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } @@ -464,6 +478,7 @@ void SelectionDAGISel::initializeAnalysisResults( (void)MatchFilterFuncName; #endif + bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel); TII = MF->getSubtarget().getInstrInfo(); TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); @@ -474,7 +489,7 @@ void SelectionDAGISel::initializeAnalysisResults( auto *PSI = MAMP.getCachedResult<ProfileSummaryAnalysis>(*Fn.getParent()); BlockFrequencyInfo *BFI = nullptr; FAM.getResult<BlockFrequencyAnalysis>(Fn); - if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None) + if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses) BFI = &FAM.getResult<BlockFrequencyAnalysis>(Fn); FunctionVarLocs const *FnVarLocs = nullptr; @@ -492,7 +507,7 @@ void SelectionDAGISel::initializeAnalysisResults( // into account). That's unfortunate but OK because it just means we won't // ask for passes that have been required anyway. - if (UseMBPI && OptLevel != CodeGenOptLevel::None) + if (UseMBPI && RegisterPGOPasses) FuncInfo->BPI = &FAM.getResult<BranchProbabilityAnalysis>(Fn); else FuncInfo->BPI = nullptr; @@ -518,6 +533,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { (void)MatchFilterFuncName; #endif + bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel); TII = MF->getSubtarget().getInstrInfo(); TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); @@ -528,7 +544,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { AC = &MFP.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(Fn); auto *PSI = &MFP.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); BlockFrequencyInfo *BFI = nullptr; - if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None) + if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses) BFI = &MFP.getAnalysis<LazyBlockFrequencyInfoPass>().getBFI(); FunctionVarLocs const *FnVarLocs = nullptr; @@ -549,7 +565,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) { // into account). That's unfortunate but OK because it just means we won't // ask for passes that have been required anyway. - if (UseMBPI && OptLevel != CodeGenOptLevel::None) + if (UseMBPI && RegisterPGOPasses) FuncInfo->BPI = &MFP.getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); else diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 5980ee3..286ed03 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3623,7 +3623,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( // 1. Build a list of reduction variables. // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; auto Size = ReductionInfos.size(); - Type *PtrTy = PointerType::getUnqual(Ctx); + Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS()); + Type *FuncPtrTy = + Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace()); Type *RedArrayTy = ArrayType::get(PtrTy, Size); CodeGenIP = Builder.saveIP(); Builder.restoreIP(AllocaIP); @@ -3667,9 +3669,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Builder.getInt64(MaxDataSize * ReductionInfos.size()); if (!IsTeamsReduction) { Value *SarFuncCast = - Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy); Value *WcFuncCast = - Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy); Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast, WcFuncCast}; Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr( @@ -10072,13 +10074,14 @@ void OpenMPIRBuilder::initializeTypes(Module &M) { LLVMContext &Ctx = M.getContext(); StructType *T; unsigned DefaultTargetAS = Config.getDefaultTargetAS(); + unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace(); #define OMP_TYPE(VarName, InitValue) VarName = InitValue; #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \ VarName##Ty = ArrayType::get(ElemTy, ArraySize); \ VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS); #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \ VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \ - VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS); + VarName##Ptr = PointerType::get(Ctx, ProgramAS); #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \ T = StructType::getTypeByName(Ctx, StructName); \ if (!T) \ diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp index 4f37624..8e6d654 100644 --- a/llvm/lib/IR/DiagnosticInfo.cpp +++ b/llvm/lib/IR/DiagnosticInfo.cpp @@ -273,6 +273,13 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, C.print(OS); } +DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, + BranchProbability P) + : Key(std::string(Key)) { + raw_string_ostream OS(Val); + P.print(OS); +} + DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc) : Key(std::string(Key)), Loc(Loc) { if (Loc) { diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 47860c0..708e79d 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -20,7 +20,7 @@ using namespace llvm::mustache; namespace { -using Accessor = SmallVector<std::string>; +using Accessor = ArrayRef<StringRef>; static bool isFalsey(const json::Value &V) { return V.getAsNull() || (V.getAsBoolean() && !V.getAsBoolean().value()) || @@ -34,23 +34,32 @@ static bool isContextFalsey(const json::Value *V) { return isFalsey(*V); } -static Accessor splitMustacheString(StringRef Str) { +static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // We split the mustache string into an accessor. // For example: // "a.b.c" would be split into {"a", "b", "c"} // We make an exception for a single dot which // refers to the current context. - Accessor Tokens; + SmallVector<StringRef> Tokens; if (Str == ".") { - Tokens.emplace_back(Str); - return Tokens; - } - while (!Str.empty()) { - StringRef Part; - std::tie(Part, Str) = Str.split("."); - Tokens.emplace_back(Part.trim()); + // "." is a special accessor that refers to the current context. + // It's a literal, so it doesn't need to be saved. + Tokens.push_back("."); + } else { + while (!Str.empty()) { + StringRef Part; + std::tie(Part, Str) = Str.split('.'); + // Each part of the accessor needs to be saved to the arena + // to ensure it has a stable address. + Tokens.push_back(Ctx.Saver.save(Part.trim())); + } } - return Tokens; + // Now, allocate memory for the array of StringRefs in the arena. + StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size()); + // Copy the StringRefs from the stack vector to the arena. + std::copy(Tokens.begin(), Tokens.end(), ArenaTokens); + // Return an ArrayRef pointing to the stable arena memory. + return ArrayRef<StringRef>(ArenaTokens, Tokens.size()); } } // namespace @@ -97,23 +106,23 @@ public: SetDelimiter, }; - Token(std::string Str) - : TokenType(Type::Text), RawBody(std::move(Str)), TokenBody(RawBody), + Token(StringRef Str) + : TokenType(Type::Text), RawBody(Str), TokenBody(RawBody), AccessorValue({}), Indentation(0) {}; - Token(std::string RawBody, std::string TokenBody, char Identifier) - : RawBody(std::move(RawBody)), TokenBody(std::move(TokenBody)), - Indentation(0) { + Token(StringRef RawBody, StringRef TokenBody, char Identifier, + MustacheContext &Ctx) + : RawBody(RawBody), TokenBody(TokenBody), Indentation(0) { TokenType = getTokenType(Identifier); if (TokenType == Type::Comment) return; StringRef AccessorStr(this->TokenBody); if (TokenType != Type::Variable) AccessorStr = AccessorStr.substr(1); - AccessorValue = splitMustacheString(StringRef(AccessorStr).trim()); + AccessorValue = splitMustacheString(StringRef(AccessorStr).trim(), Ctx); } - Accessor getAccessor() const { return AccessorValue; } + ArrayRef<StringRef> getAccessor() const { return AccessorValue; } Type getType() const { return TokenType; } @@ -144,16 +153,16 @@ public: Type TokenType; // RawBody is the original string that was tokenized. - std::string RawBody; + StringRef RawBody; // TokenBody is the original string with the identifier removed. - std::string TokenBody; - Accessor AccessorValue; + StringRef TokenBody; + ArrayRef<StringRef> AccessorValue; size_t Indentation; }; using EscapeMap = DenseMap<char, std::string>; -class ASTNode { +class ASTNode : public ilist_node<ASTNode> { public: enum Type { Root, @@ -168,18 +177,19 @@ public: ASTNode(MustacheContext &Ctx) : Ctx(Ctx), Ty(Type::Root), Parent(nullptr), ParentContext(nullptr) {} - ASTNode(MustacheContext &Ctx, std::string Body, ASTNode *Parent) - : Ctx(Ctx), Ty(Type::Text), Body(std::move(Body)), Parent(Parent), + ASTNode(MustacheContext &Ctx, StringRef Body, ASTNode *Parent) + : Ctx(Ctx), Ty(Type::Text), Body(Body), Parent(Parent), ParentContext(nullptr) {} // Constructor for Section/InvertSection/Variable/UnescapeVariable Nodes - ASTNode(MustacheContext &Ctx, Type Ty, Accessor Accessor, ASTNode *Parent) - : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(std::move(Accessor)), + ASTNode(MustacheContext &Ctx, Type Ty, ArrayRef<StringRef> Accessor, + ASTNode *Parent) + : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(Accessor), ParentContext(nullptr) {} - void addChild(AstPtr Child) { Children.emplace_back(std::move(Child)); }; + void addChild(AstPtr Child) { Children.push_back(Child); }; - void setRawBody(std::string NewBody) { RawBody = std::move(NewBody); }; + void setRawBody(StringRef NewBody) { RawBody = NewBody; }; void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; }; @@ -212,28 +222,27 @@ private: MustacheContext &Ctx; Type Ty; size_t Indentation = 0; - std::string RawBody; - std::string Body; + StringRef RawBody; + StringRef Body; ASTNode *Parent; - // TODO: switch implementation to SmallVector<T> - std::vector<AstPtr> Children; - const Accessor AccessorValue; + ASTNodeList Children; + const ArrayRef<StringRef> AccessorValue; const llvm::json::Value *ParentContext; }; // A wrapper for arena allocator for ASTNodes static AstPtr createRootNode(MustacheContext &Ctx) { - return std::make_unique<ASTNode>(Ctx); + return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx); } -static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T, Accessor A, - ASTNode *Parent) { - return std::make_unique<ASTNode>(Ctx, T, std::move(A), Parent); +static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T, + ArrayRef<StringRef> A, ASTNode *Parent) { + return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, T, A, Parent); } -static AstPtr createTextNode(MustacheContext &Ctx, std::string Body, +static AstPtr createTextNode(MustacheContext &Ctx, StringRef Body, ASTNode *Parent) { - return std::make_unique<ASTNode>(Ctx, std::move(Body), Parent); + return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, Body, Parent); } // Function to check if there is meaningful text behind. @@ -295,9 +304,9 @@ static void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) { StringRef NextTokenBody = NextToken.TokenBody; // Cut off the leading newline which could be \n or \r\n. if (NextTokenBody.starts_with("\r\n")) - NextToken.TokenBody = NextTokenBody.substr(2).str(); + NextToken.TokenBody = NextTokenBody.substr(2); else if (NextTokenBody.starts_with("\n")) - NextToken.TokenBody = NextTokenBody.substr(1).str(); + NextToken.TokenBody = NextTokenBody.substr(1); } // Adjust previous token body if there no text behind. @@ -312,7 +321,7 @@ void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx, StringRef PrevTokenBody = PrevToken.TokenBody; StringRef Unindented = PrevTokenBody.rtrim(" \r\t\v"); size_t Indentation = PrevTokenBody.size() - Unindented.size(); - PrevToken.TokenBody = Unindented.str(); + PrevToken.TokenBody = Unindented; CurrentToken.setIndentation(Indentation); } @@ -402,21 +411,20 @@ static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, } static std::optional<std::pair<StringRef, StringRef>> -processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) { +processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content << ", Kind: " << tagKindToString(T.TagKind) << "\n"); if (T.TagKind == Tag::Kind::Triple) { - Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&'); + Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx); return std::nullopt; } StringRef Interpolated = T.Content; - std::string RawBody = T.FullMatch.str(); if (!Interpolated.trim().starts_with("=")) { char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); - Tokens.emplace_back(RawBody, Interpolated.str(), Front); + Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx); return std::nullopt; } - Tokens.emplace_back(RawBody, Interpolated.str(), '='); + Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx); StringRef DelimSpec = Interpolated.trim(); DelimSpec = DelimSpec.drop_front(1); DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); @@ -432,7 +440,7 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) { // The mustache spec allows {{{ }}} to unescape variables, // but we don't support that here. An unescape variable // is represented only by {{& variable}}. -static SmallVector<Token> tokenize(StringRef Template) { +static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n"); SmallVector<Token> Tokens; SmallString<8> Open("{{"); @@ -446,19 +454,17 @@ static SmallVector<Token> tokenize(StringRef Template) { if (T.TagKind == Tag::Kind::None) { // No more tags, the rest is text. - Tokens.emplace_back(Template.substr(Start).str()); - LLVM_DEBUG(dbgs() << " No more tags. Created final Text token: \"" - << Template.substr(Start) << "\"\n"); + Tokens.emplace_back(Template.substr(Start)); break; } // Add the text before the tag. if (T.StartPosition > Start) { StringRef Text = Template.substr(Start, T.StartPosition - Start); - Tokens.emplace_back(Text.str()); + Tokens.emplace_back(Text); } - if (auto NewDelims = processTag(T, Tokens)) { + if (auto NewDelims = processTag(T, Tokens, Ctx)) { std::tie(Open, Close) = *NewDelims; } @@ -614,20 +620,20 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, const Accessor &A) { AstPtr CurrentNode = createNode(Ctx, Ty, A, Parent); size_t Start = CurrentPtr; - parseMustache(CurrentNode.get()); + parseMustache(CurrentNode); const size_t End = CurrentPtr - 1; - std::string RawBody; + SmallString<128> RawBody; for (std::size_t I = Start; I < End; I++) RawBody += Tokens[I].RawBody; - CurrentNode->setRawBody(std::move(RawBody)); - Parent->addChild(std::move(CurrentNode)); + CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody))); + Parent->addChild(CurrentNode); } AstPtr Parser::parse() { - Tokens = tokenize(TemplateStr); + Tokens = tokenize(TemplateStr, Ctx); CurrentPtr = 0; AstPtr RootNode = createRootNode(Ctx); - parseMustache(RootNode.get()); + parseMustache(RootNode); return RootNode; } @@ -636,31 +642,29 @@ void Parser::parseMustache(ASTNode *Parent) { while (CurrentPtr < Tokens.size()) { Token CurrentToken = Tokens[CurrentPtr]; CurrentPtr++; - Accessor A = CurrentToken.getAccessor(); + ArrayRef<StringRef> A = CurrentToken.getAccessor(); AstPtr CurrentNode; switch (CurrentToken.getType()) { case Token::Type::Text: { - CurrentNode = - createTextNode(Ctx, std::move(CurrentToken.TokenBody), Parent); - Parent->addChild(std::move(CurrentNode)); + CurrentNode = createTextNode(Ctx, CurrentToken.TokenBody, Parent); + Parent->addChild(CurrentNode); break; } case Token::Type::Variable: { - CurrentNode = createNode(Ctx, ASTNode::Variable, std::move(A), Parent); - Parent->addChild(std::move(CurrentNode)); + CurrentNode = createNode(Ctx, ASTNode::Variable, A, Parent); + Parent->addChild(CurrentNode); break; } case Token::Type::UnescapeVariable: { - CurrentNode = - createNode(Ctx, ASTNode::UnescapeVariable, std::move(A), Parent); - Parent->addChild(std::move(CurrentNode)); + CurrentNode = createNode(Ctx, ASTNode::UnescapeVariable, A, Parent); + Parent->addChild(CurrentNode); break; } case Token::Type::Partial: { - CurrentNode = createNode(Ctx, ASTNode::Partial, std::move(A), Parent); + CurrentNode = createNode(Ctx, ASTNode::Partial, A, Parent); CurrentNode->setIndentation(CurrentToken.getIndentation()); - Parent->addChild(std::move(CurrentNode)); + Parent->addChild(CurrentNode); break; } case Token::Type::SectionOpen: { @@ -694,8 +698,7 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) { return; } case json::Value::String: { - auto Str = *Data.getAsString(); - OS << Str.str(); + OS << *Data.getAsString(); return; } @@ -727,7 +730,7 @@ void ASTNode::renderPartial(const json::Value &CurrentCtx, << ", Indentation:" << Indentation << "\n"); auto Partial = Ctx.Partials.find(AccessorValue[0]); if (Partial != Ctx.Partials.end()) - renderPartial(CurrentCtx, OS, Partial->getValue().get()); + renderPartial(CurrentCtx, OS, Partial->getValue()); } void ASTNode::renderVariable(const json::Value &CurrentCtx, @@ -858,8 +861,8 @@ const json::Value *ASTNode::findContext() { void ASTNode::renderChild(const json::Value &Contexts, MustacheOutputStream &OS) { - for (AstPtr &Child : Children) - Child->render(Contexts, OS); + for (ASTNode &Child : Children) + Child.render(Contexts, OS); } void ASTNode::renderPartial(const json::Value &Contexts, @@ -869,7 +872,7 @@ void ASTNode::renderPartial(const json::Value &Contexts, Partial->render(Contexts, IS); } -void ASTNode::renderLambdas(const json::Value &Contexts, +void ASTNode::renderLambdas(const llvm::json::Value &Contexts, MustacheOutputStream &OS, Lambda &L) { json::Value LambdaResult = L(); std::string LambdaStr; @@ -886,9 +889,9 @@ void ASTNode::renderLambdas(const json::Value &Contexts, LambdaNode->render(Contexts, OS); } -void ASTNode::renderSectionLambdas(const json::Value &Contexts, +void ASTNode::renderSectionLambdas(const llvm::json::Value &Contexts, MustacheOutputStream &OS, SectionLambda &L) { - json::Value Return = L(RawBody); + json::Value Return = L(RawBody.str()); if (isFalsey(Return)) return; std::string LambdaStr; @@ -899,15 +902,16 @@ void ASTNode::renderSectionLambdas(const json::Value &Contexts, LambdaNode->render(Contexts, OS); } -void Template::render(const json::Value &Data, llvm::raw_ostream &OS) { +void Template::render(const llvm::json::Value &Data, llvm::raw_ostream &OS) { RawMustacheOutputStream MOS(OS); Tree->render(Data, MOS); } void Template::registerPartial(std::string Name, std::string Partial) { - Parser P(Partial, Ctx); + StringRef SavedPartial = Ctx.Saver.save(Partial); + Parser P(SavedPartial, Ctx); AstPtr PartialTree = P.parse(); - Ctx.Partials.insert(std::make_pair(Name, std::move(PartialTree))); + Ctx.Partials.insert(std::make_pair(Name, PartialTree)); } void Template::registerLambda(std::string Name, Lambda L) { @@ -922,7 +926,7 @@ void Template::overrideEscapeCharacters(EscapeMap E) { Ctx.Escapes = std::move(E); } -Template::Template(StringRef TemplateStr) { +Template::Template(StringRef TemplateStr, MustacheContext &Ctx) : Ctx(Ctx) { Parser P(TemplateStr, Ctx); Tree = P.parse(); // The default behavior is to escape html entities. @@ -935,18 +939,12 @@ Template::Template(StringRef TemplateStr) { } Template::Template(Template &&Other) noexcept - : Ctx(std::move(Other.Ctx)), Tree(std::move(Other.Tree)) {} + : Ctx(Other.Ctx), Tree(Other.Tree) { + Other.Tree = nullptr; +} Template::~Template() = default; -Template &Template::operator=(Template &&Other) noexcept { - if (this != &Other) { - Ctx = std::move(Other.Ctx); - Tree = std::move(Other.Tree); - Other.Tree = nullptr; - } - return *this; -} } // namespace llvm::mustache #undef DEBUG_TYPE diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index dc8e7c8..31b3d18 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1458,6 +1458,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal); setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal); + setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom); setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom); if (Subtarget->hasMatMulInt8()) { @@ -30769,6 +30770,17 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op, ResultVT.isFixedLengthVector() && useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true); + // We can handle this case natively by accumulating into a wider + // zero-padded vector. + if (!ConvertToScalable && ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) { + SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32); + SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0); + SDValue Wide = + DAG.getNode(Op.getOpcode(), DL, MVT::v4i32, WideAcc, LHS, RHS); + SDValue Reduced = DAG.getNode(AArch64ISD::ADDP, DL, MVT::v4i32, Wide, Wide); + return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0); + } + if (ConvertToScalable) { ResultVT = getContainerForFixedLengthVector(DAG, ResultVT); OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType()); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index bc047a4a..a1fb665 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -651,7 +651,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Custom conversions to/from v2i8. setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); - // Only logical ops can be done on v4i8 directly, others must be done + // Only logical ops can be done on v4i8/v2i32 directly, others must be done // elementwise. setOperationAction( {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE, @@ -669,7 +669,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM, ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT, ISD::USUBSAT}, - MVT::v4i8, Expand); + {MVT::v4i8, MVT::v2i32}, Expand); // Operations not directly supported by NVPTX. for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, @@ -689,7 +689,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 40c05e8..333b693 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1823,6 +1823,11 @@ def TuneConditionalCompressedMoveFusion def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">; def NoConditionalMoveFusion : Predicate<"!Subtarget->hasConditionalMoveFusion()">; +def TuneHasSingleElementVecFP64 + : SubtargetFeature<"single-element-vec-fp64", "HasSingleElementVectorFP64", "true", + "Certain vector FP64 operations produce a single result " + "element per cycle">; + def TuneMIPSP8700 : SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700", "MIPS p8700 processor">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 447f05c..f2724c41 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1636,7 +1636,7 @@ def : QCISELECTCCIPat<SETNE, QC_SELECTNEI>; } let Predicates = [HasVendorXqcilsm, IsRV32] in { -def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7), +def : Pat<(qc_setwmi (i32 GPR:$rs3), GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7), (QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>; } // Predicates = [HasVendorXqcilsm, IsRV32] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index a29b7dd..57fbaa0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -634,56 +634,56 @@ def : PatGpr<bswap, REV8_RV64, i64>; let Predicates = [HasStdExtZbkb] in { def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF), - (zexti8 (XLenVT GPR:$rs1))), - (PACKH GPR:$rs1, GPR:$rs2)>; -def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)), - (zexti8 (XLenVT GPR:$rs1))), - (PACKH GPR:$rs1, GPR:$rs2)>; + zexti8:$rs1), + (PACKH zexti8:$rs1, GPR:$rs2)>; +def : Pat<(or (shl zexti8:$rs2, (XLenVT 8)), + zexti8:$rs1), + (PACKH zexti8:$rs1, zexti8:$rs2)>; def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)), - (zexti8 (XLenVT GPR:$rs1))), 0xFFFF), - (PACKH GPR:$rs1, GPR:$rs2)>; + zexti8:$rs1), 0xFFFF), + (PACKH zexti8:$rs1, GPR:$rs2)>; def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)), - (zexti8 (XLenVT GPR:$rs1))), - (PACKH GPR:$rs1, GPR:$rs2)>; + zexti8:$rs1), + (PACKH zexti8:$rs1, GPR:$rs2)>; } // Predicates = [HasStdExtZbkb] let Predicates = [HasStdExtZbkb, IsRV32] in { -def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))), - (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(i32 (or zexti16:$rs1, (shl GPR:$rs2, (i32 16)))), + (PACK zexti16:$rs1, GPR:$rs2)>; -def : Pat<(or (shl GPR:$rs2, (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), - (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; +def : Pat<(i32 (or (shl GPR:$rs2, (XLenVT 24)), + (shl zexti8:$rs1, (XLenVT 16)))), + (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>; // Match a pattern of 2 bytes being inserted into bits [31:16], with bits // bits [15:0] coming from a zero extended value. We can use pack with packh for // bits [31:16]. If bits [15:0] can also be a packh, it can be matched // separately. -def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), - (zexti16 (XLenVT GPR:$rs1))), - (PACK (XLenVT GPR:$rs1), - (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; +def : Pat<(i32 (or (or (shl GPR:$op1rs2, (XLenVT 24)), + (shl zexti8:$op1rs1, (XLenVT 16))), + zexti16:$rs1)), + (PACK zexti16:$rs1, + (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; } let Predicates = [HasStdExtZbkb, IsRV64] in { -def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))), - (PACK GPR:$rs1, GPR:$rs2)>; +def : Pat<(i64 (or zexti32:$rs1, (shl GPR:$rs2, (i64 32)))), + (PACK zexti32:$rs1, GPR:$rs2)>; -def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), - (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; +def : Pat<(i64 (or (shl zexti8:$rs2, (XLenVT 24)), + (shl zexti8:$rs1, (XLenVT 16)))), + (SLLI (XLenVT (PACKH zexti8:$rs1, zexti8:$rs2)), (XLenVT 16))>; def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))), - (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>; + (shl zexti8:$rs1, (XLenVT 16))), + (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>; def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)), - (zexti16 (i64 GPR:$rs1))), - (PACKW GPR:$rs1, GPR:$rs2)>; + zexti16:$rs1), + (PACKW zexti16:$rs1, GPR:$rs2)>; def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), - (zexti16 (i64 GPR:$rs1)))), - (PACKW GPR:$rs1, GPR:$rs2)>; + zexti16:$rs1)), + (PACKW zexti16:$rs1, GPR:$rs2)>; // Match a pattern of 2 bytes being inserted into bits [31:16], with bits // bits [15:0] coming from a zero extended value, and bits [63:32] being @@ -691,35 +691,35 @@ def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), // also be a packh, it can be matched separately. def : Pat<(binop_allwusers<or> (or (shl GPR:$op1rs2, (XLenVT 24)), - (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), - (zexti16 (XLenVT GPR:$rs1))), - (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + (shl zexti8:$op1rs1, (XLenVT 16))), + zexti16:$rs1), + (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; // We need to manually reassociate the patterns because of the binop_allwusers. def : Pat<(binop_allwusers<or> - (or (zexti16 (XLenVT GPR:$rs1)), - (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), + (or zexti16:$rs1, + (shl zexti8:$op1rs1, (XLenVT 16))), (shl GPR:$op1rs2, (XLenVT 24))), - (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; def : Pat<(binop_allwusers<or> - (or (zexti16 (XLenVT GPR:$rs1)), - (shl GPR:$op1rs1, (XLenVT 24))), - (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))), - (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + (or zexti16:$rs1, + (shl GPR:$op1rs2, (XLenVT 24))), + (shl zexti8:$op1rs1, (XLenVT 16))), + (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; def : Pat<(i64 (or (or (zexti16 (XLenVT GPR:$rs1)), - (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))), - (sext_inreg (shl GPR:$op1rs1, (XLenVT 24)), i32))), - (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; + (shl zexti8:$op1rs1, (XLenVT 16))), + (sext_inreg (shl GPR:$op1rs2, (XLenVT 24)), i32))), + (PACKW GPR:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>; // Match a pattern of 2 halfwords being inserted into bits [63:32], with bits // bits [31:0] coming from a zero extended value. We can use pack with packw for // bits [63:32]. If bits [63:31] can also be a packw, it can be matched // separately. def : Pat<(or (or (shl GPR:$op1rs2, (i64 48)), - (shl (zexti16 (i64 GPR:$op1rs1)), (i64 32))), - (zexti32 (i64 GPR:$rs1))), - (PACK (XLenVT GPR:$rs1), - (XLenVT (PACKW GPR:$op1rs1, GPR:$op1rs2)))>; + (shl zexti16:$op1rs1, (i64 32))), + zexti32:$rs1), + (PACK zexti32:$rs1, + (XLenVT (PACKW zexti16:$op1rs1, GPR:$op1rs2)))>; } // Predicates = [HasStdExtZbkb, IsRV64] let Predicates = [HasStdExtZbb, IsRV32] in diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td index 6d86aff..3658817 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td +++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td @@ -14,6 +14,10 @@ // otherwise. def VLDSX0Pred : MCSchedPredicate<CheckRegOperand<3, X0>>; +// This scheduling predicate is true when subtarget feature TuneHasSingleElementVecFP64 +// is enabled. +def SingleElementVecFP64SchedPred : FeatureSchedPredicate<TuneHasSingleElementVecFP64>; + // Returns true if this is the sext.w pattern, addiw rd, rs1, 0. def isSEXT_W : TIIPredicate<"isSEXT_W", diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 17a7948..e86431f 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -338,7 +338,8 @@ def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390", FeatureStdExtZvl1024b, FeatureVendorXSiFivecdiscarddlone, FeatureVendorXSiFivecflushdlone], - SiFiveIntelligenceTuneFeatures>; + !listconcat(SiFiveIntelligenceTuneFeatures, + [TuneHasSingleElementVecFP64])>; defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll, TuneConditionalCompressedMoveFusion, diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 3e07eff..f863392a 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -317,7 +317,6 @@ multiclass SiFive7WriteResBase<int VLEN, ProcResourceKind VL, ProcResourceKind VS, ProcResourceKind VCQ, SiFive7FPLatencies fpLatencies, - bit isFP64Throttled = false, bit hasFastGather = false> { // Branching @@ -832,29 +831,56 @@ multiclass SiFive7WriteResBase<int VLEN, // 13. Vector Floating-Point Instructions foreach mx = SchedMxListF in { foreach sew = SchedSEWSet<mx, isF=1>.val in { - defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 64)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesDefault<mx>.c); - defvar Lat8 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 8); - defvar VA = !if(!and(isFP64Throttled, !eq(sew, 64)), VA1, VA1OrVA2); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; - let Latency = Lat8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; - } - defvar Lat4 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 4); - let Latency = Lat4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA], mx, sew, IsWorstCase>; - // min max require merge - defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>; + if !eq(sew, 64) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + foreach SchedWriteName = ["WriteVFALUV", "WriteVFALUF", "WriteVFMulV", "WriteVFMulF", + "WriteVFMulAddV", "WriteVFMulAddF"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1OrVA2], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + foreach SchedWriteName = ["WriteVFRecpV", "WriteVFCvtIToFV"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + foreach SchedWriteName = ["WriteVFSgnjV", "WriteVFSgnjF"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1OrVA2], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + foreach SchedWriteName = ["WriteVFMinMaxV", "WriteVFMinMaxF"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in { + defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } + let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in { + defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; + // min max require merge + defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>; + } } } } @@ -892,19 +918,28 @@ multiclass SiFive7WriteResBase<int VLEN, // Widening foreach mx = SchedMxListW in { foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in { - defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesDefault<mx>.c); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c; - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in - defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c; + if !eq(sew, 32) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtIToFV", SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, + AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in + defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } } } foreach mx = SchedMxListFW in { foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { - defvar Cycles = SiFive7GetCyclesDefault<mx>.c; + defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { + let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in { defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; @@ -912,11 +947,19 @@ multiclass SiFive7WriteResBase<int VLEN, defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>; } - defvar CvtCycles = !if(!and(isFP64Throttled, !eq(sew, 32)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesDefault<mx>.c); - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in - defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + if !eq(sew, 32) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtFToFV", SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, + AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in + defm : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } } defvar Cycles = SiFive7GetCyclesDefault<mx>.c; defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c; @@ -933,13 +976,23 @@ multiclass SiFive7WriteResBase<int VLEN, } foreach mx = SchedMxListFW in { foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { - defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)), - SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, - SiFive7GetCyclesNarrowing<mx>.c); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; - let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + defvar DefaultCycles = SiFive7GetCyclesNarrowing<mx>.c; + if !eq(sew, 32) then { + defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c; + foreach SchedWriteName = ["WriteVFNCvtIToFV", "WriteVFNCvtFToFV"] in + defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred, + // Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)], + // Not Predicated + [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)], + mx, sew, IsWorstCase>; + } else { + let Latency = 8, + AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in { + defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>; + } } } } @@ -1499,7 +1552,6 @@ multiclass SiFive7ReadAdvance { /// eventually be supplied by different SchedMachineModels. multiclass SiFive7SchedResources<int vlen, bit extraVALU, SiFive7FPLatencies fpLatencies, - bit isFP64Throttled, bit hasFastGather> { defm SiFive7 : SiFive7ProcResources<extraVALU>; @@ -1527,8 +1579,7 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU, : SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB, SiFive7IDiv, SiFive7FDiv, SiFive7VA1, SiFive7VA1OrVA2, SiFive7VL, SiFive7VS, - SiFive7VCQ, fpLatencies, isFP64Throttled, - hasFastGather>; + SiFive7VCQ, fpLatencies, hasFastGather>; //===----------------------------------------------------------------------===// // Bypass and advance @@ -1560,7 +1611,6 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel { bit HasExtraVALU = false; SiFive7FPLatencies FPLatencies; - bit IsFP64Throttled = false; bit HasFastGather = false; string Name = !subst("Model", "", !subst("SiFive7", "", NAME)); @@ -1587,7 +1637,6 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> { def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> { let HasExtraVALU = true; let FPLatencies = SiFive7LowFPLatencies; - let IsFP64Throttled = true; let HasFastGather = true; } @@ -1596,7 +1645,6 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in { let SchedModel = model in defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU, model.FPLatencies, - model.IsFP64Throttled, model.HasFastGather>; } diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index 01a4308..d11b446 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -128,6 +128,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred, IsWorstCase>; } +multiclass LMULSEWWriteResMXSEWVariant<string name, SchedPredicateBase Pred, + list<ProcResourceKind> predResources, + int predLat, list<int> predAcquireCycles, + list<int> predReleaseCycles, + list<ProcResourceKind> noPredResources, + int noPredLat, list<int> noPredAcquireCycles, + list<int> noPredReleaseCycles, + string mx, int sew, bit IsWorstCase> { + defm "" : LMULWriteResVariantImpl<name, name # "_" # mx # "_E" # sew, Pred, predResources, + predLat, predAcquireCycles, + predReleaseCycles, noPredResources, + noPredLat, noPredAcquireCycles, + noPredReleaseCycles, + IsWorstCase>; +} + // Define multiclasses to define SchedWrite, SchedRead, WriteRes, and // ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the // SchedMxList variants above. Each multiclass is responsible for defining diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp index af79070..275165d 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp @@ -184,8 +184,8 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI, // Output the TLS marker if present. if ((unsigned)OpNum + 1 < MI->getNumOperands()) { const MCOperand &MO = MI->getOperand(OpNum + 1); - const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr()); - switch (refExp.getSpecifier()) { + const MCSymbolRefExpr &RefExp = cast<MCSymbolRefExpr>(*MO.getExpr()); + switch (RefExp.getSpecifier()) { case SystemZ::S_TLSGD: O << ":tls_gdcall:"; break; @@ -195,7 +195,7 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI, default: llvm_unreachable("Unexpected symbol kind"); } - O << refExp.getSymbol().getName(); + O << RefExp.getSymbol().getName(); } } diff --git a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp index fce6393..8c31579 100644 --- a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp +++ b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp @@ -13,10 +13,9 @@ using namespace llvm; -SystemZConstantPoolValue:: -SystemZConstantPoolValue(const GlobalValue *gv, - SystemZCP::SystemZCPModifier modifier) - : MachineConstantPoolValue(gv->getType()), GV(gv), Modifier(modifier) {} +SystemZConstantPoolValue::SystemZConstantPoolValue( + const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier) + : MachineConstantPoolValue(GV->getType()), GV(GV), Modifier(Modifier) {} SystemZConstantPoolValue * SystemZConstantPoolValue::Create(const GlobalValue *GV, diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp index 34d58e0..5313fba 100644 --- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp +++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -352,10 +352,9 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const { // Similarly, a group-ending SU may either fit well (last in group), or // end the group prematurely. if (SC->EndGroup) { - unsigned resultingGroupSize = - (CurrGroupSize + getNumDecoderSlots(SU)); - if (resultingGroupSize < 3) - return (3 - resultingGroupSize); + unsigned ResultingGroupSize = (CurrGroupSize + getNumDecoderSlots(SU)); + if (ResultingGroupSize < 3) + return (3 - ResultingGroupSize); return -1; } diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 764ff998..4b3ddbd 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -592,10 +592,10 @@ def : Pat<(X86mcvttp2sis (v2f64 (X86VBroadcastld64 addr:$src)), (VCVTTPD2DQSZ128rmbkz VK2WM:$mask, addr:$src)>; // Patterns VCVTTPD2UDQSZ128 -def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))), - (VCVTTPD2UDQSZ128rmb addr:$src)>; def : Pat<(v4i32 (X86cvttp2uis (v2f64 VR128X:$src))), (VCVTTPD2UDQSZ128rr VR128X:$src)>; +def : Pat<(v4i32 (X86cvttp2uis (loadv2f64 addr:$src))), + (VCVTTPD2UDQSZ128rm addr:$src)>; def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2UDQSZ128rmb addr:$src)>; def : Pat<(X86mcvttp2uis (v2f64 VR128X:$src), (v4i32 VR128X:$src0), diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp index 9b9e2ba..9150b58 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -459,7 +459,7 @@ void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) { Value *Op0 = I->getOperand(0); Value *LHS = getReducedOperand(I->getOperand(1), SclTy); Value *RHS = getReducedOperand(I->getOperand(2), SclTy); - Res = Builder.CreateSelect(Op0, LHS, RHS); + Res = Builder.CreateSelect(Op0, LHS, RHS, "", I); break; } case Instruction::PHI: { diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp index 9115946..f166fef 100644 --- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp @@ -24,6 +24,9 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -33,6 +36,11 @@ using namespace llvm; #define DEBUG_TYPE "coro-annotation-elide" +static cl::opt<float> CoroElideBranchRatio( + "coro-elide-branch-ratio", cl::init(0.55), cl::Hidden, + cl::desc("Minimum BranchProbability to consider a elide a coroutine.")); +extern cl::opt<unsigned> MinBlockCounterExecution; + static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) { for (Instruction &I : F->getEntryBlock()) if (!isa<AllocaInst>(&I)) @@ -145,6 +153,30 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C, bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine(); bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe); if (IsCallerPresplitCoroutine && HasAttr) { + BranchProbability MinBranchProbability( + static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution), + MinBlockCounterExecution); + + auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller); + + auto Prob = BranchProbability::getBranchProbability( + BFI.getBlockFreq(CB->getParent()).getFrequency(), + BFI.getEntryFreq().getFrequency()); + + if (Prob < MinBranchProbability) { + ORE.emit([&]() { + return OptimizationRemarkMissed( + DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller) + << "'" << ore::NV("callee", Callee->getName()) + << "' not elided in '" + << ore::NV("caller", Caller->getName()) + << "' because of low probability: " + << ore::NV("probability", Prob) << " (threshold: " + << ore::NV("threshold", MinBranchProbability) << ")"; + }); + continue; + } + auto *CallerN = CG.lookup(*Caller); auto *CallerC = CallerN ? CG.lookupSCC(*CallerN) : nullptr; // If CallerC is nullptr, it means LazyCallGraph hasn't visited Caller @@ -156,7 +188,7 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C, return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller) << "'" << ore::NV("callee", Callee->getName()) << "' elided in '" << ore::NV("caller", Caller->getName()) - << "'"; + << "' (probability: " << ore::NV("probability", Prob) << ")"; }); FAM.invalidate(*Caller, PreservedAnalyses::none()); diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 2583249..1a00d17 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -109,7 +109,7 @@ static cl::opt<float> MinRegionSizeRatio( "outline candidate and original function")); // Used to tune the minimum number of execution counts needed in the predecessor // block to the cold edge. ie. confidence interval. -static cl::opt<unsigned> +cl::opt<unsigned> MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, cl::desc("Minimum block executions to consider " "its BranchProbabilityInfo valid")); diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 155fcc5..9ac3be1 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5959,7 +5959,11 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, unsigned PreviousEdges = OtherCases->size(); if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; - for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) + unsigned E = PreviousEdges - 1; + // Remove all incoming values from OtherDest if OtherDest is unreachable. + if (NewBI->isUnconditional()) + ++E; + for (unsigned I = 0; I != E; ++I) cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3f16b03..e62d57e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5696,7 +5696,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { Instruction *I = Worklist.pop_back_val(); for (auto &Op : I->operands()) if (auto *InstOp = dyn_cast<Instruction>(Op)) - if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && + if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) && AddrDefs.insert(InstOp).second) Worklist.push_back(InstOp); } |