diff options
Diffstat (limited to 'llvm/lib')
75 files changed, 1032 insertions, 531 deletions
diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp index a363bce..c4abec0 100644 --- a/llvm/lib/Analysis/CtxProfAnalysis.cpp +++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp @@ -30,6 +30,9 @@ #define DEBUG_TYPE "ctx_prof" using namespace llvm; + +namespace llvm { + cl::opt<std::string> UseCtxProfile("use-ctx-profile", cl::init(""), cl::Hidden, cl::desc("Use the specified contextual profile file")); @@ -50,7 +53,6 @@ static cl::opt<bool> ForceIsInSpecializedModule( const char *AssignGUIDPass::GUIDMetadataName = "guid"; -namespace llvm { class ProfileAnnotatorImpl final { friend class ProfileAnnotator; class BBInfo; diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index 99afc06..af30422 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Module.h" @@ -216,6 +217,8 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const { ArgEmb += Vocab[*Op]; auto InstVector = Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; + if (const auto *IC = dyn_cast<CmpInst>(&I)) + InstVector += Vocab[IC->getPredicate()]; InstVecMap[&I] = InstVector; BBVector += InstVector; } @@ -250,6 +253,9 @@ void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const { // embeddings auto InstVector = Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; + // Add compare predicate embedding as an additional operand if applicable + if (const auto *IC = dyn_cast<CmpInst>(&I)) + InstVector += Vocab[IC->getPredicate()]; InstVecMap[&I] = InstVector; BBVector += InstVector; } @@ -257,41 +263,76 @@ void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const { } // ==----------------------------------------------------------------------===// -// Vocabulary +// VocabStorage //===----------------------------------------------------------------------===// -unsigned Vocabulary::getDimension() const { - assert(isValid() && "IR2Vec Vocabulary is invalid"); - return Vocab[0].size(); -} - -unsigned Vocabulary::getSlotIndex(unsigned Opcode) { - assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode"); - return Opcode - 1; // Convert to zero-based index -} - -unsigned Vocabulary::getSlotIndex(Type::TypeID TypeID) { - assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID"); - return MaxOpcodes + static_cast<unsigned>(getCanonicalTypeID(TypeID)); -} - -unsigned Vocabulary::getSlotIndex(const Value &Op) { - unsigned Index = static_cast<unsigned>(getOperandKind(&Op)); - assert(Index < MaxOperandKinds && "Invalid OperandKind"); - return MaxOpcodes + MaxCanonicalTypeIDs + Index; +VocabStorage::VocabStorage(std::vector<std::vector<Embedding>> &&SectionData) + : Sections(std::move(SectionData)), TotalSize([&] { + assert(!Sections.empty() && "Vocabulary has no sections"); + // Compute total size across all sections + size_t Size = 0; + for (const auto &Section : Sections) { + assert(!Section.empty() && "Vocabulary section is empty"); + Size += Section.size(); + } + return Size; + }()), + Dimension([&] { + // Get dimension from the first embedding in the first section - all + // embeddings must have the same dimension + assert(!Sections.empty() && "Vocabulary has no sections"); + assert(!Sections[0].empty() && "First section of vocabulary is empty"); + unsigned ExpectedDim = static_cast<unsigned>(Sections[0][0].size()); + + // Verify that all embeddings across all sections have the same + // dimension + [[maybe_unused]] auto allSameDim = + [ExpectedDim](const std::vector<Embedding> &Section) { + return std::all_of(Section.begin(), Section.end(), + [ExpectedDim](const Embedding &Emb) { + return Emb.size() == ExpectedDim; + }); + }; + assert(std::all_of(Sections.begin(), Sections.end(), allSameDim) && + "All embeddings must have the same dimension"); + + return ExpectedDim; + }()) {} + +const Embedding &VocabStorage::const_iterator::operator*() const { + assert(SectionId < Storage->Sections.size() && "Invalid section ID"); + assert(LocalIndex < Storage->Sections[SectionId].size() && + "Local index out of range"); + return Storage->Sections[SectionId][LocalIndex]; +} + +VocabStorage::const_iterator &VocabStorage::const_iterator::operator++() { + ++LocalIndex; + // Check if we need to move to the next section + if (SectionId < Storage->getNumSections() && + LocalIndex >= Storage->Sections[SectionId].size()) { + assert(LocalIndex == Storage->Sections[SectionId].size() && + "Local index should be at the end of the current section"); + LocalIndex = 0; + ++SectionId; + } + return *this; } -const Embedding &Vocabulary::operator[](unsigned Opcode) const { - return Vocab[getSlotIndex(Opcode)]; +bool VocabStorage::const_iterator::operator==( + const const_iterator &Other) const { + return Storage == Other.Storage && SectionId == Other.SectionId && + LocalIndex == Other.LocalIndex; } -const Embedding &Vocabulary::operator[](Type::TypeID TypeID) const { - return Vocab[getSlotIndex(TypeID)]; +bool VocabStorage::const_iterator::operator!=( + const const_iterator &Other) const { + return !(*this == Other); } -const ir2vec::Embedding &Vocabulary::operator[](const Value &Arg) const { - return Vocab[getSlotIndex(Arg)]; -} +// ==----------------------------------------------------------------------===// +// Vocabulary +//===----------------------------------------------------------------------===// StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) { assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode"); @@ -304,29 +345,6 @@ StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) { return "UnknownOpcode"; } -StringRef Vocabulary::getVocabKeyForCanonicalTypeID(CanonicalTypeID CType) { - unsigned Index = static_cast<unsigned>(CType); - assert(Index < MaxCanonicalTypeIDs && "Invalid CanonicalTypeID"); - return CanonicalTypeNames[Index]; -} - -Vocabulary::CanonicalTypeID -Vocabulary::getCanonicalTypeID(Type::TypeID TypeID) { - unsigned Index = static_cast<unsigned>(TypeID); - assert(Index < MaxTypeIDs && "Invalid TypeID"); - return TypeIDMapping[Index]; -} - -StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) { - return getVocabKeyForCanonicalTypeID(getCanonicalTypeID(TypeID)); -} - -StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) { - unsigned Index = static_cast<unsigned>(Kind); - assert(Index < MaxOperandKinds && "Invalid OperandKind"); - return OperandKindNames[Index]; -} - // Helper function to classify an operand into OperandKind Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) { if (isa<Function>(Op)) @@ -338,18 +356,50 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) { return OperandKind::VariableID; } +unsigned Vocabulary::getPredicateLocalIndex(CmpInst::Predicate P) { + if (P >= CmpInst::FIRST_FCMP_PREDICATE && P <= CmpInst::LAST_FCMP_PREDICATE) + return P - CmpInst::FIRST_FCMP_PREDICATE; + else + return P - CmpInst::FIRST_ICMP_PREDICATE + + (CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + 1); +} + +CmpInst::Predicate Vocabulary::getPredicateFromLocalIndex(unsigned LocalIndex) { + unsigned fcmpRange = + CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + 1; + if (LocalIndex < fcmpRange) + return static_cast<CmpInst::Predicate>(CmpInst::FIRST_FCMP_PREDICATE + + LocalIndex); + else + return static_cast<CmpInst::Predicate>(CmpInst::FIRST_ICMP_PREDICATE + + LocalIndex - fcmpRange); +} + +StringRef Vocabulary::getVocabKeyForPredicate(CmpInst::Predicate Pred) { + static SmallString<16> PredNameBuffer; + if (Pred < CmpInst::FIRST_ICMP_PREDICATE) + PredNameBuffer = "FCMP_"; + else + PredNameBuffer = "ICMP_"; + PredNameBuffer += CmpInst::getPredicateName(Pred); + return PredNameBuffer; +} + StringRef Vocabulary::getStringKey(unsigned Pos) { assert(Pos < NumCanonicalEntries && "Position out of bounds in vocabulary"); // Opcode if (Pos < MaxOpcodes) return getVocabKeyForOpcode(Pos + 1); // Type - if (Pos < MaxOpcodes + MaxCanonicalTypeIDs) + if (Pos < OperandBaseOffset) return getVocabKeyForCanonicalTypeID( static_cast<CanonicalTypeID>(Pos - MaxOpcodes)); // Operand - return getVocabKeyForOperandKind( - static_cast<OperandKind>(Pos - MaxOpcodes - MaxCanonicalTypeIDs)); + if (Pos < PredicateBaseOffset) + return getVocabKeyForOperandKind( + static_cast<OperandKind>(Pos - OperandBaseOffset)); + // Predicates + return getVocabKeyForPredicate(getPredicate(Pos - PredicateBaseOffset)); } // For now, assume vocabulary is stable unless explicitly invalidated. @@ -359,19 +409,51 @@ bool Vocabulary::invalidate(Module &M, const PreservedAnalyses &PA, return !(PAC.preservedWhenStateless()); } -Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) { - VocabVector DummyVocab; - DummyVocab.reserve(NumCanonicalEntries); +VocabStorage Vocabulary::createDummyVocabForTest(unsigned Dim) { float DummyVal = 0.1f; - // Create a dummy vocabulary with entries for all opcodes, types, and - // operands - for ([[maybe_unused]] unsigned _ : - seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxCanonicalTypeIDs + - Vocabulary::MaxOperandKinds)) { - DummyVocab.push_back(Embedding(Dim, DummyVal)); + + // Create sections for opcodes, types, operands, and predicates + // Order must match Vocabulary::Section enum + std::vector<std::vector<Embedding>> Sections; + Sections.reserve(4); + + // Opcodes section + std::vector<Embedding> OpcodeSec; + OpcodeSec.reserve(MaxOpcodes); + for (unsigned I = 0; I < MaxOpcodes; ++I) { + OpcodeSec.emplace_back(Dim, DummyVal); DummyVal += 0.1f; } - return DummyVocab; + Sections.push_back(std::move(OpcodeSec)); + + // Types section + std::vector<Embedding> TypeSec; + TypeSec.reserve(MaxCanonicalTypeIDs); + for (unsigned I = 0; I < MaxCanonicalTypeIDs; ++I) { + TypeSec.emplace_back(Dim, DummyVal); + DummyVal += 0.1f; + } + Sections.push_back(std::move(TypeSec)); + + // Operands section + std::vector<Embedding> OperandSec; + OperandSec.reserve(MaxOperandKinds); + for (unsigned I = 0; I < MaxOperandKinds; ++I) { + OperandSec.emplace_back(Dim, DummyVal); + DummyVal += 0.1f; + } + Sections.push_back(std::move(OperandSec)); + + // Predicates section + std::vector<Embedding> PredicateSec; + PredicateSec.reserve(MaxPredicateKinds); + for (unsigned I = 0; I < MaxPredicateKinds; ++I) { + PredicateSec.emplace_back(Dim, DummyVal); + DummyVal += 0.1f; + } + Sections.push_back(std::move(PredicateSec)); + + return VocabStorage(std::move(Sections)); } // ==----------------------------------------------------------------------===// @@ -417,7 +499,9 @@ Error IR2VecVocabAnalysis::parseVocabSection( // FIXME: Make this optional. We can avoid file reads // by auto-generating a default vocabulary during the build time. -Error IR2VecVocabAnalysis::readVocabulary() { +Error IR2VecVocabAnalysis::readVocabulary(VocabMap &OpcVocab, + VocabMap &TypeVocab, + VocabMap &ArgVocab) { auto BufOrError = MemoryBuffer::getFileOrSTDIN(VocabFile, /*IsText=*/true); if (!BufOrError) return createFileError(VocabFile, BufOrError.getError()); @@ -448,7 +532,9 @@ Error IR2VecVocabAnalysis::readVocabulary() { return Error::success(); } -void IR2VecVocabAnalysis::generateNumMappedVocab() { +void IR2VecVocabAnalysis::generateVocabStorage(VocabMap &OpcVocab, + VocabMap &TypeVocab, + VocabMap &ArgVocab) { // Helper for handling missing entities in the vocabulary. // Currently, we use a zero vector. In the future, we will throw an error to @@ -466,7 +552,6 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() { // Handle Opcodes std::vector<Embedding> NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes, Embedding(Dim)); - NumericOpcodeEmbeddings.reserve(Vocabulary::MaxOpcodes); for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) { StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode + 1); auto It = OpcVocab.find(VocabKey.str()); @@ -475,13 +560,10 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() { else handleMissingEntity(VocabKey.str()); } - Vocab.insert(Vocab.end(), NumericOpcodeEmbeddings.begin(), - NumericOpcodeEmbeddings.end()); // Handle Types - only canonical types are present in vocabulary std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxCanonicalTypeIDs, Embedding(Dim)); - NumericTypeEmbeddings.reserve(Vocabulary::MaxCanonicalTypeIDs); for (unsigned CTypeID : seq(0u, Vocabulary::MaxCanonicalTypeIDs)) { StringRef VocabKey = Vocabulary::getVocabKeyForCanonicalTypeID( static_cast<Vocabulary::CanonicalTypeID>(CTypeID)); @@ -491,13 +573,10 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() { } handleMissingEntity(VocabKey.str()); } - Vocab.insert(Vocab.end(), NumericTypeEmbeddings.begin(), - NumericTypeEmbeddings.end()); // Handle Arguments/Operands std::vector<Embedding> NumericArgEmbeddings(Vocabulary::MaxOperandKinds, Embedding(Dim)); - NumericArgEmbeddings.reserve(Vocabulary::MaxOperandKinds); for (unsigned OpKind : seq(0u, Vocabulary::MaxOperandKinds)) { Vocabulary::OperandKind Kind = static_cast<Vocabulary::OperandKind>(OpKind); StringRef VocabKey = Vocabulary::getVocabKeyForOperandKind(Kind); @@ -508,15 +587,37 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() { } handleMissingEntity(VocabKey.str()); } - Vocab.insert(Vocab.end(), NumericArgEmbeddings.begin(), - NumericArgEmbeddings.end()); -} -IR2VecVocabAnalysis::IR2VecVocabAnalysis(const VocabVector &Vocab) - : Vocab(Vocab) {} + // Handle Predicates: part of Operands section. We look up predicate keys + // in ArgVocab. + std::vector<Embedding> NumericPredEmbeddings(Vocabulary::MaxPredicateKinds, + Embedding(Dim, 0)); + for (unsigned PK : seq(0u, Vocabulary::MaxPredicateKinds)) { + StringRef VocabKey = + Vocabulary::getVocabKeyForPredicate(Vocabulary::getPredicate(PK)); + auto It = ArgVocab.find(VocabKey.str()); + if (It != ArgVocab.end()) { + NumericPredEmbeddings[PK] = It->second; + continue; + } + handleMissingEntity(VocabKey.str()); + } + + // Create section-based storage instead of flat vocabulary + // Order must match Vocabulary::Section enum + std::vector<std::vector<Embedding>> Sections(4); + Sections[static_cast<unsigned>(Vocabulary::Section::Opcodes)] = + std::move(NumericOpcodeEmbeddings); // Section::Opcodes + Sections[static_cast<unsigned>(Vocabulary::Section::CanonicalTypes)] = + std::move(NumericTypeEmbeddings); // Section::CanonicalTypes + Sections[static_cast<unsigned>(Vocabulary::Section::Operands)] = + std::move(NumericArgEmbeddings); // Section::Operands + Sections[static_cast<unsigned>(Vocabulary::Section::Predicates)] = + std::move(NumericPredEmbeddings); // Section::Predicates -IR2VecVocabAnalysis::IR2VecVocabAnalysis(VocabVector &&Vocab) - : Vocab(std::move(Vocab)) {} + // Create VocabStorage from organized sections + Vocab.emplace(std::move(Sections)); +} void IR2VecVocabAnalysis::emitError(Error Err, LLVMContext &Ctx) { handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) { @@ -528,8 +629,8 @@ IR2VecVocabAnalysis::Result IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) { auto Ctx = &M.getContext(); // If vocabulary is already populated by the constructor, use it. - if (!Vocab.empty()) - return Vocabulary(std::move(Vocab)); + if (Vocab.has_value()) + return Vocabulary(std::move(Vocab.value())); // Otherwise, try to read from the vocabulary file. if (VocabFile.empty()) { @@ -538,7 +639,9 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) { "set it using --ir2vec-vocab-path"); return Vocabulary(); // Return invalid result } - if (auto Err = readVocabulary()) { + + VocabMap OpcVocab, TypeVocab, ArgVocab; + if (auto Err = readVocabulary(OpcVocab, TypeVocab, ArgVocab)) { emitError(std::move(Err), *Ctx); return Vocabulary(); } @@ -553,9 +656,9 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) { scaleVocabSection(ArgVocab, ArgWeight); // Generate the numeric lookup vocabulary - generateNumMappedVocab(); + generateVocabStorage(OpcVocab, TypeVocab, ArgVocab); - return Vocabulary(std::move(Vocab)); + return Vocabulary(std::move(Vocab.value())); } // ==----------------------------------------------------------------------===// @@ -564,7 +667,7 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) { PreservedAnalyses IR2VecPrinterPass::run(Module &M, ModuleAnalysisManager &MAM) { - auto Vocabulary = MAM.getResult<IR2VecVocabAnalysis>(M); + auto &Vocabulary = MAM.getResult<IR2VecVocabAnalysis>(M); assert(Vocabulary.isValid() && "IR2Vec Vocabulary is invalid"); for (Function &F : M) { @@ -606,7 +709,7 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M, PreservedAnalyses IR2VecVocabPrinterPass::run(Module &M, ModuleAnalysisManager &MAM) { - auto IR2VecVocabulary = MAM.getResult<IR2VecVocabAnalysis>(M); + auto &IR2VecVocabulary = MAM.getResult<IR2VecVocabAnalysis>(M); assert(IR2VecVocabulary.isValid() && "IR2Vec Vocabulary is invalid"); // Print each entry diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp index 7b93474..25e7a97 100644 --- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp +++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp @@ -22,6 +22,8 @@ using namespace llvm; #define DEBUG_TYPE "pgo-icall-prom-analysis" +namespace llvm { + // The percent threshold for the direct-call target (this call site vs the // remaining call count) for it to be considered as the promotion target. static cl::opt<unsigned> ICPRemainingPercentThreshold( @@ -54,6 +56,8 @@ cl::opt<unsigned> MaxNumVTableAnnotations( "icp-max-num-vtables", cl::init(6), cl::Hidden, cl::desc("Max number of vtables annotated for a vtable load instruction.")); +} // end namespace llvm + bool ICallPromotionAnalysis::isPromotionProfitable(uint64_t Count, uint64_t TotalCount, uint64_t RemainingCount) { diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp index 28b14c2..0fa804f 100644 --- a/llvm/lib/Analysis/InlineAdvisor.cpp +++ b/llvm/lib/Analysis/InlineAdvisor.cpp @@ -217,7 +217,7 @@ AnalysisKey PluginInlineAdvisorAnalysis::Key; bool InlineAdvisorAnalysis::initializeIR2VecVocabIfRequested( Module &M, ModuleAnalysisManager &MAM) { if (!IR2VecVocabFile.empty()) { - auto IR2VecVocabResult = MAM.getResult<IR2VecVocabAnalysis>(M); + auto &IR2VecVocabResult = MAM.getResult<IR2VecVocabAnalysis>(M); if (!IR2VecVocabResult.isValid()) { M.getContext().emitError("Failed to load IR2Vec vocabulary"); return false; diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index b5ca6b1..11602d2 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -22,6 +22,8 @@ using namespace llvm::memprof; #define DEBUG_TYPE "memory-profile-info" +namespace llvm { + cl::opt<bool> MemProfReportHintedSizes( "memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations")); @@ -52,6 +54,8 @@ cl::opt<unsigned> MinPercentMaxColdSize( "memprof-min-percent-max-cold-size", cl::init(100), cl::Hidden, cl::desc("Min percent of max cold bytes for critical cold context")); +} // end namespace llvm + bool llvm::memprof::metadataIncludesAllContextSizeInfo() { return MemProfReportHintedSizes || MinClonedColdBytePercent < 100; } diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index a317ac4..a60a4bb 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -67,7 +67,6 @@ using namespace llvm::memprof; namespace llvm { FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold = FunctionSummary::FSHT_None; -} // namespace llvm static cl::opt<FunctionSummary::ForceSummaryHotnessType, true> FSEC( "force-summary-edges-cold", cl::Hidden, cl::location(ForceSummaryEdgesCold), @@ -91,6 +90,7 @@ LLVM_ABI extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize; extern cl::opt<unsigned> MaxNumVTableAnnotations; extern cl::opt<bool> MemProfReportHintedSizes; +} // namespace llvm // Walk through the operands of a given User via worklist iteration and populate // the set of GlobalValue references encountered. Invoked either on an diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp index f1c3155..44d7a17 100644 --- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp +++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp @@ -24,6 +24,8 @@ #include <optional> using namespace llvm; +namespace llvm { + static cl::opt<bool> PartialProfile( "partial-profile", cl::Hidden, cl::init(false), cl::desc("Specify the current profile is used as a partial profile.")); @@ -44,6 +46,8 @@ static cl::opt<double> PartialSampleProfileWorkingSetSizeScaleFactor( "and the factor to scale the working set size to use the same " "shared thresholds as PGO.")); +} // end namespace llvm + // The profile summary metadata may be attached either by the frontend or by // any backend passes (IR level instrumentation, for example). This method // checks if the Summary is null and if so checks if the summary metadata is now diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index b4f08c3..7900dc7 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -31,11 +31,14 @@ static cl::opt<bool> static cl::opt<std::string> CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden, cl::desc("File path to where .cgdata file is read")); + +namespace llvm { cl::opt<bool> CodeGenDataThinLTOTwoRounds( "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden, cl::desc("Enable two-round ThinLTO code generation. The first round " "emits codegen data, while the second round uses the emitted " "codegen data for further optimizations.")); +} // end namespace llvm static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp index 3fd8cfe..b1cd939 100644 --- a/llvm/lib/CGData/CodeGenDataReader.cpp +++ b/llvm/lib/CGData/CodeGenDataReader.cpp @@ -26,14 +26,14 @@ static cl::opt<bool> IndexedCodeGenDataReadFunctionMapNames( "disabled to save memory and time for final consumption of the " "indexed CodeGenData in production.")); +namespace llvm { + cl::opt<bool> IndexedCodeGenDataLazyLoading( "indexed-codegen-data-lazy-loading", cl::init(false), cl::Hidden, cl::desc( "Lazily load indexed CodeGenData. Enable to save memory and time " "for final consumption of the indexed CodeGenData in production.")); -namespace llvm { - static Expected<std::unique_ptr<MemoryBuffer>> setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) { auto BufferOrErr = Filename.str() == "-" ? MemoryBuffer::getSTDIN() diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 56e13f0..884c3f1 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2362,6 +2362,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, MachineInstr::copyFlagsFromInstruction(CI)); return true; } + case Intrinsic::modf: { + ArrayRef<Register> VRegs = getOrCreateVRegs(CI); + MIRBuilder.buildModf(VRegs[0], VRegs[1], + getOrCreateVReg(*CI.getArgOperand(0)), + MachineInstr::copyFlagsFromInstruction(CI)); + return true; + } case Intrinsic::sincos: { ArrayRef<Register> VRegs = getOrCreateVRegs(CI); MIRBuilder.buildFSincos(VRegs[0], VRegs[1], diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 03dfa6f..cffaf7c 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -471,6 +471,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE(TANH_F); case TargetOpcode::G_FSINCOS: RTLIBCASE(SINCOS_F); + case TargetOpcode::G_FMODF: + RTLIBCASE(MODF_F); case TargetOpcode::G_FLOG10: RTLIBCASE(LOG10_F); case TargetOpcode::G_FLOG: @@ -703,6 +705,46 @@ LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall( } LegalizerHelper::LegalizeResult +LegalizerHelper::emitModfLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, + unsigned Size, Type *OpType, + LostDebugLocObserver &LocObserver) { + MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + Register DstFrac = MI.getOperand(0).getReg(); + Register DstInt = MI.getOperand(1).getReg(); + Register Src = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(DstFrac); + + int MemSize = DstTy.getSizeInBytes(); + Align Alignment = getStackTemporaryAlignment(DstTy); + const DataLayout &DL = MIRBuilder.getDataLayout(); + unsigned AddrSpace = DL.getAllocaAddrSpace(); + MachinePointerInfo PtrInfo; + + Register StackPtrInt = + createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo) + .getReg(0); + + auto &Ctx = MF.getFunction().getContext(); + auto LibcallResult = createLibcall( + MIRBuilder, getRTLibDesc(MI.getOpcode(), Size), {DstFrac, OpType, 0}, + {{Src, OpType, 0}, {StackPtrInt, PointerType::get(Ctx, AddrSpace), 1}}, + LocObserver, &MI); + + if (LibcallResult != LegalizeResult::Legalized) + return LegalizerHelper::UnableToLegalize; + + MachineMemOperand *LoadMMOInt = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment); + + MIRBuilder.buildLoad(DstInt, StackPtrInt, *LoadMMOInt); + MI.eraseFromParent(); + + return LegalizerHelper::Legalized; +} + +LegalizerHelper::LegalizeResult llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, MachineInstr &MI, LostDebugLocObserver &LocObserver) { auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); @@ -1341,6 +1383,16 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { } return emitSincosLibcall(MI, MIRBuilder, Size, HLTy, LocObserver); } + case TargetOpcode::G_FMODF: { + LLT LLTy = MRI.getType(MI.getOperand(0).getReg()); + unsigned Size = LLTy.getSizeInBits(); + Type *HLTy = getFloatTypeForLLT(Ctx, LLTy); + if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) { + LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n"); + return UnableToLegalize; + } + return emitModfLibcall(MI, MIRBuilder, Size, HLTy, LocObserver); + } case TargetOpcode::G_LROUND: case TargetOpcode::G_LLROUND: case TargetOpcode::G_INTRINSIC_LRINT: @@ -3333,6 +3385,16 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); Observer.changedInstr(MI); return Legalized; + case TargetOpcode::G_FMODF: { + Observer.changingInstr(MI); + widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT); + + widenScalarDst(MI, WideTy, 1, TargetOpcode::G_FPTRUNC); + MIRBuilder.setInsertPt(MIRBuilder.getMBB(), --MIRBuilder.getInsertPt()); + widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); + Observer.changedInstr(MI); + return Legalized; + } case TargetOpcode::G_FPOWI: case TargetOpcode::G_FLDEXP: case TargetOpcode::G_STRICT_FLDEXP: { @@ -5472,6 +5534,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_LROUND: case G_LLROUND: case G_INTRINSIC_TRUNC: + case G_FMODF: case G_FCOS: case G_FSIN: case G_FTAN: diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index 477e5c1..c2d474f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -34,7 +34,7 @@ cl::opt<bool> llvm::DisableGISelLegalityCheck( cl::desc("Don't verify that MIR is fully legal between GlobalISel passes"), cl::Hidden); -cl::opt<bool> VerboseVerifyLegalizerInfo( +static cl::opt<bool> VerboseVerifyLegalizerInfo( "verbose-gisel-verify-legalizer-info", cl::desc("Print more information to dbgs about GlobalISel legalizer rules " "being verified"), diff --git a/llvm/lib/CodeGen/MachineRegionInfo.cpp b/llvm/lib/CodeGen/MachineRegionInfo.cpp index f8268b8..366755a 100644 --- a/llvm/lib/CodeGen/MachineRegionInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegionInfo.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/RegionInfoImpl.h" #include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Config/llvm-config.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -127,7 +128,7 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const { #endif char MachineRegionInfoPass::ID = 0; -char &MachineRegionInfoPassID = MachineRegionInfoPass::ID; +char &llvm::MachineRegionInfoPassID = MachineRegionInfoPass::ID; INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE, "Detect single entry single exit regions", true, true) diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 729a57e..e1d39d6 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1929,7 +1929,27 @@ ValueTrackerResult ValueTracker::getNextSourceFromCopy() { const MachineOperand &Src = Def->getOperand(1); if (Src.isUndef()) return ValueTrackerResult(); - return ValueTrackerResult(Src.getReg(), Src.getSubReg()); + + Register SrcReg = Src.getReg(); + unsigned SubReg = Src.getSubReg(); + if (DefSubReg) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + SubReg = TRI->composeSubRegIndices(SubReg, DefSubReg); + + if (SrcReg.isVirtual()) { + // TODO: Try constraining on rewrite if we can + const TargetRegisterClass *RegRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *SrcWithSubRC = + TRI->getSubClassWithSubReg(RegRC, SubReg); + if (RegRC != SrcWithSubRC) + return ValueTrackerResult(); + } else { + if (!TRI->getSubReg(SrcReg, SubReg)) + return ValueTrackerResult(); + } + } + + return ValueTrackerResult(SrcReg, SubReg); } ValueTrackerResult ValueTracker::getNextSourceFromBitcast() { diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 8e6cf3e..7fe13a3 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -1406,8 +1406,24 @@ bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint, continue; // Check if VirtReg interferes with OtherReg after this COPY instruction. - if (!IsDef && VirtReg.liveAt(LIS->getInstructionIndex(Instr).getRegSlot())) - continue; + if (Opnd.readsReg()) { + SlotIndex Index = LIS->getInstructionIndex(Instr).getRegSlot(); + + if (SubReg) { + LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg); + if (IsDef) + Mask = ~Mask; + + if (any_of(VirtReg.subranges(), [=](const LiveInterval::SubRange &S) { + return (S.LaneMask & Mask).any() && S.liveAt(Index); + })) { + continue; + } + } else { + if (VirtReg.liveAt(Index)) + continue; + } + } MCRegister OtherPhysReg = OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg); @@ -2419,25 +2435,28 @@ void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) { unsigned SubReg = Opnd.getSubReg(); // Get the current assignment. - MCRegister OtherPhysReg = - OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg); - if (OtherSubReg) { - if (OtherReg.isPhysical()) { - MCRegister Tuple = - TRI->getMatchingSuperReg(OtherPhysReg, OtherSubReg, RC); - if (!Tuple) - continue; - OtherPhysReg = Tuple; - } else { - // TODO: There should be a hinting mechanism for subregisters - if (SubReg != OtherSubReg) - continue; - } + MCRegister OtherPhysReg; + if (OtherReg.isPhysical()) { + if (OtherSubReg) + OtherPhysReg = TRI->getMatchingSuperReg(OtherReg, OtherSubReg, RC); + else if (SubReg) + OtherPhysReg = TRI->getMatchingSuperReg(OtherReg, SubReg, RC); + else + OtherPhysReg = OtherReg; + } else { + OtherPhysReg = VRM->getPhys(OtherReg); + // TODO: Should find matching superregister, but applying this in the + // non-hint case currently causes regressions + + if (SubReg && OtherSubReg && SubReg != OtherSubReg) + continue; } // Push the collected information. - Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg, - OtherPhysReg)); + if (OtherPhysReg) { + Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg, + OtherPhysReg)); + } } } @@ -2466,15 +2485,13 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) { // We have a broken hint, check if it is possible to fix it by // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted // some register and PhysReg may be available for the other live-ranges. - SmallSet<Register, 4> Visited; - SmallVector<Register, 2> RecoloringCandidates; HintsInfo Info; Register Reg = VirtReg.reg(); MCRegister PhysReg = VRM->getPhys(Reg); // Start the recoloring algorithm from the input live-interval, then // it will propagate to the ones that are copy-related with it. - Visited.insert(Reg); - RecoloringCandidates.push_back(Reg); + SmallSet<Register, 4> Visited = {Reg}; + SmallVector<Register, 2> RecoloringCandidates = {Reg}; LLVM_DEBUG(dbgs() << "Trying to reconcile hints for: " << printReg(Reg, TRI) << '(' << printReg(PhysReg, TRI) << ")\n"); @@ -2482,12 +2499,10 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) { do { Reg = RecoloringCandidates.pop_back_val(); - // We cannot recolor physical register. - if (Reg.isPhysical()) - continue; + MCRegister CurrPhys = VRM->getPhys(Reg); // This may be a skipped register. - if (!VRM->hasPhys(Reg)) { + if (!CurrPhys) { assert(!shouldAllocateRegister(Reg) && "We have an unallocated variable which should have been handled"); continue; @@ -2496,7 +2511,6 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) { // Get the live interval mapped with this virtual register to be able // to check for the interference with the new color. LiveInterval &LI = LIS->getInterval(Reg); - MCRegister CurrPhys = VRM->getPhys(Reg); // Check that the new color matches the register class constraints and // that it is free for this live range. if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) || @@ -2533,7 +2547,8 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) { // Push all copy-related live-ranges to keep reconciling the broken // hints. for (const HintInfo &HI : Info) { - if (Visited.insert(HI.Reg).second) + // We cannot recolor physical register. + if (HI.Reg.isVirtual() && Visited.insert(HI.Reg).second) RecoloringCandidates.push_back(HI.Reg); } } while (!RecoloringCandidates.empty()); diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp index 9c9cc1f..280946b 100644 --- a/llvm/lib/CodeGen/RegAllocScore.cpp +++ b/llvm/lib/CodeGen/RegAllocScore.cpp @@ -23,6 +23,8 @@ #include "llvm/Support/CommandLine.h" using namespace llvm; + +namespace llvm { LLVM_ABI cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2), cl::Hidden); LLVM_ABI cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0), @@ -33,6 +35,8 @@ LLVM_ABI cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight", cl::init(0.2), cl::Hidden); LLVM_ABI cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight", cl::init(1.0), cl::Hidden); +} // end namespace llvm + #define DEBUG_TYPE "regalloc-score" RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) { diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 7ac1aef..ebfea8e 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -584,14 +584,14 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const { return DstReg == Dst; // This is a partial register copy. Check that the parts match. return Register(TRI.getSubReg(DstReg, SrcSub)) == Dst; - } else { - // DstReg is virtual. - if (DstReg != Dst) - return false; - // Registers match, do the subregisters line up? - return TRI.composeSubRegIndices(SrcIdx, SrcSub) == - TRI.composeSubRegIndices(DstIdx, DstSub); } + + // DstReg is virtual. + if (DstReg != Dst) + return false; + // Registers match, do the subregisters line up? + return TRI.composeSubRegIndices(SrcIdx, SrcSub) == + TRI.composeSubRegIndices(DstIdx, DstSub); } void RegisterCoalescerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { @@ -2914,8 +2914,7 @@ JoinVals::ConflictResolution JoinVals::analyzeValue(unsigned ValNo, if ((V.ValidLanes & OtherV.ValidLanes).any()) // Overlapping lanes can't be resolved. return CR_Impossible; - else - return CR_Merge; + return CR_Merge; } // No simultaneous def. Is Other live at the def? diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index ff7cd66..87d5453 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -6256,17 +6256,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { // FIXME: Not all targets may support EVL in VP_LOAD. These will have been // removed from the IR by the ExpandVectorPredication pass but we're // reintroducing them here. - EVT LdVT = LD->getMemoryVT(); - EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), LdVT); - EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - WideVT.getVectorElementCount()); + EVT VT = LD->getValueType(0); + EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + EVT WideMaskVT = getSetCCResultType(WideVT); + if (ExtType == ISD::NON_EXTLOAD && TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WideVT) && TLI.isTypeLegal(WideMaskVT)) { SDLoc DL(N); SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT); SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(), - LdVT.getVectorElementCount()); + VT.getVectorElementCount()); SDValue NewLoad = DAG.getLoadVP(LD->getAddressingMode(), ISD::NON_EXTLOAD, WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask, @@ -6303,6 +6303,24 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { return Result; } + if (VT.isVector()) { + // If all else fails replace the load with a wide masked load. + SDLoc DL(N); + EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + + SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount()); + SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT, + DAG.getConstant(0, DL, IdxVT), Len); + + SDValue NewLoad = DAG.getMaskedLoad( + WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask, + DAG.getPOISON(WideVT), LD->getMemoryVT(), LD->getMemOperand(), + LD->getAddressingMode(), LD->getExtensionType()); + + ReplaceValueWith(SDValue(N, 1), NewLoad.getValue(1)); + return NewLoad; + } + report_fatal_error("Unable to widen vector load"); } @@ -7516,8 +7534,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { SDValue StVal = ST->getValue(); EVT StVT = StVal.getValueType(); EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT); - EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - WideVT.getVectorElementCount()); + EVT WideMaskVT = getSetCCResultType(WideVT); if (TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) && TLI.isTypeLegal(WideMaskVT)) { @@ -7540,6 +7557,22 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain); } + if (StVT.isVector()) { + // If all else fails replace the store with a wide masked store. + SDLoc DL(N); + EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + + SDValue WideStVal = GetWidenedVector(StVal); + SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount()); + SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT, + DAG.getConstant(0, DL, IdxVT), Len); + + return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(), + ST->getOffset(), Mask, ST->getMemoryVT(), + ST->getMemOperand(), ST->getAddressingMode(), + ST->isTruncatingStore()); + } + report_fatal_error("Unable to widen vector store"); } @@ -8298,8 +8331,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain, AAMDNodes AAInfo = LD->getAAInfo(); if (LdVT.isScalableVector()) - report_fatal_error("Generating widen scalable extending vector loads is " - "not yet supported"); + return SDValue(); EVT EltVT = WidenVT.getVectorElementType(); EVT LdEltVT = LdVT.getVectorElementType(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 8fc7eab..95f53fe 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4762,6 +4762,11 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, case ISD::AssertZext: Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits(); return VTBits-Tmp; + case ISD::FREEZE: + if (isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedElts, + /*PoisonOnly=*/false)) + return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); + break; case ISD::MERGE_VALUES: return ComputeNumSignBits(Op.getOperand(Op.getResNo()), DemandedElts, Depth + 1); diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp index 096a33c..ec75dc3 100644 --- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp +++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp @@ -72,7 +72,7 @@ struct StackFrameLayoutAnalysis { : Slot(Idx), Size(MFI.getObjectSize(Idx)), Align(MFI.getObjectAlign(Idx).value()), Offset(Offset), SlotTy(Invalid), Scalable(false) { - Scalable = MFI.getStackID(Idx) == TargetStackID::ScalableVector; + Scalable = MFI.isScalableStackID(Idx); if (MFI.isSpillSlotObjectIndex(Idx)) SlotTy = SlotType::Spill; else if (MFI.isFixedObjectIndex(Idx)) diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp index c1017d8..d973a47 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp @@ -148,7 +148,7 @@ std::error_code LVSplitContext::open(std::string ContextName, return std::error_code(); } -LVReader *CurrentReader = nullptr; +static LVReader *CurrentReader = nullptr; LVReader &LVReader::getInstance() { if (CurrentReader) return *CurrentReader; diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index a8bb34f..33ca46c 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -30,6 +30,8 @@ #include "llvm/Support/Compiler.h" using namespace llvm; +namespace llvm { + // FIXME: Flag used for an ablation performance test, Issue #147390. Placing it // here because referencing IR should be feasible from anywhere. Will be // removed after the ablation test. @@ -38,6 +40,8 @@ cl::opt<bool> ProfcheckDisableMetadataFixes( cl::desc( "Disable metadata propagation fixes discovered through Issue #147390")); +} // end namespace llvm + InsertPosition::InsertPosition(Instruction *InsertBefore) : InsertAt(InsertBefore ? InsertBefore->getIterator() : InstListType::iterator()) {} diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index e5e062d..a347609 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -36,7 +36,7 @@ using namespace llvm; -cl::opt<bool> UseDerefAtPointSemantics( +static cl::opt<bool> UseDerefAtPointSemantics( "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false), cl::desc("Deref attributes and metadata infer facts at definition only")); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 7b25262..e6544f3 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -75,9 +75,10 @@ static cl::opt<bool> DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden, cl::desc("Dump the SCCs in the ThinLTO index's callgraph")); +namespace llvm { extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; - extern cl::opt<bool> ForceImportAll; +} // end namespace llvm namespace llvm { /// Enable global value internalization in LTO. diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp index 0dd378e..329dcbf 100644 --- a/llvm/lib/Object/OffloadBundle.cpp +++ b/llvm/lib/Object/OffloadBundle.cpp @@ -120,14 +120,15 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset, if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle) return errorCodeToError(object_error::parse_failed); - OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, FileName); + std::unique_ptr<OffloadBundleFatBin> TheBundle( + new OffloadBundleFatBin(Buf, FileName)); // Read the Bundle Entries Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset); if (Err) return Err; - return std::unique_ptr<OffloadBundleFatBin>(TheBundle); + return std::move(TheBundle); } Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) { diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 256cf9d..7069e8d 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -150,6 +150,8 @@ using namespace llvm; +namespace llvm { + static cl::opt<InliningAdvisorMode> UseInlineAdvisor( "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden, cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"), @@ -305,7 +307,6 @@ static cl::opt<std::string> InstrumentColdFuncOnlyPath( extern cl::opt<std::string> UseCtxProfile; extern cl::opt<bool> PGOInstrumentColdFunctionOnly; -namespace llvm { extern cl::opt<bool> EnableMemProfContextDisambiguation; } // namespace llvm @@ -610,7 +611,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // Jump table to switch conversion. if (EnableJumpTableToSwitch) - FPM.addPass(JumpTableToSwitchPass()); + FPM.addPass(JumpTableToSwitchPass( + /*InLTO=*/Phase == ThinOrFullLTOPhase::ThinLTOPostLink || + Phase == ThinOrFullLTOPhase::FullLTOPostLink)); FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); diff --git a/llvm/lib/ProfileData/MemProfCommon.cpp b/llvm/lib/ProfileData/MemProfCommon.cpp index a13a291..cfd2efd 100644 --- a/llvm/lib/ProfileData/MemProfCommon.cpp +++ b/llvm/lib/ProfileData/MemProfCommon.cpp @@ -20,6 +20,8 @@ using namespace llvm; using namespace llvm::memprof; +namespace llvm { + // Upper bound on lifetime access density (accesses per byte per lifetime sec) // for marking an allocation cold. LLVM_ABI cl::opt<float> MemProfLifetimeAccessDensityColdThreshold( @@ -48,6 +50,8 @@ LLVM_ABI cl::opt<bool> cl::desc("Enable use of hot hints (only supported for " "unambigously hot allocations)")); +} // end namespace llvm + AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity, uint64_t AllocCount, uint64_t TotalLifetime) { diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index ab5c6f3..f5f7b65 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -324,7 +324,37 @@ AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF, static bool produceCompactUnwindFrame(const AArch64FrameLowering &, MachineFunction &MF); -// Conservatively, returns true if the function is likely to have an SVE vectors +enum class AssignObjectOffsets { No, Yes }; +/// Process all the SVE stack objects and the SVE stack size and offsets for +/// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE +/// stack sizes set). Returns the size of the SVE stack. +static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, + AssignObjectOffsets AssignOffsets, + bool SplitSVEObjects = false); + +static unsigned getStackHazardSize(const MachineFunction &MF) { + return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize(); +} + +/// Returns true if PPRs are spilled as ZPRs. +static bool arePPRsSpilledAsZPR(const MachineFunction &MF) { + return MF.getSubtarget().getRegisterInfo()->getSpillSize( + AArch64::PPRRegClass) == 16; +} + +StackOffset +AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return StackOffset::getScalable(AFI->getStackSizeZPR()); +} + +StackOffset +AArch64FrameLowering::getPPRStackSize(const MachineFunction &MF) const { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return StackOffset::getScalable(AFI->getStackSizePPR()); +} + +// Conservatively, returns true if the function is likely to have SVE vectors // on the stack. This function is safe to be called before callee-saves or // object offsets have been determined. static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL, @@ -338,7 +368,7 @@ static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL, const MachineFrameInfo &MFI = MF.getFrameInfo(); for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) { - if (MFI.getStackID(FI) == TargetStackID::ScalableVector) + if (MFI.isScalableStackID(FI)) return true; } @@ -482,13 +512,6 @@ AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF, } } -/// Returns the size of the entire SVE stackframe (calleesaves + spills). -StackOffset -AArch64FrameLowering::getSVEStackSize(const MachineFunction &MF) const { - const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE()); -} - bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -514,7 +537,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { !Subtarget.hasSVE(); return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize || - getSVEStackSize(MF) || LowerQRegCopyThroughMem); + AFI->hasSVEStackSize() || LowerQRegCopyThroughMem); } /// hasFPImpl - Return true if the specified function should have a dedicated @@ -557,7 +580,7 @@ bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const { // CFA in either of these cases. if (AFI.needsDwarfUnwindInfo(MF) && ((requiresSaveVG(MF) || AFI.getSMEFnAttrs().hasStreamingBody()) && - (!AFI.hasCalculatedStackSizeSVE() || AFI.getStackSizeSVE() > 0))) + (!AFI.hasCalculatedStackSizeSVE() || AFI.hasSVEStackSize()))) return true; // With large callframes around we may need to use FP to access the scavenging // emergency spillslot. @@ -1126,10 +1149,6 @@ static bool isTargetWindows(const MachineFunction &MF) { return MF.getSubtarget<AArch64Subtarget>().isTargetWindows(); } -static unsigned getStackHazardSize(const MachineFunction &MF) { - return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize(); -} - void AArch64FrameLowering::emitPacRetPlusLeafHardening( MachineFunction &MF) const { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); @@ -1212,7 +1231,9 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); - StackOffset SVEStackSize = getSVEStackSize(MF); + StackOffset ZPRStackSize = getZPRStackSize(MF); + StackOffset PPRStackSize = getPPRStackSize(MF); + StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; // For VLA-area objects, just emit an offset at the end of the stack frame. // Whilst not quite correct, these objects do live at the end of the frame and @@ -1228,7 +1249,7 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); bool FPAfterSVECalleeSaves = isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize(); - if (MFI.getStackID(FI) == TargetStackID::ScalableVector) { + if (MFI.isScalableStackID(FI)) { if (FPAfterSVECalleeSaves && -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) return StackOffset::getScalable(ObjectOffset); @@ -1294,7 +1315,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); - bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector; + bool isSVE = MFI.isScalableStackID(FI); return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, PreferFP, ForSimm); } @@ -1313,7 +1334,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); - const StackOffset &SVEStackSize = getSVEStackSize(MF); + const StackOffset SVEStackSize = getSVEStackSize(MF); // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't @@ -1615,10 +1636,13 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, FirstReg = Count - 1; } bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize(); - int ScalableByteOffset = - FPAfterSVECalleeSaves ? 0 : AFI->getSVECalleeSavedStackSize(); + int ScalableByteOffset = FPAfterSVECalleeSaves + ? 0 + : AFI->getZPRCalleeSavedStackSize() + + AFI->getPPRCalleeSavedStackSize(); bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); Register LastReg = 0; + bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex(); // When iterating backwards, the loop condition relies on unsigned wraparound. for (unsigned i = FirstReg; i < Count; i += RegInc) { @@ -1648,7 +1672,7 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, } // Add the stack hazard size as we transition from GPR->FPR CSRs. - if (AFI->hasStackHazardSlotIndex() && + if (HasCSHazardPadding && (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && AArch64InstrInfo::isFpOrNEON(RPI.Reg1)) ByteOffset += StackFillDir * StackHazardSize; @@ -1656,7 +1680,7 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, int Scale = TRI->getSpillSize(*RPI.RC); // Add the next reg to the pair if it is in the same register class. - if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) { + if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) { MCRegister NextReg = CSI[i + RegInc].getReg(); bool IsFirst = i == FirstReg; switch (RPI.Type) { @@ -2021,10 +2045,14 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } // Update the StackIDs of the SVE stack slots. MachineFrameInfo &MFI = MF.getFrameInfo(); - if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) { + if (RPI.Type == RegPairInfo::ZPR) { MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector); if (RPI.isPaired()) MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector); + } else if (RPI.Type == RegPairInfo::PPR) { + MFI.setStackID(FrameIdxReg1, TargetStackID::ScalablePredicateVector); + if (RPI.isPaired()) + MFI.setStackID(FrameIdxReg2, TargetStackID::ScalablePredicateVector); } } return true; @@ -2232,8 +2260,7 @@ void AArch64FrameLowering::determineStackHazardSlot( for (auto &MI : MBB) { std::optional<int> FI = getLdStFrameID(MI, MFI); if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { - if (MFI.getStackID(*FI) == TargetStackID::ScalableVector || - AArch64InstrInfo::isFpOrNEON(MI)) + if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI)) FrameObjects[*FI] |= 2; else FrameObjects[*FI] |= 1; @@ -2260,10 +2287,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); - const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; @@ -2384,15 +2412,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Calculates the callee saved stack size. unsigned CSStackSize = 0; - unsigned SVECSStackSize = 0; + unsigned ZPRCSStackSize = 0; + unsigned PPRCSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (unsigned Reg : SavedRegs.set_bits()) { auto *RC = TRI->getMinimalPhysRegClass(Reg); assert(RC && "expected register class!"); auto SpillSize = TRI->getSpillSize(*RC); - if (AArch64::PPRRegClass.contains(Reg) || - AArch64::ZPRRegClass.contains(Reg)) - SVECSStackSize += SpillSize; + bool IsZPR = AArch64::ZPRRegClass.contains(Reg); + bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg); + if (IsZPR || (IsPPR && arePPRsSpilledAsZPR(MF))) + ZPRCSStackSize += SpillSize; + else if (IsPPR) + PPRCSStackSize += SpillSize; else CSStackSize += SpillSize; } @@ -2402,17 +2434,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // only 64-bit GPRs can be added to SavedRegs. unsigned NumSavedRegs = SavedRegs.count(); - // Increase the callee-saved stack size if the function has streaming mode - // changes, as we will need to spill the value of the VG register. - if (requiresSaveVG(MF)) - CSStackSize += 8; - // Determine if a Hazard slot should be used, and increase the CSStackSize by // StackHazardSize if so. determineStackHazardSlot(MF, SavedRegs); if (AFI->hasStackHazardSlotIndex()) CSStackSize += getStackHazardSize(MF); + // Increase the callee-saved stack size if the function has streaming mode + // changes, as we will need to spill the value of the VG register. + if (requiresSaveVG(MF)) + CSStackSize += 8; + // If we must call __arm_get_current_vg in the prologue preserve the LR. if (requiresSaveVG(MF) && !Subtarget.hasSVE()) SavedRegs.set(AArch64::LR); @@ -2433,8 +2465,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, }); // If any callee-saved registers are used, the frame cannot be eliminated. - int64_t SVEStackSize = - alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16); + auto [ZPRLocalStackSize, PPRLocalStackSize] = + determineSVEStackSizes(MF, AssignObjectOffsets::No); + uint64_t SVELocals = ZPRLocalStackSize + PPRLocalStackSize; + uint64_t SVEStackSize = + alignTo(ZPRCSStackSize + PPRCSStackSize + SVELocals, 16); bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; // The CSR spill slots have not been allocated yet, so estimateStackSize @@ -2519,7 +2554,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // instructions. AFI->setCalleeSavedStackSize(AlignedCSStackSize); AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); - AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16)); + AFI->setSVECalleeSavedStackSize(ZPRCSStackSize, alignTo(PPRCSStackSize, 16)); } bool AArch64FrameLowering::assignCalleeSavedSpillSlots( @@ -2658,7 +2693,6 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, assert((Max == std::numeric_limits<int>::min() || Max + 1 == CS.getFrameIdx()) && "SVE CalleeSaves are not consecutive"); - Min = std::min(Min, CS.getFrameIdx()); Max = std::max(Max, CS.getFrameIdx()); } @@ -2666,43 +2700,65 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, return Min != std::numeric_limits<int>::max(); } -// Process all the SVE stack objects and determine offsets for each -// object. If AssignOffsets is true, the offsets get assigned. -// Fills in the first and last callee-saved frame indices into -// Min/MaxCSFrameIndex, respectively. -// Returns the size of the stack. -static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, - int &MinCSFrameIndex, - int &MaxCSFrameIndex, - bool AssignOffsets) { +static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, + AssignObjectOffsets AssignOffsets, + bool SplitSVEObjects) { + MachineFrameInfo &MFI = MF.getFrameInfo(); + auto *AFI = MF.getInfo<AArch64FunctionInfo>(); + + SVEStackSizes SVEStack{}; + + // With SplitSVEObjects we maintain separate stack offsets for predicates + // (PPRs) and SVE vectors (ZPRs). When SplitSVEObjects is disabled predicates + // are included in the SVE vector area. + uint64_t &ZPRStackTop = SVEStack.ZPRStackSize; + uint64_t &PPRStackTop = + SplitSVEObjects ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize; + #ifndef NDEBUG // First process all fixed stack objects. for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) - assert(MFI.getStackID(I) != TargetStackID::ScalableVector && + assert(!MFI.isScalableStackID(I) && "SVE vectors should never be passed on the stack by value, only by " "reference."); #endif - auto Assign = [&MFI](int FI, int64_t Offset) { + auto AllocateObject = [&](int FI) { + uint64_t &StackTop = MFI.getStackID(FI) == TargetStackID::ScalableVector + ? ZPRStackTop + : PPRStackTop; + + // FIXME: Given that the length of SVE vectors is not necessarily a power of + // two, we'd need to align every object dynamically at runtime if the + // alignment is larger than 16. This is not yet supported. + Align Alignment = MFI.getObjectAlign(FI); + if (Alignment > Align(16)) + report_fatal_error( + "Alignment of scalable vectors > 16 bytes is not yet supported"); + + StackTop += MFI.getObjectSize(FI); + StackTop = alignTo(StackTop, Alignment); + + assert(StackTop < std::numeric_limits<int64_t>::max() && + "SVE StackTop far too large?!"); + + int64_t Offset = -int64_t(StackTop); + if (AssignOffsets == AssignObjectOffsets::Yes) + MFI.setObjectOffset(FI, Offset); + LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); - MFI.setObjectOffset(FI, Offset); }; - int64_t Offset = 0; - // Then process all callee saved slots. + int MinCSFrameIndex, MaxCSFrameIndex; if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { - // Assign offsets to the callee save slots. - for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { - Offset += MFI.getObjectSize(I); - Offset = alignTo(Offset, MFI.getObjectAlign(I)); - if (AssignOffsets) - Assign(I, -Offset); - } + for (int FI = MinCSFrameIndex; FI <= MaxCSFrameIndex; ++FI) + AllocateObject(FI); } - // Ensure that the Callee-save area is aligned to 16bytes. - Offset = alignTo(Offset, Align(16U)); + // Ensure the CS area is 16-byte aligned. + PPRStackTop = alignTo(PPRStackTop, Align(16U)); + ZPRStackTop = alignTo(ZPRStackTop, Align(16U)); // Create a buffer of SVE objects to allocate and sort it. SmallVector<int, 8> ObjectsToAllocate; @@ -2715,48 +2771,31 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector) ObjectsToAllocate.push_back(StackProtectorFI); } - for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { - unsigned StackID = MFI.getStackID(I); - if (StackID != TargetStackID::ScalableVector) - continue; - if (I == StackProtectorFI) + + for (int FI = 0, E = MFI.getObjectIndexEnd(); FI != E; ++FI) { + if (FI == StackProtectorFI || MFI.isDeadObjectIndex(FI)) continue; - if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex) + if (MaxCSFrameIndex >= FI && FI >= MinCSFrameIndex) continue; - if (MFI.isDeadObjectIndex(I)) + + if (MFI.getStackID(FI) != TargetStackID::ScalableVector && + MFI.getStackID(FI) != TargetStackID::ScalablePredicateVector) continue; - ObjectsToAllocate.push_back(I); + ObjectsToAllocate.push_back(FI); } // Allocate all SVE locals and spills - for (unsigned FI : ObjectsToAllocate) { - Align Alignment = MFI.getObjectAlign(FI); - // FIXME: Given that the length of SVE vectors is not necessarily a power of - // two, we'd need to align every object dynamically at runtime if the - // alignment is larger than 16. This is not yet supported. - if (Alignment > Align(16)) - report_fatal_error( - "Alignment of scalable vectors > 16 bytes is not yet supported"); + for (unsigned FI : ObjectsToAllocate) + AllocateObject(FI); - Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment); - if (AssignOffsets) - Assign(FI, -Offset); - } + PPRStackTop = alignTo(PPRStackTop, Align(16U)); + ZPRStackTop = alignTo(ZPRStackTop, Align(16U)); - return Offset; -} + if (AssignOffsets == AssignObjectOffsets::Yes) + AFI->setStackSizeSVE(SVEStack.ZPRStackSize, SVEStack.PPRStackSize); -int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets( - MachineFrameInfo &MFI) const { - int MinCSFrameIndex, MaxCSFrameIndex; - return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false); -} - -int64_t AArch64FrameLowering::assignSVEStackObjectOffsets( - MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const { - return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, - true); + return SVEStack; } /// Attempts to scavenge a register from \p ScavengeableRegs given the used @@ -3070,12 +3109,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && "Upwards growing stack unsupported"); - int MinCSFrameIndex, MaxCSFrameIndex; - int64_t SVEStackSize = - assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex); - - AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U)); - AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex); + (void)determineSVEStackSizes(MF, AssignObjectOffsets::Yes); // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. @@ -3597,7 +3631,7 @@ StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP( // Go to common code if we cannot provide sp + offset. if (MFI.hasVarSizedObjects() || - MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() || + MF.getInfo<AArch64FunctionInfo>()->hasSVEStackSize() || MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF)) return getFrameIndexReference(MF, FI, FrameReg); @@ -3721,8 +3755,7 @@ void AArch64FrameLowering::orderFrameObjects( if (AFI.hasStackHazardSlotIndex()) { std::optional<int> FI = getLdStFrameID(MI, MFI); if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { - if (MFI.getStackID(*FI) == TargetStackID::ScalableVector || - AArch64InstrInfo::isFpOrNEON(MI)) + if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI)) FrameObjects[*FI].Accesses |= FrameObject::AccessFPR; else FrameObjects[*FI].Accesses |= FrameObject::AccessGPR; @@ -4080,7 +4113,7 @@ void AArch64FrameLowering::emitRemarks( } unsigned RegTy = StackAccess::AccessType::GPR; - if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) { + if (MFI.isScalableStackID(FrameIdx)) { // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO // spill/fill the predicate as a data vector (so are an FPR access). if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO && diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 7bba053..38aa28b1 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -24,6 +24,11 @@ class AArch64FunctionInfo; class AArch64PrologueEmitter; class AArch64EpilogueEmitter; +struct SVEStackSizes { + uint64_t ZPRStackSize{0}; + uint64_t PPRStackSize{0}; +}; + class AArch64FrameLowering : public TargetFrameLowering { public: explicit AArch64FrameLowering() @@ -124,6 +129,7 @@ public: return false; case TargetStackID::Default: case TargetStackID::ScalableVector: + case TargetStackID::ScalablePredicateVector: case TargetStackID::NoAlloc: return true; } @@ -132,7 +138,8 @@ public: bool isStackIdSafeForLocalArea(unsigned StackId) const override { // We don't support putting SVE objects into the pre-allocated local // frame block at the moment. - return StackId != TargetStackID::ScalableVector; + return (StackId != TargetStackID::ScalableVector && + StackId != TargetStackID::ScalablePredicateVector); } void @@ -145,7 +152,16 @@ public: bool requiresSaveVG(const MachineFunction &MF) const; - StackOffset getSVEStackSize(const MachineFunction &MF) const; + /// Returns the size of the entire ZPR stackframe (calleesaves + spills). + StackOffset getZPRStackSize(const MachineFunction &MF) const; + + /// Returns the size of the entire PPR stackframe (calleesaves + spills). + StackOffset getPPRStackSize(const MachineFunction &MF) const; + + /// Returns the size of the entire SVE stackframe (PPRs + ZPRs). + StackOffset getSVEStackSize(const MachineFunction &MF) const { + return getZPRStackSize(MF) + getPPRStackSize(MF); + } friend class AArch64PrologueEpilogueCommon; friend class AArch64PrologueEmitter; @@ -165,10 +181,6 @@ private: /// Returns true if CSRs should be paired. bool producePairRegisters(MachineFunction &MF) const; - int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const; - int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, - int &MinCSFrameIndex, - int &MaxCSFrameIndex) const; /// Make a determination whether a Hazard slot is used and create it if /// needed. void determineStackHazardSlot(MachineFunction &MF, diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 6a1b06e..35bbb0c0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -2089,7 +2089,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node, if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue)) return; - SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)}; + SDValue Chain = Node->getOperand(0); + SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4), Chain}; SDLoc DL(Node); EVT VT = Node->getValueType(0); @@ -2110,14 +2111,15 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node, void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc) { - SDValue ZtValue; - SmallVector<SDValue, 4> Ops; if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue)) return; - Ops.push_back(ZtValue); - Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)})); + SDValue Chain = Node->getOperand(0); + SDValue Ops[] = {ZtValue, + createZMulTuple({Node->getOperand(3), Node->getOperand(4)}), + Chain}; + SDLoc DL(Node); EVT VT = Node->getValueType(0); @@ -7495,7 +7497,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, int FI = cast<FrameIndexSDNode>(N)->getIndex(); // We can only encode VL scaled offsets, so only fold in frame indexes // referencing SVE objects. - if (MFI.getStackID(FI) == TargetStackID::ScalableVector) { + if (MFI.isScalableStackID(FI)) { Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); return true; @@ -7541,7 +7543,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, int FI = cast<FrameIndexSDNode>(Base)->getIndex(); // We can only encode VL scaled offsets, so only fold in frame indexes // referencing SVE objects. - if (MFI.getStackID(FI) == TargetStackID::ScalableVector) + if (MFI.isScalableStackID(FI)) Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 45f5235..c2a482a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1537,6 +1537,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom); @@ -6617,7 +6618,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, "llvm.eh.recoverfp must take a function as the first argument"); return IncomingFPOp; } - case Intrinsic::aarch64_neon_vsri: case Intrinsic::aarch64_neon_vsli: case Intrinsic::aarch64_sve_sri: @@ -9256,8 +9256,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, (MI.getOpcode() == AArch64::ADDXri || MI.getOpcode() == AArch64::SUBXri)) { const MachineOperand &MO = MI.getOperand(1); - if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) == - TargetStackID::ScalableVector) + if (MO.isFI() && MF.getFrameInfo().isScalableStackID(MO.getIndex())) MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false, /*IsImplicit=*/true)); } @@ -9704,8 +9703,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); MachineFrameInfo &MFI = MF.getFrameInfo(); int FI = MFI.CreateStackObject(StoreSize, Alignment, false); - if (isScalable) - MFI.setStackID(FI, TargetStackID::ScalableVector); + if (isScalable) { + bool IsPred = VA.getValVT() == MVT::aarch64svcount || + VA.getValVT().getVectorElementType() == MVT::i1; + MFI.setStackID(FI, IsPred ? TargetStackID::ScalablePredicateVector + : TargetStackID::ScalableVector); + } MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); SDValue Ptr = DAG.getFrameIndex( @@ -15154,9 +15157,7 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { : Shift.getOperand(1); unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; - SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm); - - return ResultSLI; + return DAG.getNode(Inst, DL, VT, X, Y, Imm); } static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG) { @@ -27234,6 +27235,21 @@ static bool isLanes1toNKnownZero(SDValue Op) { } } +// Return true if the vector operation can guarantee that the first lane of its +// result is active. +static bool isLane0KnownActive(SDValue Op) { + switch (Op.getOpcode()) { + default: + return false; + case AArch64ISD::REINTERPRET_CAST: + return isLane0KnownActive(Op->getOperand(0)); + case ISD::SPLAT_VECTOR: + return isOneConstant(Op.getOperand(0)); + case AArch64ISD::PTRUE: + return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all; + }; +} + static SDValue removeRedundantInsertVectorElt(SDNode *N) { assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!"); SDValue InsertVec = N->getOperand(0); @@ -27519,6 +27535,32 @@ static SDValue performMULLCombine(SDNode *N, return SDValue(); } +static SDValue performPTestFirstCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SDLoc DL(N); + auto Mask = N->getOperand(0); + auto Pred = N->getOperand(1); + + if (!isLane0KnownActive(Mask)) + return SDValue(); + + if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST) + Pred = Pred->getOperand(0); + + if (Pred->getOpcode() == ISD::CONCAT_VECTORS) { + Pred = Pred->getOperand(0); + Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred); + return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask, + Pred); + } + + return SDValue(); +} + static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -27875,6 +27917,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case AArch64ISD::UMULL: case AArch64ISD::PMULL: return performMULLCombine(N, DCI, DAG); + case AArch64ISD::PTEST_FIRST: + return performPTestFirstCombine(N, DCI, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (N->getConstantOperandVal(1)) { @@ -29564,7 +29608,7 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { // than doing it here in finalizeLowering. if (MFI.hasStackProtectorIndex()) { for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { - if (MFI.getStackID(i) == TargetStackID::ScalableVector && + if (MFI.isScalableStackID(i) && MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) { MFI.setStackID(MFI.getStackProtectorIndex(), TargetStackID::ScalableVector); diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index f07d351..6ef0a95 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -10176,28 +10176,6 @@ multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm, (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>; } -multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> { - def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?}, - FPR8, FPR8, vecshiftR8, asm, []> { - let Inst{18-16} = imm{2-0}; - } - - def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?}, - FPR16, FPR16, vecshiftR16, asm, []> { - let Inst{19-16} = imm{3-0}; - } - - def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?}, - FPR32, FPR32, vecshiftR32, asm, []> { - let Inst{20-16} = imm{4-0}; - } - - def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?}, - FPR64, FPR64, vecshiftR64, asm, []> { - let Inst{21-16} = imm{5-0}; - } -} - //---------------------------------------------------------------------------- // AdvSIMD vector x indexed element //---------------------------------------------------------------------------- diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5a51c81..5a90da1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1503,6 +1503,13 @@ AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask, getElementSizeForOpcode(PredOpcode)) return PredOpcode; + // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since + // WHILEcc performs an implicit PTEST with an all active mask, setting + // the N flag as the PTEST_FIRST would. + if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST && + isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31) + return PredOpcode; + return {}; } @@ -5592,7 +5599,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_PXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalablePredicateVector; } break; } @@ -5607,7 +5614,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Opc = AArch64::STRSui; else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { Opc = AArch64::STR_PPXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalablePredicateVector; } break; case 8: @@ -5777,7 +5784,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( if (IsPNR) PNRReg = DestReg; Opc = AArch64::LDR_PXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalablePredicateVector; } break; } @@ -5792,7 +5799,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( Opc = AArch64::LDRSui; else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { Opc = AArch64::LDR_PPXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalablePredicateVector; } break; case 8: diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index a81f5b3..b3c9656 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -23,12 +23,21 @@ using namespace llvm; +static std::optional<uint64_t> +getSVEStackSize(const AArch64FunctionInfo &MFI, + uint64_t (AArch64FunctionInfo::*GetStackSize)() const) { + if (!MFI.hasCalculatedStackSizeSVE()) + return std::nullopt; + return (MFI.*GetStackSize)(); +} + yaml::AArch64FunctionInfo::AArch64FunctionInfo( const llvm::AArch64FunctionInfo &MFI) : HasRedZone(MFI.hasRedZone()), - StackSizeSVE(MFI.hasCalculatedStackSizeSVE() - ? std::optional<uint64_t>(MFI.getStackSizeSVE()) - : std::nullopt), + StackSizeZPR( + getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizeZPR)), + StackSizePPR( + getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizePPR)), HasStackFrame(MFI.hasStackFrame() ? std::optional<bool>(MFI.hasStackFrame()) : std::nullopt) {} @@ -41,8 +50,9 @@ void AArch64FunctionInfo::initializeBaseYamlFields( const yaml::AArch64FunctionInfo &YamlMFI) { if (YamlMFI.HasRedZone) HasRedZone = YamlMFI.HasRedZone; - if (YamlMFI.StackSizeSVE) - setStackSizeSVE(*YamlMFI.StackSizeSVE); + if (YamlMFI.StackSizeZPR || YamlMFI.StackSizePPR) + setStackSizeSVE(YamlMFI.StackSizeZPR.value_or(0), + YamlMFI.StackSizePPR.value_or(0)); if (YamlMFI.HasStackFrame) setHasStackFrame(*YamlMFI.HasStackFrame); } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 897c7e8..4a79d9c 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -74,13 +74,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// Amount of stack frame size, not including callee-saved registers. uint64_t LocalStackSize = 0; - /// The start and end frame indices for the SVE callee saves. - int MinSVECSFrameIndex = 0; - int MaxSVECSFrameIndex = 0; - /// Amount of stack frame size used for saving callee-saved registers. unsigned CalleeSavedStackSize = 0; - unsigned SVECalleeSavedStackSize = 0; + unsigned ZPRCalleeSavedStackSize = 0; + unsigned PPRCalleeSavedStackSize = 0; bool HasCalleeSavedStackSize = false; bool HasSVECalleeSavedStackSize = false; @@ -137,9 +134,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// SVE stack size (for predicates and data vectors) are maintained here /// rather than in FrameInfo, as the placement and Stack IDs are target /// specific. - uint64_t StackSizeSVE = 0; + uint64_t StackSizeZPR = 0; + uint64_t StackSizePPR = 0; - /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid. + /// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid. bool HasCalculatedStackSizeSVE = false; /// Has a value when it is known whether or not the function uses a @@ -312,16 +310,25 @@ public: TailCallReservedStack = bytes; } - bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } - - void setStackSizeSVE(uint64_t S) { + void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) { + StackSizeZPR = ZPR; + StackSizePPR = PPR; HasCalculatedStackSizeSVE = true; - StackSizeSVE = S; } - uint64_t getStackSizeSVE() const { + uint64_t getStackSizeZPR() const { assert(hasCalculatedStackSizeSVE()); - return StackSizeSVE; + return StackSizeZPR; + } + uint64_t getStackSizePPR() const { + assert(hasCalculatedStackSizeSVE()); + return StackSizePPR; + } + + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + + bool hasSVEStackSize() const { + return getStackSizeZPR() > 0 || getStackSizePPR() > 0; } bool hasStackFrame() const { return HasStackFrame; } @@ -414,23 +421,25 @@ public: } // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes' - void setSVECalleeSavedStackSize(unsigned Size) { - SVECalleeSavedStackSize = Size; + void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) { + ZPRCalleeSavedStackSize = ZPR; + PPRCalleeSavedStackSize = PPR; HasSVECalleeSavedStackSize = true; } - unsigned getSVECalleeSavedStackSize() const { + unsigned getZPRCalleeSavedStackSize() const { assert(HasSVECalleeSavedStackSize && - "SVECalleeSavedStackSize has not been calculated"); - return SVECalleeSavedStackSize; + "ZPRCalleeSavedStackSize has not been calculated"); + return ZPRCalleeSavedStackSize; } - - void setMinMaxSVECSFrameIndex(int Min, int Max) { - MinSVECSFrameIndex = Min; - MaxSVECSFrameIndex = Max; + unsigned getPPRCalleeSavedStackSize() const { + assert(HasSVECalleeSavedStackSize && + "PPRCalleeSavedStackSize has not been calculated"); + return PPRCalleeSavedStackSize; } - int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; } - int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; } + unsigned getSVECalleeSavedStackSize() const { + return getZPRCalleeSavedStackSize() + getPPRCalleeSavedStackSize(); + } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } unsigned getNumLocalDynamicTLSAccesses() const { @@ -611,7 +620,8 @@ private: namespace yaml { struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { std::optional<bool> HasRedZone; - std::optional<uint64_t> StackSizeSVE; + std::optional<uint64_t> StackSizeZPR; + std::optional<uint64_t> StackSizePPR; std::optional<bool> HasStackFrame; AArch64FunctionInfo() = default; @@ -624,7 +634,8 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { template <> struct MappingTraits<AArch64FunctionInfo> { static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) { YamlIO.mapOptional("hasRedZone", MFI.HasRedZone); - YamlIO.mapOptional("stackSizeSVE", MFI.StackSizeSVE); + YamlIO.mapOptional("stackSizeZPR", MFI.StackSizeZPR); + YamlIO.mapOptional("stackSizePPR", MFI.StackSizePPR); YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame); } }; diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 09b3643..aad6579 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -48,21 +48,19 @@ bool AArch64PrologueEpilogueCommon::isVGInstruction( return Opc == TargetOpcode::COPY; } -// Convenience function to determine whether I is an SVE callee save. -static bool isSVECalleeSave(MachineBasicBlock::iterator I) { +// Convenience function to determine whether I is part of the ZPR callee saves. +static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) { switch (I->getOpcode()) { default: return false; - case AArch64::PTRUE_C_B: case AArch64::LD1B_2Z_IMM: case AArch64::ST1B_2Z_IMM: case AArch64::STR_ZXI: - case AArch64::STR_PXI: case AArch64::LDR_ZXI: - case AArch64::LDR_PXI: - case AArch64::PTRUE_B: case AArch64::CPY_ZPzI_B: case AArch64::CMPNE_PPzZI_B: + case AArch64::PTRUE_C_B: + case AArch64::PTRUE_B: return I->getFlag(MachineInstr::FrameSetup) || I->getFlag(MachineInstr::FrameDestroy); case AArch64::SEH_SavePReg: @@ -71,6 +69,23 @@ static bool isSVECalleeSave(MachineBasicBlock::iterator I) { } } +// Convenience function to determine whether I is part of the PPR callee saves. +static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) { + switch (I->getOpcode()) { + default: + return false; + case AArch64::STR_PXI: + case AArch64::LDR_PXI: + return I->getFlag(MachineInstr::FrameSetup) || + I->getFlag(MachineInstr::FrameDestroy); + } +} + +// Convenience function to determine whether I is part of the SVE callee saves. +static bool isPartOfSVECalleeSaves(MachineBasicBlock::iterator I) { + return isPartOfZPRCalleeSaves(I) || isPartOfPPRCalleeSaves(I); +} + AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon( MachineFunction &MF, MachineBasicBlock &MBB, const AArch64FrameLowering &AFL) @@ -316,7 +331,7 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump( // When there is an SVE area on the stack, always allocate the // callee-saves and spills/locals separately. - if (AFL.getSVEStackSize(MF)) + if (AFI->hasSVEStackSize()) return false; return true; @@ -639,7 +654,7 @@ void AArch64PrologueEmitter::emitPrologue() { // Now allocate space for the GPR callee saves. MachineBasicBlock::iterator MBBI = PrologueBeginI; - while (MBBI != EndI && isSVECalleeSave(MBBI)) + while (MBBI != EndI && isPartOfSVECalleeSaves(MBBI)) ++MBBI; FirstGPRSaveI = convertCalleeSaveRestoreToSPPrePostIncDec( MBBI, DL, -AFI->getCalleeSavedStackSize(), EmitAsyncCFI); @@ -669,7 +684,7 @@ void AArch64PrologueEmitter::emitPrologue() { MachineBasicBlock::iterator AfterGPRSavesI = FirstGPRSaveI; while (AfterGPRSavesI != EndI && AfterGPRSavesI->getFlag(MachineInstr::FrameSetup) && - !isSVECalleeSave(AfterGPRSavesI)) { + !isPartOfSVECalleeSaves(AfterGPRSavesI)) { if (CombineSPBump && // Only fix-up frame-setup load/store instructions. (!AFL.requiresSaveVG(MF) || !isVGInstruction(AfterGPRSavesI, TLI))) @@ -700,38 +715,66 @@ void AArch64PrologueEmitter::emitPrologue() { if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding); - StackOffset SVEStackSize = AFL.getSVEStackSize(MF); - StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI; + StackOffset PPRCalleeSavesSize = + StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); + StackOffset ZPRCalleeSavesSize = + StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); + StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize; + StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize; + StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize; + StackOffset CFAOffset = StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); - - // Process the SVE callee-saves to determine what space needs to be - // allocated. MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI; - if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { - LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize - << "\n"); - SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize); - SVELocalsSize = SVEStackSize - SVECalleeSavesSize; - // Find callee save instructions in frame. - // Note: With FPAfterSVECalleeSaves the callee saves have already been + + if (!FPAfterSVECalleeSaves) { + MachineBasicBlock::iterator ZPRCalleeSavesBegin = AfterGPRSavesI, + ZPRCalleeSavesEnd = AfterGPRSavesI; + MachineBasicBlock::iterator PPRCalleeSavesBegin = AfterGPRSavesI, + PPRCalleeSavesEnd = AfterGPRSavesI; + + // Process the SVE callee-saves to determine what space needs to be // allocated. - if (!FPAfterSVECalleeSaves) { - MachineBasicBlock::iterator CalleeSavesBegin = AfterGPRSavesI; - assert(isSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); - while (isSVECalleeSave(AfterSVESavesI) && + + if (PPRCalleeSavesSize) { + LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = " + << PPRCalleeSavesSize.getScalable() << "\n"); + + PPRCalleeSavesBegin = AfterSVESavesI; + assert(isPartOfPPRCalleeSaves(PPRCalleeSavesBegin) && + "Unexpected instruction"); + while (isPartOfPPRCalleeSaves(AfterSVESavesI) && AfterSVESavesI != MBB.getFirstTerminator()) ++AfterSVESavesI; - CalleeSavesEnd = AfterSVESavesI; + PPRCalleeSavesEnd = AfterSVESavesI; + } - StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes); - // Allocate space for the callee saves (if any). - allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize, - EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || LocalsSize); + if (ZPRCalleeSavesSize) { + LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = " + << ZPRCalleeSavesSize.getScalable() << "\n"); + ZPRCalleeSavesBegin = AfterSVESavesI; + assert(isPartOfZPRCalleeSaves(ZPRCalleeSavesBegin) && + "Unexpected instruction"); + while (isPartOfZPRCalleeSaves(AfterSVESavesI) && + AfterSVESavesI != MBB.getFirstTerminator()) + ++AfterSVESavesI; + ZPRCalleeSavesEnd = AfterSVESavesI; } + + // Allocate space for the callee saves (if any). + StackOffset LocalsSize = + PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes); + MachineBasicBlock::iterator CalleeSavesBegin = + AFI->getPPRCalleeSavedStackSize() ? PPRCalleeSavesBegin + : ZPRCalleeSavesBegin; + allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || LocalsSize); + + CalleeSavesEnd = AFI->getZPRCalleeSavedStackSize() ? ZPRCalleeSavesEnd + : PPRCalleeSavesEnd; } CFAOffset += SVECalleeSavesSize; @@ -746,6 +789,7 @@ void AArch64PrologueEmitter::emitPrologue() { // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. + StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize; allocateStackSpace(CalleeSavesEnd, RealignmentPadding, SVELocalsSize + StackOffset::getFixed(NumBytes), EmitAsyncCFI && !HasFP, CFAOffset, @@ -796,7 +840,8 @@ void AArch64PrologueEmitter::emitPrologue() { emitDefineCFAWithFP(AfterSVESavesI, FixedObject); } else { StackOffset TotalSize = - SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize()); + AFL.getSVEStackSize(MF) + + StackOffset::getFixed((int64_t)MFI.getStackSize()); CFIInstBuilder CFIBuilder(MBB, AfterSVESavesI, MachineInstr::FrameSetup); CFIBuilder.insertCFIInst( createDefCFA(RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP, @@ -1165,7 +1210,7 @@ void AArch64PrologueEmitter::emitCalleeSavedGPRLocations( CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); for (const auto &Info : CSI) { unsigned FrameIdx = Info.getFrameIdx(); - if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) + if (MFI.isScalableStackID(FrameIdx)) continue; assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); @@ -1192,7 +1237,7 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations( } for (const auto &Info : CSI) { - if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector) + if (!MFI.isScalableStackID(Info.getFrameIdx())) continue; // Not all unwinders may know about SVE registers, so assume the lowest @@ -1322,7 +1367,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { while (FirstGPRRestoreI != Begin) { --FirstGPRRestoreI; if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) || - (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) { + (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) { ++FirstGPRRestoreI; break; } else if (CombineSPBump) @@ -1346,7 +1391,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { if (HasFP && AFI->hasSwiftAsyncContext()) emitSwiftAsyncContextFramePointer(EpilogueEndI, DL); - const StackOffset &SVEStackSize = AFL.getSVEStackSize(MF); + StackOffset SVEStackSize = AFL.getSVEStackSize(MF); // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { @@ -1372,20 +1417,25 @@ void AArch64EpilogueEmitter::emitEpilogue() { StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI, RestoreEnd = FirstGPRRestoreI; - if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { + int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize(); + int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize(); + int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize; + + if (SVECalleeSavedSize) { if (FPAfterSVECalleeSaves) RestoreEnd = MBB.getFirstTerminator(); RestoreBegin = std::prev(RestoreEnd); while (RestoreBegin != MBB.begin() && - isSVECalleeSave(std::prev(RestoreBegin))) + isPartOfSVECalleeSaves(std::prev(RestoreBegin))) --RestoreBegin; - assert(isSVECalleeSave(RestoreBegin) && - isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); + assert(isPartOfSVECalleeSaves(RestoreBegin) && + isPartOfSVECalleeSaves(std::prev(RestoreEnd)) && + "Unexpected instruction"); StackOffset CalleeSavedSizeAsOffset = - StackOffset::getScalable(CalleeSavedSize); + StackOffset::getScalable(SVECalleeSavedSize); DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; DeallocateAfter = CalleeSavedSizeAsOffset; } @@ -1624,8 +1674,7 @@ void AArch64EpilogueEmitter::emitCalleeSavedRestores( CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy); for (const auto &Info : CSI) { - if (SVE != - (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)) + if (SVE != MFI.isScalableStackID(Info.getFrameIdx())) continue; MCRegister Reg = Info.getReg(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 2b0c8ad..3f43b70 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -643,7 +643,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { if (ST.hasSVE() || ST.isStreaming()) { // Frames that have variable sized objects and scalable SVE objects, // should always use a basepointer. - if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE()) + if (!AFI->hasCalculatedStackSizeSVE() || AFI->hasSVEStackSize()) return true; } @@ -783,7 +783,7 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() || AFI->hasCalculatedStackSizeSVE()) && "Expected SVE area to be calculated by this point"); - return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE() && + return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->hasSVEStackSize() && !AFI->hasStackHazardSlotIndex(); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 7ee54c5..c197550e 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -438,7 +438,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10, G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH, - G_FSINH, G_FTANH}) + G_FSINH, G_FTANH, G_FMODF}) // We need a call for these, so we always need to scalarize. .scalarize(0) // Regardless of FP16 support, widen 16-bit elements to 32-bits. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 92a587b..280fbe2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); + TargetPassConfig::addCodeGenPrepare(); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(createLoadStoreVectorizerPass()); + if (TM->getTargetTriple().isAMDGCN()) { // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). @@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(createAMDGPULowerBufferFatPointersPass()); addPass(createAMDGPULowerIntrinsicsLegacyPass()); // In accordance with the above FIXME, manually force all the @@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { addPass(new DummyCGSCCPass()); } - TargetPassConfig::addCodeGenPrepare(); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(createLoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can // cause unexpected behavior for subsequent passes. Placing it // here seems better that these blocks would get cleaned up by @@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { if (EnableLowerKernelArguments) addPass(AMDGPULowerKernelArgumentsPass(TM)); + Base::addCodeGenPrepare(addPass); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(LoadStoreVectorizerPass()); + // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). // It could be placed anywhere before uniformity annotations (an analysis @@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(AMDGPULowerBufferFatPointersPass(TM)); addPass.requireCGSCCOrder(); addPass(AMDGPULowerIntrinsicsPass(TM)); - Base::addCodeGenPrepare(addPass); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(LoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can cause unexpected // behavior for subsequent passes. Placing it here seems better that these // blocks would get cleaned up by UnreachableBlockElim inserted next in the diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 7c5d4fc..e4b3528 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -924,6 +924,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::SGPRSpill: return true; case TargetStackID::ScalableVector: + case TargetStackID::ScalablePredicateVector: case TargetStackID::WasmLocal: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1653008..f7265c5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -64,14 +64,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing( cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); -// TODO: This option should be removed once we switch to always using PTRADD in -// the SelectionDAG. -static cl::opt<bool> UseSelectionDAGPTRADD( - "amdgpu-use-sdag-ptradd", cl::Hidden, - cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " - "SelectionDAG ISel"), - cl::init(false)); - static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); @@ -11466,7 +11458,7 @@ static bool isNoUnsignedWrap(SDValue Addr) { bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { - return UseSelectionDAGPTRADD && PtrVT == MVT::i64; + return PtrVT == MVT::i64; } bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F, diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 45d194e..939841a 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -2804,6 +2804,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::V6_vL32b_nt_cur_npred_ai: case Hexagon::V6_vL32b_nt_tmp_pred_ai: case Hexagon::V6_vL32b_nt_tmp_npred_ai: + case Hexagon::V6_vS32Ub_npred_ai: case Hexagon::V6_vgathermh_pseudo: case Hexagon::V6_vgathermw_pseudo: case Hexagon::V6_vgathermhw_pseudo: diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 06ce917..7d4535a 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -2395,6 +2395,7 @@ bool RISCVFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::NoAlloc: case TargetStackID::SGPRSpill: case TargetStackID::WasmLocal: + case TargetStackID::ScalablePredicateVector: return false; } llvm_unreachable("Invalid TargetStackID::Value"); diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index 2e5f30f..19d5aff 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -109,8 +109,9 @@ def : LdPat<extloadi8, LBU, i16>; // Prefer unsigned due to no c.lb in Zcb. def : StPat<truncstorei8, SB, GPR, i16>; let Predicates = [HasAtomicLdSt] in { - def : LdPat<atomic_load_aext_8, LB, i16>; - def : LdPat<atomic_load_nonext_16, LH, i16>; + // Prefer unsigned due to no c.lb in Zcb. + def : LdPat<atomic_load_aext_8, LBU, i16>; + def : LdPat<atomic_load_nonext_16, LH, i16>; def : StPat<atomic_store_8, SB, GPR, i16>; def : StPat<atomic_store_16, SH, GPR, i16>; diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 90e1c47a..6a6ead2 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -70,6 +70,10 @@ static unsigned getSEWOpNum(const MachineInstr &MI) { return RISCVII::getSEWOpNum(MI.getDesc()); } +static unsigned getVecPolicyOpNum(const MachineInstr &MI) { + return RISCVII::getVecPolicyOpNum(MI.getDesc()); +} + /// Get the EEW for a load or store instruction. Return std::nullopt if MI is /// not a load or store which ignores SEW. static std::optional<unsigned> getEEWForLoadStore(const MachineInstr &MI) { @@ -986,7 +990,7 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const { // If there is a policy operand, use it. if (RISCVII::hasVecPolicyOp(TSFlags)) { - const MachineOperand &Op = MI.getOperand(MI.getNumExplicitOperands() - 1); + const MachineOperand &Op = MI.getOperand(getVecPolicyOpNum(MI)); uint64_t Policy = Op.getImm(); assert(Policy <= (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC) && diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 59f5aeb..99992d1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -165,10 +165,11 @@ class seq_cst_store<PatFrag base> // any ordering. This is necessary because AtomicExpandPass has added fences to // atomic load/stores and changed them to unordered ones. let Predicates = [HasAtomicLdSt] in { - def : LdPat<relaxed_load<atomic_load_asext_8>, LB>; + // Use unsigned for aext due to no c.lb in Zcb. + def : LdPat<relaxed_load<atomic_load_sext_8>, LB>; + def : LdPat<relaxed_load<atomic_load_azext_8>, LBU>; def : LdPat<relaxed_load<atomic_load_asext_16>, LH>; - def : LdPat<relaxed_load<atomic_load_zext_8>, LBU>; - def : LdPat<relaxed_load<atomic_load_zext_16>, LHU>; + def : LdPat<relaxed_load<atomic_load_zext_16>, LHU>; def : StPat<relaxed_store<atomic_store_8>, SB, GPR, XLenVT>; def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 273edf3..0afec42 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -752,6 +752,8 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::exp, GL::Exp); case TargetOpcode::G_FEXP2: return selectExtInst(ResVReg, ResType, I, CL::exp2, GL::Exp2); + case TargetOpcode::G_FMODF: + return selectModf(ResVReg, ResType, I); case TargetOpcode::G_FLOG: return selectExtInst(ResVReg, ResType, I, CL::log, GL::Log); @@ -3453,9 +3455,6 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, case Intrinsic::spv_discard: { return selectDiscard(ResVReg, ResType, I); } - case Intrinsic::modf: { - return selectModf(ResVReg, ResType, I); - } default: { std::string DiagMsg; raw_string_ostream OS(DiagMsg); @@ -4268,6 +4267,7 @@ bool SPIRVInstructionSelector::selectModf(Register ResVReg, PtrTyReg, LLT::pointer(storageClassToAddressSpace(SPIRV::StorageClass::Function), GR.getPointerSize())); + // Assign SPIR-V type of the pointer type of the alloca variable to the // new register. GR.assignSPIRVTypeToVReg(PtrType, PtrTyReg, MIRBuilder.getMF()); @@ -4280,10 +4280,7 @@ bool SPIRVInstructionSelector::selectModf(Register ResVReg, .addUse(GR.getSPIRVTypeID(PtrType)) .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function)); Register Variable = AllocaMIB->getOperand(0).getReg(); - // Modf must have 4 operands, the first two are the 2 parts of the result, - // the third is the operand, and the last one is the floating point value. - assert(I.getNumOperands() == 4 && - "Expected 4 operands for modf instruction"); + MachineBasicBlock &BB = *I.getParent(); // Create the OpenCLLIB::modf instruction. auto MIB = @@ -4293,8 +4290,8 @@ bool SPIRVInstructionSelector::selectModf(Register ResVReg, .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::OpenCL_std)) .addImm(CL::modf) .setMIFlags(I.getFlags()) - .add(I.getOperand(3)) // Floating point value. - .addUse(Variable); // Pointer to integral part. + .add(I.getOperand(I.getNumExplicitDefs())) // Floating point value. + .addUse(Variable); // Pointer to integral part. // Assign the integral part stored in the ptr to the second element of the // result. Register IntegralPartReg = I.getOperand(1).getReg(); diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp index aea3397..205895e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp @@ -39,6 +39,7 @@ private: void collectBindingInfo(Module &M); uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet); void replaceImplicitBindingCalls(Module &M); + void verifyUniqueOrderIdPerResource(SmallVectorImpl<CallInst *> &Calls); // A map from descriptor set to a bit vector of used binding numbers. std::vector<BitVector> UsedBindings; @@ -94,6 +95,33 @@ void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) { }); } +void SPIRVLegalizeImplicitBinding::verifyUniqueOrderIdPerResource( + SmallVectorImpl<CallInst *> &Calls) { + // Check that the order Id is unique per resource. + for (uint32_t i = 1; i < Calls.size(); ++i) { + const uint32_t OrderIdArgIdx = 0; + const uint32_t DescSetArgIdx = 1; + const uint32_t OrderA = + cast<ConstantInt>(Calls[i - 1]->getArgOperand(OrderIdArgIdx)) + ->getZExtValue(); + const uint32_t OrderB = + cast<ConstantInt>(Calls[i]->getArgOperand(OrderIdArgIdx)) + ->getZExtValue(); + if (OrderA == OrderB) { + const uint32_t DescSetA = + cast<ConstantInt>(Calls[i - 1]->getArgOperand(DescSetArgIdx)) + ->getZExtValue(); + const uint32_t DescSetB = + cast<ConstantInt>(Calls[i]->getArgOperand(DescSetArgIdx)) + ->getZExtValue(); + if (DescSetA != DescSetB) { + report_fatal_error("Implicit binding calls with the same order ID must " + "have the same descriptor set"); + } + } + } +} + uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding( uint32_t DescSet) { if (UsedBindings.size() <= DescSet) { @@ -112,11 +140,23 @@ uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding( } void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) { + uint32_t lastOrderId = -1; + uint32_t lastBindingNumber = -1; + for (CallInst *OldCI : ImplicitBindingCalls) { IRBuilder<> Builder(OldCI); + const uint32_t OrderId = + cast<ConstantInt>(OldCI->getArgOperand(0))->getZExtValue(); const uint32_t DescSet = cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue(); - const uint32_t NewBinding = getAndReserveFirstUnusedBinding(DescSet); + + // Reuse an existing binding for this order ID, if one was already assigned. + // Otherwise, assign a new binding. + const uint32_t NewBinding = (lastOrderId == OrderId) + ? lastBindingNumber + : getAndReserveFirstUnusedBinding(DescSet); + lastOrderId = OrderId; + lastBindingNumber = NewBinding; SmallVector<Value *, 8> Args; Args.push_back(Builder.getInt32(DescSet)); @@ -142,6 +182,7 @@ bool SPIRVLegalizeImplicitBinding::runOnModule(Module &M) { if (ImplicitBindingCalls.empty()) { return false; } + verifyUniqueOrderIdPerResource(ImplicitBindingCalls); replaceImplicitBindingCalls(M); return true; diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index db85e33..53074ea 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -300,6 +300,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { getActionDefinitionsBuilder({G_STRICT_FSQRT, G_FPOW, G_FEXP, + G_FMODF, G_FEXP2, G_FLOG, G_FLOG2, diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index ad7e503..cf85691 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -27,7 +27,7 @@ #include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; -cl::opt<bool> NoKernelInfoEndLTO( +cl::opt<bool> llvm::NoKernelInfoEndLTO( "no-kernel-info-end-lto", cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"), cl::init(false), cl::Hidden); diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index 2cfdc75..a068138 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -957,6 +957,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const { EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, EVT VT) const { + if (VT.isVector()) + return VT.changeVectorElementType(MVT::i1); return MVT::i32; } diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp index 2de89947..ea93a57 100644 --- a/llvm/lib/Target/X86/X86FixupSetCC.cpp +++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp @@ -136,6 +136,12 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { .addReg(ZeroReg) .addReg(Reg0) .addImm(X86::sub_8bit); + + // Redirect the debug-instr-number to the setcc. + if (unsigned InstrNum = ZExt->peekDebugInstrNum()) + MF.makeDebugValueSubstitution({InstrNum, 0}, + {MI.getDebugInstrNum(), 0}); + ToErase.push_back(ZExt); } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 34854e4..cda5568 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -52388,16 +52388,41 @@ static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && - EFLAGS.getValueType().isInteger() && - !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { - SDValue NewSub = - DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), - EFLAGS.getOperand(1), EFLAGS.getOperand(0)); - SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); + // If EFLAGS is from a CMP that compares the same operands as the earlier + // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with + // SBB/ADC without creating a flipped SUB. + if (EFLAGS.getOpcode() == X86ISD::CMP && + EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) { return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), X, - DAG.getConstant(0, DL, VT), NewEFLAGS); + DAG.getConstant(0, DL, VT), EFLAGS); + } + + if (EFLAGS.getOpcode() == X86ISD::SUB && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + // Only create NewSub if we know one of the folds will succeed to avoid + // introducing a temporary node that may persist and affect one-use checks + // below. + if (EFLAGS.getNode()->hasOneUse()) { + SDValue NewSub = DAG.getNode( + X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); + return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, + DAG.getVTList(VT, MVT::i32), X, + DAG.getConstant(0, DL, VT), NewEFLAGS); + } + + if (IsSub && X == EFLAGS.getValue(0)) { + SDValue NewSub = DAG.getNode( + X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); + return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32), + EFLAGS.getOperand(0), EFLAGS.getOperand(1), + NewEFLAGS); + } } } diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 83aa7de..28ee444 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -72,6 +72,7 @@ STATISTIC(NumImportedModules, "Number of modules imported from"); STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index"); STATISTIC(NumLiveSymbols, "Number of live symbols in index"); +namespace llvm { cl::opt<bool> ForceImportAll("force-import-all", cl::init(false), cl::Hidden, cl::desc("Import functions with noinline attribute")); @@ -185,9 +186,8 @@ static cl::opt<bool> CtxprofMoveRootsToOwnModule( extern cl::list<GlobalValue::GUID> MoveSymbolGUID; -namespace llvm { extern cl::opt<bool> EnableMemProfContextDisambiguation; -} +} // end namespace llvm // Load lazily a module from \p FileName in \p Context. static std::unique_ptr<Module> loadFile(const std::string &FileName, diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 4f53738..150a2dc 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -28,10 +28,13 @@ using namespace llvm; STATISTIC(NumSpecsCreated, "Number of specializations created"); +namespace llvm { + static cl::opt<bool> ForceSpecialization( - "force-specialization", cl::init(false), cl::Hidden, cl::desc( - "Force function specialization for every call site with a constant " - "argument")); + "force-specialization", cl::init(false), cl::Hidden, + cl::desc( + "Force function specialization for every call site with a constant " + "argument")); static cl::opt<unsigned> MaxClones( "funcspec-max-clones", cl::init(3), cl::Hidden, cl::desc( @@ -91,6 +94,8 @@ static cl::opt<bool> SpecializeLiteralConstant( extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // end namespace llvm + bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ) const { unsigned I = 0; diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 15f4d76..c4f1b68 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -214,11 +214,12 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion( "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc( "Require target function definition when promoting indirect calls")); -} // namespace llvm extern cl::opt<bool> MemProfReportHintedSizes; extern cl::opt<unsigned> MinClonedColdBytePercent; +} // namespace llvm + namespace { /// CRTP base for graphs built from either IR or ThinLTO summary index. /// diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 99b8b88..e39e311 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -116,6 +116,8 @@ STATISTIC( NumCSInlinedHitGrowthLimit, "Number of functions with FDO inline stopped due to growth size limit"); +namespace llvm { + // Command line option to specify the file to read samples from. This is // mainly used for debugging. static cl::opt<std::string> SampleProfileFile( @@ -198,7 +200,6 @@ static cl::opt<bool> DisableSampleLoaderInlining( "pass, and merge (or scale) profiles (as configured by " "--sample-profile-merge-inlinee).")); -namespace llvm { cl::opt<bool> SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden, cl::desc("Sort profiled recursion by edge weights.")); diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp index 093a39e..70b8614 100644 --- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp @@ -23,6 +23,8 @@ using namespace sampleprof; #define DEBUG_TYPE "sample-profile-matcher" +namespace llvm { + static cl::opt<unsigned> FuncProfileSimilarityThreshold( "func-profile-similarity-threshold", cl::Hidden, cl::init(80), cl::desc("Consider a profile matches a function if the similarity of their " @@ -55,6 +57,8 @@ static cl::opt<unsigned> SalvageStaleProfileMaxCallsites( cl::desc("The maximum number of callsites in a function, above which stale " "profile matching will be skipped.")); +} // end namespace llvm + void SampleProfileMatcher::findIRAnchors(const Function &F, AnchorMap &IRAnchors) const { // For inlined code, recover the original callsite and callee by finding the diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 09bffa7..ac41fdd 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -120,6 +120,8 @@ STATISTIC(NumVirtConstProp1Bit, "Number of 1 bit virtual constant propagations"); STATISTIC(NumVirtConstProp, "Number of virtual constant propagations"); +namespace llvm { + static cl::opt<PassSummaryAction> ClSummaryAction( "wholeprogramdevirt-summary-action", cl::desc("What to do with the summary when running this pass"), @@ -175,6 +177,8 @@ static cl::list<std::string> extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // end namespace llvm + /// With Clang, a pure virtual class's deleting destructor is emitted as a /// `llvm.trap` intrinsic followed by an unreachable IR instruction. In the /// context of whole program devirtualization, the deleting destructor of a pure diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index b6b3a95..87000a1 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -2934,32 +2934,6 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC, return nullptr; } -static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy &Builder) { - FreezeInst *FI = dyn_cast<FreezeInst>(Sel.getCondition()); - if (!FI) - return nullptr; - - Value *Cond = FI->getOperand(0); - Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue(); - - // select (freeze(x == y)), x, y --> y - // select (freeze(x != y)), x, y --> x - // The freeze should be only used by this select. Otherwise, remaining uses of - // the freeze can observe a contradictory value. - // c = freeze(x == y) ; Let's assume that y = poison & x = 42; c is 0 or 1 - // a = select c, x, y ; - // f(a, c) ; f(poison, 1) cannot happen, but if a is folded - // ; to y, this can happen. - CmpPredicate Pred; - if (FI->hasOneUse() && - match(Cond, m_c_ICmp(Pred, m_Specific(TrueVal), m_Specific(FalseVal))) && - (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) { - return Pred == ICmpInst::ICMP_EQ ? FalseVal : TrueVal; - } - - return nullptr; -} - /// Given that \p CondVal is known to be \p CondIsTrue, try to simplify \p SI. static Value *simplifyNestedSelectsUsingImpliedCond(SelectInst &SI, Value *CondVal, @@ -4446,9 +4420,6 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { if (Instruction *PN = foldSelectToPhi(SI, DT, Builder)) return replaceInstUsesWith(SI, PN); - if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder)) - return replaceInstUsesWith(SI, Fr); - if (Value *V = foldRoundUpIntegerWithPow2Alignment(SI, Builder)) return replaceInstUsesWith(SI, V); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 5d2d79e..917004c 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -132,9 +132,11 @@ STATISTIC(NumReassoc , "Number of reassociations"); DEBUG_COUNTER(VisitCounter, "instcombine-visit", "Controls which instructions are visited"); -static cl::opt<bool> -EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"), - cl::init(true)); +namespace llvm { + +static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking", + cl::desc("Enable code sinking"), + cl::init(true)); static cl::opt<unsigned> MaxSinkNumUsers( "instcombine-max-sink-users", cl::init(32), @@ -156,6 +158,8 @@ extern cl::opt<bool> ProfcheckDisableMetadataFixes; static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare", cl::Hidden, cl::init(true)); +} // end namespace llvm + std::optional<Instruction *> InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) { // Handle target specific intrinsics diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 0249f21..cf87e35 100644 --- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -55,11 +55,11 @@ using namespace llvm; STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions."); STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites."); +namespace llvm { extern cl::opt<unsigned> MaxNumVTableAnnotations; -namespace llvm { extern cl::opt<bool> EnableVTableProfileUse; -} +} // namespace llvm // Command line option to disable indirect-call promotion with the default as // false. This is for debug purpose. diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index cf076b9a..eff6f0c 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1923,20 +1923,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// /// Shadow = ParamTLS+ArgOffset. Value *getShadowPtrForArgument(IRBuilder<> &IRB, int ArgOffset) { - Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy); - if (ArgOffset) - Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); - return IRB.CreateIntToPtr(Base, IRB.getPtrTy(0), "_msarg"); + return IRB.CreatePtrAdd(MS.ParamTLS, + ConstantInt::get(MS.IntptrTy, ArgOffset), "_msarg"); } /// Compute the origin address for a given function argument. Value *getOriginPtrForArgument(IRBuilder<> &IRB, int ArgOffset) { if (!MS.TrackOrigins) return nullptr; - Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy); - if (ArgOffset) - Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); - return IRB.CreateIntToPtr(Base, IRB.getPtrTy(0), "_msarg_o"); + return IRB.CreatePtrAdd(MS.ParamOriginTLS, + ConstantInt::get(MS.IntptrTy, ArgOffset), + "_msarg_o"); } /// Compute the shadow address for a retval. @@ -7219,9 +7216,8 @@ struct VarArgHelperBase : public VarArgHelper { /// Compute the shadow address for a given va_arg. Value *getShadowPtrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) { - Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); - Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); - return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_s"); + return IRB.CreatePtrAdd( + MS.VAArgTLS, ConstantInt::get(MS.IntptrTy, ArgOffset), "_msarg_va_s"); } /// Compute the shadow address for a given va_arg. @@ -7235,12 +7231,12 @@ struct VarArgHelperBase : public VarArgHelper { /// Compute the origin address for a given va_arg. Value *getOriginPtrForVAArgument(IRBuilder<> &IRB, int ArgOffset) { - Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy); // getOriginPtrForVAArgument() is always called after // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never // overflow. - Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); - return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_o"); + return IRB.CreatePtrAdd(MS.VAArgOriginTLS, + ConstantInt::get(MS.IntptrTy, ArgOffset), + "_msarg_va_o"); } void CleanUnusedTLS(IRBuilder<> &IRB, Value *ShadowBase, @@ -7467,10 +7463,8 @@ struct VarArgAMD64Helper : public VarArgHelperBase { NextNodeIRBuilder IRB(OrigInst); Value *VAListTag = OrigInst->getArgOperand(0); - Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr( - IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), - ConstantInt::get(MS.IntptrTy, 16)), - MS.PtrTy); + Value *RegSaveAreaPtrPtr = + IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, 16)); Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; const Align Alignment = Align(16); @@ -7482,10 +7476,8 @@ struct VarArgAMD64Helper : public VarArgHelperBase { if (MS.TrackOrigins) IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy, Alignment, AMD64FpEndOffset); - Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr( - IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), - ConstantInt::get(MS.IntptrTy, 8)), - MS.PtrTy); + Value *OverflowArgAreaPtrPtr = + IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, 8)); Value *OverflowArgAreaPtr = IRB.CreateLoad(MS.PtrTy, OverflowArgAreaPtrPtr); Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr; @@ -7615,19 +7607,15 @@ struct VarArgAArch64Helper : public VarArgHelperBase { // Retrieve a va_list field of 'void*' size. Value *getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) { - Value *SaveAreaPtrPtr = IRB.CreateIntToPtr( - IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), - ConstantInt::get(MS.IntptrTy, offset)), - MS.PtrTy); + Value *SaveAreaPtrPtr = + IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, offset)); return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr); } // Retrieve a va_list field of 'int' size. Value *getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) { - Value *SaveAreaPtr = IRB.CreateIntToPtr( - IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), - ConstantInt::get(MS.IntptrTy, offset)), - MS.PtrTy); + Value *SaveAreaPtr = + IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, offset)); Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr); return IRB.CreateSExt(SaveArea32, MS.IntptrTy); } diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index d9e850e..120c4f6 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -222,7 +222,6 @@ cl::opt<bool> NoPGOWarnMismatchComdatWeak( cl::desc("The option is used to turn on/off " "warnings about hash mismatch for comdat " "or weak functions.")); -} // namespace llvm // Command line option to enable/disable select instruction instrumentation. static cl::opt<bool> @@ -347,7 +346,6 @@ cl::list<std::string> CtxPGOSkipCallsiteInstrument( extern cl::opt<unsigned> MaxNumVTableAnnotations; -namespace llvm { // Command line option to turn on CFG dot dump after profile annotation. // Defined in Analysis/BlockFrequencyInfo.cpp: -pgo-view-counts extern cl::opt<PGOViewCountsType> PGOViewCounts; diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index 343bec3..a5f417a 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -54,6 +54,8 @@ using namespace llvm; STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); +namespace llvm { + // The minimum call count to optimize memory intrinsic calls. static cl::opt<unsigned> MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000), @@ -93,6 +95,8 @@ static cl::opt<unsigned> MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), cl::desc("Optimize the memop size <= this value")); +} // end namespace llvm + namespace { static const char *getMIName(const MemIntrinsic *MI) { diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc index a3d4e53..0534fdd 100644 --- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc +++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc @@ -21,7 +21,9 @@ using namespace llvm; using CandidateInfo = ValueProfileCollector::CandidateInfo; +namespace llvm { extern cl::opt<bool> MemOPOptMemcmpBcmp; +} // end namespace llvm ///--------------------------- MemIntrinsicPlugin ------------------------------ class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> { diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp index 2025fbb..3c14036e 100644 --- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp +++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp @@ -26,6 +26,8 @@ using namespace llvm; +namespace llvm { + static cl::opt<unsigned> JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden, cl::desc("Only split jump tables with size less or " @@ -43,6 +45,8 @@ static cl::opt<unsigned> FunctionSizeThreshold( extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // end namespace llvm + #define DEBUG_TYPE "jump-table-to-switch" namespace { @@ -201,14 +205,12 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F, PostDominatorTree *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F); DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy); bool Changed = false; - InstrProfSymtab Symtab; - if (auto E = Symtab.create(*F.getParent())) - F.getContext().emitError( - "Could not create indirect call table, likely corrupted IR" + - toString(std::move(E))); - DenseMap<const Function *, GlobalValue::GUID> FToGuid; - for (const auto &[G, FPtr] : Symtab.getIDToNameMap()) - FToGuid.insert({FPtr, G}); + auto FuncToGuid = [&](const Function &Fct) { + if (Fct.getMetadata(AssignGUIDPass::GUIDMetadataName)) + return AssignGUIDPass::getGUID(Fct); + + return Function::getGUIDAssumingExternalLinkage(getIRPGOFuncName(F, InLTO)); + }; for (BasicBlock &BB : make_early_inc_range(F)) { BasicBlock *CurrentBB = &BB; @@ -230,12 +232,8 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F, std::optional<JumpTableTy> JumpTable = parseJumpTable(GEP, PtrTy); if (!JumpTable) continue; - SplittedOutTail = expandToSwitch( - Call, *JumpTable, DTU, ORE, [&](const Function &Fct) { - if (Fct.getMetadata(AssignGUIDPass::GUIDMetadataName)) - return AssignGUIDPass::getGUID(Fct); - return FToGuid.lookup_or(&Fct, 0U); - }); + SplittedOutTail = + expandToSwitch(Call, *JumpTable, DTU, ORE, FuncToGuid); Changed = true; break; } diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index bab1f2a..9655173 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -116,6 +116,8 @@ STATISTIC(NumIntAssociationsHoisted, STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions " "reassociated and hoisted out of the loop"); +namespace llvm { + /// Memory promotion is enabled by default. static cl::opt<bool> DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false), @@ -154,7 +156,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit( // which may not be precise, since optimizeUses is capped. The result is // correct, but we may not get as "far up" as possible to get which access is // clobbering the one queried. -cl::opt<unsigned> llvm::SetLicmMssaOptCap( +cl::opt<unsigned> SetLicmMssaOptCap( "licm-mssa-optimization-cap", cl::init(100), cl::Hidden, cl::desc("Enable imprecision in LICM in pathological cases, in exchange " "for faster compile. Caps the MemorySSA clobbering calls.")); @@ -162,7 +164,7 @@ cl::opt<unsigned> llvm::SetLicmMssaOptCap( // Experimentally, memory promotion carries less importance than sinking and // hoisting. Limit when we do promotion when using MemorySSA, in order to save // compile time. -cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap( +cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap( "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden, cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no " "effect. When MSSA in LICM is enabled, then this is the maximum " @@ -171,6 +173,8 @@ cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap( extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // end namespace llvm + static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); static bool isNotUsedOrFoldableInLoop(const Instruction &I, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp index 1a9e16b..d31154f 100644 --- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -17,6 +17,8 @@ using namespace llvm; +namespace llvm { + /// Uses the "source_filename" instead of a Module hash ID for the suffix of /// promoted locals during LTO. NOTE: This requires that the source filename /// has a unique name / path to avoid name collisions. @@ -35,6 +37,8 @@ cl::list<GlobalValue::GUID> MoveSymbolGUID( "used with the name of contextual profiling roots."), cl::Hidden); +} // end namespace llvm + FunctionImportGlobalProcessing::FunctionImportGlobalProcessing( Module &M, const ModuleSummaryIndex &Index, SetVector<GlobalValue *> *GlobalsToImport, bool ClearDSOLocalOnDeclarations) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 4d1f768..8bba634 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -95,7 +95,9 @@ using namespace PatternMatch; #define DEBUG_TYPE "simplifycfg" -cl::opt<bool> llvm::RequireAndPreserveDomTree( +namespace llvm { + +cl::opt<bool> RequireAndPreserveDomTree( "simplifycfg-require-and-preserve-domtree", cl::Hidden, cl::desc( @@ -205,6 +207,8 @@ static cl::opt<unsigned> MaxJumpThreadingLiveBlocks( extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // end namespace llvm + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e5d6c81..7fa787b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7954,6 +7954,13 @@ bool VPRecipeBuilder::getScaledReductions( auto CollectExtInfo = [this, &Exts, &ExtOpTypes, &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool { for (const auto &[I, OpI] : enumerate(Ops)) { + auto *CI = dyn_cast<ConstantInt>(OpI); + if (I > 0 && CI && + canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) { + ExtOpTypes[I] = ExtOpTypes[0]; + ExtKinds[I] = ExtKinds[0]; + continue; + } Value *ExtOp; if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp)))) return false; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 02eb637..07b191a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1753,6 +1753,16 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif +bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType, + TTI::PartialReductionExtendKind ExtKind) { + APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits()); + unsigned WideSize = CI->getType()->getScalarSizeInBits(); + APInt ExtendedVal = ExtKind == TTI::PR_SignExtend + ? TruncatedVal.sext(WideSize) + : TruncatedVal.zext(WideSize); + return ExtendedVal == CI->getValue(); +} + TargetTransformInfo::OperandValueInfo VPCostContext::getOperandInfo(VPValue *V) const { if (!V->isLiveIn()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fe59774..fc1a09e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -468,6 +468,10 @@ public: }; #endif +/// Check if a constant \p CI can be safely treated as having been extended +/// from a narrower type with the given extension kind. +bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType, + TTI::PartialReductionExtendKind ExtKind); } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 46909a5..67b9244 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -340,6 +340,14 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, : Widen->getOperand(1)); ExtAType = GetExtendKind(ExtAR); ExtBType = GetExtendKind(ExtBR); + + if (!ExtBR && Widen->getOperand(1)->isLiveIn()) { + auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue()); + if (canConstantBeExtended(CI, InputTypeA, ExtAType)) { + InputTypeB = InputTypeA; + ExtBType = ExtAType; + } + } }; if (isa<VPWidenCastRecipe>(OpR)) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a73b083..acdb379 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -40,7 +40,7 @@ using namespace llvm; using namespace VPlanPatternMatch; -cl::opt<bool> EnableWideActiveLaneMask( +static cl::opt<bool> EnableWideActiveLaneMask( "enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions")); |