//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "CSPreInliner.h" #include "ProfiledBinary.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/Statistic.h" #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" #include #include #define DEBUG_TYPE "cs-preinliner" using namespace llvm; using namespace sampleprof; STATISTIC(PreInlNumCSInlined, "Number of functions inlined with context sensitive profile"); STATISTIC(PreInlNumCSNotInlined, "Number of functions not inlined with context sensitive profile"); STATISTIC(PreInlNumCSInlinedHitMinLimit, "Number of functions with FDO inline stopped due to min size limit"); STATISTIC(PreInlNumCSInlinedHitMaxLimit, "Number of functions with FDO inline stopped due to max size limit"); STATISTIC( PreInlNumCSInlinedHitGrowthLimit, "Number of functions with FDO inline stopped due to growth size limit"); // The switches specify inline thresholds used in SampleProfileLoader inlining. // TODO: the actual threshold to be tuned here because the size here is based // on machine code not LLVM IR. namespace llvm { extern cl::opt SampleHotCallSiteThreshold; extern cl::opt SampleColdCallSiteThreshold; extern cl::opt ProfileInlineGrowthLimit; extern cl::opt ProfileInlineLimitMin; extern cl::opt ProfileInlineLimitMax; extern cl::opt SortProfiledSCC; cl::opt EnableCSPreInliner( "csspgo-preinliner", cl::Hidden, cl::init(true), cl::desc("Run a global pre-inliner to merge context profile based on " "estimated global top-down inline decisions")); cl::opt UseContextCostForPreInliner( "use-context-cost-for-preinliner", cl::Hidden, cl::init(true), cl::desc("Use context-sensitive byte size cost for preinliner decisions")); } // namespace llvm static cl::opt SamplePreInlineReplay( "csspgo-replay-preinline", cl::Hidden, cl::init(false), cl::desc( "Replay previous inlining and adjust context profile accordingly")); static cl::opt CSPreinlMultiplierForPrevInl( "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden, cl::init(100), cl::desc( "Multiplier to bump up callsite threshold for previous inlining.")); CSPreInliner::CSPreInliner(SampleContextTracker &Tracker, ProfiledBinary &Binary, ProfileSummary *Summary) : UseContextCost(UseContextCostForPreInliner), // TODO: Pass in a guid-to-name map in order for // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes // as their profile context. ContextTracker(Tracker), Binary(Binary), Summary(Summary) { // Set default preinliner hot/cold call site threshold tuned with CSSPGO. // for good performance with reasonable profile size. if (!SampleHotCallSiteThreshold.getNumOccurrences()) SampleHotCallSiteThreshold = 1500; if (!SampleColdCallSiteThreshold.getNumOccurrences()) SampleColdCallSiteThreshold = 0; if (!ProfileInlineLimitMax.getNumOccurrences()) ProfileInlineLimitMax = 50000; } std::vector CSPreInliner::buildTopDownOrder() { std::vector Order; // Trim cold edges to get a more stable call graph. This allows for a more // stable top-down order which in turns helps the stablity of the generated // profile from run to run. uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( (Summary->getDetailedSummary())); ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold); // Now that we have a profiled call graph, construct top-down order // by building up SCC and reversing SCC order. scc_iterator I = scc_begin(&ProfiledCG); while (!I.isAtEnd()) { auto Range = *I; if (SortProfiledSCC) { // Sort nodes in one SCC based on callsite hotness. scc_member_iterator SI(*I); Range = *SI; } for (auto *Node : Range) { if (Node != ProfiledCG.getEntryNode()) Order.push_back(Node->Name); } ++I; } std::reverse(Order.begin(), Order.end()); return Order; } bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, const FunctionSamples *CallerSamples) { assert(CallerSamples && "Expect non-null caller samples"); // Ideally we want to consider everything a function calls, but as far as // context profile is concerned, only those frames that are children of // current one in the trie is relavent. So we walk the trie instead of call // targets from function profile. ContextTrieNode *CallerNode = ContextTracker.getContextNodeForProfile(CallerSamples); bool HasNewCandidate = false; for (auto &Child : CallerNode->getAllChildContext()) { ContextTrieNode *CalleeNode = &Child.second; FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); if (!CalleeSamples) continue; // Call site count is more reliable, so we look up the corresponding call // target profile in caller's context profile to retrieve call site count. uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate(); uint64_t CallsiteCount = 0; LineLocation Callsite = CalleeNode->getCallSiteLoc(); if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); auto It = TargetCounts.find(CalleeSamples->getName()); if (It != TargetCounts.end()) CallsiteCount = It->second; } // TODO: call site and callee entry count should be mostly consistent, add // check for that. HasNewCandidate = true; uint32_t CalleeSize = getFuncSize(CalleeNode); CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), CalleeSize); } return HasNewCandidate; } uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) { if (UseContextCost) return Binary.getFuncSizeForContext(ContextNode); return ContextNode->getFunctionSamples()->getBodySamples().size(); } bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { bool WasInlined = Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); // If replay inline is requested, simply follow the inline decision of the // profiled binary. if (SamplePreInlineReplay) return WasInlined; unsigned int SampleThreshold = SampleColdCallSiteThreshold; uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( (Summary->getDetailedSummary())); if (Candidate.CallsiteCount <= ColdCountThreshold) SampleThreshold = SampleColdCallSiteThreshold; else { // Linearly adjust threshold based on normalized hotness, i.e, a value in // [0,1]. Use 10% cutoff instead of the max count as the normalization // upperbound for stability. double NormalizationUpperBound = ProfileSummaryBuilder::getEntryForPercentile( Summary->getDetailedSummary(), 100000 /* 10% */) .MinCount; double NormalizationLowerBound = ColdCountThreshold; double NormalizedHotness = (Candidate.CallsiteCount - NormalizationLowerBound) / (NormalizationUpperBound - NormalizationLowerBound); if (NormalizedHotness > 1.0) NormalizedHotness = 1.0; // Add 1 to to ensure hot callsites get a non-zero threshold, which could // happen when SampleColdCallSiteThreshold is 0. This is when we do not // want any inlining for cold callsites. SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 + SampleColdCallSiteThreshold + 1; // Bump up the threshold to favor previous compiler inline decision. The // compiler has more insight and knowledge about functions based on their IR // and attribures and should be able to make a more reasonable inline // decision. if (WasInlined) SampleThreshold *= CSPreinlMultiplierForPrevInl; } return (Candidate.SizeCost < SampleThreshold); } void CSPreInliner::processFunction(const StringRef Name) { FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); if (!FSamples) return; unsigned FuncSize = getFuncSize(ContextTracker.getContextNodeForProfile(FSamples)); unsigned FuncFinalSize = FuncSize; unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); LLVM_DEBUG(dbgs() << "Process " << Name << " for context-sensitive pre-inlining (pre-inline size: " << FuncSize << ", size limit: " << SizeLimit << ")\n"); ProfiledCandidateQueue CQueue; getInlineCandidates(CQueue, FSamples); while (!CQueue.empty() && FuncFinalSize < SizeLimit) { ProfiledInlineCandidate Candidate = CQueue.top(); CQueue.pop(); bool ShouldInline = false; if ((ShouldInline = shouldInline(Candidate))) { // We mark context as inlined as the corresponding context profile // won't be merged into that function's base profile. ++PreInlNumCSInlined; ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); Candidate.CalleeSamples->getContext().setAttribute( ContextShouldBeInlined); FuncFinalSize += Candidate.SizeCost; getInlineCandidates(CQueue, Candidate.CalleeSamples); } else { ++PreInlNumCSNotInlined; } LLVM_DEBUG( dbgs() << (ShouldInline ? " Inlined" : " Outlined") << " context profile for: " << ContextTracker.getContextString(*Candidate.CalleeSamples) << " (callee size: " << Candidate.SizeCost << ", call count:" << Candidate.CallsiteCount << ")\n"); } if (!CQueue.empty()) { if (SizeLimit == (unsigned)ProfileInlineLimitMax) ++PreInlNumCSInlinedHitMaxLimit; else if (SizeLimit == (unsigned)ProfileInlineLimitMin) ++PreInlNumCSInlinedHitMinLimit; else ++PreInlNumCSInlinedHitGrowthLimit; } LLVM_DEBUG({ if (!CQueue.empty()) dbgs() << " Inline candidates ignored due to size limit (inliner " "original size: " << FuncSize << ", inliner final size: " << FuncFinalSize << ", size limit: " << SizeLimit << ")\n"; while (!CQueue.empty()) { ProfiledInlineCandidate Candidate = CQueue.top(); CQueue.pop(); bool WasInlined = Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); dbgs() << " " << ContextTracker.getContextString(*Candidate.CalleeSamples) << " (candidate size:" << Candidate.SizeCost << ", call count: " << Candidate.CallsiteCount << ", previously " << (WasInlined ? "inlined)\n" : "not inlined)\n"); } }); } void CSPreInliner::run() { #ifndef NDEBUG auto printProfileNames = [](SampleContextTracker &ContextTracker, bool IsInput) { uint32_t Size = 0; for (auto *Node : ContextTracker) { FunctionSamples *FSamples = Node->getFunctionSamples(); if (FSamples) { Size++; dbgs() << " [" << ContextTracker.getContextString(Node) << "] " << FSamples->getTotalSamples() << ":" << FSamples->getHeadSamples() << "\n"; } } dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" << Size << " total):\n"; }; #endif LLVM_DEBUG(printProfileNames(ContextTracker, true)); // Execute global pre-inliner to estimate a global top-down inline // decision and merge profiles accordingly. This helps with profile // merge for ThinLTO otherwise we won't be able to merge profiles back // to base profile across module/thin-backend boundaries. // It also helps better compress context profile to control profile // size, as we now only need context profile for functions going to // be inlined. for (StringRef FuncName : buildTopDownOrder()) { processFunction(FuncName); } // Not inlined context profiles are merged into its base, so we can // trim out such profiles from the output. for (auto *Node : ContextTracker) { FunctionSamples *FProfile = Node->getFunctionSamples(); if (FProfile && (Node->getParentContext() != &ContextTracker.getRootContext() && !FProfile->getContext().hasState(InlinedContext))) { Node->setFunctionSamples(nullptr); } } FunctionSamples::ProfileIsPreInlined = true; LLVM_DEBUG(printProfileNames(ContextTracker, false)); }