Unpack the CostEstimate feature in ML inlining models.

This change yields an additional 2% size reduction on an internal search binary, and an additional 0.5% size reduction on fuchsia. Differential Revision: https://reviews.llvm.org/D104751
author: Jacob Hegna <jacobhegna@google.com> 2021-06-10 02:16:04 +0000
committer: Jacob Hegna <jacobhegna@gmail.com> 2021-07-02 16:57:16 +0000
commit: 99f00635d7acf1cbcdba35e7621f3a211aa3f237 (patch)
tree: a0620b339f83281b9ac6b30d1bbd11b78723223c /llvm/lib/Analysis/InlineCost.cpp
parent: dba74c68178bfaa54e6270d4790b78ef5b6e37c2 (diff)
download: llvm-99f00635d7acf1cbcdba35e7621f3a211aa3f237.zip
llvm-99f00635d7acf1cbcdba35e7621f3a211aa3f237.tar.gz
llvm-99f00635d7acf1cbcdba35e7621f3a211aa3f237.tar.bz2
1 files changed, 238 insertions, 16 deletions
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 92b0fbd..4e03629 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -439,6 +439,25 @@ public:
   void dump();
 };
 
+// Considering forming a binary search, we should find the number of nodes
+// which is same as the number of comparisons when lowered. For a given
+// number of clusters, n, we can define a recursive function, f(n), to find
+// the number of nodes in the tree. The recursion is :
+// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
+// and f(n) = n, when n <= 3.
+// This will lead a binary tree where the leaf should be either f(2) or f(3)
+// when n > 3.  So, the number of comparisons from leaves should be n, while
+// the number of non-leaf should be :
+//   2^(log2(n) - 1) - 1
+//   = 2^log2(n) * 2^-1 - 1
+//   = n / 2 - 1.
+// Considering comparisons from leaf and non-leaf nodes, we can estimate the
+// number of comparisons in a simple closed form :
+//   n + n / 2 - 1 = n * 3 / 2 - 1
+int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
+  return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
+}
+
 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
 class InlineCostCallAnalyzer final : public CallAnalyzer {
@@ -582,28 +601,15 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
       addCost(JTCost, (int64_t)CostUpperBound);
       return;
     }
-    // Considering forming a binary search, we should find the number of nodes
-    // which is same as the number of comparisons when lowered. For a given
-    // number of clusters, n, we can define a recursive function, f(n), to find
-    // the number of nodes in the tree. The recursion is :
-    // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
-    // and f(n) = n, when n <= 3.
-    // This will lead a binary tree where the leaf should be either f(2) or f(3)
-    // when n > 3.  So, the number of comparisons from leaves should be n, while
-    // the number of non-leaf should be :
-    //   2^(log2(n) - 1) - 1
-    //   = 2^log2(n) * 2^-1 - 1
-    //   = n / 2 - 1.
-    // Considering comparisons from leaf and non-leaf nodes, we can estimate the
-    // number of comparisons in a simple closed form :
-    //   n + n / 2 - 1 = n * 3 / 2 - 1
+
     if (NumCaseCluster <= 3) {
       // Suppose a comparison includes one compare and one conditional branch.
       addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
       return;
     }
 
-    int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
+    int64_t ExpectedNumberOfCompare =
+        getExpectedNumberOfCompare(NumCaseCluster);
     int64_t SwitchCost =
         ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
 
@@ -936,6 +942,209 @@ public:
   int getCost() { return Cost; }
   bool wasDecidedByCostBenefit() { return DecidedByCostBenefit; }
 };
+
+class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
+private:
+  InlineCostFeatures Cost = {};
+
+  // FIXME: These constants are taken from the heuristic-based cost visitor.
+  // These should be removed entirely in a later revision to avoid reliance on
+  // heuristics in the ML inliner.
+  static constexpr int JTCostMultiplier = 4;
+  static constexpr int CaseClusterCostMultiplier = 2;
+  static constexpr int SwitchCostMultiplier = 2;
+
+  // FIXME: These are taken from the heuristic-based cost visitor: we should
+  // eventually abstract these to the CallAnalyzer to avoid duplication.
+  unsigned SROACostSavingOpportunities = 0;
+  int VectorBonus = 0;
+  int SingleBBBonus = 0;
+  int Threshold = 5;
+
+  DenseMap<AllocaInst *, unsigned> SROACosts;
+
+  void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
+    Cost[static_cast<size_t>(Feature)] += Delta;
+  }
+
+  void set(InlineCostFeatureIndex Feature, int64_t Value) {
+    Cost[static_cast<size_t>(Feature)] = Value;
+  }
+
+  void onDisableSROA(AllocaInst *Arg) override {
+    auto CostIt = SROACosts.find(Arg);
+    if (CostIt == SROACosts.end())
+      return;
+
+    increment(InlineCostFeatureIndex::SROALosses, CostIt->second);
+    SROACostSavingOpportunities -= CostIt->second;
+    SROACosts.erase(CostIt);
+  }
+
+  void onDisableLoadElimination() override {
+    set(InlineCostFeatureIndex::LoadElimination, 1);
+  }
+
+  void onCallPenalty() override {
+    increment(InlineCostFeatureIndex::CallPenalty,
+              InlineConstants::CallPenalty);
+  }
+
+  void onCallArgumentSetup(const CallBase &Call) override {
+    increment(InlineCostFeatureIndex::CallArgumentSetup,
+              Call.arg_size() * InlineConstants::InstrCost);
+  }
+
+  void onLoadRelativeIntrinsic() override {
+    increment(InlineCostFeatureIndex::LoadRelativeIntrinsic,
+              3 * InlineConstants::InstrCost);
+  }
+
+  void onLoweredCall(Function *F, CallBase &Call,
+                     bool IsIndirectCall) override {
+    increment(InlineCostFeatureIndex::LoweredCallArgSetup,
+              Call.arg_size() * InlineConstants::InstrCost);
+
+    if (IsIndirectCall) {
+      InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
+                                         /*HintThreshold*/ {},
+                                         /*ColdThreshold*/ {},
+                                         /*OptSizeThreshold*/ {},
+                                         /*OptMinSizeThreshold*/ {},
+                                         /*HotCallSiteThreshold*/ {},
+                                         /*LocallyHotCallSiteThreshold*/ {},
+                                         /*ColdCallSiteThreshold*/ {},
+                                         /*ComputeFullInlineCost*/ true,
+                                         /*EnableDeferral*/ true};
+      IndirectCallParams.DefaultThreshold =
+          InlineConstants::IndirectCallThreshold;
+
+      InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
+                                GetAssumptionCache, GetBFI, PSI, ORE, false,
+                                true);
+      if (CA.analyze().isSuccess()) {
+        increment(InlineCostFeatureIndex::NestedInlineCostEstimate,
+                  CA.getCost());
+        increment(InlineCostFeatureIndex::NestedInlines, 1);
+      }
+    } else {
+      onCallPenalty();
+    }
+  }
+
+  void onFinalizeSwitch(unsigned JumpTableSize,
+                        unsigned NumCaseCluster) override {
+
+    if (JumpTableSize) {
+      int64_t JTCost =
+          static_cast<int64_t>(JumpTableSize) * InlineConstants::InstrCost +
+          JTCostMultiplier * InlineConstants::InstrCost;
+      increment(InlineCostFeatureIndex::JumpTablePenalty, JTCost);
+      return;
+    }
+
+    if (NumCaseCluster <= 3) {
+      increment(InlineCostFeatureIndex::CaseClusterPenalty,
+                NumCaseCluster * CaseClusterCostMultiplier *
+                    InlineConstants::InstrCost);
+      return;
+    }
+
+    int64_t ExpectedNumberOfCompare =
+        getExpectedNumberOfCompare(NumCaseCluster);
+
+    int64_t SwitchCost = ExpectedNumberOfCompare * SwitchCostMultiplier *
+                         InlineConstants::InstrCost;
+    increment(InlineCostFeatureIndex::SwitchPenalty, SwitchCost);
+  }
+
+  void onMissedSimplification() override {
+    increment(InlineCostFeatureIndex::UnsimplifiedCommonInstructions,
+              InlineConstants::InstrCost);
+  }
+
+  void onInitializeSROAArg(AllocaInst *Arg) override { SROACosts[Arg] = 0; }
+  void onAggregateSROAUse(AllocaInst *Arg) override {
+    SROACosts.find(Arg)->second += InlineConstants::InstrCost;
+    SROACostSavingOpportunities += InlineConstants::InstrCost;
+  }
+
+  void onBlockAnalyzed(const BasicBlock *BB) override {
+    if (BB->getTerminator()->getNumSuccessors() > 1)
+      set(InlineCostFeatureIndex::IsMultipleBlocks, 1);
+    Threshold -= SingleBBBonus;
+  }
+
+  InlineResult finalizeAnalysis() override {
+    auto *Caller = CandidateCall.getFunction();
+    if (Caller->hasMinSize()) {
+      DominatorTree DT(F);
+      LoopInfo LI(DT);
+      for (Loop *L : LI) {
+        // Ignore loops that will not be executed
+        if (DeadBlocks.count(L->getHeader()))
+          continue;
+        increment(InlineCostFeatureIndex::NumLoops,
+                  InlineConstants::CallPenalty);
+      }
+    }
+    set(InlineCostFeatureIndex::DeadBlocks, DeadBlocks.size());
+    set(InlineCostFeatureIndex::SimplifiedInstructions,
+        NumInstructionsSimplified);
+    set(InlineCostFeatureIndex::ConstantArgs, NumConstantArgs);
+    set(InlineCostFeatureIndex::ConstantOffsetPtrArgs,
+        NumConstantOffsetPtrArgs);
+    set(InlineCostFeatureIndex::SROASavings, SROACostSavingOpportunities);
+
+    if (NumVectorInstructions <= NumInstructions / 10)
+      increment(InlineCostFeatureIndex::Threshold, -1 * VectorBonus);
+    else if (NumVectorInstructions <= NumInstructions / 2)
+      increment(InlineCostFeatureIndex::Threshold, -1 * (VectorBonus / 2));
+
+    set(InlineCostFeatureIndex::Threshold, Threshold);
+
+    return InlineResult::success();
+  }
+
+  bool shouldStop() override { return false; }
+
+  void onLoadEliminationOpportunity() override {
+    increment(InlineCostFeatureIndex::LoadElimination, 1);
+  }
+
+  InlineResult onAnalysisStart() override {
+    increment(InlineCostFeatureIndex::CallSiteCost,
+              -1 * getCallsiteCost(this->CandidateCall, DL));
+
+    set(InlineCostFeatureIndex::ColdCcPenalty,
+        (F.getCallingConv() == CallingConv::Cold));
+
+    // FIXME: we shouldn't repeat this logic in both the Features and Cost
+    // analyzer - instead, we should abstract it to a common method in the
+    // CallAnalyzer
+    int SingleBBBonusPercent = 50;
+    int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
+    Threshold += TTI.adjustInliningThreshold(&CandidateCall);
+    Threshold *= TTI.getInliningThresholdMultiplier();
+    SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
+    VectorBonus = Threshold * VectorBonusPercent / 100;
+    Threshold += (SingleBBBonus + VectorBonus);
+
+    return InlineResult::success();
+  }
+
+public:
+  InlineCostFeaturesAnalyzer(
+      const TargetTransformInfo &TTI,
+      function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
+      function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+      ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
+      CallBase &Call)
+      : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
+
+  const InlineCostFeatures &features() const { return Cost; }
+};
+
 } // namespace
 
 /// Test whether the given value is an Alloca-derived function argument.
@@ -2502,6 +2711,19 @@ Optional<int> llvm::getInliningCostEstimate(
   return CA.getCost();
 }
 
+Optional<InlineCostFeatures> llvm::getInliningCostFeatures(
+    CallBase &Call, TargetTransformInfo &CalleeTTI,
+    function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
+    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+    ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
+  InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
+                                 ORE, *Call.getCalledFunction(), Call);
+  auto R = CFA.analyze();
+  if (!R.isSuccess())
+    return None;
+  return CFA.features();
+}
+
 Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
     CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
     function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
author	Jacob Hegna <jacobhegna@google.com>	2021-06-10 02:16:04 +0000
committer	Jacob Hegna <jacobhegna@gmail.com>	2021-07-02 16:57:16 +0000
commit	99f00635d7acf1cbcdba35e7621f3a211aa3f237 (patch)
tree	a0620b339f83281b9ac6b30d1bbd11b78723223c /llvm/lib/Analysis/InlineCost.cpp
parent	dba74c68178bfaa54e6270d4790b78ef5b6e37c2 (diff)
download	llvm-99f00635d7acf1cbcdba35e7621f3a211aa3f237.zip llvm-99f00635d7acf1cbcdba35e7621f3a211aa3f237.tar.gz llvm-99f00635d7acf1cbcdba35e7621f3a211aa3f237.tar.bz2