[LoopInterchange] New cost model for loop interchange

This is another attempt to land this patch. The patch proposed to use a new cost model for loop interchange, which is obtained from loop cache analysis. Given a loopnest, what loop cache analysis returns is a vector of loops [loop0, loop1, loop2, ...] where loop0 should be replaced as the outermost loop, loop1 should be placed one more level inside, and loop2 one more level inside, etc. What loop cache analysis does is not only more comprehensive than the current cost model, it is also a "one-shot" query which means that we only need to query it once during the entire loop interchange pass, which is better than the current cost model where we query it every time we check whether it is profitable to interchange two loops. Thus complexity is reduced, especially after D120386 where we do more interchanges to get the globally optimal loop access pattern. Updates made to test cases are mostly minor changes and some corrections. One change that applies to all tests is that we added an option `-cache-line-size=64` to the RUN lines. This is ensure that loop cache analysis receives a valid number of cache line size for correct analysis. Test coverage for loop interchange is not reduced. Currently we did not completely remove the legacy cost model, but keep it as fall-back in case the new cost model did not run successfully. This is because currently we have some limitations in delinearization, which sometimes makes loop cache analysis bail out. The longer term goal is to enhance delinearization and eventually remove the legacy cost model compeletely. Reviewed By: bmahjour, #loopoptwg Differential Revision: https://reviews.llvm.org/D124926
author: Congzhe Cao <congzhe.cao@huawei.com> 2022-06-28 00:06:16 -0400
committer: CongzheUalberta <congzhecao@gmail.com> 2022-06-28 00:08:37 -0400
commit: b941857b40edd7f3f3a9ec2ec85a26db24739774 (patch)
tree: dda1d8671100817003112aba9ba444545c8b62c6 /llvm/lib
parent: f1dcc6af30d98cef6d0aa9579148fa223dbb5d7c (diff)
download: llvm-b941857b40edd7f3f3a9ec2ec85a26db24739774.zip
llvm-b941857b40edd7f3f3a9ec2ec85a26db24739774.tar.gz
llvm-b941857b40edd7f3f3a9ec2ec85a26db24739774.tar.bz2
1 files changed, 63 insertions, 29 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index ced4396..1d3023d 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -358,8 +359,10 @@ public:
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
 
   /// Check if the loop interchange is profitable.
-  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix);
+  bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
+                    unsigned InnerLoopId, unsigned OuterLoopId,
+                    CharMatrix &DepMatrix,
+                    const DenseMap<const Loop *, unsigned> &CostMap);
 
 private:
   int getInstrOrderCost();
@@ -410,13 +413,15 @@ struct LoopInterchange {
   LoopInfo *LI = nullptr;
   DependenceInfo *DI = nullptr;
   DominatorTree *DT = nullptr;
+  std::unique_ptr<CacheCost> CC = nullptr;
 
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
   LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
-                  DominatorTree *DT, OptimizationRemarkEmitter *ORE)
-      : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
+                  DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
+                  OptimizationRemarkEmitter *ORE)
+      : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
 
   bool run(Loop *L) {
     if (L->getParentLoop())
@@ -499,6 +504,21 @@ struct LoopInterchange {
     }
 
     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+    // Obtain the loop vector returned from loop cache analysis beforehand,
+    // and put each <Loop, index> pair into a map for constant time query
+    // later. Indices in loop vector reprsent the optimal order of the
+    // corresponding loop, e.g., given a loopnest with depth N, index 0
+    // indicates the loop should be placed as the outermost loop and index N
+    // indicates the loop should be placed as the innermost loop.
+    //
+    // For the old pass manager CacheCost would be null.
+    DenseMap<const Loop *, unsigned> CostMap;
+    if (CC != nullptr) {
+      const auto &LoopCosts = CC->getLoopCosts();
+      for (unsigned i = 0; i < LoopCosts.size(); i++) {
+        CostMap[LoopCosts[i].first] = i;
+      }
+    }
     // We try to achieve the globally optimal memory access for the loopnest,
     // and do interchange based on a bubble-sort fasion. We start from
     // the innermost loop, move it outwards to the best possible position
@@ -507,7 +527,7 @@ struct LoopInterchange {
       bool ChangedPerIter = false;
       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
         bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
-                                        DependencyMatrix);
+                                        DependencyMatrix, CostMap);
         if (!Interchanged)
           continue;
         // Loops interchanged, update LoopList accordingly.
@@ -531,7 +551,8 @@ struct LoopInterchange {
 
   bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
                    unsigned OuterLoopId,
-                   std::vector<std::vector<char>> &DependencyMatrix) {
+                   std::vector<std::vector<char>> &DependencyMatrix,
+                   const DenseMap<const Loop *, unsigned> &CostMap) {
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                       << " and OuterLoopId = " << OuterLoopId << "\n");
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
@@ -541,7 +562,8 @@ struct LoopInterchange {
     }
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
-    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+    if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
+                          DependencyMatrix, CostMap)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1135,21 +1157,33 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
   return !DepMatrix.empty();
 }
 
-bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
-                                                unsigned OuterLoopId,
-                                                CharMatrix &DepMatrix) {
-  // TODO: Add better profitability checks.
-  // e.g
-  // 1) Construct dependency matrix and move the one with no loop carried dep
-  //    inside to enable vectorization.
-
-  // This is rough cost estimation algorithm. It counts the good and bad order
-  // of induction variables in the instruction and allows reordering if number
-  // of bad orders is more than good.
-  int Cost = getInstrOrderCost();
-  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
-  if (Cost < -LoopInterchangeCostThreshold)
-    return true;
+bool LoopInterchangeProfitability::isProfitable(
+    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const DenseMap<const Loop *, unsigned> &CostMap) {
+  // TODO: Remove the legacy cost model.
+
+  // This is the new cost model returned from loop cache analysis.
+  // A smaller index means the loop should be placed an outer loop, and vice
+  // versa.
+  if (CostMap.find(InnerLoop) != CostMap.end() &&
+      CostMap.find(OuterLoop) != CostMap.end()) {
+    unsigned InnerIndex = 0, OuterIndex = 0;
+    InnerIndex = CostMap.find(InnerLoop)->second;
+    OuterIndex = CostMap.find(OuterLoop)->second;
+    LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
+                      << ", OuterIndex = " << OuterIndex << "\n");
+    if (InnerIndex < OuterIndex)
+      return true;
+  } else {
+    // Legacy cost model: this is rough cost estimation algorithm. It counts the
+    // good and bad order of induction variables in the instruction and allows
+    // reordering if number of bad orders is more than good.
+    int Cost = getInstrOrderCost();
+    LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+    if (Cost < -LoopInterchangeCostThreshold)
+      return true;
+  }
 
   // It is not profitable as per current cache profitability model. But check if
   // we can move this loop outside to improve parallelism.
@@ -1160,10 +1194,8 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
     return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
                                     InnerLoop->getStartLoc(),
                                     InnerLoop->getHeader())
-           << "Interchanging loops is too costly (cost="
-           << ore::NV("Cost", Cost) << ", threshold="
-           << ore::NV("Threshold", LoopInterchangeCostThreshold)
-           << ") and it does not improve parallelism.";
+           << "Interchanging loops is too costly and it does not improve "
+              "parallelism.";
   });
   return false;
 }
@@ -1709,8 +1741,8 @@ struct LoopInterchangeLegacyPass : public LoopPass {
     auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
-    return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
+    std::unique_ptr<CacheCost> CC = nullptr;
+    return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L);
   }
 };
 } // namespace
@@ -1737,8 +1769,10 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
   Function &F = *LN.getParent();
 
   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+  std::unique_ptr<CacheCost> CC =
+      CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
   OptimizationRemarkEmitter ORE(&F);
-  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN))
+  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
 }
author	Congzhe Cao <congzhe.cao@huawei.com>	2022-06-28 00:06:16 -0400
committer	CongzheUalberta <congzhecao@gmail.com>	2022-06-28 00:08:37 -0400
commit	b941857b40edd7f3f3a9ec2ec85a26db24739774 (patch)
tree	dda1d8671100817003112aba9ba444545c8b62c6 /llvm/lib
parent	f1dcc6af30d98cef6d0aa9579148fa223dbb5d7c (diff)
download	llvm-b941857b40edd7f3f3a9ec2ec85a26db24739774.zip llvm-b941857b40edd7f3f3a9ec2ec85a26db24739774.tar.gz llvm-b941857b40edd7f3f3a9ec2ec85a26db24739774.tar.bz2