[Scheduling] Implement a new way to cluster loads/stores

Before calling target hook to determine if two loads/stores are clusterable, we put them into different groups to avoid fake cluster due to dependency. For now, we are putting the loads/stores into the same group if they have the same predecessor. We assume that, if two loads/stores have the same predecessor, it is likely that, they didn't have dependency for each other. However, one SUnit might have several predecessors and for now, we just pick up the first predecessor that has non-data/non-artificial dependency, which is too arbitrary. And we are struggling to fix it. So, I am proposing some better implementation. 1. Collect all the loads/stores that has memory info first to reduce the complexity. 2. Sort these loads/stores so that we can stop the seeking as early as possible. 3. For each load/store, seeking for the first non-dependency instruction with the sorted order, and check if they can cluster or not. Reviewed By: Jay Foad Differential Revision: https://reviews.llvm.org/D85517
author: QingShan Zhang <qshanz@cn.ibm.com> 2020-08-26 12:26:21 +0000
committer: QingShan Zhang <qshanz@cn.ibm.com> 2020-08-26 12:33:59 +0000
commit: ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac (patch)
tree: b8e8af31aac49efbe18773f16aa8d53415c0e3e9 /llvm/lib/CodeGen/MachineScheduler.cpp
parent: d289a97f91443177b605926668512479c2cee37b (diff)
download: llvm-ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac.zip
llvm-ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac.tar.gz
llvm-ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac.tar.bz2
1 files changed, 72 insertions, 64 deletions
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index a8ccf26..b6d0d9a 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1530,7 +1530,10 @@ public:
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
 protected:
-  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG);
+  void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps,
+                                ScheduleDAGInstrs *DAG);
+  void collectMemOpRecords(std::vector<SUnit> &SUnits,
+                           SmallVectorImpl<MemOpInfo> &MemOpRecords);
 };
 
 class StoreClusterMutation : public BaseMemOpClusterMutation {
@@ -1566,63 +1569,53 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,
 
 } // end namespace llvm
 
+// Sorting all the loads/stores first, then for each load/store, checking the
+// following load/store one by one, until reach the first non-dependent one and
+// call target hook to see if they can cluster.
 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
-    ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
-  SmallVector<MemOpInfo, 32> MemOpRecords;
-  for (SUnit *SU : MemOps) {
-    const MachineInstr &MI = *SU->getInstr();
-    SmallVector<const MachineOperand *, 4> BaseOps;
-    int64_t Offset;
-    bool OffsetIsScalable;
-    unsigned Width;
-    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
-                                           OffsetIsScalable, Width, TRI)) {
-      MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width));
-
-      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
-                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
-                        << ", Width: " << Width << "\n");
-    }
-#ifndef NDEBUG
-    for (auto *Op : BaseOps)
-      assert(Op);
-#endif
-  }
-  if (MemOpRecords.size() < 2)
-    return;
-
-  llvm::sort(MemOpRecords);
+    ArrayRef<MemOpInfo> MemOpRecords, ScheduleDAGInstrs *DAG) {
+  // Keep track of the current cluster length and bytes for each SUnit.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
 
   // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
   // cluster mem ops collected within `MemOpRecords` array.
-  unsigned ClusterLength = 1;
-  unsigned CurrentClusterBytes = MemOpRecords[0].Width;
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     // Decision to cluster mem ops is taken based on target dependent logic
     auto MemOpa = MemOpRecords[Idx];
-    auto MemOpb = MemOpRecords[Idx + 1];
-    ++ClusterLength;
-    CurrentClusterBytes += MemOpb.Width;
-    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
-                                  CurrentClusterBytes)) {
-      // Current mem ops pair could not be clustered, reset cluster length, and
-      // go to next pair
-      ClusterLength = 1;
-      CurrentClusterBytes = MemOpb.Width;
+
+    // Seek for the next load/store to do the cluster.
+    unsigned NextIdx = Idx + 1;
+    for (; NextIdx < End; ++NextIdx)
+      // Skip if MemOpb has been clustered already or has dependency with
+      // MemOpa.
+      if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
+          !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
+          !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))
+        break;
+    if (NextIdx == End)
       continue;
+
+    auto MemOpb = MemOpRecords[NextIdx];
+    unsigned ClusterLength = 2;
+    unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
+    if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
+      ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
+      CurrentClusterBytes =
+          SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
     }
 
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
+                                  CurrentClusterBytes))
+      continue;
+
     SUnit *SUa = MemOpa.SU;
     SUnit *SUb = MemOpb.SU;
     if (SUa->NodeNum > SUb->NodeNum)
       std::swap(SUa, SUb);
 
     // FIXME: Is this check really required?
-    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-      ClusterLength = 1;
-      CurrentClusterBytes = MemOpb.Width;
+    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
       continue;
-    }
 
     LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
                       << SUb->NodeNum << ")\n");
@@ -1656,42 +1649,57 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
       }
     }
 
+    SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
+                                             CurrentClusterBytes};
+
     LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
                       << ", Curr cluster bytes: " << CurrentClusterBytes
                       << "\n");
   }
 }
 
-/// Callback from DAG postProcessing to create cluster edges for loads.
-void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
-  // Map DAG NodeNum to a set of dependent MemOps in store chain.
-  DenseMap<unsigned, SmallVector<SUnit *, 4>> StoreChains;
-  for (SUnit &SU : DAG->SUnits) {
+void BaseMemOpClusterMutation::collectMemOpRecords(
+    std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
+  for (auto &SU : SUnits) {
     if ((IsLoad && !SU.getInstr()->mayLoad()) ||
         (!IsLoad && !SU.getInstr()->mayStore()))
       continue;
 
-    unsigned ChainPredID = DAG->SUnits.size();
-    for (const SDep &Pred : SU.Preds) {
-      // We only want to cluster the mem ops that have the same ctrl(non-data)
-      // pred so that they didn't have ctrl dependency for each other. But for
-      // store instrs, we can still cluster them if the pred is load instr.
-      if ((Pred.isCtrl() &&
-           (IsLoad ||
-            (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
-          !Pred.isArtificial()) {
-        ChainPredID = Pred.getSUnit()->NodeNum;
-        break;
-      }
+    const MachineInstr &MI = *SU.getInstr();
+    SmallVector<const MachineOperand *, 4> BaseOps;
+    int64_t Offset;
+    bool OffsetIsScalable;
+    unsigned Width;
+    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
+                                           OffsetIsScalable, Width, TRI)) {
+      MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+
+      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
+                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
+                        << ", Width: " << Width << "\n");
     }
-    // Insert the SU to corresponding store chain.
-    auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second;
-    Chain.push_back(&SU);
+#ifndef NDEBUG
+    for (auto *Op : BaseOps)
+      assert(Op);
+#endif
   }
+}
+
+/// Callback from DAG postProcessing to create cluster edges for loads/stores.
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
+  // Collect all the clusterable loads/stores
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  collectMemOpRecords(DAG->SUnits, MemOpRecords);
+
+  if (MemOpRecords.size() < 2)
+    return;
+
+  // Sorting the loads/stores, so that, we can stop the cluster as early as
+  // possible.
+  llvm::sort(MemOpRecords);
 
-  // Iterate over the store chains.
-  for (auto &SCD : StoreChains)
-    clusterNeighboringMemOps(SCD.second, DAG);
+  // Trying to cluster all the neighboring loads/stores.
+  clusterNeighboringMemOps(MemOpRecords, DAG);
 }
 
 //===----------------------------------------------------------------------===//
author	QingShan Zhang <qshanz@cn.ibm.com>	2020-08-26 12:26:21 +0000
committer	QingShan Zhang <qshanz@cn.ibm.com>	2020-08-26 12:33:59 +0000
commit	ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac (patch)
tree	b8e8af31aac49efbe18773f16aa8d53415c0e3e9 /llvm/lib/CodeGen/MachineScheduler.cpp
parent	d289a97f91443177b605926668512479c2cee37b (diff)
download	llvm-ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac.zip llvm-ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac.tar.gz llvm-ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac.tar.bz2