aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/CodeGen/MachineScheduler.cpp
diff options
context:
space:
mode:
authorQingShan Zhang <qshanz@cn.ibm.com>2020-08-26 12:26:21 +0000
committerQingShan Zhang <qshanz@cn.ibm.com>2020-08-26 12:33:59 +0000
commitebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac (patch)
treeb8e8af31aac49efbe18773f16aa8d53415c0e3e9 /llvm/lib/CodeGen/MachineScheduler.cpp
parentd289a97f91443177b605926668512479c2cee37b (diff)
downloadllvm-ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac.zip
llvm-ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac.tar.gz
llvm-ebf3b188c6edcce7e90ddcacbe7c51c90d95b0ac.tar.bz2
[Scheduling] Implement a new way to cluster loads/stores
Before calling target hook to determine if two loads/stores are clusterable, we put them into different groups to avoid fake cluster due to dependency. For now, we are putting the loads/stores into the same group if they have the same predecessor. We assume that, if two loads/stores have the same predecessor, it is likely that, they didn't have dependency for each other. However, one SUnit might have several predecessors and for now, we just pick up the first predecessor that has non-data/non-artificial dependency, which is too arbitrary. And we are struggling to fix it. So, I am proposing some better implementation. 1. Collect all the loads/stores that has memory info first to reduce the complexity. 2. Sort these loads/stores so that we can stop the seeking as early as possible. 3. For each load/store, seeking for the first non-dependency instruction with the sorted order, and check if they can cluster or not. Reviewed By: Jay Foad Differential Revision: https://reviews.llvm.org/D85517
Diffstat (limited to 'llvm/lib/CodeGen/MachineScheduler.cpp')
-rw-r--r--llvm/lib/CodeGen/MachineScheduler.cpp136
1 files changed, 72 insertions, 64 deletions
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index a8ccf26..b6d0d9a 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1530,7 +1530,10 @@ public:
void apply(ScheduleDAGInstrs *DAGInstrs) override;
protected:
- void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG);
+ void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps,
+ ScheduleDAGInstrs *DAG);
+ void collectMemOpRecords(std::vector<SUnit> &SUnits,
+ SmallVectorImpl<MemOpInfo> &MemOpRecords);
};
class StoreClusterMutation : public BaseMemOpClusterMutation {
@@ -1566,63 +1569,53 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,
} // end namespace llvm
+// Sorting all the loads/stores first, then for each load/store, checking the
+// following load/store one by one, until reach the first non-dependent one and
+// call target hook to see if they can cluster.
void BaseMemOpClusterMutation::clusterNeighboringMemOps(
- ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
- SmallVector<MemOpInfo, 32> MemOpRecords;
- for (SUnit *SU : MemOps) {
- const MachineInstr &MI = *SU->getInstr();
- SmallVector<const MachineOperand *, 4> BaseOps;
- int64_t Offset;
- bool OffsetIsScalable;
- unsigned Width;
- if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
- OffsetIsScalable, Width, TRI)) {
- MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width));
-
- LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
- << Offset << ", OffsetIsScalable: " << OffsetIsScalable
- << ", Width: " << Width << "\n");
- }
-#ifndef NDEBUG
- for (auto *Op : BaseOps)
- assert(Op);
-#endif
- }
- if (MemOpRecords.size() < 2)
- return;
-
- llvm::sort(MemOpRecords);
+ ArrayRef<MemOpInfo> MemOpRecords, ScheduleDAGInstrs *DAG) {
+ // Keep track of the current cluster length and bytes for each SUnit.
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
// At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
// cluster mem ops collected within `MemOpRecords` array.
- unsigned ClusterLength = 1;
- unsigned CurrentClusterBytes = MemOpRecords[0].Width;
for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
// Decision to cluster mem ops is taken based on target dependent logic
auto MemOpa = MemOpRecords[Idx];
- auto MemOpb = MemOpRecords[Idx + 1];
- ++ClusterLength;
- CurrentClusterBytes += MemOpb.Width;
- if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
- CurrentClusterBytes)) {
- // Current mem ops pair could not be clustered, reset cluster length, and
- // go to next pair
- ClusterLength = 1;
- CurrentClusterBytes = MemOpb.Width;
+
+ // Seek for the next load/store to do the cluster.
+ unsigned NextIdx = Idx + 1;
+ for (; NextIdx < End; ++NextIdx)
+ // Skip if MemOpb has been clustered already or has dependency with
+ // MemOpa.
+ if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
+ !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
+ !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))
+ break;
+ if (NextIdx == End)
continue;
+
+ auto MemOpb = MemOpRecords[NextIdx];
+ unsigned ClusterLength = 2;
+ unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
+ if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
+ ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
+ CurrentClusterBytes =
+ SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
}
+ if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
+ CurrentClusterBytes))
+ continue;
+
SUnit *SUa = MemOpa.SU;
SUnit *SUb = MemOpb.SU;
if (SUa->NodeNum > SUb->NodeNum)
std::swap(SUa, SUb);
// FIXME: Is this check really required?
- if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
- ClusterLength = 1;
- CurrentClusterBytes = MemOpb.Width;
+ if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
continue;
- }
LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
<< SUb->NodeNum << ")\n");
@@ -1656,42 +1649,57 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
}
}
+ SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
+ CurrentClusterBytes};
+
LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength
<< ", Curr cluster bytes: " << CurrentClusterBytes
<< "\n");
}
}
-/// Callback from DAG postProcessing to create cluster edges for loads.
-void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
- // Map DAG NodeNum to a set of dependent MemOps in store chain.
- DenseMap<unsigned, SmallVector<SUnit *, 4>> StoreChains;
- for (SUnit &SU : DAG->SUnits) {
+void BaseMemOpClusterMutation::collectMemOpRecords(
+ std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
+ for (auto &SU : SUnits) {
if ((IsLoad && !SU.getInstr()->mayLoad()) ||
(!IsLoad && !SU.getInstr()->mayStore()))
continue;
- unsigned ChainPredID = DAG->SUnits.size();
- for (const SDep &Pred : SU.Preds) {
- // We only want to cluster the mem ops that have the same ctrl(non-data)
- // pred so that they didn't have ctrl dependency for each other. But for
- // store instrs, we can still cluster them if the pred is load instr.
- if ((Pred.isCtrl() &&
- (IsLoad ||
- (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
- !Pred.isArtificial()) {
- ChainPredID = Pred.getSUnit()->NodeNum;
- break;
- }
+ const MachineInstr &MI = *SU.getInstr();
+ SmallVector<const MachineOperand *, 4> BaseOps;
+ int64_t Offset;
+ bool OffsetIsScalable;
+ unsigned Width;
+ if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
+ OffsetIsScalable, Width, TRI)) {
+ MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+
+ LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
+ << Offset << ", OffsetIsScalable: " << OffsetIsScalable
+ << ", Width: " << Width << "\n");
}
- // Insert the SU to corresponding store chain.
- auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second;
- Chain.push_back(&SU);
+#ifndef NDEBUG
+ for (auto *Op : BaseOps)
+ assert(Op);
+#endif
}
+}
+
+/// Callback from DAG postProcessing to create cluster edges for loads/stores.
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
+ // Collect all the clusterable loads/stores
+ SmallVector<MemOpInfo, 32> MemOpRecords;
+ collectMemOpRecords(DAG->SUnits, MemOpRecords);
+
+ if (MemOpRecords.size() < 2)
+ return;
+
+ // Sorting the loads/stores, so that, we can stop the cluster as early as
+ // possible.
+ llvm::sort(MemOpRecords);
- // Iterate over the store chains.
- for (auto &SCD : StoreChains)
- clusterNeighboringMemOps(SCD.second, DAG);
+ // Trying to cluster all the neighboring loads/stores.
+ clusterNeighboringMemOps(MemOpRecords, DAG);
}
//===----------------------------------------------------------------------===//