Address Review commentsusers/jebyrnes/DS-FIFO-stalls-Rebase3

Change-Id: I6972e887edd5db44ee9bcaed1f79e0c9933f611e
author: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com> 2026-04-24 09:29:10 -0700
committer: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com> 2026-04-24 09:52:30 -0700
commit: 9e4ef195dc3017fa3e4a86092e60b018900c5b53 (patch)
tree: c9a55f3f66b41c11431902e1995a32c6676f4a9b
parent: a18260b1a0d1f7ccb05e64a2a42f550270c0ca0e (diff)
download: llvm-users/jebyrnes/DS-FIFO-stalls-Rebase3.tar.gz
llvm-users/jebyrnes/DS-FIFO-stalls-Rebase3.tar.bz2
llvm-users/jebyrnes/DS-FIFO-stalls-Rebase3.zip
2 files changed, 23 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 093afb08629e..8d550f5078b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -141,6 +141,8 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
   AllSUs.remove(SU);
   PrioritySUs.remove(SU);
 
+  // BufferSize of 0 or 1 implies that each SU uses the HardwareUnit for
+  // BlockingCycles
   if (BufferSize <= 1 || (ScheduledSUs.size() % BufferSize == 0))
     TotalCycles -= BlockingCycles;
 
@@ -170,6 +172,8 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
 }
 
 void HardwareUnitInfo::finalizeCycles() {
+  // BufferSize of 0 or 1 implies that each SU uses the HardwareUnit for
+  // BlockingCycles
   if (BufferSize <= 1 || AllSUs.empty())
     return;
 
@@ -701,7 +705,8 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
         *SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
     HardwareUnitInfo *HWUI = Heurs.getHWUIFromFlavor(Flavor);
 
-    if (HWUI->getBufferSize() <= 1)
+    // A BufferSize of 0 means "unlimited" buffer, thus we will never fill it.
+    if (HWUI->getBufferSize() == 0)
       return 0;
 
     // getBufferAvailableCycle assumes top-down scheduling.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 198e9b007fa1..fd637e9f8efc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -178,9 +178,23 @@ private:
   /// instructions on other HardwareUnits.
   bool ProducesCoexecWindow = false;
   /// How many instructions can be held simultaneously for this HardwareUnit.
-  /// A value of 0 or 1 means that there is no buffer.
+  /// A value of 0 means there is no limit.
+  ///
+  /// This may approximate the hardware. For example, for LDS instructions
+  /// it is a well-known phenomena that oversubscribing the LDS unit results in
+  /// longer latency for the LDS instructions. While it is true that there is a
+  /// hard limit to the amount of simulatenous in-flight LDS instructions, good
+  /// scheduling would also cool off the LDS to avoid other forms of hardware
+  /// contention and increasing LDS latency. Thus, we limit the amount of LDS
+  /// instructions we are willing to schedule close together, though this does
+  /// not correspond 1:1 with a hardware mechanism.
   unsigned BufferSize = 0;
   /// How many cycles it takes for an instruction to clear the buffer.
+  ///
+  /// Again, this may be an apprxoimation. For example, for memory FIFOs, the
+  /// actual amount of cycles it will take to clear it is dependent on how
+  /// quickly prior instructions evacuate the FIFO, which is based on runtime
+  /// behavior which is not modelled in the compiler.
   unsigned BufferCycles = 0;
 
 public:
@@ -210,7 +224,7 @@ public:
   /// \returns the next cycle where there is space in the buffer.
   unsigned getBufferAvailableCycle(unsigned CurrCycle) {
     // There is no buffer.
-    if (BufferSize <= 1)
+    if (BufferSize == 0)
       return CurrCycle;
 
     // Buffer is available now.
@@ -265,7 +279,7 @@ public:
   /// the list of PrioritySUs.
   void markScheduled(SUnit *SU, unsigned BlockingCycles);
   /// After we've collected all the region pressure for this HWUI, correct for
-  /// any specifics of the behavior of this resource. For example, if we the
+  /// any specifics of the behavior of this resource. For example, if the
   /// HardwareUnit can hold N instructions simultaneously, then there is no
   /// penalty for scheduling N instructions back to back.
   void finalizeCycles();
author	Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>	2026-04-24 09:29:10 -0700
committer	Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>	2026-04-24 09:52:30 -0700
commit	9e4ef195dc3017fa3e4a86092e60b018900c5b53 (patch)
tree	c9a55f3f66b41c11431902e1995a32c6676f4a9b
parent	a18260b1a0d1f7ccb05e64a2a42f550270c0ca0e (diff)
download	llvm-users/jebyrnes/DS-FIFO-stalls-Rebase3.tar.gz llvm-users/jebyrnes/DS-FIFO-stalls-Rebase3.tar.bz2 llvm-users/jebyrnes/DS-FIFO-stalls-Rebase3.zip