diff options
| author | Jeffrey Byrnes <Jeffrey.Byrnes@amd.com> | 2026-04-24 09:29:10 -0700 |
|---|---|---|
| committer | Jeffrey Byrnes <Jeffrey.Byrnes@amd.com> | 2026-04-24 09:52:30 -0700 |
| commit | 9e4ef195dc3017fa3e4a86092e60b018900c5b53 (patch) | |
| tree | c9a55f3f66b41c11431902e1995a32c6676f4a9b | |
| parent | a18260b1a0d1f7ccb05e64a2a42f550270c0ca0e (diff) | |
| download | llvm-users/jebyrnes/DS-FIFO-stalls-Rebase3.tar.gz llvm-users/jebyrnes/DS-FIFO-stalls-Rebase3.tar.bz2 llvm-users/jebyrnes/DS-FIFO-stalls-Rebase3.zip | |
Address Review commentsusers/jebyrnes/DS-FIFO-stalls-Rebase3
Change-Id: I6972e887edd5db44ee9bcaed1f79e0c9933f611e
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 20 |
2 files changed, 23 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp index 093afb08629e..8d550f5078b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp @@ -141,6 +141,8 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) { AllSUs.remove(SU); PrioritySUs.remove(SU); + // BufferSize of 0 or 1 implies that each SU uses the HardwareUnit for + // BlockingCycles if (BufferSize <= 1 || (ScheduledSUs.size() % BufferSize == 0)) TotalCycles -= BlockingCycles; @@ -170,6 +172,8 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) { } void HardwareUnitInfo::finalizeCycles() { + // BufferSize of 0 or 1 implies that each SU uses the HardwareUnit for + // BlockingCycles if (BufferSize <= 1 || AllSUs.empty()) return; @@ -701,7 +705,8 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand, *SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII)); HardwareUnitInfo *HWUI = Heurs.getHWUIFromFlavor(Flavor); - if (HWUI->getBufferSize() <= 1) + // A BufferSize of 0 means "unlimited" buffer, thus we will never fill it. + if (HWUI->getBufferSize() == 0) return 0; // getBufferAvailableCycle assumes top-down scheduling. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h index 198e9b007fa1..fd637e9f8efc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h @@ -178,9 +178,23 @@ private: /// instructions on other HardwareUnits. bool ProducesCoexecWindow = false; /// How many instructions can be held simultaneously for this HardwareUnit. - /// A value of 0 or 1 means that there is no buffer. + /// A value of 0 means there is no limit. + /// + /// This may approximate the hardware. For example, for LDS instructions + /// it is a well-known phenomena that oversubscribing the LDS unit results in + /// longer latency for the LDS instructions. While it is true that there is a + /// hard limit to the amount of simulatenous in-flight LDS instructions, good + /// scheduling would also cool off the LDS to avoid other forms of hardware + /// contention and increasing LDS latency. Thus, we limit the amount of LDS + /// instructions we are willing to schedule close together, though this does + /// not correspond 1:1 with a hardware mechanism. unsigned BufferSize = 0; /// How many cycles it takes for an instruction to clear the buffer. + /// + /// Again, this may be an apprxoimation. For example, for memory FIFOs, the + /// actual amount of cycles it will take to clear it is dependent on how + /// quickly prior instructions evacuate the FIFO, which is based on runtime + /// behavior which is not modelled in the compiler. unsigned BufferCycles = 0; public: @@ -210,7 +224,7 @@ public: /// \returns the next cycle where there is space in the buffer. unsigned getBufferAvailableCycle(unsigned CurrCycle) { // There is no buffer. - if (BufferSize <= 1) + if (BufferSize == 0) return CurrCycle; // Buffer is available now. @@ -265,7 +279,7 @@ public: /// the list of PrioritySUs. void markScheduled(SUnit *SU, unsigned BlockingCycles); /// After we've collected all the region pressure for this HWUI, correct for - /// any specifics of the behavior of this resource. For example, if we the + /// any specifics of the behavior of this resource. For example, if the /// HardwareUnit can hold N instructions simultaneously, then there is no /// penalty for scheduling N instructions back to back. void finalizeCycles(); |
