aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Halkenhaeuser <MichaelGerald.Halkenhauser@amd.com>2023-08-07 06:41:55 -0400
committerMichael Halkenhaeuser <MichaelGerald.Halkenhauser@amd.com>2023-08-07 10:48:02 -0400
commit7eba3e58d5a383e47973c3d3945277f0115c86f3 (patch)
treeb51909fd5f1eb82f12eb211c78e5c97e14d9334e
parent999ac10d7649e41755a9624dbb508c2db8bf3ddd (diff)
downloadllvm-7eba3e58d5a383e47973c3d3945277f0115c86f3.zip
llvm-7eba3e58d5a383e47973c3d3945277f0115c86f3.tar.gz
llvm-7eba3e58d5a383e47973c3d3945277f0115c86f3.tar.bz2
[OpenMP][AMDGPU] Add Envar for controlling HSA busy queue tracking
If the Envar is set to true (default), busy HSA queues will be actively avoided when assigning a queue to a Stream. Otherwise, we will initialize a new HSA queue for each requested Stream, then default to round robin once the set maximum has been reached. Reviewed By: jdoerfert, kevinsala Differential Revision: https://reviews.llvm.org/D156996
-rw-r--r--openmp/docs/design/Runtimes.rst12
-rw-r--r--openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp68
2 files changed, 51 insertions, 29 deletions
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index e681252..44c29fd 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1175,6 +1175,7 @@ There are several environment variables to change the behavior of the plugins:
* ``LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS``
* ``LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES``
* ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE``
+* ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING``
* ``LIBOMPTARGET_AMDGPU_TEAMS_PER_CU``
* ``LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES``
* ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS``
@@ -1231,6 +1232,17 @@ plugin. The size is the number of AQL packets an HSA queue is expected to hold.
It is also the number of AQL packets that can be pushed into each queue without
waiting the driver to process them. The default value is ``512``.
+LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING
+"""""""""""""""""""""""""""""""""""""""""""
+
+This environment variable controls if idle HSA queues will be preferentially
+assigned to streams, for example when they are requested for a kernel launch.
+Should all queues be considered busy, a new queue is initialized and returned,
+until we reach the set maximum. Otherwise, we will select the least utilized
+queue. If this is disabled, each time a stream is requested a new HSA queue
+will be initialized, regardless of their utilization. Additionally, queues will
+be selected using round robin selection. The default value is ``true``.
+
.. _libomptarget_amdgpu_teams_per_cu:
LIBOMPTARGET_AMDGPU_TEAMS_PER_CU
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 3d38e0f..8da4b1f 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -594,13 +594,16 @@ struct AMDGPUQueueTy {
return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
}
- /// Returns if this queue is considered busy
- bool isBusy() const { return NumUsers > 0; }
+ /// Returns the number of streams, this queue is currently assigned to.
+ bool getUserCount() const { return NumUsers; }
- /// Decrement user count of the queue object
+ /// Returns if the underlying HSA queue is initialized.
+ bool isInitialized() { return Queue != nullptr; }
+
+ /// Decrement user count of the queue object.
void removeUser() { --NumUsers; }
- /// Increase user count of the queue object
+ /// Increase user count of the queue object.
void addUser() { ++NumUsers; }
/// Push a kernel launch to the queue. The kernel launch requires an output
@@ -784,8 +787,9 @@ private:
/// atomic operations. We can further investigate it if this is a bottleneck.
std::mutex Mutex;
- /// Indicates that the queue is busy when > 0
- int NumUsers;
+ /// The number of streams, this queue is currently assigned to. A queue is
+ /// considered idle when this is zero, otherwise: busy.
+ uint32_t NumUsers;
};
/// Struct that implements a stream of asynchronous operations for AMDGPU
@@ -1451,7 +1455,9 @@ struct AMDGPUStreamManagerTy final
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
- : GenericDeviceResourceManagerTy(Device), NextQueue(0), Agent(HSAAgent) {}
+ : GenericDeviceResourceManagerTy(Device),
+ OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
+ NextQueue(0), Agent(HSAAgent) {}
Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
@@ -1493,35 +1499,39 @@ struct AMDGPUStreamManagerTy final
private:
/// Search for and assign an prefereably idle queue to the given Stream. If
- /// there is no queue without current users, resort to round robin selection.
+ /// there is no queue without current users, choose the queue with the lowest
+ /// user count. If utilization is ignored: use round robin selection.
inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
- uint32_t StartIndex = NextQueue % MaxNumQueues;
- AMDGPUQueueTy *Q = nullptr;
-
- for (int i = 0; i < MaxNumQueues; ++i) {
- Q = &Queues[StartIndex++];
- if (StartIndex == MaxNumQueues)
- StartIndex = 0;
-
- if (Q->isBusy())
- continue;
- else {
- if (auto Err = Q->init(Agent, QueueSize))
- return Err;
-
- Q->addUser();
- Stream->Queue = Q;
- return Plugin::success();
+ // Start from zero when tracking utilization, otherwise: round robin policy.
+ uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
+
+ if (OMPX_QueueTracking) {
+ // Find the least used queue.
+ for (uint32_t I = 0; I < MaxNumQueues; ++I) {
+ // Early exit when an initialized queue is idle.
+ if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
+ Index = I;
+ break;
+ }
+
+ // Update the least used queue.
+ if (Queues[Index].getUserCount() > Queues[I].getUserCount())
+ Index = I;
}
}
- // All queues busy: Round robin (StartIndex has the initial value again)
- Queues[StartIndex].addUser();
- Stream->Queue = &Queues[StartIndex];
- ++NextQueue;
+ // Make sure the queue is initialized, then add user & assign.
+ if (auto Err = Queues[Index].init(Agent, QueueSize))
+ return Err;
+ Queues[Index].addUser();
+ Stream->Queue = &Queues[Index];
+
return Plugin::success();
}
+ /// Envar for controlling the tracking of busy HSA queues.
+ BoolEnvar OMPX_QueueTracking;
+
/// The next queue index to use for round robin selection.
uint32_t NextQueue;