aboutsummaryrefslogtreecommitdiff
path: root/offload/plugins-nextgen/amdgpu/src/rtl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'offload/plugins-nextgen/amdgpu/src/rtl.cpp')
-rw-r--r--offload/plugins-nextgen/amdgpu/src/rtl.cpp70
1 files changed, 57 insertions, 13 deletions
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index b7bfa89..7961820 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2232,16 +2232,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Get the stream of the asynchronous info structure or get a new one.
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
AMDGPUStreamTy *&Stream) {
- // Get the stream (if any) from the async info.
- Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
- if (!Stream) {
- // There was no stream; get an idle one.
- if (auto Err = AMDGPUStreamManager.getResource(Stream))
- return Err;
-
- // Modify the async info's stream.
- AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
- }
+ auto WrapperStream =
+ AsyncInfoWrapper.getOrInitQueue<AMDGPUStreamTy *>(AMDGPUStreamManager);
+ if (!WrapperStream)
+ return WrapperStream.takeError();
+ Stream = *WrapperStream;
return Plugin::success();
}
@@ -2296,7 +2291,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
}
/// Synchronize current thread with the pending operations on the async info.
- Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+ Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+ bool ReleaseQueue) override {
AMDGPUStreamTy *Stream =
reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
assert(Stream && "Invalid stream");
@@ -2307,8 +2303,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
// Once the stream is synchronized, return it to stream pool and reset
// AsyncInfo. This is to make sure the synchronization only works for its
// own tasks.
- AsyncInfo.Queue = nullptr;
- return AMDGPUStreamManager.returnResource(Stream);
+ if (ReleaseQueue) {
+ AsyncInfo.Queue = nullptr;
+ return AMDGPUStreamManager.returnResource(Stream);
+ }
+ return Plugin::success();
}
/// Query for the completion of the pending operations on the async info.
@@ -2591,6 +2590,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Event->wait(*Stream);
}
+ Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
+ auto Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>();
+ if (!Stream)
+ return false;
+
+ auto Query = Stream->query();
+ if (Query)
+ return !*Query;
+ return Query.takeError();
+ }
+
/// Synchronize the current thread with the event.
Error syncEventImpl(void *EventPtr) override {
AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
@@ -2945,6 +2955,40 @@ private:
return Plugin::success();
}
+ bool checkIfCoarseGrainMemoryNearOrAbove64GB() {
+ for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+ if (!Pool->isGlobal() || !Pool->isCoarseGrained())
+ continue;
+ uint64_t Value;
+ hsa_status_t Status =
+ Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
+ if (Status != HSA_STATUS_SUCCESS)
+ continue;
+ constexpr uint64_t Almost64Gig = 0xFF0000000;
+ if (Value >= Almost64Gig)
+ return true;
+ }
+ return false; // CoarseGrain pool w/ 64GB or more capacity not found
+ }
+
+ size_t getMemoryManagerSizeThreshold() override {
+ // Targeting high memory capacity GPUs such as
+ // data center GPUs.
+ if (checkIfCoarseGrainMemoryNearOrAbove64GB()) {
+ // Set GenericDeviceTy::MemoryManager's Threshold to 3GiB,
+ // if threshold is not already set by ENV var
+ // LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
+ // This MemoryManager is used for omp_target_alloc(), OpenMP
+ // (non-usm) map clause, etc.
+ //
+ // Ideally, this kind of pooling is best performed at
+ // a common level (e.g, user side of HSA) between OpenMP and HIP
+ // but that feature does not exist (yet).
+ return 3ul * 1024 * 1024 * 1024 /* 3 GiB */;
+ }
+ return 0;
+ }
+
/// Envar for controlling the number of HSA queues per device. High number of
/// queues may degrade performance.
UInt32Envar OMPX_NumQueues;