aboutsummaryrefslogtreecommitdiff
path: root/offload/plugins-nextgen/amdgpu/src
diff options
context:
space:
mode:
Diffstat (limited to 'offload/plugins-nextgen/amdgpu/src')
-rw-r--r--offload/plugins-nextgen/amdgpu/src/rtl.cpp160
1 files changed, 144 insertions, 16 deletions
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index b7bfa89..b07086d 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -570,6 +570,16 @@ struct AMDGPUKernelTy : public GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+ /// Return maximum block size for maximum occupancy
+ ///
+ /// TODO: This needs to be implemented for amdgpu
+ Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+ uint64_t DynamicMemSize) const override {
+ return Plugin::error(
+ ErrorCode::UNSUPPORTED,
+ "occupancy calculations for AMDGPU are not yet implemented");
+ }
+
/// Print more elaborate kernel launch info for AMDGPU
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
@@ -1063,6 +1073,20 @@ private:
/// Indicate to spread data transfers across all available SDMAs
bool UseMultipleSdmaEngines;
+ /// Wrapper function for implementing host callbacks
+ static void CallbackWrapper(AMDGPUSignalTy *InputSignal,
+ AMDGPUSignalTy *OutputSignal,
+ void (*Callback)(void *), void *UserData) {
+ // The wait call will not error in this context.
+ if (InputSignal)
+ if (auto Err = InputSignal->wait())
+ reportFatalInternalError(std::move(Err));
+
+ Callback(UserData);
+
+ OutputSignal->signal();
+ }
+
/// Return the current number of asynchronous operations on the stream.
uint32_t size() const { return NextSlot; }
@@ -1495,6 +1519,31 @@ public:
OutputSignal->get());
}
+ Error pushHostCallback(void (*Callback)(void *), void *UserData) {
+ // Retrieve an available signal for the operation's output.
+ AMDGPUSignalTy *OutputSignal = nullptr;
+ if (auto Err = SignalManager.getResource(OutputSignal))
+ return Err;
+ OutputSignal->reset();
+ OutputSignal->increaseUseCount();
+
+ AMDGPUSignalTy *InputSignal;
+ {
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ // Consume stream slot and compute dependencies.
+ InputSignal = consume(OutputSignal).second;
+ }
+
+ // "Leaking" the thread here is consistent with other work added to the
+ // queue. The input and output signals will remain valid until the output is
+ // signaled.
+ std::thread(CallbackWrapper, InputSignal, OutputSignal, Callback, UserData)
+ .detach();
+
+ return Plugin::success();
+ }
+
/// Synchronize with the stream. The current thread waits until all operations
/// are finalized and it performs the pending post actions (i.e., releasing
/// intermediate buffers).
@@ -2232,16 +2281,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Get the stream of the asynchronous info structure or get a new one.
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
AMDGPUStreamTy *&Stream) {
- // Get the stream (if any) from the async info.
- Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
- if (!Stream) {
- // There was no stream; get an idle one.
- if (auto Err = AMDGPUStreamManager.getResource(Stream))
- return Err;
-
- // Modify the async info's stream.
- AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
- }
+ auto WrapperStream =
+ AsyncInfoWrapper.getOrInitQueue<AMDGPUStreamTy *>(AMDGPUStreamManager);
+ if (!WrapperStream)
+ return WrapperStream.takeError();
+ Stream = *WrapperStream;
return Plugin::success();
}
@@ -2296,7 +2340,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
}
/// Synchronize current thread with the pending operations on the async info.
- Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+ Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+ bool ReleaseQueue) override {
AMDGPUStreamTy *Stream =
reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
assert(Stream && "Invalid stream");
@@ -2307,8 +2352,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
// Once the stream is synchronized, return it to stream pool and reset
// AsyncInfo. This is to make sure the synchronization only works for its
// own tasks.
- AsyncInfo.Queue = nullptr;
- return AMDGPUStreamManager.returnResource(Stream);
+ if (ReleaseQueue) {
+ AsyncInfo.Queue = nullptr;
+ return AMDGPUStreamManager.returnResource(Stream);
+ }
+ return Plugin::success();
}
/// Query for the completion of the pending operations on the async info.
@@ -2538,6 +2586,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
getAgent(), (uint64_t)Size);
}
+ /// Insert a data fence between previous data operations and the following
+ /// operations. This is a no-op for AMDGPU devices as operations inserted into
+ /// a queue are in-order.
+ Error dataFence(__tgt_async_info *Async) override {
+ return Plugin::success();
+ }
+
/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
// TODO: Implement this function.
@@ -2554,6 +2609,15 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
+ Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+ AsyncInfoWrapperTy &AsyncInfo) override {
+ AMDGPUStreamTy *Stream = nullptr;
+ if (auto Err = getStream(AsyncInfo, Stream))
+ return Err;
+
+ return Stream->pushHostCallback(Callback, UserData);
+ };
+
/// Create an event.
Error createEventImpl(void **EventPtrStorage) override {
AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
@@ -2591,6 +2655,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Event->wait(*Stream);
}
+ Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
+ auto Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>();
+ if (!Stream)
+ return false;
+
+ auto Query = Stream->query();
+ if (Query)
+ return !*Query;
+ return Query.takeError();
+ }
+
/// Synchronize the current thread with the event.
Error syncEventImpl(void *EventPtr) override {
AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
@@ -2632,6 +2707,15 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (Status == HSA_STATUS_SUCCESS)
Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR);
+ Info.add("Vendor ID", uint64_t{4130}, "", DeviceInfo::VENDOR_ID);
+
+ hsa_machine_model_t MachineModel;
+ Status = getDeviceAttrRaw(HSA_AGENT_INFO_MACHINE_MODEL, MachineModel);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Memory Address Size",
+ uint64_t{MachineModel == HSA_MACHINE_MODEL_SMALL ? 32u : 64u},
+ "bits", DeviceInfo::ADDRESS_BITS);
+
hsa_device_type_t DevType;
Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
if (Status == HSA_STATUS_SUCCESS) {
@@ -2682,11 +2766,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
- Info.add("Max Clock Freq", TmpUInt, "MHz");
+ Info.add("Max Clock Freq", TmpUInt, "MHz",
+ DeviceInfo::MAX_CLOCK_FREQUENCY);
+
+ Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY, TmpUInt);
+ if (Status == HSA_STATUS_SUCCESS)
+ Info.add("Max Memory Clock Freq", TmpUInt, "MHz",
+ DeviceInfo::MEMORY_CLOCK_RATE);
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
- Info.add("Compute Units", TmpUInt);
+ Info.add("Compute Units", TmpUInt, "", DeviceInfo::NUM_COMPUTE_UNITS);
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
@@ -2768,7 +2858,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
if (Status == HSA_STATUS_SUCCESS)
- PoolNode.add("Size", TmpSt, "bytes");
+ PoolNode.add(
+ "Size", TmpSt, "bytes",
+ (Pool->isGlobal() && Pool->isCoarseGrained())
+ ? std::optional<DeviceInfo>{DeviceInfo::GLOBAL_MEM_SIZE}
+ : std::nullopt);
Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
TmpBool);
@@ -2945,6 +3039,40 @@ private:
return Plugin::success();
}
+ bool checkIfCoarseGrainMemoryNearOrAbove64GB() {
+ for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+ if (!Pool->isGlobal() || !Pool->isCoarseGrained())
+ continue;
+ uint64_t Value;
+ hsa_status_t Status =
+ Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
+ if (Status != HSA_STATUS_SUCCESS)
+ continue;
+ constexpr uint64_t Almost64Gig = 0xFF0000000;
+ if (Value >= Almost64Gig)
+ return true;
+ }
+ return false; // CoarseGrain pool w/ 64GB or more capacity not found
+ }
+
+ size_t getMemoryManagerSizeThreshold() override {
+ // Targeting high memory capacity GPUs such as
+ // data center GPUs.
+ if (checkIfCoarseGrainMemoryNearOrAbove64GB()) {
+ // Set GenericDeviceTy::MemoryManager's Threshold to 3GiB,
+ // if threshold is not already set by ENV var
+ // LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
+ // This MemoryManager is used for omp_target_alloc(), OpenMP
+ // (non-usm) map clause, etc.
+ //
+ // Ideally, this kind of pooling is best performed at
+ // a common level (e.g, user side of HSA) between OpenMP and HIP
+ // but that feature does not exist (yet).
+ return 3ul * 1024 * 1024 * 1024 /* 3 GiB */;
+ }
+ return 0;
+ }
+
/// Envar for controlling the number of HSA queues per device. High number of
/// queues may degrade performance.
UInt32Envar OMPX_NumQueues;