aboutsummaryrefslogtreecommitdiff
path: root/offload/plugins-nextgen/cuda/src/rtl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'offload/plugins-nextgen/cuda/src/rtl.cpp')
-rw-r--r--offload/plugins-nextgen/cuda/src/rtl.cpp105
1 files changed, 81 insertions, 24 deletions
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index c5f3167..6aba6f8 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+ /// Return maximum block size for maximum occupancy
+ Expected<uint64_t> maxGroupSize(GenericDeviceTy &,
+ uint64_t DynamicMemSize) const override {
+ int minGridSize;
+ int maxBlockSize;
+ auto Res = cuOccupancyMaxPotentialBlockSize(
+ &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
+ if (auto Err = Plugin::check(
+ Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
+ return Err;
+ }
+ return maxBlockSize;
+ }
+
private:
/// The CUDA kernel function to execute.
CUfunction Func;
@@ -522,16 +536,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
/// Get the stream of the asynchronous info structure or get a new one.
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
- // Get the stream (if any) from the async info.
- Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
- if (!Stream) {
- // There was no stream; get an idle one.
- if (auto Err = CUDAStreamManager.getResource(Stream))
- return Err;
-
- // Modify the async info's stream.
- AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
- }
+ auto WrapperStream =
+ AsyncInfoWrapper.getOrInitQueue<CUstream>(CUDAStreamManager);
+ if (!WrapperStream)
+ return WrapperStream.takeError();
+ Stream = *WrapperStream;
return Plugin::success();
}
@@ -642,17 +651,20 @@ struct CUDADeviceTy : public GenericDeviceTy {
}
/// Synchronize current thread with the pending operations on the async info.
- Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+ Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+ bool ReleaseQueue) override {
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
CUresult Res;
Res = cuStreamSynchronize(Stream);
- // Once the stream is synchronized, return it to stream pool and reset
- // AsyncInfo. This is to make sure the synchronization only works for its
- // own tasks.
- AsyncInfo.Queue = nullptr;
- if (auto Err = CUDAStreamManager.returnResource(Stream))
- return Err;
+ // Once the stream is synchronized and we want to release the queue, return
+ // it to stream pool and reset AsyncInfo. This is to make sure the
+ // synchronization only works for its own tasks.
+ if (ReleaseQueue) {
+ AsyncInfo.Queue = nullptr;
+ if (auto Err = CUDAStreamManager.returnResource(Stream))
+ return Err;
+ }
return Plugin::check(Res, "error in cuStreamSynchronize: %s");
}
@@ -858,6 +870,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ /// Insert a data fence between previous data operations and the following
+ /// operations. This is a no-op for CUDA devices as operations inserted into
+ /// a queue are in-order.
+ Error dataFence(__tgt_async_info *Async) override {
+ return Plugin::success();
+ }
+
/// Initialize the device info for interoperability purposes.
Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
assert(Context && "Context is null");
@@ -875,6 +894,19 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+ AsyncInfoWrapperTy &AsyncInfo) override {
+ if (auto Err = setContext())
+ return Err;
+
+ CUstream Stream;
+ if (auto Err = getStream(AsyncInfo, Stream))
+ return Err;
+
+ CUresult Res = cuLaunchHostFunc(Stream, Callback, UserData);
+ return Plugin::check(Res, "error in cuStreamLaunchHostFunc: %s");
+ };
+
/// Create an event.
Error createEventImpl(void **EventPtrStorage) override {
CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
@@ -916,6 +948,21 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "error in cuStreamWaitEvent: %s");
}
+ Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
+ CUstream Stream;
+ if (auto Err = getStream(AsyncInfo, Stream))
+ return Err;
+
+ CUresult Ret = cuStreamQuery(Stream);
+ if (Ret == CUDA_SUCCESS)
+ return false;
+
+ if (Ret == CUDA_ERROR_NOT_READY)
+ return true;
+
+ return Plugin::check(Ret, "error in cuStreamQuery: %s");
+ }
+
/// Synchronize the current thread with the event.
Error syncEventImpl(void *EventPtr) override {
CUevent Event = reinterpret_cast<CUevent>(EventPtr);
@@ -946,13 +993,20 @@ struct CUDADeviceTy : public GenericDeviceTy {
Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR);
+ Info.add("Vendor ID", uint64_t{4318}, "", DeviceInfo::VENDOR_ID);
+
+ Info.add("Memory Address Size", std::numeric_limits<CUdeviceptr>::digits,
+ "bits", DeviceInfo::ADDRESS_BITS);
+
Res = cuDeviceTotalMem(&TmpSt, Device);
if (Res == CUDA_SUCCESS)
- Info.add("Global Memory Size", TmpSt, "bytes");
+ Info.add("Global Memory Size", TmpSt, "bytes",
+ DeviceInfo::GLOBAL_MEM_SIZE);
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
if (Res == CUDA_SUCCESS)
- Info.add("Number of Multiprocessors", TmpInt);
+ Info.add("Number of Multiprocessors", TmpInt, "",
+ DeviceInfo::NUM_COMPUTE_UNITS);
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
if (Res == CUDA_SUCCESS)
@@ -1013,7 +1067,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
if (Res == CUDA_SUCCESS)
- Info.add("Clock Rate", TmpInt, "kHz");
+ Info.add("Clock Rate", TmpInt / 1000, "MHz",
+ DeviceInfo::MAX_CLOCK_FREQUENCY);
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
if (Res == CUDA_SUCCESS)
@@ -1050,7 +1105,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
if (Res == CUDA_SUCCESS)
- Info.add("Memory Clock Rate", TmpInt, "kHz");
+ Info.add("Memory Clock Rate", TmpInt / 1000, "MHz",
+ DeviceInfo::MEMORY_CLOCK_RATE);
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
if (Res == CUDA_SUCCESS)
@@ -1314,9 +1370,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
CUresult AttrResult = cuFuncSetAttribute(
Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
- return Plugin::check(
- AttrResult,
- "Error in cuLaunchKernel while setting the memory limits: %s");
+ if (auto Err = Plugin::check(
+ AttrResult,
+ "Error in cuLaunchKernel while setting the memory limits: %s"))
+ return Err;
MaxDynCGroupMemLimit = MaxDynCGroupMem;
}