aboutsummaryrefslogtreecommitdiff
path: root/openmp/libomptarget
diff options
context:
space:
mode:
Diffstat (limited to 'openmp/libomptarget')
-rw-r--r--openmp/libomptarget/include/omptarget.h4
-rw-r--r--openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp2
-rw-r--r--openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp7
-rw-r--r--openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp3
-rw-r--r--openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h2
-rw-r--r--openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp27
-rw-r--r--openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp1
-rw-r--r--openmp/libomptarget/test/libc/malloc.c10
8 files changed, 50 insertions, 6 deletions
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 476a158..d5602ee 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -103,7 +103,9 @@ enum TargetAllocTy : int32_t {
TARGET_ALLOC_DEVICE = 0,
TARGET_ALLOC_HOST,
TARGET_ALLOC_SHARED,
- TARGET_ALLOC_DEFAULT
+ TARGET_ALLOC_DEFAULT,
+ /// The allocation will not block on other streams.
+ TARGET_ALLOC_DEVICE_NON_BLOCKING,
};
inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index fe435a3..0411c67 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2112,6 +2112,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
@@ -3315,6 +3316,7 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
index 60e0540..54aced1 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
@@ -62,15 +62,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
"Failed to initialize RPC server for device %d: %d", DeviceId, Err);
// Register a custom opcode handler to perform plugin specific allocation.
- // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
auto MallocHandler = [](rpc_port_t Port, void *Data) {
rpc_recv_and_send(
Port,
[](rpc_buffer_t *Buffer, void *Data) {
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
- Buffer->data[0] = reinterpret_cast<uintptr_t>(
- Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+ Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+ Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
},
Data);
};
@@ -88,7 +87,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
Device.free(reinterpret_cast<void *>(Buffer->data[0]),
- TARGET_ALLOC_DEVICE);
+ TARGET_ALLOC_DEVICE_NON_BLOCKING);
},
Data);
};
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 56c4404..5ec3adb 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -43,6 +43,7 @@ DLWRAP(cuLaunchKernel, 11)
DLWRAP(cuMemAlloc, 2)
DLWRAP(cuMemAllocHost, 2)
DLWRAP(cuMemAllocManaged, 3)
+DLWRAP(cuMemAllocAsync, 3)
DLWRAP(cuMemcpyDtoDAsync, 4)
DLWRAP(cuMemcpyDtoH, 3)
@@ -52,6 +53,8 @@ DLWRAP(cuMemcpyHtoDAsync, 4)
DLWRAP(cuMemFree, 1)
DLWRAP(cuMemFreeHost, 1)
+DLWRAP(cuMemFreeAsync, 2)
+
DLWRAP(cuModuleGetFunction, 3)
DLWRAP(cuModuleGetGlobal, 4)
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 3e03077..32031c2 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -293,6 +293,7 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
CUresult cuMemAlloc(CUdeviceptr *, size_t);
CUresult cuMemAllocHost(void **, size_t);
CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
+CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
@@ -302,6 +303,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
CUresult cuMemFree(CUdeviceptr);
CUresult cuMemFreeHost(void *);
+CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index b0dff91..0005bff 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -63,6 +63,14 @@ cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option) {}
#endif
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
+// Forward declarations of asynchronous memory management functions. This is
+// necessary for older versions of CUDA.
+CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }
+
+CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
+#endif
+
/// Class implementing the CUDA device images properties.
struct CUDADeviceImageTy : public DeviceImageTy {
/// Create the CUDA image with the id and the target image pointer.
@@ -488,6 +496,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
MemAlloc = (void *)DevicePtr;
break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+ break;
+ if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
+ break;
+ cuStreamSynchronize(Stream);
+ Res = cuStreamDestroy(Stream);
+ MemAlloc = (void *)DevicePtr;
+ }
}
if (auto Err =
@@ -518,6 +536,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_HOST:
Res = cuMemFreeHost(TgtPtr);
break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+ break;
+ cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+ cuStreamSynchronize(Stream);
+ if ((Res = cuStreamDestroy(Stream)))
+ break;
+ }
}
if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 88b5236..43569f2 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -215,6 +215,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_HOST:
case TARGET_ALLOC_SHARED:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemAlloc = std::malloc(Size);
break;
}
diff --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c
index c18a724..b587b61 100644
--- a/openmp/libomptarget/test/libc/malloc.c
+++ b/openmp/libomptarget/test/libc/malloc.c
@@ -13,7 +13,7 @@ int main() {
unsigned *d_x;
#pragma omp target map(from : d_x)
{
- d_x = malloc(sizeof(unsigned));
+ d_x = (unsigned *)malloc(sizeof(unsigned));
*d_x = 1;
}
@@ -23,6 +23,14 @@ int main() {
#pragma omp target is_device_ptr(d_x)
{ free(d_x); }
+#pragma omp target teams num_teams(64)
+#pragma omp parallel num_threads(32)
+ {
+ int *ptr = (int *)malloc(sizeof(int));
+ *ptr = 42;
+ free(ptr);
+ }
+
// CHECK: PASS
if (h_x == 1)
fputs("PASS\n", stdout);