aboutsummaryrefslogtreecommitdiff
path: root/libgomp/plugin
diff options
context:
space:
mode:
authorTobias Burnus <tobias@codesourcery.com>2022-10-24 16:58:43 +0200
committerTobias Burnus <tobias@codesourcery.com>2022-10-24 17:04:08 +0200
commit131d18e928a3ea1ab2d3bf61aa92d68a8a254609 (patch)
tree9c379ef9c639a56d0b1146aada7cef937328a89e /libgomp/plugin
parenta096036589d82175a0f729c2dab73c9a527d075d (diff)
downloadgcc-131d18e928a3ea1ab2d3bf61aa92d68a8a254609.zip
gcc-131d18e928a3ea1ab2d3bf61aa92d68a8a254609.tar.gz
gcc-131d18e928a3ea1ab2d3bf61aa92d68a8a254609.tar.bz2
libgomp/nvptx: Prepare for reverse-offload callback handling
This patch adds a stub 'gomp_target_rev' in the host's target.c, which will later handle the reverse offload. For nvptx, it adds support for forwarding the offload gomp_target_ext call to the host by setting values in a struct on the device and querying it on the host - invoking gomp_target_rev on the result. include/ChangeLog: * cuda/cuda.h (enum CUdevice_attribute): Add CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. (CU_MEMHOSTALLOC_DEVICEMAP): Define. (cuMemHostAlloc): Add prototype. libgomp/ChangeLog: * config/nvptx/icv-device.c (GOMP_DEVICE_NUM_VAR): Remove 'static' for this variable. * config/nvptx/libgomp-nvptx.h: New file. * config/nvptx/target.c: Include it. (GOMP_ADDITIONAL_ICVS): Declare extern var. (GOMP_REV_OFFLOAD_VAR): Declare var. (GOMP_target_ext): Handle reverse offload. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): New prototype. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): New, call ... * target.c (gomp_target_rev): ... this new stub function. * libgomp.h (gomp_target_rev): Declare. * libgomp.map (GOMP_PLUGIN_1.4): New; add GOMP_PLUGIN_target_rev. * plugin/cuda-lib.def (cuMemHostAlloc): Add. * plugin/plugin-nvptx.c: Include libgomp-nvptx.h. (struct ptx_device): Add rev_data member. (nvptx_open_device): Remove async_engines query, last used in r10-304-g1f4c5b9b; add unified-address assert check. (GOMP_OFFLOAD_get_num_devices): Claim unified address support. (GOMP_OFFLOAD_load_image): Free rev_fn_table if no offload functions exist. Make offload var available on host and device. (rev_off_dev_to_host_cpy, rev_off_host_to_dev_cpy): New. (GOMP_OFFLOAD_run): Handle reverse offload.
Diffstat (limited to 'libgomp/plugin')
-rw-r--r--libgomp/plugin/cuda-lib.def1
-rw-r--r--libgomp/plugin/plugin-nvptx.c107
2 files changed, 98 insertions, 10 deletions
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index cd91b39..dff42d6 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -29,6 +29,7 @@ CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2)
CUDA_ONE_CALL (cuLinkDestroy)
CUDA_ONE_CALL (cuMemAlloc)
CUDA_ONE_CALL (cuMemAllocHost)
+CUDA_ONE_CALL (cuMemHostAlloc)
CUDA_ONE_CALL (cuMemcpy)
CUDA_ONE_CALL (cuMemcpyDtoDAsync)
CUDA_ONE_CALL (cuMemcpyDtoH)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index ba6b229..ad057ed 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -40,6 +40,9 @@
#include "gomp-constants.h"
#include "oacc-int.h"
+/* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
+#include "config/nvptx/libgomp-nvptx.h"
+
#include <pthread.h>
#ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
# include "cuda/cuda.h"
@@ -329,6 +332,7 @@ struct ptx_device
pthread_mutex_t lock;
} omp_stacks;
+ struct rev_offload *rev_data;
struct ptx_device *next;
};
@@ -423,7 +427,7 @@ nvptx_open_device (int n)
struct ptx_device *ptx_dev;
CUdevice dev, ctx_dev;
CUresult r;
- int async_engines, pi;
+ int pi;
CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
@@ -519,10 +523,12 @@ nvptx_open_device (int n)
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
ptx_dev->max_threads_per_multiprocessor = pi;
- r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
- CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
- if (r != CUDA_SUCCESS)
- async_engines = 1;
+ /* Required below for reverse offload as implemented, but with compute
+ capability >= 2.0 and 64bit device processes, this should be universally be
+ the case; hence, an assert. */
+ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
+ CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
+ assert (r == CUDA_SUCCESS && pi);
for (int i = 0; i != GOMP_DIM_MAX; i++)
ptx_dev->default_dims[i] = 0;
@@ -1179,8 +1185,10 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
{
int num_devices = nvptx_get_num_devices ();
/* Return -1 if no omp_requires_mask cannot be fulfilled but
- devices were present. */
- if (num_devices > 0 && omp_requires_mask != 0)
+ devices were present. Unified-shared address: see comment in
+ nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
+ if (num_devices > 0
+ && (omp_requires_mask & ~GOMP_REQUIRES_UNIFIED_ADDRESS) != 0)
return -1;
return num_devices;
}
@@ -1380,7 +1388,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
else if (rev_fn_table)
{
CUdeviceptr var;
- size_t bytes;
+ size_t bytes, i;
r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
"$offload_func_table");
if (r != CUDA_SUCCESS)
@@ -1390,6 +1398,37 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
+ /* Free if only NULL entries. */
+ for (i = 0; i < fn_entries; ++i)
+ if ((*rev_fn_table)[i] != 0)
+ break;
+ if (i == fn_entries)
+ {
+ free (*rev_fn_table);
+ *rev_fn_table = NULL;
+ }
+ }
+
+ if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
+ {
+ /* cuMemHostAlloc memory is accessible on the device, if unified-shared
+ address is supported; this is assumed - see comment in
+ nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
+ CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
+ sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
+ CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
+ CUdeviceptr device_rev_offload_var;
+ size_t device_rev_offload_size;
+ CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
+ &device_rev_offload_var,
+ &device_rev_offload_size, module,
+ XSTRING (GOMP_REV_OFFLOAD_VAR));
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleGetGlobal error - GOMP_REV_OFFLOAD_VAR: %s", cuda_error (r));
+ r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
+ sizeof (dp));
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
}
nvptx_set_clocktick (module, dev);
@@ -2001,6 +2040,23 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
return (void *) ptx_dev->omp_stacks.ptr;
}
+
+void
+rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
+ CUstream stream)
+{
+ CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
+ CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
+}
+
+void
+rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
+ CUstream stream)
+{
+ CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
+ CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
+}
+
void
GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
{
@@ -2035,6 +2091,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
size_t stack_size = nvptx_stacks_size ();
+ bool reverse_offload = ptx_dev->rev_data != NULL;
+ CUstream copy_stream = NULL;
pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
@@ -2048,12 +2106,41 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
" [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
__FUNCTION__, fn_name, teams, threads);
+ if (reverse_offload)
+ CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
32, threads, 1, 0, NULL, NULL, config);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
-
- r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
+ if (reverse_offload)
+ while (true)
+ {
+ r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
+ if (r == CUDA_SUCCESS)
+ break;
+ if (r == CUDA_ERROR_LAUNCH_FAILED)
+ GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
+ maybe_abort_msg);
+ else if (r != CUDA_ERROR_NOT_READY)
+ GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
+
+ if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
+ {
+ struct rev_offload *rev_data = ptx_dev->rev_data;
+ GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
+ rev_data->addrs, rev_data->sizes,
+ rev_data->kinds, rev_data->dev_num,
+ rev_off_dev_to_host_cpy,
+ rev_off_host_to_dev_cpy, copy_stream);
+ CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
+ __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
+ }
+ usleep (1);
+ }
+ else
+ r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
+ if (reverse_offload)
+ CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);