diff options
author | Tobias Burnus <tobias@codesourcery.com> | 2022-10-24 16:58:43 +0200 |
---|---|---|
committer | Tobias Burnus <tobias@codesourcery.com> | 2022-10-24 17:04:08 +0200 |
commit | 131d18e928a3ea1ab2d3bf61aa92d68a8a254609 (patch) | |
tree | 9c379ef9c639a56d0b1146aada7cef937328a89e /libgomp/plugin | |
parent | a096036589d82175a0f729c2dab73c9a527d075d (diff) | |
download | gcc-131d18e928a3ea1ab2d3bf61aa92d68a8a254609.zip gcc-131d18e928a3ea1ab2d3bf61aa92d68a8a254609.tar.gz gcc-131d18e928a3ea1ab2d3bf61aa92d68a8a254609.tar.bz2 |
libgomp/nvptx: Prepare for reverse-offload callback handling
This patch adds a stub 'gomp_target_rev' in the host's target.c, which will
later handle the reverse offload.
For nvptx, it adds support for forwarding the offload gomp_target_ext call
to the host by setting values in a struct on the device and querying it on
the host - invoking gomp_target_rev on the result.
include/ChangeLog:
* cuda/cuda.h (enum CUdevice_attribute): Add
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
(CU_MEMHOSTALLOC_DEVICEMAP): Define.
(cuMemHostAlloc): Add prototype.
libgomp/ChangeLog:
* config/nvptx/icv-device.c (GOMP_DEVICE_NUM_VAR): Remove
'static' for this variable.
* config/nvptx/libgomp-nvptx.h: New file.
* config/nvptx/target.c: Include it.
(GOMP_ADDITIONAL_ICVS): Declare extern var.
(GOMP_REV_OFFLOAD_VAR): Declare var.
(GOMP_target_ext): Handle reverse offload.
* libgomp-plugin.h (GOMP_PLUGIN_target_rev): New prototype.
* libgomp-plugin.c (GOMP_PLUGIN_target_rev): New, call ...
* target.c (gomp_target_rev): ... this new stub function.
* libgomp.h (gomp_target_rev): Declare.
* libgomp.map (GOMP_PLUGIN_1.4): New; add GOMP_PLUGIN_target_rev.
* plugin/cuda-lib.def (cuMemHostAlloc): Add.
* plugin/plugin-nvptx.c: Include libgomp-nvptx.h.
(struct ptx_device): Add rev_data member.
(nvptx_open_device): Remove async_engines query, last used in
r10-304-g1f4c5b9b; add unified-address assert check.
(GOMP_OFFLOAD_get_num_devices): Claim unified address
support.
(GOMP_OFFLOAD_load_image): Free rev_fn_table if no
offload functions exist. Make offload var available
on host and device.
(rev_off_dev_to_host_cpy, rev_off_host_to_dev_cpy): New.
(GOMP_OFFLOAD_run): Handle reverse offload.
Diffstat (limited to 'libgomp/plugin')
-rw-r--r-- | libgomp/plugin/cuda-lib.def | 1 | ||||
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 107 |
2 files changed, 98 insertions, 10 deletions
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index cd91b39..dff42d6 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -29,6 +29,7 @@ CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2) CUDA_ONE_CALL (cuLinkDestroy) CUDA_ONE_CALL (cuMemAlloc) CUDA_ONE_CALL (cuMemAllocHost) +CUDA_ONE_CALL (cuMemHostAlloc) CUDA_ONE_CALL (cuMemcpy) CUDA_ONE_CALL (cuMemcpyDtoDAsync) CUDA_ONE_CALL (cuMemcpyDtoH) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index ba6b229..ad057ed 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -40,6 +40,9 @@ #include "gomp-constants.h" #include "oacc-int.h" +/* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */ +#include "config/nvptx/libgomp-nvptx.h" + #include <pthread.h> #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H # include "cuda/cuda.h" @@ -329,6 +332,7 @@ struct ptx_device pthread_mutex_t lock; } omp_stacks; + struct rev_offload *rev_data; struct ptx_device *next; }; @@ -423,7 +427,7 @@ nvptx_open_device (int n) struct ptx_device *ptx_dev; CUdevice dev, ctx_dev; CUresult r; - int async_engines, pi; + int pi; CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n); @@ -519,10 +523,12 @@ nvptx_open_device (int n) CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); ptx_dev->max_threads_per_multiprocessor = pi; - r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); - if (r != CUDA_SUCCESS) - async_engines = 1; + /* Required below for reverse offload as implemented, but with compute + capability >= 2.0 and 64bit device processes, this should be universally be + the case; hence, an assert. */ + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); + assert (r == CUDA_SUCCESS && pi); for (int i = 0; i != GOMP_DIM_MAX; i++) ptx_dev->default_dims[i] = 0; @@ -1179,8 +1185,10 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask) { int num_devices = nvptx_get_num_devices (); /* Return -1 if no omp_requires_mask cannot be fulfilled but - devices were present. */ - if (num_devices > 0 && omp_requires_mask != 0) + devices were present. Unified-shared address: see comment in + nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */ + if (num_devices > 0 + && (omp_requires_mask & ~GOMP_REQUIRES_UNIFIED_ADDRESS) != 0) return -1; return num_devices; } @@ -1380,7 +1388,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, else if (rev_fn_table) { CUdeviceptr var; - size_t bytes; + size_t bytes, i; r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module, "$offload_func_table"); if (r != CUDA_SUCCESS) @@ -1390,6 +1398,37 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); + /* Free if only NULL entries. */ + for (i = 0; i < fn_entries; ++i) + if ((*rev_fn_table)[i] != 0) + break; + if (i == fn_entries) + { + free (*rev_fn_table); + *rev_fn_table = NULL; + } + } + + if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL) + { + /* cuMemHostAlloc memory is accessible on the device, if unified-shared + address is supported; this is assumed - see comment in + nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */ + CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data, + sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP); + CUdeviceptr dp = (CUdeviceptr) dev->rev_data; + CUdeviceptr device_rev_offload_var; + size_t device_rev_offload_size; + CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, + &device_rev_offload_var, + &device_rev_offload_size, module, + XSTRING (GOMP_REV_OFFLOAD_VAR)); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuModuleGetGlobal error - GOMP_REV_OFFLOAD_VAR: %s", cuda_error (r)); + r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp, + sizeof (dp)); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); } nvptx_set_clocktick (module, dev); @@ -2001,6 +2040,23 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num) return (void *) ptx_dev->omp_stacks.ptr; } + +void +rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size, + CUstream stream) +{ + CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream); + CUDA_CALL_ASSERT (cuStreamSynchronize, stream); +} + +void +rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size, + CUstream stream) +{ + CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream); + CUDA_CALL_ASSERT (cuStreamSynchronize, stream); +} + void GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) { @@ -2035,6 +2091,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads); size_t stack_size = nvptx_stacks_size (); + bool reverse_offload = ptx_dev->rev_data != NULL; + CUstream copy_stream = NULL; pthread_mutex_lock (&ptx_dev->omp_stacks.lock); void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads); @@ -2048,12 +2106,41 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n", __FUNCTION__, fn_name, teams, threads); + if (reverse_offload) + CUDA_CALL_ASSERT (cuStreamCreate, ©_stream, CU_STREAM_NON_BLOCKING); r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1, 32, threads, 1, 0, NULL, NULL, config); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); - - r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); + if (reverse_offload) + while (true) + { + r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL); + if (r == CUDA_SUCCESS) + break; + if (r == CUDA_ERROR_LAUNCH_FAILED) + GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r), + maybe_abort_msg); + else if (r != CUDA_ERROR_NOT_READY) + GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); + + if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0) + { + struct rev_offload *rev_data = ptx_dev->rev_data; + GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum, + rev_data->addrs, rev_data->sizes, + rev_data->kinds, rev_data->dev_num, + rev_off_dev_to_host_cpy, + rev_off_host_to_dev_cpy, copy_stream); + CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream); + __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE); + } + usleep (1); + } + else + r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); + if (reverse_offload) + CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); |