aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Schwinge <thomas@codesourcery.com>2023-03-21 16:14:16 +0100
committerThomas Schwinge <thomas@codesourcery.com>2023-03-24 16:40:22 +0100
commitc276fa0616eb79ddc4d0245e775a841e84cbb7dd (patch)
tree8df4bea545719399fb38c1d8c6950b13798e8aea
parent65037818987ffce7d6f466fa8bde13e9f59a3218 (diff)
downloadgcc-c276fa0616eb79ddc4d0245e775a841e84cbb7dd.zip
gcc-c276fa0616eb79ddc4d0245e775a841e84cbb7dd.tar.gz
gcc-c276fa0616eb79ddc4d0245e775a841e84cbb7dd.tar.bz2
libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation
... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it. Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609 "libgomp/nvptx: Prepare for reverse-offload callback handling", and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8 "libgomp: Handle OpenMP's reverse offloads". libgomp/ * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy', 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'. * libgomp.h (gomp_target_rev): Adjust. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust. * plugin/plugin-gcn.c (process_reverse_offload): Adjust. * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy) (rev_off_host_to_dev_cpy): Remove. (GOMP_OFFLOAD_run): Adjust.
-rw-r--r--libgomp/ChangeLog.omp10
-rw-r--r--libgomp/libgomp-plugin.c7
-rw-r--r--libgomp/libgomp-plugin.h6
-rw-r--r--libgomp/libgomp.h5
-rw-r--r--libgomp/plugin/plugin-gcn.c2
-rw-r--r--libgomp/plugin/plugin-nvptx.c77
-rw-r--r--libgomp/target.c102
7 files changed, 106 insertions, 103 deletions
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index 9360db6..fb352b3 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,5 +1,15 @@
2023-03-24 Thomas Schwinge <thomas@codesourcery.com>
+ * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
+ 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
+ * libgomp.h (gomp_target_rev): Adjust.
+ * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
+ * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
+ * plugin/plugin-gcn.c (process_reverse_offload): Adjust.
+ * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
+ (rev_off_host_to_dev_cpy): Remove.
+ (GOMP_OFFLOAD_run): Adjust.
+
* target.c (gomp_unmap_vars_internal): Queue splay-tree keys for
removal after main loop.
diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c
index 316de74..c76fa63 100644
--- a/libgomp/libgomp-plugin.c
+++ b/libgomp/libgomp-plugin.c
@@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
void
GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
- void (*dev_to_host_cpy) (void *, const void *, size_t,
- void *),
- void (*host_to_dev_cpy) (void *, const void *, size_t,
- void *), void *token)
+ struct goacc_asyncqueue *aq)
{
gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
- dev_to_host_cpy, host_to_dev_cpy, token);
+ aq);
}
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 66d995f..ca557a7 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -122,11 +122,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
__attribute__ ((noreturn, format (printf, 1, 2)));
extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
- uint64_t, int,
- void (*) (void *, const void *, size_t,
- void *),
- void (*) (void *, const void *, size_t,
- void *), void *);
+ uint64_t, int, struct goacc_asyncqueue *);
/* Prototypes for functions implemented by libgomp plugins. */
extern const char *GOMP_OFFLOAD_get_name (void);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 92f6f14..3b2b4aa 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1127,10 +1127,7 @@ extern void gomp_init_targets_once (void);
extern int gomp_get_num_devices (void);
extern bool gomp_target_task_fn (void *);
extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
- int,
- void (*) (void *, const void *, size_t, void *),
- void (*) (void *, const void *, size_t, void *),
- void *);
+ int, struct goacc_asyncqueue *);
extern void * gomp_usm_alloc (size_t size, int device_num);
extern void gomp_usm_free (void *device_ptr, int device_num);
extern bool gomp_page_locked_host_alloc (void **, size_t);
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 64694cd..82f5940 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -2008,7 +2008,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
{
int dev_num = dev_num64;
GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
- NULL, NULL, NULL);
+ NULL);
}
/* Output any data written to console output from the kernel. It is expected
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 6ade34b..23f89b6 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -56,6 +56,7 @@
#include <unistd.h>
#include <assert.h>
#include <errno.h>
+#include <stdlib.h>
/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
block to cache between kernel invocations. For soft-stacks blocks bigger
@@ -1837,11 +1838,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
return 1;
}
-struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+static struct goacc_asyncqueue *
+nvptx_goacc_asyncqueue_construct (unsigned int flags)
{
CUstream stream = NULL;
- CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+ CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
struct goacc_asyncqueue *aq
= GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
@@ -1849,14 +1850,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
return aq;
}
-bool
-GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+{
+ return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+}
+
+static bool
+nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
{
CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
free (aq);
return true;
}
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+ return nvptx_goacc_asyncqueue_destruct (aq);
+}
+
int
GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
{
@@ -1870,14 +1883,20 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
return -1;
}
-bool
-GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+static bool
+nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
{
CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
return true;
}
bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+ return nvptx_goacc_asyncqueue_synchronize (aq);
+}
+
+bool
GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
struct goacc_asyncqueue *aq2)
{
@@ -2137,22 +2156,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
void
-rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
- CUstream stream)
-{
- CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
- CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
-rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
- CUstream stream)
-{
- CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
- CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
{
struct targ_fn_descriptor *tgt_fn_desc
@@ -2185,9 +2188,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
}
nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
- size_t stack_size = nvptx_stacks_size ();
bool reverse_offload = ptx_dev->rev_data != NULL;
- CUstream copy_stream = NULL;
+ struct goacc_asyncqueue *reverse_offload_aq = NULL;
+ if (reverse_offload)
+ {
+ reverse_offload_aq
+ = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
+ if (!reverse_offload_aq)
+ exit (EXIT_FAILURE);
+ }
+
+ size_t stack_size = nvptx_stacks_size ();
pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
@@ -2201,8 +2212,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
" [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
__FUNCTION__, fn_name, teams, threads);
- if (reverse_offload)
- CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
32, threads, 1, lowlat_pool_size, NULL, NULL, config);
if (r != CUDA_SUCCESS)
@@ -2225,17 +2234,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
rev_data->addrs, rev_data->sizes,
rev_data->kinds, rev_data->dev_num,
- rev_off_dev_to_host_cpy,
- rev_off_host_to_dev_cpy, copy_stream);
- CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
+ reverse_offload_aq);
+ if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
+ exit (EXIT_FAILURE);
__atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
}
usleep (1);
}
else
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
- if (reverse_offload)
- CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
@@ -2243,6 +2250,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+
+ if (reverse_offload)
+ {
+ if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
+ exit (EXIT_FAILURE);
+ }
}
/* TODO: Implement GOMP_OFFLOAD_async_run. */
diff --git a/libgomp/target.c b/libgomp/target.c
index 107c356..2f53f05 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -3527,9 +3527,7 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
void
gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
- void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
- void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
- void *token)
+ struct goacc_asyncqueue *aq)
{
/* Return early if there is no offload code. */
if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
@@ -3571,26 +3569,17 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
- if (dev_to_host_cpy)
- {
- dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
- mapnum * sizeof (uint64_t), token);
- dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
- mapnum * sizeof (uint64_t), token);
- dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
- mapnum * sizeof (unsigned short), token);
- }
- else
- {
- gomp_copy_dev2host (devicep, NULL, devaddrs,
- (const void *) (uintptr_t) devaddrs_ptr,
- mapnum * sizeof (uint64_t));
- gomp_copy_dev2host (devicep, NULL, sizes,
- (const void *) (uintptr_t) sizes_ptr,
- mapnum * sizeof (uint64_t));
- gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
- mapnum * sizeof (unsigned short));
- }
+ gomp_copy_dev2host (devicep, aq, devaddrs,
+ (const void *) (uintptr_t) devaddrs_ptr,
+ mapnum * sizeof (uint64_t));
+ gomp_copy_dev2host (devicep, aq, sizes,
+ (const void *) (uintptr_t) sizes_ptr,
+ mapnum * sizeof (uint64_t));
+ gomp_copy_dev2host (devicep, aq, kinds,
+ (const void *) (uintptr_t) kinds_ptr,
+ mapnum * sizeof (unsigned short));
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ exit (EXIT_FAILURE);
}
size_t tgt_align = 0, tgt_size = 0;
@@ -3617,13 +3606,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
(size_t) sizes[i]);
- else if (dev_to_host_cpy)
- dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
- (size_t) sizes[i], token);
else
- gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
- (void *) (uintptr_t) devaddrs[i],
- (size_t) sizes[i]);
+ {
+ gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
+ (void *) (uintptr_t) devaddrs[i],
+ (size_t) sizes[i]);
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ exit (EXIT_FAILURE);
+ }
devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
tgt_size = tgt_size + sizes[i];
if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
@@ -3735,15 +3725,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|| kind == GOMP_MAP_FORCE_TOFROM
|| GOMP_MAP_ALWAYS_TO_P (kind))
{
- if (dev_to_host_cpy)
- dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
- (void *) (uintptr_t) cdata[i].devaddr,
- sizes[i], token);
- else
- gomp_copy_dev2host (devicep, NULL,
- (void *) (uintptr_t) devaddrs[i],
- (void *) (uintptr_t) cdata[i].devaddr,
- sizes[i]);
+ gomp_copy_dev2host (devicep, aq,
+ (void *) (uintptr_t) devaddrs[i],
+ (void *) (uintptr_t) cdata[i].devaddr,
+ sizes[i]);
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
}
if (struct_cpy)
struct_cpy--;
@@ -3810,15 +3800,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
devaddrs[i]
= (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
sizes[i]);
- if (dev_to_host_cpy)
- dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
- (void *) (uintptr_t) cdata[i].devaddr,
- sizes[i], token);
- else
- gomp_copy_dev2host (devicep, NULL,
- (void *) (uintptr_t) devaddrs[i],
- (void *) (uintptr_t) cdata[i].devaddr,
- sizes[i]);
+ gomp_copy_dev2host (devicep, aq,
+ (void *) (uintptr_t) devaddrs[i],
+ (void *) (uintptr_t) cdata[i].devaddr,
+ sizes[i]);
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
}
for (j = i + 1; j < mapnum; j++)
{
@@ -3926,15 +3916,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
/* FALLTHRU */
case GOMP_MAP_FROM:
case GOMP_MAP_TOFROM:
- if (copy && host_to_dev_cpy)
- host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
- (void *) (uintptr_t) devaddrs[i],
- sizes[i], token);
- else if (copy)
- gomp_copy_host2dev (devicep, NULL,
- (void *) (uintptr_t) cdata[i].devaddr,
- (void *) (uintptr_t) devaddrs[i],
- sizes[i], false, NULL);
+ if (copy)
+ {
+ gomp_copy_host2dev (devicep, aq,
+ (void *) (uintptr_t) cdata[i].devaddr,
+ (void *) (uintptr_t) devaddrs[i],
+ sizes[i], false, NULL);
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ exit (EXIT_FAILURE);
+ }
default:
break;
}