aboutsummaryrefslogtreecommitdiff
path: root/libgomp/plugin/plugin-nvptx.c
diff options
context:
space:
mode:
Diffstat (limited to 'libgomp/plugin/plugin-nvptx.c')
-rw-r--r--libgomp/plugin/plugin-nvptx.c243
1 files changed, 212 insertions, 31 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 822c6a4..dd8bcf9 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -60,6 +60,14 @@
#include <errno.h>
#include <stdlib.h>
+/* Create hash-table for declare target's indirect clause on the host;
+ see build-target-indirect-htab.h for details. */
+#define USE_HASHTAB_LOOKUP_FOR_INDIRECT
+#ifdef USE_HASHTAB_LOOKUP_FOR_INDIRECT
+static void* create_target_indirect_map (size_t *, size_t,
+ uint64_t *, uint64_t *);
+#endif
+
/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
block to cache between kernel invocations. For soft-stacks blocks bigger
than this, we will free the block before attempting another GPU memory
@@ -1125,11 +1133,13 @@ nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
}
static void *
-nvptx_alloc (size_t s, bool suppress_errors)
+nvptx_alloc (size_t s, bool suppress_errors, bool managed)
{
CUdeviceptr d;
- CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
+ CUresult r = (managed ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s,
+ CU_MEM_ATTACH_GLOBAL)
+ : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s));
if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
return NULL;
else if (r != CUDA_SUCCESS)
@@ -1238,6 +1248,24 @@ nvptx_get_current_cuda_context (void)
return nvthd->ptx_dev->ctx;
}
+#if 0 /* TODO: Use to enable self-mapping/USM automatically. */
+/* FIXME: The auto-self-map feature depends on still mapping 'declare target'
+ variables, even if ignoring all other mappings. Cf. PR 115279. */
+
+/* Return TRUE if the GPU is integrated with host memory, i.e. GPU and
+ host share the same memory controller. As of Oct 2025, no such
+ Nvidia GPU seems to exist. */
+static bool
+is_integrated_apu (struct ptx_device *ptx_dev)
+{
+ int pi;
+ CUresult r;
+ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
+ CU_DEVICE_ATTRIBUTE_INTEGRATED, ptx_dev->dev);
+ return (r == CUDA_SUCCESS && pi == 1);
+}
+#endif
+
/* Plugin entry points. */
const char *
@@ -1626,39 +1654,71 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
- /* Build host->target address map for indirect functions. */
- uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
- for (unsigned k = 0; k < ind_fn_entries; k++)
- {
- ind_fn_map[k * 2] = host_ind_fn_table[k];
- ind_fn_map[k * 2 + 1] = ind_fn_table[k];
- GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
- k, host_ind_fn_table[k], ind_fn_table[k]);
- }
- ind_fn_map[ind_fn_entries * 2] = 0;
+ /* For newer binaries, the hash table for 'indirect' is created on the
+ host. Older binaries don't have GOMP_INDIRECT_ADDR_HMAP on the
+ device side - and have to create the table themselves using
+ GOMP_INDIRECT_ADDR_MAP. */
- /* Write the map onto the target. */
- void *map_target_addr
- = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
- GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
-
- GOMP_OFFLOAD_host2dev (ord, map_target_addr,
- (void*) ind_fn_map,
- sizeof (ind_fn_map));
-
- /* Write address of the map onto the target. */
CUdeviceptr varptr;
size_t varsize;
+ bool host_init_htab = true;
+ #ifdef USE_HASHTAB_LOOKUP_FOR_INDIRECT
r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
- module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
+ module, XSTRING (GOMP_INDIRECT_ADDR_HMAP));
+ if (r != CUDA_SUCCESS)
+ #endif
+ {
+ host_init_htab = false;
+ r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
+ module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
+ }
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
cuda_error (r));
-
GOMP_PLUGIN_debug (0,
- "Indirect map variable found at %llx with size %ld\n",
+ "%s-style indirect map variable found at %llx with "
+ "size %ld\n", host_init_htab ? "New" : "Old",
varptr, varsize);
+ void *map_target_addr;
+ if (!host_init_htab)
+ {
+ /* Build host->target address map for indirect functions. */
+ uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
+ for (unsigned k = 0; k < ind_fn_entries; k++)
+ {
+ ind_fn_map[k * 2] = host_ind_fn_table[k];
+ ind_fn_map[k * 2 + 1] = ind_fn_table[k];
+ GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
+ k, host_ind_fn_table[k], ind_fn_table[k]);
+ }
+ ind_fn_map[ind_fn_entries * 2] = 0;
+ /* Write the map onto the target. */
+ map_target_addr = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
+ GOMP_OFFLOAD_host2dev (ord, map_target_addr,
+ (void *) ind_fn_map, sizeof (ind_fn_map));
+ }
+ #ifdef USE_HASHTAB_LOOKUP_FOR_INDIRECT
+ else
+ {
+ /* FIXME: Handle multi-kernel load and unload, cf. PR 114690. */
+ size_t host_map_size;
+ void *host_map;
+ host_map = create_target_indirect_map (&host_map_size, ind_fn_entries,
+ host_ind_fn_table,
+ ind_fn_table);
+ for (unsigned k = 0; k < ind_fn_entries; k++)
+ GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
+ k, host_ind_fn_table[k], ind_fn_table[k]);
+ /* Write the map onto the target. */
+ map_target_addr = GOMP_OFFLOAD_alloc (ord, host_map_size);
+ GOMP_OFFLOAD_host2dev (ord, map_target_addr, host_map, host_map_size);
+ }
+ #endif
+
+ GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
+
+ /* Write address of the map onto the target. */
GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr,
sizeof (map_target_addr));
}
@@ -1785,8 +1845,8 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
return ret;
}
-void *
-GOMP_OFFLOAD_alloc (int ord, size_t size)
+static void *
+cleanup_and_alloc (int ord, size_t size, bool managed)
{
if (!nvptx_attach_host_thread_to_device (ord))
return NULL;
@@ -1809,7 +1869,7 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
blocks = tmp;
}
- void *d = nvptx_alloc (size, true);
+ void *d = nvptx_alloc (size, true, managed);
if (d)
return d;
else
@@ -1817,10 +1877,22 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
/* Memory allocation failed. Try freeing the stacks block, and
retrying. */
nvptx_stacks_free (ptx_dev, true);
- return nvptx_alloc (size, false);
+ return nvptx_alloc (size, false, managed);
}
}
+void *
+GOMP_OFFLOAD_alloc (int ord, size_t size)
+{
+ return cleanup_and_alloc (ord, size, false);
+}
+
+void *
+GOMP_OFFLOAD_managed_alloc (int ord, size_t size)
+{
+ return cleanup_and_alloc (ord, size, true);
+}
+
bool
GOMP_OFFLOAD_free (int ord, void *ptr)
{
@@ -1828,6 +1900,45 @@ GOMP_OFFLOAD_free (int ord, void *ptr)
&& nvptx_free (ptr, ptx_devices[ord]));
}
+bool
+GOMP_OFFLOAD_managed_free (int ord, void *ptr)
+{
+ return GOMP_OFFLOAD_free (ord, ptr);
+}
+
+bool
+GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
+{
+ if (size == 0)
+ {
+ /* Special case to ensure omp_alloc specification compliance. */
+ *ptr = NULL;
+ return true;
+ }
+
+ CUresult r;
+
+ unsigned int flags = 0;
+ /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
+ 'flags |= CU_MEMHOSTALLOC_PORTABLE;' here. */
+ r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags);
+ if (r == CUDA_ERROR_OUT_OF_MEMORY)
+ *ptr = NULL;
+ else if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r));
+ return false;
+ }
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_page_locked_host_free (void *ptr)
+{
+ CUDA_CALL (cuMemFreeHost, ptr);
+ return true;
+}
+
void
GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
size_t mapnum __attribute__((unused)),
@@ -2019,6 +2130,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
}
static bool
+cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s)
+{
+ CUdeviceptr pb1, pb2;
+ size_t ps1, ps2;
+ if (!s)
+ return true;
+ if (!d1 || !d2)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1);
+ CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2);
+ if (!pb1 || !pb2)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ if ((void *)(d1 + s) > (void *)(pb1 + ps1)
+ || (void *)(d2 + s) > (void *)(pb2 + ps2))
+ {
+ GOMP_PLUGIN_error ("invalid size");
+ return false;
+ }
+ return true;
+}
+
+static bool
cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
{
CUdeviceptr pb;
@@ -2077,6 +2216,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
bool
GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_dev_sanity_check (dst, src, n))
+ return false;
CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
return true;
}
@@ -2267,6 +2409,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
}
bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+ if (!nvptx_attach_host_thread_to_device (ord))
+ return false;
+ CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count);
+ return true;
+}
+
+bool
GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
size_t n, struct goacc_asyncqueue *aq)
{
@@ -2288,6 +2439,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
return true;
}
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_dev_sanity_check (dst, src, n))
+ return false;
+ CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+ aq->cuda_stream);
+ return true;
+}
+
union goacc_property_value
GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
{
@@ -2483,12 +2646,26 @@ GOMP_OFFLOAD_interop (struct interop_obj_t *obj, int ord,
break;
}
- obj->device_data = ptx_devices[ord];
+ struct ptx_device *ptx_dev = obj->device_data = ptx_devices[ord];
if (targetsync)
{
CUstream stream = NULL;
- CUDA_CALL_ASSERT (cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+ CUdevice cur_ctx_dev;
+ CUresult res = CUDA_CALL_NOCHECK (cuCtxGetDevice, &cur_ctx_dev);
+ if (res != CUDA_SUCCESS && res != CUDA_ERROR_INVALID_CONTEXT)
+ GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (res));
+ if (res != CUDA_ERROR_INVALID_CONTEXT && ptx_dev->dev == cur_ctx_dev)
+ CUDA_CALL_ASSERT (cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+ else
+ {
+ CUcontext old_ctx;
+ assert (ptx_dev->ctx);
+ CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
+ CUDA_CALL_ASSERT (cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+ if (res != CUDA_ERROR_INVALID_CONTEXT)
+ CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
+ }
obj->stream = stream;
}
}
@@ -2832,3 +3009,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
}
/* TODO: Implement GOMP_OFFLOAD_async_run. */
+
+#ifdef USE_HASHTAB_LOOKUP_FOR_INDIRECT
+ #include "build-target-indirect-htab.h"
+#endif