diff options
Diffstat (limited to 'libgomp/plugin/plugin-nvptx.c')
| -rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 243 |
1 files changed, 212 insertions, 31 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 822c6a4..dd8bcf9 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -60,6 +60,14 @@ #include <errno.h> #include <stdlib.h> +/* Create hash-table for declare target's indirect clause on the host; + see build-target-indirect-htab.h for details. */ +#define USE_HASHTAB_LOOKUP_FOR_INDIRECT +#ifdef USE_HASHTAB_LOOKUP_FOR_INDIRECT +static void* create_target_indirect_map (size_t *, size_t, + uint64_t *, uint64_t *); +#endif + /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks block to cache between kernel invocations. For soft-stacks blocks bigger than this, we will free the block before attempting another GPU memory @@ -1125,11 +1133,13 @@ nvptx_stacks_free (struct ptx_device *ptx_dev, bool force) } static void * -nvptx_alloc (size_t s, bool suppress_errors) +nvptx_alloc (size_t s, bool suppress_errors, bool managed) { CUdeviceptr d; - CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s); + CUresult r = (managed ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s, + CU_MEM_ATTACH_GLOBAL) + : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s)); if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY) return NULL; else if (r != CUDA_SUCCESS) @@ -1238,6 +1248,24 @@ nvptx_get_current_cuda_context (void) return nvthd->ptx_dev->ctx; } +#if 0 /* TODO: Use to enable self-mapping/USM automatically. */ +/* FIXME: The auto-self-map feature depends on still mapping 'declare target' + variables, even if ignoring all other mappings. Cf. PR 115279. */ + +/* Return TRUE if the GPU is integrated with host memory, i.e. GPU and + host share the same memory controller. As of Oct 2025, no such + Nvidia GPU seems to exist. */ +static bool +is_integrated_apu (struct ptx_device *ptx_dev) +{ + int pi; + CUresult r; + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, + CU_DEVICE_ATTRIBUTE_INTEGRATED, ptx_dev->dev); + return (r == CUDA_SUCCESS && pi == 1); +} +#endif + /* Plugin entry points. */ const char * @@ -1626,39 +1654,71 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); - /* Build host->target address map for indirect functions. */ - uint64_t ind_fn_map[ind_fn_entries * 2 + 1]; - for (unsigned k = 0; k < ind_fn_entries; k++) - { - ind_fn_map[k * 2] = host_ind_fn_table[k]; - ind_fn_map[k * 2 + 1] = ind_fn_table[k]; - GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n", - k, host_ind_fn_table[k], ind_fn_table[k]); - } - ind_fn_map[ind_fn_entries * 2] = 0; + /* For newer binaries, the hash table for 'indirect' is created on the + host. Older binaries don't have GOMP_INDIRECT_ADDR_HMAP on the + device side - and have to create the table themselves using + GOMP_INDIRECT_ADDR_MAP. */ - /* Write the map onto the target. */ - void *map_target_addr - = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map)); - GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr); - - GOMP_OFFLOAD_host2dev (ord, map_target_addr, - (void*) ind_fn_map, - sizeof (ind_fn_map)); - - /* Write address of the map onto the target. */ CUdeviceptr varptr; size_t varsize; + bool host_init_htab = true; + #ifdef USE_HASHTAB_LOOKUP_FOR_INDIRECT r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize, - module, XSTRING (GOMP_INDIRECT_ADDR_MAP)); + module, XSTRING (GOMP_INDIRECT_ADDR_HMAP)); + if (r != CUDA_SUCCESS) + #endif + { + host_init_htab = false; + r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize, + module, XSTRING (GOMP_INDIRECT_ADDR_MAP)); + } if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s", cuda_error (r)); - GOMP_PLUGIN_debug (0, - "Indirect map variable found at %llx with size %ld\n", + "%s-style indirect map variable found at %llx with " + "size %ld\n", host_init_htab ? "New" : "Old", varptr, varsize); + void *map_target_addr; + if (!host_init_htab) + { + /* Build host->target address map for indirect functions. */ + uint64_t ind_fn_map[ind_fn_entries * 2 + 1]; + for (unsigned k = 0; k < ind_fn_entries; k++) + { + ind_fn_map[k * 2] = host_ind_fn_table[k]; + ind_fn_map[k * 2 + 1] = ind_fn_table[k]; + GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n", + k, host_ind_fn_table[k], ind_fn_table[k]); + } + ind_fn_map[ind_fn_entries * 2] = 0; + /* Write the map onto the target. */ + map_target_addr = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map)); + GOMP_OFFLOAD_host2dev (ord, map_target_addr, + (void *) ind_fn_map, sizeof (ind_fn_map)); + } + #ifdef USE_HASHTAB_LOOKUP_FOR_INDIRECT + else + { + /* FIXME: Handle multi-kernel load and unload, cf. PR 114690. */ + size_t host_map_size; + void *host_map; + host_map = create_target_indirect_map (&host_map_size, ind_fn_entries, + host_ind_fn_table, + ind_fn_table); + for (unsigned k = 0; k < ind_fn_entries; k++) + GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n", + k, host_ind_fn_table[k], ind_fn_table[k]); + /* Write the map onto the target. */ + map_target_addr = GOMP_OFFLOAD_alloc (ord, host_map_size); + GOMP_OFFLOAD_host2dev (ord, map_target_addr, host_map, host_map_size); + } + #endif + + GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr); + + /* Write address of the map onto the target. */ GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr, sizeof (map_target_addr)); } @@ -1785,8 +1845,8 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) return ret; } -void * -GOMP_OFFLOAD_alloc (int ord, size_t size) +static void * +cleanup_and_alloc (int ord, size_t size, bool managed) { if (!nvptx_attach_host_thread_to_device (ord)) return NULL; @@ -1809,7 +1869,7 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) blocks = tmp; } - void *d = nvptx_alloc (size, true); + void *d = nvptx_alloc (size, true, managed); if (d) return d; else @@ -1817,10 +1877,22 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) /* Memory allocation failed. Try freeing the stacks block, and retrying. */ nvptx_stacks_free (ptx_dev, true); - return nvptx_alloc (size, false); + return nvptx_alloc (size, false, managed); } } +void * +GOMP_OFFLOAD_alloc (int ord, size_t size) +{ + return cleanup_and_alloc (ord, size, false); +} + +void * +GOMP_OFFLOAD_managed_alloc (int ord, size_t size) +{ + return cleanup_and_alloc (ord, size, true); +} + bool GOMP_OFFLOAD_free (int ord, void *ptr) { @@ -1828,6 +1900,45 @@ GOMP_OFFLOAD_free (int ord, void *ptr) && nvptx_free (ptr, ptx_devices[ord])); } +bool +GOMP_OFFLOAD_managed_free (int ord, void *ptr) +{ + return GOMP_OFFLOAD_free (ord, ptr); +} + +bool +GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size) +{ + if (size == 0) + { + /* Special case to ensure omp_alloc specification compliance. */ + *ptr = NULL; + return true; + } + + CUresult r; + + unsigned int flags = 0; + /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need + 'flags |= CU_MEMHOSTALLOC_PORTABLE;' here. */ + r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags); + if (r == CUDA_ERROR_OUT_OF_MEMORY) + *ptr = NULL; + else if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r)); + return false; + } + return true; +} + +bool +GOMP_OFFLOAD_page_locked_host_free (void *ptr) +{ + CUDA_CALL (cuMemFreeHost, ptr); + return true; +} + void GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum __attribute__((unused)), @@ -2019,6 +2130,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, } static bool +cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s) +{ + CUdeviceptr pb1, pb2; + size_t ps1, ps2; + if (!s) + return true; + if (!d1 || !d2) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1); + CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2); + if (!pb1 || !pb2) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + if ((void *)(d1 + s) > (void *)(pb1 + ps1) + || (void *)(d2 + s) > (void *)(pb2 + ps2)) + { + GOMP_PLUGIN_error ("invalid size"); + return false; + } + return true; +} + +static bool cuda_memcpy_sanity_check (const void *h, const void *d, size_t s) { CUdeviceptr pb; @@ -2077,6 +2216,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) bool GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) { + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_dev_sanity_check (dst, src, n)) + return false; CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL); return true; } @@ -2267,6 +2409,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, } bool +GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count) +{ + if (!nvptx_attach_host_thread_to_device (ord)) + return false; + CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count); + return true; +} + +bool GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src, size_t n, struct goacc_asyncqueue *aq) { @@ -2288,6 +2439,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src, return true; } +bool +GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src, + size_t n, struct goacc_asyncqueue *aq) +{ + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_dev_sanity_check (dst, src, n)) + return false; + CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, + aq->cuda_stream); + return true; +} + union goacc_property_value GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop) { @@ -2483,12 +2646,26 @@ GOMP_OFFLOAD_interop (struct interop_obj_t *obj, int ord, break; } - obj->device_data = ptx_devices[ord]; + struct ptx_device *ptx_dev = obj->device_data = ptx_devices[ord]; if (targetsync) { CUstream stream = NULL; - CUDA_CALL_ASSERT (cuStreamCreate, &stream, CU_STREAM_DEFAULT); + CUdevice cur_ctx_dev; + CUresult res = CUDA_CALL_NOCHECK (cuCtxGetDevice, &cur_ctx_dev); + if (res != CUDA_SUCCESS && res != CUDA_ERROR_INVALID_CONTEXT) + GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (res)); + if (res != CUDA_ERROR_INVALID_CONTEXT && ptx_dev->dev == cur_ctx_dev) + CUDA_CALL_ASSERT (cuStreamCreate, &stream, CU_STREAM_DEFAULT); + else + { + CUcontext old_ctx; + assert (ptx_dev->ctx); + CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx); + CUDA_CALL_ASSERT (cuStreamCreate, &stream, CU_STREAM_DEFAULT); + if (res != CUDA_ERROR_INVALID_CONTEXT) + CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx); + } obj->stream = stream; } } @@ -2832,3 +3009,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) } /* TODO: Implement GOMP_OFFLOAD_async_run. */ + +#ifdef USE_HASHTAB_LOOKUP_FOR_INDIRECT + #include "build-target-indirect-htab.h" +#endif |
