diff options
author | Julian Brown <julian@codesourcery.com> | 2015-04-08 15:58:33 +0000 |
---|---|---|
committer | Julian Brown <jules@gcc.gnu.org> | 2015-04-08 15:58:33 +0000 |
commit | d93bdab53b8de8677bca3af17fe8072458ea3f6b (patch) | |
tree | 60d71a75181c79b311aaa199f917cb19ed8c06f7 /libgomp/plugin | |
parent | a6330e856f5f2a0c6c92e54cec42ff1798bf0ea8 (diff) | |
download | gcc-d93bdab53b8de8677bca3af17fe8072458ea3f6b.zip gcc-d93bdab53b8de8677bca3af17fe8072458ea3f6b.tar.gz gcc-d93bdab53b8de8677bca3af17fe8072458ea3f6b.tar.bz2 |
mkoffload.c (process): Support variable mapping.
gcc/
* config/nvptx/mkoffload.c (process): Support variable mapping.
libgomp/
* libgomp.h (target_mem_desc: Remove mem_map field.
(acc_dispatch_t): Remove open_device_func, close_device_func,
get_device_num_func, set_device_num_func, target_data members.
Change create_thread_data_func argument to device number instead of
generic pointer.
* oacc-async.c (assert.h): Include.
(acc_async_test, acc_async_test_all, acc_wait, acc_wait_async)
(acc_wait_all, acc_wait_all_async): Use current host thread's
active device, not base_dev.
* oacc-cuda.c (acc_get_current_cuda_device)
(acc_get_current_cuda_context, acc_get_cuda_stream)
(acc_set_cuda_stream): Likewise.
* oacc-host.c (host_dispatch): Don't set open_device_func,
close_device_func, get_device_num_func or set_device_num_func.
* oacc-init.c (base_dev, init_key): Remove.
(cached_base_dev): New.
(name_of_acc_device_t): New.
(acc_init_1): Initialise default-numbered device, not zeroth.
(acc_shutdown_1): Close all devices of a given type.
(goacc_destroy_thread): Don't use base_dev.
(lazy_open, lazy_init, lazy_init_and_open): Remove.
(goacc_attach_host_thread_to_device): New.
(acc_init): Reimplement with goacc_attach_host_thread_to_device.
(acc_get_num_devices): Don't use base_dev.
(acc_set_device_type): Reimplement.
(acc_get_device_type): Don't use base_dev.
(acc_get_device_num): Tweak logic.
(acc_set_device_num): Likewise.
(acc_on_device): Use acc_get_device_type.
(goacc_runtime_initialize): Initialize cached_base_dev not base_dev.
(goacc_lazy_initialize): Reimplement with acc_init and
goacc_attach_host_thread_to_device.
* oacc-int.h (goacc_thread): Add base_dev field.
(base_dev): Remove extern declaration.
(goacc_attach_host_thread_to_device): Add prototype.
* oacc-mem.c (acc_malloc): Use current thread's device instead of
base_dev.
(acc_free): Likewise.
(acc_memcpy_to_device): Likewise.
(acc_memcpy_from_device): Likewise.
* oacc-parallel.c (select_acc_device): Remove. Replace calls with
goacc_lazy_initialize (throughout).
(GOACC_parallel): Use tgt_offset to locate target functions.
* target.c (gomp_map_vars): Don't set tgt->mem_map.
(gomp_unmap_vars): Use devicep->mem_map pointer not tgt->mem_map.
(gomp_load_plugin_for_device): Remove open_device, close_device,
get_device_num, set_device_num openacc hook initialisation. Don't set
openacc.target_data.
* plugin/plugin-host.c (GOMP_OFFLOAD_openacc_open_device)
(GOMP_OFFLOAD_openacc_close_device)
(GOMP_OFFLOAD_openacc_get_device_num)
(GOMP_OFFLOAD_openacc_set_device_num): Remove.
(GOMP_OFFLOAD_openacc_create_thread_data): Change (unused) argument
to int.
* plugin/plugin-nvptx.c (ptx_inited): Remove.
(instantiated_devices, ptx_dev_lock): New.
(struct ptx_image_data): New.
(ptx_devices, ptx_images, ptx_image_lock): New.
(fini_streams_for_device): Reorder cuStreamDestroy call.
(nvptx_get_num_devices): Remove forward declaration.
(nvptx_init): Change return type to bool.
(nvptx_fini): Remove.
(nvptx_attach_host_thread_to_device): New.
(nvptx_open_device): Return struct ptx_device* instead of void*.
(nvptx_close_device): Change argument type to struct ptx_device*,
return type to void.
(nvptx_get_num_devices): Use instantiated_devices not ptx_inited.
(kernel_target_data, kernel_host_table): Remove static globals.
(GOMP_OFFLOAD_register_image, GOMP_OFFLOAD_get_table): Remove.
(GOMP_OFFLOAD_init_device): Reimplement.
(GOMP_OFFLOAD_fini_device): Likewise.
(GOMP_OFFLOAD_load_image, GOMP_OFFLOAD_unload_image): New.
(GOMP_OFFLOAD_alloc, GOMP_OFFLOAD_free, GOMP_OFFLOAD_dev2host)
(GOMP_OFFLOAD_host2dev): Use ORD argument.
(GOMP_OFFLOAD_openacc_open_device)
(GOMP_OFFLOAD_openacc_close_device)
(GOMP_OFFLOAD_openacc_set_device_num)
(GOMP_OFFLOAD_openacc_get_device_num): Remove.
(GOMP_OFFLOAD_openacc_create_thread_data): Change argument to int
(device number).
libgomp/testsuite/
* libgomp.oacc-c-c++-common/lib-9.c: Fix devnum check in test.
From-SVN: r221922
Diffstat (limited to 'libgomp/plugin')
-rw-r--r-- | libgomp/plugin/plugin-host.c | 27 | ||||
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 318 |
2 files changed, 210 insertions, 135 deletions
diff --git a/libgomp/plugin/plugin-host.c b/libgomp/plugin/plugin-host.c index bc60f72..1faf5bc 100644 --- a/libgomp/plugin/plugin-host.c +++ b/libgomp/plugin/plugin-host.c @@ -119,31 +119,6 @@ GOMP_OFFLOAD_unload_image (int n __attribute__ ((unused)), } STATIC void * -GOMP_OFFLOAD_openacc_open_device (int n) -{ - return (void *) (intptr_t) n; -} - -STATIC int -GOMP_OFFLOAD_openacc_close_device (void *hnd) -{ - return 0; -} - -STATIC int -GOMP_OFFLOAD_openacc_get_device_num (void) -{ - return 0; -} - -STATIC void -GOMP_OFFLOAD_openacc_set_device_num (int n) -{ - if (n > 0) - GOMP (fatal) ("device number %u out of range for host execution", n); -} - -STATIC void * GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t s) { return GOMP (malloc) (s); @@ -254,7 +229,7 @@ GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__ ((unused))) } STATIC void * -GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data +GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__ ((unused))) { return NULL; diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 483cb75..583ec87 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -133,7 +133,8 @@ struct targ_fn_descriptor const char *name; }; -static bool ptx_inited = false; +static unsigned int instantiated_devices = 0; +static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; struct ptx_stream { @@ -331,9 +332,21 @@ struct ptx_event struct ptx_event *next; }; +struct ptx_image_data +{ + void *target_data; + CUmodule module; + struct ptx_image_data *next; +}; + static pthread_mutex_t ptx_event_lock; static struct ptx_event *ptx_events; +static struct ptx_device **ptx_devices; + +static struct ptx_image_data *ptx_images = NULL; +static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER; + #define _XSTR(s) _STR(s) #define _STR(s) #s @@ -450,8 +463,8 @@ fini_streams_for_device (struct ptx_device *ptx_dev) struct ptx_stream *s = ptx_dev->active_streams; ptx_dev->active_streams = ptx_dev->active_streams->next; - cuStreamDestroy (s->stream); map_fini (s); + cuStreamDestroy (s->stream); free (s); } @@ -575,21 +588,21 @@ select_stream_for_async (int async, pthread_t thread, bool create, return stream; } -static int nvptx_get_num_devices (void); - -/* Initialize the device. */ -static int +/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK + should be locked on entry and remains locked on exit. */ +static bool nvptx_init (void) { CUresult r; int rc; + int ndevs; - if (ptx_inited) - return nvptx_get_num_devices (); + if (instantiated_devices != 0) + return true; rc = verify_device_library (); if (rc < 0) - return -1; + return false; r = cuInit (0); if (r != CUDA_SUCCESS) @@ -599,22 +612,64 @@ nvptx_init (void) pthread_mutex_init (&ptx_event_lock, NULL); - ptx_inited = true; + r = cuDeviceGetCount (&ndevs); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); - return nvptx_get_num_devices (); + ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) + * ndevs); + + return true; } +/* Select the N'th PTX device for the current host thread. The device must + have been previously opened before calling this function. */ + static void -nvptx_fini (void) +nvptx_attach_host_thread_to_device (int n) { - ptx_inited = false; + CUdevice dev; + CUresult r; + struct ptx_device *ptx_dev; + CUcontext thd_ctx; + + r = cuCtxGetDevice (&dev); + if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) + GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); + + if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) + return; + else + { + CUcontext old_ctx; + + ptx_dev = ptx_devices[n]; + assert (ptx_dev); + + r = cuCtxGetCurrent (&thd_ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); + + /* We don't necessarily have a current context (e.g. if it has been + destroyed. Pop it if we do though. */ + if (thd_ctx != NULL) + { + r = cuCtxPopCurrent (&old_ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); + } + + r = cuCtxPushCurrent (ptx_dev->ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); + } } -static void * +static struct ptx_device * nvptx_open_device (int n) { struct ptx_device *ptx_dev; - CUdevice dev; + CUdevice dev, ctx_dev; CUresult r; int async_engines, pi; @@ -628,6 +683,21 @@ nvptx_open_device (int n) ptx_dev->dev = dev; ptx_dev->ctx_shared = false; + r = cuCtxGetDevice (&ctx_dev); + if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) + GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); + + if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) + { + /* The current host thread has an active context for a different device. + Detach it. */ + CUcontext old_ctx; + + r = cuCtxPopCurrent (&old_ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); + } + r = cuCtxGetCurrent (&ptx_dev->ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); @@ -678,17 +748,16 @@ nvptx_open_device (int n) init_streams_for_device (ptx_dev, async_engines); - return (void *) ptx_dev; + return ptx_dev; } -static int -nvptx_close_device (void *targ_data) +static void +nvptx_close_device (struct ptx_device *ptx_dev) { CUresult r; - struct ptx_device *ptx_dev = targ_data; if (!ptx_dev) - return 0; + return; fini_streams_for_device (ptx_dev); @@ -700,8 +769,6 @@ nvptx_close_device (void *targ_data) } free (ptx_dev); - - return 0; } static int @@ -714,7 +781,7 @@ nvptx_get_num_devices (void) order to enumerate available devices, but CUDA API routines can't be used until cuInit has been called. Just call it now (but don't yet do any further initialization). */ - if (!ptx_inited) + if (instantiated_devices == 0) cuInit (0); r = cuDeviceGetCount (&n); @@ -1507,64 +1574,84 @@ GOMP_OFFLOAD_get_num_devices (void) return nvptx_get_num_devices (); } -static void **kernel_target_data; -static void **kernel_host_table; - void -GOMP_OFFLOAD_register_image (void *host_table, void *target_data) +GOMP_OFFLOAD_init_device (int n) { - kernel_target_data = target_data; - kernel_host_table = host_table; -} + pthread_mutex_lock (&ptx_dev_lock); -void -GOMP_OFFLOAD_init_device (int n __attribute__ ((unused))) -{ - (void) nvptx_init (); + if (!nvptx_init () || ptx_devices[n] != NULL) + { + pthread_mutex_unlock (&ptx_dev_lock); + return; + } + + ptx_devices[n] = nvptx_open_device (n); + instantiated_devices++; + + pthread_mutex_unlock (&ptx_dev_lock); } void -GOMP_OFFLOAD_fini_device (int n __attribute__ ((unused))) +GOMP_OFFLOAD_fini_device (int n) { - nvptx_fini (); + pthread_mutex_lock (&ptx_dev_lock); + + if (ptx_devices[n] != NULL) + { + nvptx_attach_host_thread_to_device (n); + nvptx_close_device (ptx_devices[n]); + ptx_devices[n] = NULL; + instantiated_devices--; + } + + pthread_mutex_unlock (&ptx_dev_lock); } int -GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)), - struct mapping_table **tablep) +GOMP_OFFLOAD_load_image (int ord, void *target_data, + struct addr_pair **target_table) { CUmodule module; - void **fn_table; - char **fn_names; - int fn_entries, i; + char **fn_names, **var_names; + unsigned int fn_entries, var_entries, i, j; CUresult r; struct targ_fn_descriptor *targ_fns; + void **img_header = (void **) target_data; + struct ptx_image_data *new_image; - if (nvptx_init () <= 0) - return 0; + GOMP_OFFLOAD_init_device (ord); - /* This isn't an error, because an image may legitimately have no offloaded - regions and so will not call GOMP_offload_register. */ - if (kernel_target_data == NULL) - return 0; + nvptx_attach_host_thread_to_device (ord); + + link_ptx (&module, img_header[0]); - link_ptx (&module, kernel_target_data[0]); + pthread_mutex_lock (&ptx_image_lock); + new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data)); + new_image->target_data = target_data; + new_image->module = module; + new_image->next = ptx_images; + ptx_images = new_image; + pthread_mutex_unlock (&ptx_image_lock); - /* kernel_target_data[0] -> ptx code - kernel_target_data[1] -> variable mappings - kernel_target_data[2] -> array of kernel names in ascii + /* The mkoffload utility emits a table of pointers/integers at the start of + each offload image: - kernel_host_table[0] -> start of function addresses (__offload_func_table) - kernel_host_table[1] -> end of function addresses (__offload_funcs_end) + img_header[0] -> ptx code + img_header[1] -> number of variables + img_header[2] -> array of variable names (pointers to strings) + img_header[3] -> number of kernels + img_header[4] -> array of kernel names (pointers to strings) The array of kernel names and the functions addresses form a one-to-one correspondence. */ - fn_table = kernel_host_table[0]; - fn_names = (char **) kernel_target_data[2]; - fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *); + var_entries = (uintptr_t) img_header[1]; + var_names = (char **) img_header[2]; + fn_entries = (uintptr_t) img_header[3]; + fn_names = (char **) img_header[4]; - *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries); + *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) + * (fn_entries + var_entries)); targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) * fn_entries); @@ -1579,38 +1666,86 @@ GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)), targ_fns[i].fn = function; targ_fns[i].name = (const char *) fn_names[i]; - (*tablep)[i].host_start = (uintptr_t) fn_table[i]; - (*tablep)[i].host_end = (*tablep)[i].host_start + 1; - (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i]; - (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1; + (*target_table)[i].start = (uintptr_t) &targ_fns[i]; + (*target_table)[i].end = (*target_table)[i].start + 1; } - return fn_entries; + for (j = 0; j < var_entries; j++, i++) + { + CUdeviceptr var; + size_t bytes; + + r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); + + (*target_table)[i].start = (uintptr_t) var; + (*target_table)[i].end = (*target_table)[i].start + bytes; + } + + return i; +} + +void +GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data) +{ + void **img_header = (void **) target_data; + struct targ_fn_descriptor *targ_fns + = (struct targ_fn_descriptor *) img_header[0]; + struct ptx_image_data *image, *prev = NULL, *newhd = NULL; + + free (targ_fns); + + pthread_mutex_lock (&ptx_image_lock); + for (image = ptx_images; image != NULL;) + { + struct ptx_image_data *next = image->next; + + if (image->target_data == target_data) + { + cuModuleUnload (image->module); + free (image); + if (prev) + prev->next = next; + } + else + { + prev = image; + if (!newhd) + newhd = image; + } + + image = next; + } + ptx_images = newhd; + pthread_mutex_unlock (&ptx_image_lock); } void * -GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t size) +GOMP_OFFLOAD_alloc (int ord, size_t size) { + nvptx_attach_host_thread_to_device (ord); return nvptx_alloc (size); } void -GOMP_OFFLOAD_free (int n __attribute__ ((unused)), void *ptr) +GOMP_OFFLOAD_free (int ord, void *ptr) { + nvptx_attach_host_thread_to_device (ord); nvptx_free (ptr); } void * -GOMP_OFFLOAD_dev2host (int ord __attribute__ ((unused)), void *dst, - const void *src, size_t n) +GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) { + nvptx_attach_host_thread_to_device (ord); return nvptx_dev2host (dst, src, n); } void * -GOMP_OFFLOAD_host2dev (int ord __attribute__ ((unused)), void *dst, - const void *src, size_t n) +GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) { + nvptx_attach_host_thread_to_device (ord); return nvptx_host2dev (dst, src, n); } @@ -1627,45 +1762,6 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, num_workers, vector_length, async, targ_mem_desc); } -void * -GOMP_OFFLOAD_openacc_open_device (int n) -{ - return nvptx_open_device (n); -} - -int -GOMP_OFFLOAD_openacc_close_device (void *h) -{ - return nvptx_close_device (h); -} - -void -GOMP_OFFLOAD_openacc_set_device_num (int n) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - - assert (n >= 0); - - if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n) - (void) nvptx_open_device (n); -} - -/* This can be called before the device is "opened" for the current thread, in - which case we can't tell which device number should be returned. We don't - actually want to open the device here, so just return -1 and let the caller - (oacc-init.c:acc_get_device_num) handle it. */ - -int -GOMP_OFFLOAD_openacc_get_device_num (void) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - - if (nvthd && nvthd->ptx_dev) - return nvthd->ptx_dev->ord; - else - return -1; -} - void GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) { @@ -1729,14 +1825,18 @@ GOMP_OFFLOAD_openacc_async_set_async (int async) } void * -GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data) +GOMP_OFFLOAD_openacc_create_thread_data (int ord) { - struct ptx_device *ptx_dev = (struct ptx_device *) targ_data; + struct ptx_device *ptx_dev; struct nvptx_thread *nvthd = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); CUresult r; CUcontext thd_ctx; + ptx_dev = ptx_devices[ord]; + + assert (ptx_dev); + r = cuCtxGetCurrent (&thd_ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); |