aboutsummaryrefslogtreecommitdiff
path: root/libgomp/plugin
diff options
context:
space:
mode:
authorJulian Brown <julian@codesourcery.com>2015-04-08 15:58:33 +0000
committerJulian Brown <jules@gcc.gnu.org>2015-04-08 15:58:33 +0000
commitd93bdab53b8de8677bca3af17fe8072458ea3f6b (patch)
tree60d71a75181c79b311aaa199f917cb19ed8c06f7 /libgomp/plugin
parenta6330e856f5f2a0c6c92e54cec42ff1798bf0ea8 (diff)
downloadgcc-d93bdab53b8de8677bca3af17fe8072458ea3f6b.zip
gcc-d93bdab53b8de8677bca3af17fe8072458ea3f6b.tar.gz
gcc-d93bdab53b8de8677bca3af17fe8072458ea3f6b.tar.bz2
mkoffload.c (process): Support variable mapping.
gcc/ * config/nvptx/mkoffload.c (process): Support variable mapping. libgomp/ * libgomp.h (target_mem_desc: Remove mem_map field. (acc_dispatch_t): Remove open_device_func, close_device_func, get_device_num_func, set_device_num_func, target_data members. Change create_thread_data_func argument to device number instead of generic pointer. * oacc-async.c (assert.h): Include. (acc_async_test, acc_async_test_all, acc_wait, acc_wait_async) (acc_wait_all, acc_wait_all_async): Use current host thread's active device, not base_dev. * oacc-cuda.c (acc_get_current_cuda_device) (acc_get_current_cuda_context, acc_get_cuda_stream) (acc_set_cuda_stream): Likewise. * oacc-host.c (host_dispatch): Don't set open_device_func, close_device_func, get_device_num_func or set_device_num_func. * oacc-init.c (base_dev, init_key): Remove. (cached_base_dev): New. (name_of_acc_device_t): New. (acc_init_1): Initialise default-numbered device, not zeroth. (acc_shutdown_1): Close all devices of a given type. (goacc_destroy_thread): Don't use base_dev. (lazy_open, lazy_init, lazy_init_and_open): Remove. (goacc_attach_host_thread_to_device): New. (acc_init): Reimplement with goacc_attach_host_thread_to_device. (acc_get_num_devices): Don't use base_dev. (acc_set_device_type): Reimplement. (acc_get_device_type): Don't use base_dev. (acc_get_device_num): Tweak logic. (acc_set_device_num): Likewise. (acc_on_device): Use acc_get_device_type. (goacc_runtime_initialize): Initialize cached_base_dev not base_dev. (goacc_lazy_initialize): Reimplement with acc_init and goacc_attach_host_thread_to_device. * oacc-int.h (goacc_thread): Add base_dev field. (base_dev): Remove extern declaration. (goacc_attach_host_thread_to_device): Add prototype. * oacc-mem.c (acc_malloc): Use current thread's device instead of base_dev. (acc_free): Likewise. (acc_memcpy_to_device): Likewise. (acc_memcpy_from_device): Likewise. * oacc-parallel.c (select_acc_device): Remove. Replace calls with goacc_lazy_initialize (throughout). (GOACC_parallel): Use tgt_offset to locate target functions. * target.c (gomp_map_vars): Don't set tgt->mem_map. (gomp_unmap_vars): Use devicep->mem_map pointer not tgt->mem_map. (gomp_load_plugin_for_device): Remove open_device, close_device, get_device_num, set_device_num openacc hook initialisation. Don't set openacc.target_data. * plugin/plugin-host.c (GOMP_OFFLOAD_openacc_open_device) (GOMP_OFFLOAD_openacc_close_device) (GOMP_OFFLOAD_openacc_get_device_num) (GOMP_OFFLOAD_openacc_set_device_num): Remove. (GOMP_OFFLOAD_openacc_create_thread_data): Change (unused) argument to int. * plugin/plugin-nvptx.c (ptx_inited): Remove. (instantiated_devices, ptx_dev_lock): New. (struct ptx_image_data): New. (ptx_devices, ptx_images, ptx_image_lock): New. (fini_streams_for_device): Reorder cuStreamDestroy call. (nvptx_get_num_devices): Remove forward declaration. (nvptx_init): Change return type to bool. (nvptx_fini): Remove. (nvptx_attach_host_thread_to_device): New. (nvptx_open_device): Return struct ptx_device* instead of void*. (nvptx_close_device): Change argument type to struct ptx_device*, return type to void. (nvptx_get_num_devices): Use instantiated_devices not ptx_inited. (kernel_target_data, kernel_host_table): Remove static globals. (GOMP_OFFLOAD_register_image, GOMP_OFFLOAD_get_table): Remove. (GOMP_OFFLOAD_init_device): Reimplement. (GOMP_OFFLOAD_fini_device): Likewise. (GOMP_OFFLOAD_load_image, GOMP_OFFLOAD_unload_image): New. (GOMP_OFFLOAD_alloc, GOMP_OFFLOAD_free, GOMP_OFFLOAD_dev2host) (GOMP_OFFLOAD_host2dev): Use ORD argument. (GOMP_OFFLOAD_openacc_open_device) (GOMP_OFFLOAD_openacc_close_device) (GOMP_OFFLOAD_openacc_set_device_num) (GOMP_OFFLOAD_openacc_get_device_num): Remove. (GOMP_OFFLOAD_openacc_create_thread_data): Change argument to int (device number). libgomp/testsuite/ * libgomp.oacc-c-c++-common/lib-9.c: Fix devnum check in test. From-SVN: r221922
Diffstat (limited to 'libgomp/plugin')
-rw-r--r--libgomp/plugin/plugin-host.c27
-rw-r--r--libgomp/plugin/plugin-nvptx.c318
2 files changed, 210 insertions, 135 deletions
diff --git a/libgomp/plugin/plugin-host.c b/libgomp/plugin/plugin-host.c
index bc60f72..1faf5bc 100644
--- a/libgomp/plugin/plugin-host.c
+++ b/libgomp/plugin/plugin-host.c
@@ -119,31 +119,6 @@ GOMP_OFFLOAD_unload_image (int n __attribute__ ((unused)),
}
STATIC void *
-GOMP_OFFLOAD_openacc_open_device (int n)
-{
- return (void *) (intptr_t) n;
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_close_device (void *hnd)
-{
- return 0;
-}
-
-STATIC int
-GOMP_OFFLOAD_openacc_get_device_num (void)
-{
- return 0;
-}
-
-STATIC void
-GOMP_OFFLOAD_openacc_set_device_num (int n)
-{
- if (n > 0)
- GOMP (fatal) ("device number %u out of range for host execution", n);
-}
-
-STATIC void *
GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t s)
{
return GOMP (malloc) (s);
@@ -254,7 +229,7 @@ GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__ ((unused)))
}
STATIC void *
-GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data
+GOMP_OFFLOAD_openacc_create_thread_data (int ord
__attribute__ ((unused)))
{
return NULL;
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 483cb75..583ec87 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -133,7 +133,8 @@ struct targ_fn_descriptor
const char *name;
};
-static bool ptx_inited = false;
+static unsigned int instantiated_devices = 0;
+static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
struct ptx_stream
{
@@ -331,9 +332,21 @@ struct ptx_event
struct ptx_event *next;
};
+struct ptx_image_data
+{
+ void *target_data;
+ CUmodule module;
+ struct ptx_image_data *next;
+};
+
static pthread_mutex_t ptx_event_lock;
static struct ptx_event *ptx_events;
+static struct ptx_device **ptx_devices;
+
+static struct ptx_image_data *ptx_images = NULL;
+static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER;
+
#define _XSTR(s) _STR(s)
#define _STR(s) #s
@@ -450,8 +463,8 @@ fini_streams_for_device (struct ptx_device *ptx_dev)
struct ptx_stream *s = ptx_dev->active_streams;
ptx_dev->active_streams = ptx_dev->active_streams->next;
- cuStreamDestroy (s->stream);
map_fini (s);
+ cuStreamDestroy (s->stream);
free (s);
}
@@ -575,21 +588,21 @@ select_stream_for_async (int async, pthread_t thread, bool create,
return stream;
}
-static int nvptx_get_num_devices (void);
-
-/* Initialize the device. */
-static int
+/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
+ should be locked on entry and remains locked on exit. */
+static bool
nvptx_init (void)
{
CUresult r;
int rc;
+ int ndevs;
- if (ptx_inited)
- return nvptx_get_num_devices ();
+ if (instantiated_devices != 0)
+ return true;
rc = verify_device_library ();
if (rc < 0)
- return -1;
+ return false;
r = cuInit (0);
if (r != CUDA_SUCCESS)
@@ -599,22 +612,64 @@ nvptx_init (void)
pthread_mutex_init (&ptx_event_lock, NULL);
- ptx_inited = true;
+ r = cuDeviceGetCount (&ndevs);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
- return nvptx_get_num_devices ();
+ ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
+ * ndevs);
+
+ return true;
}
+/* Select the N'th PTX device for the current host thread. The device must
+ have been previously opened before calling this function. */
+
static void
-nvptx_fini (void)
+nvptx_attach_host_thread_to_device (int n)
{
- ptx_inited = false;
+ CUdevice dev;
+ CUresult r;
+ struct ptx_device *ptx_dev;
+ CUcontext thd_ctx;
+
+ r = cuCtxGetDevice (&dev);
+ if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
+ GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+
+ if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
+ return;
+ else
+ {
+ CUcontext old_ctx;
+
+ ptx_dev = ptx_devices[n];
+ assert (ptx_dev);
+
+ r = cuCtxGetCurrent (&thd_ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+
+ /* We don't necessarily have a current context (e.g. if it has been
+ destroyed. Pop it if we do though. */
+ if (thd_ctx != NULL)
+ {
+ r = cuCtxPopCurrent (&old_ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+ }
+
+ r = cuCtxPushCurrent (ptx_dev->ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
+ }
}
-static void *
+static struct ptx_device *
nvptx_open_device (int n)
{
struct ptx_device *ptx_dev;
- CUdevice dev;
+ CUdevice dev, ctx_dev;
CUresult r;
int async_engines, pi;
@@ -628,6 +683,21 @@ nvptx_open_device (int n)
ptx_dev->dev = dev;
ptx_dev->ctx_shared = false;
+ r = cuCtxGetDevice (&ctx_dev);
+ if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
+ GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+
+ if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
+ {
+ /* The current host thread has an active context for a different device.
+ Detach it. */
+ CUcontext old_ctx;
+
+ r = cuCtxPopCurrent (&old_ctx);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+ }
+
r = cuCtxGetCurrent (&ptx_dev->ctx);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
@@ -678,17 +748,16 @@ nvptx_open_device (int n)
init_streams_for_device (ptx_dev, async_engines);
- return (void *) ptx_dev;
+ return ptx_dev;
}
-static int
-nvptx_close_device (void *targ_data)
+static void
+nvptx_close_device (struct ptx_device *ptx_dev)
{
CUresult r;
- struct ptx_device *ptx_dev = targ_data;
if (!ptx_dev)
- return 0;
+ return;
fini_streams_for_device (ptx_dev);
@@ -700,8 +769,6 @@ nvptx_close_device (void *targ_data)
}
free (ptx_dev);
-
- return 0;
}
static int
@@ -714,7 +781,7 @@ nvptx_get_num_devices (void)
order to enumerate available devices, but CUDA API routines can't be used
until cuInit has been called. Just call it now (but don't yet do any
further initialization). */
- if (!ptx_inited)
+ if (instantiated_devices == 0)
cuInit (0);
r = cuDeviceGetCount (&n);
@@ -1507,64 +1574,84 @@ GOMP_OFFLOAD_get_num_devices (void)
return nvptx_get_num_devices ();
}
-static void **kernel_target_data;
-static void **kernel_host_table;
-
void
-GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
+GOMP_OFFLOAD_init_device (int n)
{
- kernel_target_data = target_data;
- kernel_host_table = host_table;
-}
+ pthread_mutex_lock (&ptx_dev_lock);
-void
-GOMP_OFFLOAD_init_device (int n __attribute__ ((unused)))
-{
- (void) nvptx_init ();
+ if (!nvptx_init () || ptx_devices[n] != NULL)
+ {
+ pthread_mutex_unlock (&ptx_dev_lock);
+ return;
+ }
+
+ ptx_devices[n] = nvptx_open_device (n);
+ instantiated_devices++;
+
+ pthread_mutex_unlock (&ptx_dev_lock);
}
void
-GOMP_OFFLOAD_fini_device (int n __attribute__ ((unused)))
+GOMP_OFFLOAD_fini_device (int n)
{
- nvptx_fini ();
+ pthread_mutex_lock (&ptx_dev_lock);
+
+ if (ptx_devices[n] != NULL)
+ {
+ nvptx_attach_host_thread_to_device (n);
+ nvptx_close_device (ptx_devices[n]);
+ ptx_devices[n] = NULL;
+ instantiated_devices--;
+ }
+
+ pthread_mutex_unlock (&ptx_dev_lock);
}
int
-GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
- struct mapping_table **tablep)
+GOMP_OFFLOAD_load_image (int ord, void *target_data,
+ struct addr_pair **target_table)
{
CUmodule module;
- void **fn_table;
- char **fn_names;
- int fn_entries, i;
+ char **fn_names, **var_names;
+ unsigned int fn_entries, var_entries, i, j;
CUresult r;
struct targ_fn_descriptor *targ_fns;
+ void **img_header = (void **) target_data;
+ struct ptx_image_data *new_image;
- if (nvptx_init () <= 0)
- return 0;
+ GOMP_OFFLOAD_init_device (ord);
- /* This isn't an error, because an image may legitimately have no offloaded
- regions and so will not call GOMP_offload_register. */
- if (kernel_target_data == NULL)
- return 0;
+ nvptx_attach_host_thread_to_device (ord);
+
+ link_ptx (&module, img_header[0]);
- link_ptx (&module, kernel_target_data[0]);
+ pthread_mutex_lock (&ptx_image_lock);
+ new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
+ new_image->target_data = target_data;
+ new_image->module = module;
+ new_image->next = ptx_images;
+ ptx_images = new_image;
+ pthread_mutex_unlock (&ptx_image_lock);
- /* kernel_target_data[0] -> ptx code
- kernel_target_data[1] -> variable mappings
- kernel_target_data[2] -> array of kernel names in ascii
+ /* The mkoffload utility emits a table of pointers/integers at the start of
+ each offload image:
- kernel_host_table[0] -> start of function addresses (__offload_func_table)
- kernel_host_table[1] -> end of function addresses (__offload_funcs_end)
+ img_header[0] -> ptx code
+ img_header[1] -> number of variables
+ img_header[2] -> array of variable names (pointers to strings)
+ img_header[3] -> number of kernels
+ img_header[4] -> array of kernel names (pointers to strings)
The array of kernel names and the functions addresses form a
one-to-one correspondence. */
- fn_table = kernel_host_table[0];
- fn_names = (char **) kernel_target_data[2];
- fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
+ var_entries = (uintptr_t) img_header[1];
+ var_names = (char **) img_header[2];
+ fn_entries = (uintptr_t) img_header[3];
+ fn_names = (char **) img_header[4];
- *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
+ *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
+ * (fn_entries + var_entries));
targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
* fn_entries);
@@ -1579,38 +1666,86 @@ GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
targ_fns[i].fn = function;
targ_fns[i].name = (const char *) fn_names[i];
- (*tablep)[i].host_start = (uintptr_t) fn_table[i];
- (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
- (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
- (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
+ (*target_table)[i].start = (uintptr_t) &targ_fns[i];
+ (*target_table)[i].end = (*target_table)[i].start + 1;
}
- return fn_entries;
+ for (j = 0; j < var_entries; j++, i++)
+ {
+ CUdeviceptr var;
+ size_t bytes;
+
+ r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+
+ (*target_table)[i].start = (uintptr_t) var;
+ (*target_table)[i].end = (*target_table)[i].start + bytes;
+ }
+
+ return i;
+}
+
+void
+GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
+{
+ void **img_header = (void **) target_data;
+ struct targ_fn_descriptor *targ_fns
+ = (struct targ_fn_descriptor *) img_header[0];
+ struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
+
+ free (targ_fns);
+
+ pthread_mutex_lock (&ptx_image_lock);
+ for (image = ptx_images; image != NULL;)
+ {
+ struct ptx_image_data *next = image->next;
+
+ if (image->target_data == target_data)
+ {
+ cuModuleUnload (image->module);
+ free (image);
+ if (prev)
+ prev->next = next;
+ }
+ else
+ {
+ prev = image;
+ if (!newhd)
+ newhd = image;
+ }
+
+ image = next;
+ }
+ ptx_images = newhd;
+ pthread_mutex_unlock (&ptx_image_lock);
}
void *
-GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t size)
+GOMP_OFFLOAD_alloc (int ord, size_t size)
{
+ nvptx_attach_host_thread_to_device (ord);
return nvptx_alloc (size);
}
void
-GOMP_OFFLOAD_free (int n __attribute__ ((unused)), void *ptr)
+GOMP_OFFLOAD_free (int ord, void *ptr)
{
+ nvptx_attach_host_thread_to_device (ord);
nvptx_free (ptr);
}
void *
-GOMP_OFFLOAD_dev2host (int ord __attribute__ ((unused)), void *dst,
- const void *src, size_t n)
+GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
{
+ nvptx_attach_host_thread_to_device (ord);
return nvptx_dev2host (dst, src, n);
}
void *
-GOMP_OFFLOAD_host2dev (int ord __attribute__ ((unused)), void *dst,
- const void *src, size_t n)
+GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
{
+ nvptx_attach_host_thread_to_device (ord);
return nvptx_host2dev (dst, src, n);
}
@@ -1627,45 +1762,6 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
num_workers, vector_length, async, targ_mem_desc);
}
-void *
-GOMP_OFFLOAD_openacc_open_device (int n)
-{
- return nvptx_open_device (n);
-}
-
-int
-GOMP_OFFLOAD_openacc_close_device (void *h)
-{
- return nvptx_close_device (h);
-}
-
-void
-GOMP_OFFLOAD_openacc_set_device_num (int n)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- assert (n >= 0);
-
- if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
- (void) nvptx_open_device (n);
-}
-
-/* This can be called before the device is "opened" for the current thread, in
- which case we can't tell which device number should be returned. We don't
- actually want to open the device here, so just return -1 and let the caller
- (oacc-init.c:acc_get_device_num) handle it. */
-
-int
-GOMP_OFFLOAD_openacc_get_device_num (void)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (nvthd && nvthd->ptx_dev)
- return nvthd->ptx_dev->ord;
- else
- return -1;
-}
-
void
GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
{
@@ -1729,14 +1825,18 @@ GOMP_OFFLOAD_openacc_async_set_async (int async)
}
void *
-GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data)
+GOMP_OFFLOAD_openacc_create_thread_data (int ord)
{
- struct ptx_device *ptx_dev = (struct ptx_device *) targ_data;
+ struct ptx_device *ptx_dev;
struct nvptx_thread *nvthd
= GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
CUresult r;
CUcontext thd_ctx;
+ ptx_dev = ptx_devices[ord];
+
+ assert (ptx_dev);
+
r = cuCtxGetCurrent (&thd_ctx);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));