aboutsummaryrefslogtreecommitdiff
path: root/libgomp
diff options
context:
space:
mode:
authorNathan Sidwell <nathan@gcc.gnu.org>2015-09-28 19:37:33 +0000
committerNathan Sidwell <nathan@gcc.gnu.org>2015-09-28 19:37:33 +0000
commit3e32ee19a56d9defea32f54788e1ef12657bc307 (patch)
tree82bea03a2a53289a91a90b899f03a9be503a60df /libgomp
parent4e671509d92cd4dc7d28d7bb92e46092afcd7370 (diff)
downloadgcc-3e32ee19a56d9defea32f54788e1ef12657bc307.zip
gcc-3e32ee19a56d9defea32f54788e1ef12657bc307.tar.gz
gcc-3e32ee19a56d9defea32f54788e1ef12657bc307.tar.bz2
gomp-constants.h (GOMP_VERSION_NVIDIA_PTX): Increment.
inlude/ * gomp-constants.h (GOMP_VERSION_NVIDIA_PTX): Increment. (GOMP_DIM_GANG, GOMP_DIM_WORKER, GOMP_DIM_VECTOR, GOMP_DIM_MAX, GOMP_DIM_MASK): New. (GOMP_LAUNCH_DIM, GOMP_LAUNCH_ASYNC, GOMP_LAUNCH_WAIT): New. (GOMP_LAUNCH_CODE_SHIFT, GOMP_LAUNCH_DEVICE_SHIFT, GOMP_LAUNCH_OP_SHIFT): New. (GOMP_LAUNCH_PACK, GOMP_LAUNCH_CODE, GOMP_LAUNCH_DEVICE, GOMP_LAUNCH_OP): New. (GOMP_LAUNCH_OP_MAX): New. libgomp/ * libgomp.h (acc_dispatch_t): Replace separate geometry args with array. * libgomp.map (GOACC_parallel_keyed): New. * oacc-parallel.c (goacc_wait): Take pointer to va_list. Adjust all callers. (GOACC_parallel_keyed): New interface. Lose geometry arguments and take keyed varargs list. Adjust call to exec_func. (GOACC_parallel): Force host fallback. * libgomp_g.h (GOACC_parallel): Remove. (GOACC_parallel_keyed): Declare. * plugin/plugin-nvptx.c (struct targ_fn_launch): New struct. (stuct targ_gn_descriptor): Replace name field with launch field. (nvptx_exec): Lose separate geometry args, take array. Process dynamic dimensions and adjust. (struct nvptx_tdata): Replace fn_names field with fn_descs. (GOMP_OFFLOAD_load_image): Adjust for change in function table data. (GOMP_OFFLOAD_openacc_parallel): Adjust for change in dimension passing. * oacc-host.c (host_openacc_exec): Adjust for change in dimension passing. gcc/ * config/nvptx/nvptx.c: Include omp-low.h and gomp-constants.h. (nvptx_record_offload_symbol): Record function execution geometry. * config/nvptx/mkoffload.c (process): Include launch geometry in function data. * omp-low.c (oacc_launch_pack): New. (replace_oacc_fn_attrib): New. (set_oacc_fn_attrib): New. (get_oacc_fn_attrib): New. (expand_omp_target): Create keyed varargs for GOACC_parallel call generation. * omp-low.h (get_oacc_fn_attrib): Declare. * builtin-types.def (DEF_FUNCTION_TyPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. * tree.h (OMP_CLAUSE_EXPR): New. * omp-builtins.def (BUILT_IN_GOACC_PARALLEL): Change target fn name. gcc/lto/ * lto-lang.c (DEF_FUNCTION_TYPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. gcc/c-family/ * c-common.c (DEF_FUNCTION_TYPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. gcc/fortran/ * f95-lang.c (DEF_FUNCTION_TYPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. * types.def (DEF_FUNCTION_TYPE_VAR_6): New. (DEF_FUNCTION_TYPE_VAR_11): Delete. gcc/ada/ * gcc-interface/utils.c (DEF_FUNCTION_TYPE_VAR_6): Define From-SVN: r228220
Diffstat (limited to 'libgomp')
-rw-r--r--libgomp/ChangeLog24
-rw-r--r--libgomp/libgomp.h2
-rw-r--r--libgomp/libgomp.map5
-rw-r--r--libgomp/libgomp_g.h5
-rw-r--r--libgomp/oacc-host.c4
-rw-r--r--libgomp/oacc-parallel.c114
-rw-r--r--libgomp/plugin/plugin-nvptx.c85
7 files changed, 161 insertions, 78 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 8e5655d..7c1b20f 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,27 @@
+2015-09-28 Nathan Sidwell <nathan@codesourcery.com>
+
+ * libgomp.h (acc_dispatch_t): Replace separate geometry args with
+ array.
+ * libgomp.map (GOACC_parallel_keyed): New.
+ * oacc-parallel.c (goacc_wait): Take pointer to va_list. Adjust
+ all callers.
+ (GOACC_parallel_keyed): New interface. Lose geometry arguments
+ and take keyed varargs list. Adjust call to exec_func.
+ (GOACC_parallel): Force host fallback.
+ * libgomp_g.h (GOACC_parallel): Remove.
+ (GOACC_parallel_keyed): Declare.
+ * plugin/plugin-nvptx.c (struct targ_fn_launch): New struct.
+ (stuct targ_gn_descriptor): Replace name field with launch field.
+ (nvptx_exec): Lose separate geometry args, take array. Process
+ dynamic dimensions and adjust.
+ (struct nvptx_tdata): Replace fn_names field with fn_descs.
+ (GOMP_OFFLOAD_load_image): Adjust for change in function table
+ data.
+ (GOMP_OFFLOAD_openacc_parallel): Adjust for change in dimension
+ passing.
+ * oacc-host.c (host_openacc_exec): Adjust for change in dimension
+ passing.
+
2015-09-22 Chung-Lin Tang <cltang@codesourcery.com>
PR libgomp/67141
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 2ea1c5d..04262c4 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -695,7 +695,7 @@ typedef struct acc_dispatch_t
/* Execute. */
void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *,
- unsigned short *, int, int, int, int, void *);
+ unsigned short *, int, unsigned *, void *);
/* Async cleanup callback registration. */
void (*register_async_cleanup_func) (void *);
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index ec3c3c1..3b3e0c2 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -332,6 +332,11 @@ GOACC_2.0 {
GOACC_get_num_threads;
};
+GOACC_2.0.1 {
+ global:
+ GOACC_parallel_keyed;
+} GOACC_2.0;
+
GOMP_PLUGIN_1.0 {
global:
GOMP_PLUGIN_malloc;
diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h
index 5e88d45..e7f4eff 100644
--- a/libgomp/libgomp_g.h
+++ b/libgomp/libgomp_g.h
@@ -222,9 +222,8 @@ extern void GOACC_data_start (int, size_t, void **, size_t *,
extern void GOACC_data_end (void);
extern void GOACC_enter_exit_data (int, size_t, void **,
size_t *, unsigned short *, int, int, ...);
-extern void GOACC_parallel (int, void (*) (void *), size_t,
- void **, size_t *, unsigned short *, int, int, int,
- int, int, ...);
+extern void GOACC_parallel_keyd (int, void (*) (void *), size_t,
+ void **, size_t *, unsigned short *, ...);
extern void GOACC_update (int, size_t, void **, size_t *,
unsigned short *, int, int, ...);
extern void GOACC_wait (int, int, ...);
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index 284bb0e..8e4ba04 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -137,10 +137,8 @@ host_openacc_exec (void (*fn) (void *),
void **devaddrs __attribute__ ((unused)),
size_t *sizes __attribute__ ((unused)),
unsigned short *kinds __attribute__ ((unused)),
- int num_gangs __attribute__ ((unused)),
- int num_workers __attribute__ ((unused)),
- int vector_length __attribute__ ((unused)),
int async __attribute__ ((unused)),
+ unsigned *dims __attribute ((unused)),
void *targ_mem_desc __attribute__ ((unused)))
{
fn (hostaddrs);
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index d0e5f94..e31bc0a 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -49,14 +49,18 @@ find_pset (int pos, size_t mapnum, unsigned short *kinds)
return kind == GOMP_MAP_TO_PSET;
}
-static void goacc_wait (int async, int num_waits, va_list ap);
+static void goacc_wait (int async, int num_waits, va_list *ap);
+
+
+/* Launch a possibly offloaded function on DEVICE. FN is the host fn
+ address. MAPNUM, HOSTADDRS, SIZES & KINDS describe the memory
+ blocks to be copied to/from the device. Varadic arguments are
+ keyed optional parameters terminated with a zero. */
void
-GOACC_parallel (int device, void (*fn) (void *),
- size_t mapnum, void **hostaddrs, size_t *sizes,
- unsigned short *kinds,
- int num_gangs, int num_workers, int vector_length,
- int async, int num_waits, ...)
+GOACC_parallel_keyed (int device, void (*fn) (void *),
+ size_t mapnum, void **hostaddrs, size_t *sizes,
+ unsigned short *kinds, ...)
{
bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
va_list ap;
@@ -68,22 +72,16 @@ GOACC_parallel (int device, void (*fn) (void *),
struct splay_tree_key_s k;
splay_tree_key tgt_fn_key;
void (*tgt_fn);
-
- if (num_gangs != 1)
- gomp_fatal ("num_gangs (%d) different from one is not yet supported",
- num_gangs);
- if (num_workers != 1)
- gomp_fatal ("num_workers (%d) different from one is not yet supported",
- num_workers);
+ int async = GOMP_ASYNC_SYNC;
+ unsigned dims[GOMP_DIM_MAX];
+ unsigned tag;
#ifdef HAVE_INTTYPES_H
- gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p, "
- "async = %d\n",
- __FUNCTION__, (uint64_t) mapnum, hostaddrs, sizes, kinds, async);
+ gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n",
+ __FUNCTION__, (uint64_t) mapnum, hostaddrs, sizes, kinds);
#else
- gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p, async=%d\n",
- __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds,
- async);
+ gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p\n",
+ __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds);
#endif
goacc_lazy_initialize ();
@@ -105,12 +103,51 @@ GOACC_parallel (int device, void (*fn) (void *),
return;
}
- if (num_waits)
+ va_start (ap, kinds);
+ /* TODO: This will need amending when device_type is implemented. */
+ while ((tag = va_arg (ap, unsigned)) != 0)
{
- va_start (ap, num_waits);
- goacc_wait (async, num_waits, ap);
- va_end (ap);
+ if (GOMP_LAUNCH_DEVICE (tag))
+ gomp_fatal ("device_type '%d' offload parameters, libgomp is too old",
+ GOMP_LAUNCH_DEVICE (tag));
+
+ switch (GOMP_LAUNCH_CODE (tag))
+ {
+ case GOMP_LAUNCH_DIM:
+ {
+ unsigned mask = GOMP_LAUNCH_OP (tag);
+
+ for (i = 0; i != GOMP_DIM_MAX; i++)
+ if (mask & GOMP_DIM_MASK (i))
+ dims[i] = va_arg (ap, unsigned);
+ }
+ break;
+
+ case GOMP_LAUNCH_ASYNC:
+ {
+ /* Small constant values are encoded in the operand. */
+ async = GOMP_LAUNCH_OP (tag);
+
+ if (async == GOMP_LAUNCH_OP_MAX)
+ async = va_arg (ap, unsigned);
+ break;
+ }
+
+ case GOMP_LAUNCH_WAIT:
+ {
+ unsigned num_waits = GOMP_LAUNCH_OP (tag);
+
+ if (num_waits)
+ goacc_wait (async, num_waits, &ap);
+ break;
+ }
+
+ default:
+ gomp_fatal ("unrecognized offload code '%d',"
+ " libgomp is too old", GOMP_LAUNCH_CODE (tag));
+ }
}
+ va_end (ap);
acc_dev->openacc.async_set_async_func (async);
@@ -138,9 +175,8 @@ GOACC_parallel (int device, void (*fn) (void *),
devaddrs[i] = (void *) (tgt->list[i]->tgt->tgt_start
+ tgt->list[i]->tgt_offset);
- acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
- num_gangs, num_workers, vector_length, async,
- tgt);
+ acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes,
+ kinds, async, dims, tgt);
/* If running synchronously, unmap immediately. */
if (async < acc_async_noval)
@@ -154,6 +190,20 @@ GOACC_parallel (int device, void (*fn) (void *),
acc_dev->openacc.async_set_async_func (acc_async_sync);
}
+/* Legacy entry point, only provide host execution. */
+
+void
+GOACC_parallel (int device, void (*fn) (void *),
+ size_t mapnum, void **hostaddrs, size_t *sizes,
+ unsigned short *kinds,
+ int num_gangs, int num_workers, int vector_length,
+ int async, int num_waits, ...)
+{
+ goacc_save_and_set_bind (acc_device_host);
+ fn (hostaddrs);
+ goacc_restore_bind ();
+}
+
void
GOACC_data_start (int device, size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned short *kinds)
@@ -230,7 +280,7 @@ GOACC_enter_exit_data (int device, size_t mapnum,
va_list ap;
va_start (ap, num_waits);
- goacc_wait (async, num_waits, ap);
+ goacc_wait (async, num_waits, &ap);
va_end (ap);
}
@@ -344,15 +394,15 @@ GOACC_enter_exit_data (int device, size_t mapnum,
}
static void
-goacc_wait (int async, int num_waits, va_list ap)
+goacc_wait (int async, int num_waits, va_list *ap)
{
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
while (num_waits--)
{
- int qid = va_arg (ap, int);
-
+ int qid = va_arg (*ap, int);
+
if (acc_async_test (qid))
continue;
@@ -389,7 +439,7 @@ GOACC_update (int device, size_t mapnum,
va_list ap;
va_start (ap, num_waits);
- goacc_wait (async, num_waits, ap);
+ goacc_wait (async, num_waits, &ap);
va_end (ap);
}
@@ -430,7 +480,7 @@ GOACC_wait (int async, int num_waits, ...)
va_list ap;
va_start (ap, num_waits);
- goacc_wait (async, num_waits, ap);
+ goacc_wait (async, num_waits, &ap);
va_end (ap);
}
else if (async == acc_async_sync)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a2f950d..0c4e1af 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -282,12 +282,20 @@ map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
return;
}
+/* Target data function launch information. */
+
+struct targ_fn_launch
+{
+ const char *fn;
+ unsigned short dim[3];
+};
+
/* Descriptor of a loaded function. */
struct targ_fn_descriptor
{
CUfunction fn;
- const char *name;
+ const struct targ_fn_launch *launch;
};
/* A loaded PTX image. */
@@ -929,8 +937,8 @@ event_add (enum ptx_event_type type, CUevent *e, void *h)
void
nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
- size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
- int vector_length, int async, void *targ_mem_desc)
+ size_t *sizes, unsigned short *kinds, int async, unsigned *dims,
+ void *targ_mem_desc)
{
struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
CUfunction function;
@@ -939,7 +947,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
struct ptx_stream *dev_str;
void *kargs[1];
void *hp, *dp;
- unsigned int nthreads_in_block;
struct nvptx_thread *nvthd = nvptx_thread ();
const char *maybe_abort_msg = "(perhaps abort was called)";
@@ -948,6 +955,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
assert (dev_str == nvthd->current_stream);
+ /* Initialize the launch dimensions. Typically this is constant,
+ provided by the device compiler, but we must permit runtime
+ values. */
+ for (i = 0; i != 3; i++)
+ if (targ_fn->launch->dim[i])
+ dims[i] = targ_fn->launch->dim[i];
+
+ if (dims[GOMP_DIM_GANG] != 1)
+ GOMP_PLUGIN_fatal ("non-unity num_gangs (%d) not supported",
+ dims[GOMP_DIM_GANG]);
+ if (dims[GOMP_DIM_WORKER] != 1)
+ GOMP_PLUGIN_fatal ("non-unity num_workers (%d) not supported",
+ dims[GOMP_DIM_WORKER]);
+
/* This reserves a chunk of a pre-allocated page of memory mapped on both
the host and the device. HP is a host pointer to the new chunk, and DP is
the corresponding device pointer. */
@@ -965,35 +986,21 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
- GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
+ GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
+ " gangs=%u, workers=%u, vectors=%u\n",
+ __FUNCTION__, targ_fn->launch->fn,
+ dims[0], dims[1], dims[2]);
// OpenACC CUDA
//
- // num_gangs blocks
- // num_workers warps (where a warp is equivalent to 32 threads)
- // vector length threads
- //
-
- /* The openacc vector_length clause 'determines the vector length to use for
- vector or SIMD operations'. The question is how to map this to CUDA.
-
- In CUDA, the warp size is the vector length of a CUDA device. However, the
- CUDA interface abstracts away from that, and only shows us warp size
- indirectly in maximum number of threads per block, which is a product of
- warp size and the number of hyperthreads of a multiprocessor.
-
- We choose to map openacc vector_length directly onto the number of threads
- in a block, in the x dimension. This is reflected in gcc code generation
- that uses ThreadIdx.x to access vector elements.
-
- Attempting to use an openacc vector_length of more than the maximum number
- of threads per block will result in a cuda error. */
- nthreads_in_block = vector_length;
+ // num_gangs nctaid.x
+ // num_workers ntid.y
+ // vector length ntid.x
kargs[0] = &dp;
r = cuLaunchKernel (function,
- num_gangs, 1, 1,
- nthreads_in_block, 1, 1,
+ dims[GOMP_DIM_GANG], 1, 1,
+ dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
0, dev_str->stream, kargs, 0);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
@@ -1039,7 +1046,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
#endif
GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
- targ_fn->name);
+ targ_fn->launch->fn);
#ifndef DISABLE_ASYNC
if (async < acc_async_noval)
@@ -1567,7 +1574,7 @@ typedef struct nvptx_tdata
const char *const *var_names;
size_t var_num;
- const char *const *fn_names;
+ const struct targ_fn_launch *fn_descs;
size_t fn_num;
} nvptx_tdata_t;
@@ -1588,7 +1595,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
struct addr_pair **target_table)
{
CUmodule module;
- const char *const *fn_names, *const *var_names;
+ const char *const *var_names;
+ const struct targ_fn_launch *fn_descs;
unsigned int fn_entries, var_entries, i, j;
CUresult r;
struct targ_fn_descriptor *targ_fns;
@@ -1617,7 +1625,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
var_entries = img_header->var_num;
var_names = img_header->var_names;
fn_entries = img_header->fn_num;
- fn_names = img_header->fn_names;
+ fn_descs = img_header->fn_descs;
targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
* (fn_entries + var_entries));
@@ -1640,12 +1648,12 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
{
CUfunction function;
- r = cuModuleGetFunction (&function, module, fn_names[i]);
+ r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
targ_fns->fn = function;
- targ_fns->name = (const char *) fn_names[i];
+ targ_fns->launch = &fn_descs[i];
targ_tbl->start = (uintptr_t) targ_fns;
targ_tbl->end = targ_tbl->start + 1;
@@ -1724,13 +1732,12 @@ void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
void
GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
- void **hostaddrs, void **devaddrs, size_t *sizes,
- unsigned short *kinds, int num_gangs,
- int num_workers, int vector_length, int async,
- void *targ_mem_desc)
+ void **hostaddrs, void **devaddrs,
+ size_t *sizes, unsigned short *kinds,
+ int async, unsigned *dims, void *targ_mem_desc)
{
- nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
- num_workers, vector_length, async, targ_mem_desc);
+ nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
+ async, dims, targ_mem_desc);
}
void