diff options
author | Nathan Sidwell <nathan@gcc.gnu.org> | 2015-09-28 19:37:33 +0000 |
---|---|---|
committer | Nathan Sidwell <nathan@gcc.gnu.org> | 2015-09-28 19:37:33 +0000 |
commit | 3e32ee19a56d9defea32f54788e1ef12657bc307 (patch) | |
tree | 82bea03a2a53289a91a90b899f03a9be503a60df /libgomp | |
parent | 4e671509d92cd4dc7d28d7bb92e46092afcd7370 (diff) | |
download | gcc-3e32ee19a56d9defea32f54788e1ef12657bc307.zip gcc-3e32ee19a56d9defea32f54788e1ef12657bc307.tar.gz gcc-3e32ee19a56d9defea32f54788e1ef12657bc307.tar.bz2 |
gomp-constants.h (GOMP_VERSION_NVIDIA_PTX): Increment.
inlude/
* gomp-constants.h (GOMP_VERSION_NVIDIA_PTX): Increment.
(GOMP_DIM_GANG, GOMP_DIM_WORKER, GOMP_DIM_VECTOR, GOMP_DIM_MAX,
GOMP_DIM_MASK): New.
(GOMP_LAUNCH_DIM, GOMP_LAUNCH_ASYNC, GOMP_LAUNCH_WAIT): New.
(GOMP_LAUNCH_CODE_SHIFT, GOMP_LAUNCH_DEVICE_SHIFT,
GOMP_LAUNCH_OP_SHIFT): New.
(GOMP_LAUNCH_PACK, GOMP_LAUNCH_CODE, GOMP_LAUNCH_DEVICE,
GOMP_LAUNCH_OP): New.
(GOMP_LAUNCH_OP_MAX): New.
libgomp/
* libgomp.h (acc_dispatch_t): Replace separate geometry args with
array.
* libgomp.map (GOACC_parallel_keyed): New.
* oacc-parallel.c (goacc_wait): Take pointer to va_list. Adjust
all callers.
(GOACC_parallel_keyed): New interface. Lose geometry arguments
and take keyed varargs list. Adjust call to exec_func.
(GOACC_parallel): Force host fallback.
* libgomp_g.h (GOACC_parallel): Remove.
(GOACC_parallel_keyed): Declare.
* plugin/plugin-nvptx.c (struct targ_fn_launch): New struct.
(stuct targ_gn_descriptor): Replace name field with launch field.
(nvptx_exec): Lose separate geometry args, take array. Process
dynamic dimensions and adjust.
(struct nvptx_tdata): Replace fn_names field with fn_descs.
(GOMP_OFFLOAD_load_image): Adjust for change in function table
data.
(GOMP_OFFLOAD_openacc_parallel): Adjust for change in dimension
passing.
* oacc-host.c (host_openacc_exec): Adjust for change in dimension
passing.
gcc/
* config/nvptx/nvptx.c: Include omp-low.h and gomp-constants.h.
(nvptx_record_offload_symbol): Record function execution geometry.
* config/nvptx/mkoffload.c (process): Include launch geometry in
function data.
* omp-low.c (oacc_launch_pack): New.
(replace_oacc_fn_attrib): New.
(set_oacc_fn_attrib): New.
(get_oacc_fn_attrib): New.
(expand_omp_target): Create keyed varargs for GOACC_parallel call
generation.
* omp-low.h (get_oacc_fn_attrib): Declare.
* builtin-types.def (DEF_FUNCTION_TyPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
* tree.h (OMP_CLAUSE_EXPR): New.
* omp-builtins.def (BUILT_IN_GOACC_PARALLEL): Change target fn name.
gcc/lto/
* lto-lang.c (DEF_FUNCTION_TYPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
gcc/c-family/
* c-common.c (DEF_FUNCTION_TYPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
gcc/fortran/
* f95-lang.c (DEF_FUNCTION_TYPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
* types.def (DEF_FUNCTION_TYPE_VAR_6): New.
(DEF_FUNCTION_TYPE_VAR_11): Delete.
gcc/ada/
* gcc-interface/utils.c (DEF_FUNCTION_TYPE_VAR_6): Define
From-SVN: r228220
Diffstat (limited to 'libgomp')
-rw-r--r-- | libgomp/ChangeLog | 24 | ||||
-rw-r--r-- | libgomp/libgomp.h | 2 | ||||
-rw-r--r-- | libgomp/libgomp.map | 5 | ||||
-rw-r--r-- | libgomp/libgomp_g.h | 5 | ||||
-rw-r--r-- | libgomp/oacc-host.c | 4 | ||||
-rw-r--r-- | libgomp/oacc-parallel.c | 114 | ||||
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 85 |
7 files changed, 161 insertions, 78 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 8e5655d..7c1b20f 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,27 @@ +2015-09-28 Nathan Sidwell <nathan@codesourcery.com> + + * libgomp.h (acc_dispatch_t): Replace separate geometry args with + array. + * libgomp.map (GOACC_parallel_keyed): New. + * oacc-parallel.c (goacc_wait): Take pointer to va_list. Adjust + all callers. + (GOACC_parallel_keyed): New interface. Lose geometry arguments + and take keyed varargs list. Adjust call to exec_func. + (GOACC_parallel): Force host fallback. + * libgomp_g.h (GOACC_parallel): Remove. + (GOACC_parallel_keyed): Declare. + * plugin/plugin-nvptx.c (struct targ_fn_launch): New struct. + (stuct targ_gn_descriptor): Replace name field with launch field. + (nvptx_exec): Lose separate geometry args, take array. Process + dynamic dimensions and adjust. + (struct nvptx_tdata): Replace fn_names field with fn_descs. + (GOMP_OFFLOAD_load_image): Adjust for change in function table + data. + (GOMP_OFFLOAD_openacc_parallel): Adjust for change in dimension + passing. + * oacc-host.c (host_openacc_exec): Adjust for change in dimension + passing. + 2015-09-22 Chung-Lin Tang <cltang@codesourcery.com> PR libgomp/67141 diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 2ea1c5d..04262c4 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -695,7 +695,7 @@ typedef struct acc_dispatch_t /* Execute. */ void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *, - unsigned short *, int, int, int, int, void *); + unsigned short *, int, unsigned *, void *); /* Async cleanup callback registration. */ void (*register_async_cleanup_func) (void *); diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map index ec3c3c1..3b3e0c2 100644 --- a/libgomp/libgomp.map +++ b/libgomp/libgomp.map @@ -332,6 +332,11 @@ GOACC_2.0 { GOACC_get_num_threads; }; +GOACC_2.0.1 { + global: + GOACC_parallel_keyed; +} GOACC_2.0; + GOMP_PLUGIN_1.0 { global: GOMP_PLUGIN_malloc; diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h index 5e88d45..e7f4eff 100644 --- a/libgomp/libgomp_g.h +++ b/libgomp/libgomp_g.h @@ -222,9 +222,8 @@ extern void GOACC_data_start (int, size_t, void **, size_t *, extern void GOACC_data_end (void); extern void GOACC_enter_exit_data (int, size_t, void **, size_t *, unsigned short *, int, int, ...); -extern void GOACC_parallel (int, void (*) (void *), size_t, - void **, size_t *, unsigned short *, int, int, int, - int, int, ...); +extern void GOACC_parallel_keyd (int, void (*) (void *), size_t, + void **, size_t *, unsigned short *, ...); extern void GOACC_update (int, size_t, void **, size_t *, unsigned short *, int, int, ...); extern void GOACC_wait (int, int, ...); diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index 284bb0e..8e4ba04 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -137,10 +137,8 @@ host_openacc_exec (void (*fn) (void *), void **devaddrs __attribute__ ((unused)), size_t *sizes __attribute__ ((unused)), unsigned short *kinds __attribute__ ((unused)), - int num_gangs __attribute__ ((unused)), - int num_workers __attribute__ ((unused)), - int vector_length __attribute__ ((unused)), int async __attribute__ ((unused)), + unsigned *dims __attribute ((unused)), void *targ_mem_desc __attribute__ ((unused))) { fn (hostaddrs); diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c index d0e5f94..e31bc0a 100644 --- a/libgomp/oacc-parallel.c +++ b/libgomp/oacc-parallel.c @@ -49,14 +49,18 @@ find_pset (int pos, size_t mapnum, unsigned short *kinds) return kind == GOMP_MAP_TO_PSET; } -static void goacc_wait (int async, int num_waits, va_list ap); +static void goacc_wait (int async, int num_waits, va_list *ap); + + +/* Launch a possibly offloaded function on DEVICE. FN is the host fn + address. MAPNUM, HOSTADDRS, SIZES & KINDS describe the memory + blocks to be copied to/from the device. Varadic arguments are + keyed optional parameters terminated with a zero. */ void -GOACC_parallel (int device, void (*fn) (void *), - size_t mapnum, void **hostaddrs, size_t *sizes, - unsigned short *kinds, - int num_gangs, int num_workers, int vector_length, - int async, int num_waits, ...) +GOACC_parallel_keyed (int device, void (*fn) (void *), + size_t mapnum, void **hostaddrs, size_t *sizes, + unsigned short *kinds, ...) { bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; va_list ap; @@ -68,22 +72,16 @@ GOACC_parallel (int device, void (*fn) (void *), struct splay_tree_key_s k; splay_tree_key tgt_fn_key; void (*tgt_fn); - - if (num_gangs != 1) - gomp_fatal ("num_gangs (%d) different from one is not yet supported", - num_gangs); - if (num_workers != 1) - gomp_fatal ("num_workers (%d) different from one is not yet supported", - num_workers); + int async = GOMP_ASYNC_SYNC; + unsigned dims[GOMP_DIM_MAX]; + unsigned tag; #ifdef HAVE_INTTYPES_H - gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p, " - "async = %d\n", - __FUNCTION__, (uint64_t) mapnum, hostaddrs, sizes, kinds, async); + gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n", + __FUNCTION__, (uint64_t) mapnum, hostaddrs, sizes, kinds); #else - gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p, async=%d\n", - __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds, - async); + gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p\n", + __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds); #endif goacc_lazy_initialize (); @@ -105,12 +103,51 @@ GOACC_parallel (int device, void (*fn) (void *), return; } - if (num_waits) + va_start (ap, kinds); + /* TODO: This will need amending when device_type is implemented. */ + while ((tag = va_arg (ap, unsigned)) != 0) { - va_start (ap, num_waits); - goacc_wait (async, num_waits, ap); - va_end (ap); + if (GOMP_LAUNCH_DEVICE (tag)) + gomp_fatal ("device_type '%d' offload parameters, libgomp is too old", + GOMP_LAUNCH_DEVICE (tag)); + + switch (GOMP_LAUNCH_CODE (tag)) + { + case GOMP_LAUNCH_DIM: + { + unsigned mask = GOMP_LAUNCH_OP (tag); + + for (i = 0; i != GOMP_DIM_MAX; i++) + if (mask & GOMP_DIM_MASK (i)) + dims[i] = va_arg (ap, unsigned); + } + break; + + case GOMP_LAUNCH_ASYNC: + { + /* Small constant values are encoded in the operand. */ + async = GOMP_LAUNCH_OP (tag); + + if (async == GOMP_LAUNCH_OP_MAX) + async = va_arg (ap, unsigned); + break; + } + + case GOMP_LAUNCH_WAIT: + { + unsigned num_waits = GOMP_LAUNCH_OP (tag); + + if (num_waits) + goacc_wait (async, num_waits, &ap); + break; + } + + default: + gomp_fatal ("unrecognized offload code '%d'," + " libgomp is too old", GOMP_LAUNCH_CODE (tag)); + } } + va_end (ap); acc_dev->openacc.async_set_async_func (async); @@ -138,9 +175,8 @@ GOACC_parallel (int device, void (*fn) (void *), devaddrs[i] = (void *) (tgt->list[i]->tgt->tgt_start + tgt->list[i]->tgt_offset); - acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes, kinds, - num_gangs, num_workers, vector_length, async, - tgt); + acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes, + kinds, async, dims, tgt); /* If running synchronously, unmap immediately. */ if (async < acc_async_noval) @@ -154,6 +190,20 @@ GOACC_parallel (int device, void (*fn) (void *), acc_dev->openacc.async_set_async_func (acc_async_sync); } +/* Legacy entry point, only provide host execution. */ + +void +GOACC_parallel (int device, void (*fn) (void *), + size_t mapnum, void **hostaddrs, size_t *sizes, + unsigned short *kinds, + int num_gangs, int num_workers, int vector_length, + int async, int num_waits, ...) +{ + goacc_save_and_set_bind (acc_device_host); + fn (hostaddrs); + goacc_restore_bind (); +} + void GOACC_data_start (int device, size_t mapnum, void **hostaddrs, size_t *sizes, unsigned short *kinds) @@ -230,7 +280,7 @@ GOACC_enter_exit_data (int device, size_t mapnum, va_list ap; va_start (ap, num_waits); - goacc_wait (async, num_waits, ap); + goacc_wait (async, num_waits, &ap); va_end (ap); } @@ -344,15 +394,15 @@ GOACC_enter_exit_data (int device, size_t mapnum, } static void -goacc_wait (int async, int num_waits, va_list ap) +goacc_wait (int async, int num_waits, va_list *ap) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; while (num_waits--) { - int qid = va_arg (ap, int); - + int qid = va_arg (*ap, int); + if (acc_async_test (qid)) continue; @@ -389,7 +439,7 @@ GOACC_update (int device, size_t mapnum, va_list ap; va_start (ap, num_waits); - goacc_wait (async, num_waits, ap); + goacc_wait (async, num_waits, &ap); va_end (ap); } @@ -430,7 +480,7 @@ GOACC_wait (int async, int num_waits, ...) va_list ap; va_start (ap, num_waits); - goacc_wait (async, num_waits, ap); + goacc_wait (async, num_waits, &ap); va_end (ap); } else if (async == acc_async_sync) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a2f950d..0c4e1af 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -282,12 +282,20 @@ map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d) return; } +/* Target data function launch information. */ + +struct targ_fn_launch +{ + const char *fn; + unsigned short dim[3]; +}; + /* Descriptor of a loaded function. */ struct targ_fn_descriptor { CUfunction fn; - const char *name; + const struct targ_fn_launch *launch; }; /* A loaded PTX image. */ @@ -929,8 +937,8 @@ event_add (enum ptx_event_type type, CUevent *e, void *h) void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, - size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers, - int vector_length, int async, void *targ_mem_desc) + size_t *sizes, unsigned short *kinds, int async, unsigned *dims, + void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; @@ -939,7 +947,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, struct ptx_stream *dev_str; void *kargs[1]; void *hp, *dp; - unsigned int nthreads_in_block; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; @@ -948,6 +955,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, dev_str = select_stream_for_async (async, pthread_self (), false, NULL); assert (dev_str == nvthd->current_stream); + /* Initialize the launch dimensions. Typically this is constant, + provided by the device compiler, but we must permit runtime + values. */ + for (i = 0; i != 3; i++) + if (targ_fn->launch->dim[i]) + dims[i] = targ_fn->launch->dim[i]; + + if (dims[GOMP_DIM_GANG] != 1) + GOMP_PLUGIN_fatal ("non-unity num_gangs (%d) not supported", + dims[GOMP_DIM_GANG]); + if (dims[GOMP_DIM_WORKER] != 1) + GOMP_PLUGIN_fatal ("non-unity num_workers (%d) not supported", + dims[GOMP_DIM_WORKER]); + /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ @@ -965,35 +986,21 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); - GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name); + GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" + " gangs=%u, workers=%u, vectors=%u\n", + __FUNCTION__, targ_fn->launch->fn, + dims[0], dims[1], dims[2]); // OpenACC CUDA // - // num_gangs blocks - // num_workers warps (where a warp is equivalent to 32 threads) - // vector length threads - // - - /* The openacc vector_length clause 'determines the vector length to use for - vector or SIMD operations'. The question is how to map this to CUDA. - - In CUDA, the warp size is the vector length of a CUDA device. However, the - CUDA interface abstracts away from that, and only shows us warp size - indirectly in maximum number of threads per block, which is a product of - warp size and the number of hyperthreads of a multiprocessor. - - We choose to map openacc vector_length directly onto the number of threads - in a block, in the x dimension. This is reflected in gcc code generation - that uses ThreadIdx.x to access vector elements. - - Attempting to use an openacc vector_length of more than the maximum number - of threads per block will result in a cuda error. */ - nthreads_in_block = vector_length; + // num_gangs nctaid.x + // num_workers ntid.y + // vector length ntid.x kargs[0] = &dp; r = cuLaunchKernel (function, - num_gangs, 1, 1, - nthreads_in_block, 1, 1, + dims[GOMP_DIM_GANG], 1, 1, + dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); @@ -1039,7 +1046,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, #endif GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, - targ_fn->name); + targ_fn->launch->fn); #ifndef DISABLE_ASYNC if (async < acc_async_noval) @@ -1567,7 +1574,7 @@ typedef struct nvptx_tdata const char *const *var_names; size_t var_num; - const char *const *fn_names; + const struct targ_fn_launch *fn_descs; size_t fn_num; } nvptx_tdata_t; @@ -1588,7 +1595,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, struct addr_pair **target_table) { CUmodule module; - const char *const *fn_names, *const *var_names; + const char *const *var_names; + const struct targ_fn_launch *fn_descs; unsigned int fn_entries, var_entries, i, j; CUresult r; struct targ_fn_descriptor *targ_fns; @@ -1617,7 +1625,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, var_entries = img_header->var_num; var_names = img_header->var_names; fn_entries = img_header->fn_num; - fn_names = img_header->fn_names; + fn_descs = img_header->fn_descs; targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) * (fn_entries + var_entries)); @@ -1640,12 +1648,12 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, { CUfunction function; - r = cuModuleGetFunction (&function, module, fn_names[i]); + r = cuModuleGetFunction (&function, module, fn_descs[i].fn); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r)); targ_fns->fn = function; - targ_fns->name = (const char *) fn_names[i]; + targ_fns->launch = &fn_descs[i]; targ_tbl->start = (uintptr_t) targ_fns; targ_tbl->end = targ_tbl->start + 1; @@ -1724,13 +1732,12 @@ void (*device_run) (int n, void *fn_ptr, void *vars) = NULL; void GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, - void **hostaddrs, void **devaddrs, size_t *sizes, - unsigned short *kinds, int num_gangs, - int num_workers, int vector_length, int async, - void *targ_mem_desc) + void **hostaddrs, void **devaddrs, + size_t *sizes, unsigned short *kinds, + int async, unsigned *dims, void *targ_mem_desc) { - nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs, - num_workers, vector_length, async, targ_mem_desc); + nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, + async, dims, targ_mem_desc); } void |