aboutsummaryrefslogtreecommitdiff
path: root/libgomp/plugin/plugin-nvptx.c
diff options
context:
space:
mode:
authorAlexander Monakov <amonakov@ispras.ru>2016-11-23 21:36:41 +0300
committerAlexander Monakov <amonakov@gcc.gnu.org>2016-11-23 21:36:41 +0300
commit6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15 (patch)
treeafbc9d83e9693712185a8afd9d6593f23eef2734 /libgomp/plugin/plugin-nvptx.c
parent6251fe936f3eab1542c1581646254e4bae0f6688 (diff)
downloadgcc-6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15.zip
gcc-6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15.tar.gz
gcc-6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15.tar.bz2
OpenMP offloading to NVPTX: libgomp changes
* Makefile.am (libgomp_la_SOURCES): Add atomic.c, icv.c, icv-device.c. * Makefile.in. Regenerate. * configure.ac [nvptx*-*-*] (libgomp_use_pthreads): Set and use it... (LIBGOMP_USE_PTHREADS): ...here; new define. * configure: Regenerate. * config.h.in: Likewise. * config/posix/affinity.c: Move to... * affinity.c: ...here (new file). Guard use of Pthreads-specific interface by LIBGOMP_USE_PTHREADS. * critical.c: Split out GOMP_atomic_{start,end} into... * atomic.c: ...here (new file). * env.c: Split out ICV definitions into... * icv.c: ...here (new file) and... * icv-device.c: ...here. New file. * config/linux/lock.c (gomp_init_lock_30): Move to generic lock.c. (gomp_destroy_lock_30): Ditto. (gomp_set_lock_30): Ditto. (gomp_unset_lock_30): Ditto. (gomp_test_lock_30): Ditto. (gomp_init_nest_lock_30): Ditto. (gomp_destroy_nest_lock_30): Ditto. (gomp_set_nest_lock_30): Ditto. (gomp_unset_nest_lock_30): Ditto. (gomp_test_nest_lock_30): Ditto. * lock.c: New. * config/nvptx/lock.c: New. * config/nvptx/bar.c: New. * config/nvptx/bar.h: New. * config/nvptx/doacross.h: New. * config/nvptx/error.c: New. * config/nvptx/icv-device.c: New. * config/nvptx/mutex.h: New. * config/nvptx/pool.h: New. * config/nvptx/proc.c: New. * config/nvptx/ptrlock.h: New. * config/nvptx/sem.h: New. * config/nvptx/simple-bar.h: New. * config/nvptx/target.c: New. * config/nvptx/task.c: New. * config/nvptx/team.c: New. * config/nvptx/time.c: New. * config/posix/simple-bar.h: New. * libgomp.h: Guard pthread.h inclusion. Include simple-bar.h. (gomp_num_teams_var): Declare. (struct gomp_thread_pool): Change threads_dock member to gomp_simple_barrier_t. [__nvptx__] (gomp_thread): New implementation. (gomp_thread_attr): Guard by LIBGOMP_USE_PTHREADS. (gomp_thread_destructor): Ditto. (gomp_init_thread_affinity): Ditto. * team.c: Guard uses of Pthreads-specific interfaces by LIBGOMP_USE_PTHREADS. Adjust all uses of threads_dock. (gomp_free_thread) [__nvptx__]: Do not call 'free'. * config/nvptx/alloc.c: Delete. * config/nvptx/barrier.c: Ditto. * config/nvptx/fortran.c: Ditto. * config/nvptx/iter.c: Ditto. * config/nvptx/iter_ull.c: Ditto. * config/nvptx/loop.c: Ditto. * config/nvptx/loop_ull.c: Ditto. * config/nvptx/ordered.c: Ditto. * config/nvptx/parallel.c: Ditto. * config/nvptx/priority_queue.c: Ditto. * config/nvptx/sections.c: Ditto. * config/nvptx/single.c: Ditto. * config/nvptx/splay-tree.c: Ditto. * config/nvptx/work.c: Ditto. * testsuite/libgomp.fortran/fortran.exp (lang_link_flags): Pass -foffload=-lgfortran in addition to -lgfortran. * testsuite/libgomp.oacc-fortran/fortran.exp (lang_link_flags): Ditto. * plugin/plugin-nvptx.c: Include <limits.h>. (struct targ_fn_descriptor): Add new fields. (struct ptx_device): Ditto. Set them... (nvptx_open_device): ...here. (nvptx_adjust_launch_bounds): New. (nvptx_host2dev): Allow NULL 'nvthd'. (nvptx_dev2host): Ditto. (GOMP_OFFLOAD_get_caps): Add GOMP_OFFLOAD_CAP_OPENMP_400. (link_ptx): Adjust log sizes. (nvptx_host2dev): Allow NULL 'nvthd'. (nvptx_dev2host): Ditto. (nvptx_set_clocktick): New. Use it... (GOMP_OFFLOAD_load_image): ...here. Set new targ_fn_descriptor fields. (GOMP_OFFLOAD_dev2dev): New. (nvptx_adjust_launch_bounds): New. (nvptx_stacks_size): New. (nvptx_stacks_alloc): New. (nvptx_stacks_free): New. (GOMP_OFFLOAD_run): New. (GOMP_OFFLOAD_async_run): New (stub). Co-Authored-By: Dmitry Melnik <dm@ispras.ru> Co-Authored-By: Jakub Jelinek <jakub@redhat.com> From-SVN: r242789
Diffstat (limited to 'libgomp/plugin/plugin-nvptx.c')
-rw-r--r--libgomp/plugin/plugin-nvptx.c213
1 files changed, 203 insertions, 10 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 5ee350d..ca33c51 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -41,6 +41,7 @@
#include <cuda.h>
#include <stdbool.h>
#include <stdint.h>
+#include <limits.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
@@ -274,6 +275,8 @@ struct targ_fn_descriptor
{
CUfunction fn;
const struct targ_fn_launch *launch;
+ int regs_per_thread;
+ int max_threads_per_block;
};
/* A loaded PTX image. */
@@ -307,8 +310,12 @@ struct ptx_device
bool overlap;
bool map;
bool concur;
- int mode;
bool mkern;
+ int mode;
+ int clock_khz;
+ int num_sms;
+ int regs_per_block;
+ int regs_per_sm;
struct ptx_image_data *images; /* Images loaded on device. */
pthread_mutex_t image_lock; /* Lock for above list. */
@@ -658,6 +665,39 @@ nvptx_open_device (int n)
&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
ptx_dev->mkern = pi;
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
+ ptx_dev->clock_khz = pi;
+
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
+ ptx_dev->num_sms = pi;
+
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
+ ptx_dev->regs_per_block = pi;
+
+ /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
+ in CUDA 6.0 and newer. */
+ r = cuDeviceGetAttribute (&pi, 82, dev);
+ /* Fallback: use limit of registers per block, which is usually equal. */
+ if (r == CUDA_ERROR_INVALID_VALUE)
+ pi = ptx_dev->regs_per_block;
+ else if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
+ return NULL;
+ }
+ ptx_dev->regs_per_sm = pi;
+
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
+ if (pi != 32)
+ {
+ GOMP_PLUGIN_error ("Only warp size 32 is supported");
+ return NULL;
+ }
+
r = cuDeviceGetAttribute (&async_engines,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
if (r != CUDA_SUCCESS)
@@ -725,10 +765,8 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
CUjit_option opts[6];
void *optvals[6];
float elapsed = 0.0;
-#define LOGSIZE 8192
- char elog[LOGSIZE];
- char ilog[LOGSIZE];
- unsigned long logsize = LOGSIZE;
+ char elog[1024];
+ char ilog[16384];
CUlinkState linkstate;
CUresult r;
void *linkout;
@@ -741,13 +779,13 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
optvals[1] = &ilog[0];
opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
- optvals[2] = (void *) logsize;
+ optvals[2] = (void *) sizeof ilog;
opts[3] = CU_JIT_ERROR_LOG_BUFFER;
optvals[3] = &elog[0];
opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
- optvals[4] = (void *) logsize;
+ optvals[4] = (void *) sizeof elog;
opts[5] = CU_JIT_LOG_VERBOSE;
optvals[5] = (void *) 1;
@@ -1164,7 +1202,7 @@ nvptx_host2dev (void *d, const void *h, size_t s)
}
#ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+ if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@@ -1220,7 +1258,7 @@ nvptx_dev2host (void *h, const void *d, size_t s)
}
#ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+ if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@@ -1518,7 +1556,7 @@ GOMP_OFFLOAD_get_name (void)
unsigned int
GOMP_OFFLOAD_get_caps (void)
{
- return GOMP_OFFLOAD_CAP_OPENACC_200;
+ return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
}
int
@@ -1588,6 +1626,23 @@ GOMP_OFFLOAD_version (void)
return GOMP_VERSION;
}
+/* Initialize __nvptx_clocktick, if present in MODULE. */
+
+static void
+nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
+{
+ CUdeviceptr dptr;
+ CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
+ if (r == CUDA_ERROR_NOT_FOUND)
+ return;
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+ double __nvptx_clocktick = 1e-3 / dev->clock_khz;
+ r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+}
+
/* Load the (partial) program described by TARGET_DATA to device
number ORD. Allocate and return TARGET_TABLE. */
@@ -1648,12 +1703,19 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
{
CUfunction function;
+ int nregs, mthrs;
CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
fn_descs[i].fn);
+ CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
+ CU_FUNC_ATTRIBUTE_NUM_REGS, function);
+ CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
targ_fns->fn = function;
targ_fns->launch = &fn_descs[i];
+ targ_fns->regs_per_thread = nregs;
+ targ_fns->max_threads_per_block = mthrs;
targ_tbl->start = (uintptr_t) targ_fns;
targ_tbl->end = targ_tbl->start + 1;
@@ -1671,6 +1733,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
targ_tbl->end = targ_tbl->start + bytes;
}
+ nvptx_set_clocktick (module, dev);
+
return fn_entries + var_entries;
}
@@ -1736,6 +1800,15 @@ GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
&& nvptx_host2dev (dst, src, n));
}
+bool
+GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
+{
+ struct ptx_device *ptx_dev = ptx_devices[ord];
+ CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+ ptx_dev->null_stream->stream);
+ return true;
+}
+
void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
void
@@ -1857,3 +1930,123 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
{
return nvptx_set_cuda_stream (async, stream);
}
+
+/* Adjust launch dimensions: pick good values for number of blocks and warps
+ and ensure that number of warps does not exceed CUDA limits as well as GCC's
+ own limits. */
+
+static void
+nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
+ struct ptx_device *ptx_dev,
+ int *teams_p, int *threads_p)
+{
+ int max_warps_block = fn->max_threads_per_block / 32;
+ /* Maximum 32 warps per block is an implementation limit in NVPTX backend
+ and libgcc, which matches documented limit of all GPUs as of 2015. */
+ if (max_warps_block > 32)
+ max_warps_block = 32;
+ if (*threads_p <= 0)
+ *threads_p = 8;
+ if (*threads_p > max_warps_block)
+ *threads_p = max_warps_block;
+
+ int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
+ /* This is an estimate of how many blocks the device can host simultaneously.
+ Actual limit, which may be lower, can be queried with "occupancy control"
+ driver interface (since CUDA 6.0). */
+ int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
+ if (*teams_p <= 0 || *teams_p > max_blocks)
+ *teams_p = max_blocks;
+}
+
+/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
+ target regions. */
+
+static size_t
+nvptx_stacks_size ()
+{
+ return 128 * 1024;
+}
+
+/* Return contiguous storage for NUM stacks, each SIZE bytes. */
+
+static void *
+nvptx_stacks_alloc (size_t size, int num)
+{
+ CUdeviceptr stacks;
+ CUresult r = cuMemAlloc (&stacks, size * num);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
+ return (void *) stacks;
+}
+
+/* Release storage previously allocated by nvptx_stacks_alloc. */
+
+static void
+nvptx_stacks_free (void *p, int num)
+{
+ CUresult r = cuMemFree ((CUdeviceptr) p);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+}
+
+void
+GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
+{
+ CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
+ CUresult r;
+ struct ptx_device *ptx_dev = ptx_devices[ord];
+ const char *maybe_abort_msg = "(perhaps abort was called)";
+ int teams = 0, threads = 0;
+
+ if (!args)
+ GOMP_PLUGIN_fatal ("No target arguments provided");
+ while (*args)
+ {
+ intptr_t id = (intptr_t) *args++, val;
+ if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+ val = (intptr_t) *args++;
+ else
+ val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+ if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
+ continue;
+ val = val > INT_MAX ? INT_MAX : val;
+ id &= GOMP_TARGET_ARG_ID_MASK;
+ if (id == GOMP_TARGET_ARG_NUM_TEAMS)
+ teams = val;
+ else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
+ threads = val;
+ }
+ nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
+
+ size_t stack_size = nvptx_stacks_size ();
+ void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
+ void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
+ size_t fn_args_size = sizeof fn_args;
+ void *config[] = {
+ CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
+ CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
+ CU_LAUNCH_PARAM_END
+ };
+ r = cuLaunchKernel (function,
+ teams, 1, 1,
+ 32, threads, 1,
+ 0, ptx_dev->null_stream->stream, NULL, config);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+
+ r = cuCtxSynchronize ();
+ if (r == CUDA_ERROR_LAUNCH_FAILED)
+ GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
+ maybe_abort_msg);
+ else if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+ nvptx_stacks_free (stacks, teams * threads);
+}
+
+void
+GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
+ void *async_data)
+{
+ GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
+}