OpenMP offloading to NVPTX: libgomp changes

* Makefile.am (libgomp_la_SOURCES): Add atomic.c, icv.c, icv-device.c. * Makefile.in. Regenerate. * configure.ac [nvptx*-*-*] (libgomp_use_pthreads): Set and use it... (LIBGOMP_USE_PTHREADS): ...here; new define. * configure: Regenerate. * config.h.in: Likewise. * config/posix/affinity.c: Move to... * affinity.c: ...here (new file). Guard use of Pthreads-specific interface by LIBGOMP_USE_PTHREADS. * critical.c: Split out GOMP_atomic_{start,end} into... * atomic.c: ...here (new file). * env.c: Split out ICV definitions into... * icv.c: ...here (new file) and... * icv-device.c: ...here. New file. * config/linux/lock.c (gomp_init_lock_30): Move to generic lock.c. (gomp_destroy_lock_30): Ditto. (gomp_set_lock_30): Ditto. (gomp_unset_lock_30): Ditto. (gomp_test_lock_30): Ditto. (gomp_init_nest_lock_30): Ditto. (gomp_destroy_nest_lock_30): Ditto. (gomp_set_nest_lock_30): Ditto. (gomp_unset_nest_lock_30): Ditto. (gomp_test_nest_lock_30): Ditto. * lock.c: New. * config/nvptx/lock.c: New. * config/nvptx/bar.c: New. * config/nvptx/bar.h: New. * config/nvptx/doacross.h: New. * config/nvptx/error.c: New. * config/nvptx/icv-device.c: New. * config/nvptx/mutex.h: New. * config/nvptx/pool.h: New. * config/nvptx/proc.c: New. * config/nvptx/ptrlock.h: New. * config/nvptx/sem.h: New. * config/nvptx/simple-bar.h: New. * config/nvptx/target.c: New. * config/nvptx/task.c: New. * config/nvptx/team.c: New. * config/nvptx/time.c: New. * config/posix/simple-bar.h: New. * libgomp.h: Guard pthread.h inclusion. Include simple-bar.h. (gomp_num_teams_var): Declare. (struct gomp_thread_pool): Change threads_dock member to gomp_simple_barrier_t. [__nvptx__] (gomp_thread): New implementation. (gomp_thread_attr): Guard by LIBGOMP_USE_PTHREADS. (gomp_thread_destructor): Ditto. (gomp_init_thread_affinity): Ditto. * team.c: Guard uses of Pthreads-specific interfaces by LIBGOMP_USE_PTHREADS. Adjust all uses of threads_dock. (gomp_free_thread) [__nvptx__]: Do not call 'free'. * config/nvptx/alloc.c: Delete. * config/nvptx/barrier.c: Ditto. * config/nvptx/fortran.c: Ditto. * config/nvptx/iter.c: Ditto. * config/nvptx/iter_ull.c: Ditto. * config/nvptx/loop.c: Ditto. * config/nvptx/loop_ull.c: Ditto. * config/nvptx/ordered.c: Ditto. * config/nvptx/parallel.c: Ditto. * config/nvptx/priority_queue.c: Ditto. * config/nvptx/sections.c: Ditto. * config/nvptx/single.c: Ditto. * config/nvptx/splay-tree.c: Ditto. * config/nvptx/work.c: Ditto. * testsuite/libgomp.fortran/fortran.exp (lang_link_flags): Pass -foffload=-lgfortran in addition to -lgfortran. * testsuite/libgomp.oacc-fortran/fortran.exp (lang_link_flags): Ditto. * plugin/plugin-nvptx.c: Include <limits.h>. (struct targ_fn_descriptor): Add new fields. (struct ptx_device): Ditto. Set them... (nvptx_open_device): ...here. (nvptx_adjust_launch_bounds): New. (nvptx_host2dev): Allow NULL 'nvthd'. (nvptx_dev2host): Ditto. (GOMP_OFFLOAD_get_caps): Add GOMP_OFFLOAD_CAP_OPENMP_400. (link_ptx): Adjust log sizes. (nvptx_host2dev): Allow NULL 'nvthd'. (nvptx_dev2host): Ditto. (nvptx_set_clocktick): New. Use it... (GOMP_OFFLOAD_load_image): ...here. Set new targ_fn_descriptor fields. (GOMP_OFFLOAD_dev2dev): New. (nvptx_adjust_launch_bounds): New. (nvptx_stacks_size): New. (nvptx_stacks_alloc): New. (nvptx_stacks_free): New. (GOMP_OFFLOAD_run): New. (GOMP_OFFLOAD_async_run): New (stub). Co-Authored-By: Dmitry Melnik <dm@ispras.ru> Co-Authored-By: Jakub Jelinek <jakub@redhat.com> From-SVN: r242789
author: Alexander Monakov <amonakov@ispras.ru> 2016-11-23 21:36:41 +0300
committer: Alexander Monakov <amonakov@gcc.gnu.org> 2016-11-23 21:36:41 +0300
commit: 6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15 (patch)
tree: afbc9d83e9693712185a8afd9d6593f23eef2734 /libgomp/plugin/plugin-nvptx.c
parent: 6251fe936f3eab1542c1581646254e4bae0f6688 (diff)
download: gcc-6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15.zip
gcc-6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15.tar.gz
gcc-6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15.tar.bz2
1 files changed, 203 insertions, 10 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 5ee350d..ca33c51 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -41,6 +41,7 @@
 #include <cuda.h>
 #include <stdbool.h>
 #include <stdint.h>
+#include <limits.h>
 #include <string.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -274,6 +275,8 @@ struct targ_fn_descriptor
 {
   CUfunction fn;
   const struct targ_fn_launch *launch;
+  int regs_per_thread;
+  int max_threads_per_block;
 };
 
 /* A loaded PTX image.  */
@@ -307,8 +310,12 @@ struct ptx_device
   bool overlap;
   bool map;
   bool concur;
-  int  mode;
   bool mkern;
+  int  mode;
+  int clock_khz;
+  int num_sms;
+  int regs_per_block;
+  int regs_per_sm;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -658,6 +665,39 @@ nvptx_open_device (int n)
 		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
   ptx_dev->mkern = pi;
 
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
+  ptx_dev->clock_khz = pi;
+
+  CUDA_CALL_ERET (NULL,  cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
+  ptx_dev->num_sms = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
+  ptx_dev->regs_per_block = pi;
+
+  /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
+     in CUDA 6.0 and newer.  */
+  r = cuDeviceGetAttribute (&pi, 82, dev);
+  /* Fallback: use limit of registers per block, which is usually equal.  */
+  if (r == CUDA_ERROR_INVALID_VALUE)
+    pi = ptx_dev->regs_per_block;
+  else if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
+      return NULL;
+    }
+  ptx_dev->regs_per_sm = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
+  if (pi != 32)
+    {
+      GOMP_PLUGIN_error ("Only warp size 32 is supported");
+      return NULL;
+    }
+
   r = cuDeviceGetAttribute (&async_engines,
 			    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
   if (r != CUDA_SUCCESS)
@@ -725,10 +765,8 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
   CUjit_option opts[6];
   void *optvals[6];
   float elapsed = 0.0;
-#define LOGSIZE 8192
-  char elog[LOGSIZE];
-  char ilog[LOGSIZE];
-  unsigned long logsize = LOGSIZE;
+  char elog[1024];
+  char ilog[16384];
   CUlinkState linkstate;
   CUresult r;
   void *linkout;
@@ -741,13 +779,13 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
   optvals[1] = &ilog[0];
 
   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-  optvals[2] = (void *) logsize;
+  optvals[2] = (void *) sizeof ilog;
 
   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
   optvals[3] = &elog[0];
 
   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-  optvals[4] = (void *) logsize;
+  optvals[4] = (void *) sizeof elog;
 
   opts[5] = CU_JIT_LOG_VERBOSE;
   optvals[5] = (void *) 1;
@@ -1164,7 +1202,7 @@ nvptx_host2dev (void *d, const void *h, size_t s)
     }
 
 #ifndef DISABLE_ASYNC
-  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@@ -1220,7 +1258,7 @@ nvptx_dev2host (void *h, const void *d, size_t s)
     }
 
 #ifndef DISABLE_ASYNC
-  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@@ -1518,7 +1556,7 @@ GOMP_OFFLOAD_get_name (void)
 unsigned int
 GOMP_OFFLOAD_get_caps (void)
 {
-  return GOMP_OFFLOAD_CAP_OPENACC_200;
+  return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
 }
 
 int
@@ -1588,6 +1626,23 @@ GOMP_OFFLOAD_version (void)
   return GOMP_VERSION;
 }
 
+/* Initialize __nvptx_clocktick, if present in MODULE.  */
+
+static void
+nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
+{
+  CUdeviceptr dptr;
+  CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
+  if (r == CUDA_ERROR_NOT_FOUND)
+    return;
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+  double __nvptx_clocktick = 1e-3 / dev->clock_khz;
+  r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+}
+
 /* Load the (partial) program described by TARGET_DATA to device
    number ORD.  Allocate and return TARGET_TABLE.  */
 
@@ -1648,12 +1703,19 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
     {
       CUfunction function;
+      int nregs, mthrs;
 
       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
 		      fn_descs[i].fn);
+      CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
+		      CU_FUNC_ATTRIBUTE_NUM_REGS, function);
+      CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
+		      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
 
       targ_fns->fn = function;
       targ_fns->launch = &fn_descs[i];
+      targ_fns->regs_per_thread = nregs;
+      targ_fns->max_threads_per_block = mthrs;
 
       targ_tbl->start = (uintptr_t) targ_fns;
       targ_tbl->end = targ_tbl->start + 1;
@@ -1671,6 +1733,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
       targ_tbl->end = targ_tbl->start + bytes;
     }
 
+  nvptx_set_clocktick (module, dev);
+
   return fn_entries + var_entries;
 }
 
@@ -1736,6 +1800,15 @@ GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
 	  && nvptx_host2dev (dst, src, n));
 }
 
+bool
+GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
+{
+  struct ptx_device *ptx_dev = ptx_devices[ord];
+  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+				ptx_dev->null_stream->stream);
+  return true;
+}
+
 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
 
 void
@@ -1857,3 +1930,123 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
 {
   return nvptx_set_cuda_stream (async, stream);
 }
+
+/* Adjust launch dimensions: pick good values for number of blocks and warps
+   and ensure that number of warps does not exceed CUDA limits as well as GCC's
+   own limits.  */
+
+static void
+nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
+			    struct ptx_device *ptx_dev,
+			    int *teams_p, int *threads_p)
+{
+  int max_warps_block = fn->max_threads_per_block / 32;
+  /* Maximum 32 warps per block is an implementation limit in NVPTX backend
+     and libgcc, which matches documented limit of all GPUs as of 2015.  */
+  if (max_warps_block > 32)
+    max_warps_block = 32;
+  if (*threads_p <= 0)
+    *threads_p = 8;
+  if (*threads_p > max_warps_block)
+    *threads_p = max_warps_block;
+
+  int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
+  /* This is an estimate of how many blocks the device can host simultaneously.
+     Actual limit, which may be lower, can be queried with "occupancy control"
+     driver interface (since CUDA 6.0).  */
+  int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
+  if (*teams_p <= 0 || *teams_p > max_blocks)
+    *teams_p = max_blocks;
+}
+
+/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
+   target regions.  */
+
+static size_t
+nvptx_stacks_size ()
+{
+  return 128 * 1024;
+}
+
+/* Return contiguous storage for NUM stacks, each SIZE bytes.  */
+
+static void *
+nvptx_stacks_alloc (size_t size, int num)
+{
+  CUdeviceptr stacks;
+  CUresult r = cuMemAlloc (&stacks, size * num);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
+  return (void *) stacks;
+}
+
+/* Release storage previously allocated by nvptx_stacks_alloc.  */
+
+static void
+nvptx_stacks_free (void *p, int num)
+{
+  CUresult r = cuMemFree ((CUdeviceptr) p);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+}
+
+void
+GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
+{
+  CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
+  CUresult r;
+  struct ptx_device *ptx_dev = ptx_devices[ord];
+  const char *maybe_abort_msg = "(perhaps abort was called)";
+  int teams = 0, threads = 0;
+
+  if (!args)
+    GOMP_PLUGIN_fatal ("No target arguments provided");
+  while (*args)
+    {
+      intptr_t id = (intptr_t) *args++, val;
+      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+	val = (intptr_t) *args++;
+      else
+        val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
+	continue;
+      val = val > INT_MAX ? INT_MAX : val;
+      id &= GOMP_TARGET_ARG_ID_MASK;
+      if (id == GOMP_TARGET_ARG_NUM_TEAMS)
+	teams = val;
+      else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
+	threads = val;
+    }
+  nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
+
+  size_t stack_size = nvptx_stacks_size ();
+  void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
+  void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
+  size_t fn_args_size = sizeof fn_args;
+  void *config[] = {
+    CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
+    CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
+    CU_LAUNCH_PARAM_END
+  };
+  r = cuLaunchKernel (function,
+		      teams, 1, 1,
+		      32, threads, 1,
+		      0, ptx_dev->null_stream->stream, NULL, config);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+
+  r = cuCtxSynchronize ();
+  if (r == CUDA_ERROR_LAUNCH_FAILED)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
+		       maybe_abort_msg);
+  else if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+  nvptx_stacks_free (stacks, teams * threads);
+}
+
+void
+GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
+			void *async_data)
+{
+  GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
+}
author	Alexander Monakov <amonakov@ispras.ru>	2016-11-23 21:36:41 +0300
committer	Alexander Monakov <amonakov@gcc.gnu.org>	2016-11-23 21:36:41 +0300
commit	6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15 (patch)
tree	afbc9d83e9693712185a8afd9d6593f23eef2734 /libgomp/plugin/plugin-nvptx.c
parent	6251fe936f3eab1542c1581646254e4bae0f6688 (diff)
download	gcc-6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15.zip gcc-6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15.tar.gz gcc-6103184e81c0b6a8b1f4e072e0c32d9bb86fcc15.tar.bz2