[libgomp, nvptx] Handle per-function max-threads-per-block in default dims

Currently parallel-loop-1.c fails at -O0 on a Quadro M1200, because one of the kernel launch configurations exceeds the resources available in the device, due to the default dimensions chosen by the runtime. This patch fixes that by taking the per-function max_threads_per_block into account when using the default dimensions. 2018-07-30 Tom de Vries <tdevries@suse.de> * plugin/plugin-nvptx.c (MIN, MAX): Redefine. (nvptx_exec): Ensure worker and vector default dims don't exceed targ_fn->max_threads_per_block. From-SVN: r263062
author: Tom de Vries <tdevries@suse.de> 2018-07-30 08:17:26 +0000
committer: Tom de Vries <vries@gcc.gnu.org> 2018-07-30 08:17:26 +0000
commit: 4cdfee3f206d784f8a502af4f34180a0762df4fe (patch)
tree: d46c15b15b238c3095358b7e75a968184b1b05d1
parent: 0b210c43bbb6eddac8ba550d9c45bf679d4328c5 (diff)
download: gcc-4cdfee3f206d784f8a502af4f34180a0762df4fe.zip
gcc-4cdfee3f206d784f8a502af4f34180a0762df4fe.tar.gz
gcc-4cdfee3f206d784f8a502af4f34180a0762df4fe.tar.bz2
2 files changed, 31 insertions, 4 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 1d218f4..6cd30bb 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,5 +1,11 @@
 2018-07-30  Tom de Vries  <tdevries@suse.de>
 
+	* plugin/plugin-nvptx.c (MIN, MAX): Redefine.
+	(nvptx_exec): Ensure worker and vector default dims don't exceed
+	targ_fn->max_threads_per_block.
+
+2018-07-30  Tom de Vries  <tdevries@suse.de>
+
 	* plugin/plugin-nvptx.c (struct ptx_device): Add default_dims field.
 	(nvptx_open_device): Init default_dims for device.
 	(nvptx_exec): Use default_dims from device.
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 5c522aa..b6ec5f8 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -141,6 +141,11 @@ init_cuda_lib (void)
 
 #include "secure_getenv.h"
 
+#undef MIN
+#undef MAX
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+
 /* Convenience macros for the frequently used CUDA library call and
    error handling sequence as well as CUDA library calls that
    do the error checking themselves or don't do it at all.  */
@@ -1135,6 +1140,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   void *kargs[1];
   void *hp, *dp;
   struct nvptx_thread *nvthd = nvptx_thread ();
+  int warp_size = nvthd->ptx_dev->warp_size;
   const char *maybe_abort_msg = "(perhaps abort was called)";
 
   function = targ_fn->fn;
@@ -1175,7 +1181,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
 	  int gang, worker, vector;
 	  {
-	    int warp_size = nvthd->ptx_dev->warp_size;
 	    int block_size = nvthd->ptx_dev->max_threads_per_block;
 	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 	    int dev_size = nvthd->ptx_dev->num_sms;
@@ -1213,9 +1218,25 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 	}
       pthread_mutex_unlock (&ptx_dev_lock);
 
-      for (i = 0; i != GOMP_DIM_MAX; i++)
-	if (!dims[i])
-	  dims[i] = nvthd->ptx_dev->default_dims[i];
+      {
+	bool default_dim_p[GOMP_DIM_MAX];
+	for (i = 0; i != GOMP_DIM_MAX; i++)
+	  {
+	    default_dim_p[i] = !dims[i];
+	    if (default_dim_p[i])
+	      dims[i] = nvthd->ptx_dev->default_dims[i];
+	  }
+
+	if (default_dim_p[GOMP_DIM_VECTOR])
+	  dims[GOMP_DIM_VECTOR]
+	    = MIN (dims[GOMP_DIM_VECTOR],
+		   (targ_fn->max_threads_per_block / warp_size * warp_size));
+
+	if (default_dim_p[GOMP_DIM_WORKER])
+	  dims[GOMP_DIM_WORKER]
+	    = MIN (dims[GOMP_DIM_WORKER],
+		   targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+      }
     }
 
   /* Check if the accelerator has sufficient hardware resources to
author	Tom de Vries <tdevries@suse.de>	2018-07-30 08:17:26 +0000
committer	Tom de Vries <vries@gcc.gnu.org>	2018-07-30 08:17:26 +0000
commit	4cdfee3f206d784f8a502af4f34180a0762df4fe (patch)
tree	d46c15b15b238c3095358b7e75a968184b1b05d1
parent	0b210c43bbb6eddac8ba550d9c45bf679d4328c5 (diff)
download	gcc-4cdfee3f206d784f8a502af4f34180a0762df4fe.zip gcc-4cdfee3f206d784f8a502af4f34180a0762df4fe.tar.gz gcc-4cdfee3f206d784f8a502af4f34180a0762df4fe.tar.bz2