1 files changed, 154 insertions, 10 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a5cf859..90c5916 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1799,8 +1799,6 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
   ptx_dev->free_blocks = NULL;
   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
 
-  nvptx_stacks_free (ptx_dev, false);
-
   while (blocks)
     {
       tmp = blocks->next;
@@ -1828,6 +1826,48 @@ GOMP_OFFLOAD_free (int ord, void *ptr)
 	  && nvptx_free (ptr, ptx_devices[ord]));
 }
 
+bool
+GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
+		     __FUNCTION__, ptr, (unsigned long long) size);
+
+  if (size == 0)
+    {
+      /* Special case to ensure omp_alloc specification compliance.  */
+      *ptr = NULL;
+      GOMP_PLUGIN_debug (0, "  -> *ptr=null\n");
+      return true;
+    }
+
+  CUresult r;
+
+  unsigned int flags = 0;
+  /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
+     'flags |= CU_MEMHOSTALLOC_PORTABLE;' here.  */
+  r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags);
+  if (r == CUDA_ERROR_OUT_OF_MEMORY)
+    *ptr = NULL;
+  else if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r));
+      return false;
+    }
+  GOMP_PLUGIN_debug (0, "  -> *ptr=%p\n",
+		     *ptr);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_page_locked_host_free (void *ptr)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p\n",
+		     __FUNCTION__, ptr);
+
+  CUDA_CALL (cuMemFreeHost, ptr);
+  return true;
+}
+
 void
 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
 			   size_t mapnum  __attribute__((unused)),
@@ -1939,9 +1979,10 @@ nvptx_goacc_asyncqueue_construct (unsigned int flags)
 }
 
 struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+GOMP_OFFLOAD_openacc_async_construct (int device)
 {
-  return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+  nvptx_attach_host_thread_to_device (device);
+  return nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
 }
 
 static bool
@@ -2019,6 +2060,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
 }
 
 static bool
+cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s)
+{
+  CUdeviceptr pb1, pb2;
+  size_t ps1, ps2;
+  if (!s)
+    return true;
+  if (!d1 || !d2)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1);
+  CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2);
+  if (!pb1 || !pb2)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  if ((void *)(d1 + s) > (void *)(pb1 + ps1)
+      || (void *)(d2 + s) > (void *)(pb2 + ps2))
+    {
+      GOMP_PLUGIN_error ("invalid size");
+      return false;
+    }
+  return true;
+}
+
+static bool
 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
 {
   CUdeviceptr pb;
@@ -2077,6 +2146,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
 bool
 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
 {
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_dev_sanity_check (dst, src, n))
+    return false;
   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
   return true;
 }
@@ -2267,6 +2339,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
 }
 
 bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+  if (!nvptx_attach_host_thread_to_device (ord))
+    return false;
+  CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count);
+  return true;
+}
+
+bool
 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
 				     size_t n, struct goacc_asyncqueue *aq)
 {
@@ -2288,6 +2369,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
   return true;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src,
+				    size_t n, struct goacc_asyncqueue *aq)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_dev_sanity_check (dst, src, n))
+    return false;
+  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+	     aq->cuda_stream);
+  return true;
+}
+
 union goacc_property_value
 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
 {
@@ -2815,17 +2908,68 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 	else if (r != CUDA_ERROR_NOT_READY)
 	  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
 
-	if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
+	struct rev_offload *rev_metadata = ptx_dev->rev_data;
+
+	/* Claim a portion of the ring buffer to process on this iteration.
+	   Don't mark them as consumed until all the data has been read out.  */
+	unsigned int consumed = __atomic_load_n (&rev_metadata->consumed,
+						 __ATOMIC_ACQUIRE);
+	unsigned int from = __atomic_load_n (&rev_metadata->claimed,
+						__ATOMIC_RELAXED);
+	unsigned int to = __atomic_load_n (&rev_metadata->next_slot,
+					   __ATOMIC_RELAXED);
+
+	if (consumed > to)
+	  {
+	    /* Overflow happens when we exceed UINTMAX requests.  */
+	    GOMP_PLUGIN_fatal ("NVPTX reverse offload buffer overflowed.\n");
+	  }
+
+	to = MIN(to, consumed + REV_OFFLOAD_QUEUE_SIZE / 2);
+	if (to <= from)
+	  /* Nothing to do; poll again.  */
+	  goto poll_again;
+
+	if (!__atomic_compare_exchange_n (&rev_metadata->claimed, &from, to,
+					  false,
+					  __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
+	  /* Collision with another thread ... go around again.  */
+	  goto poll_again;
+
+	unsigned int index;
+	for (index = from; index < to; index++)
 	  {
-	    struct rev_offload *rev_data = ptx_dev->rev_data;
+	    int slot = index % REV_OFFLOAD_QUEUE_SIZE;
+
+	    /* Wait while the target finishes filling in the slot.  */
+	    while (__atomic_load_n (&ptx_dev->rev_data->queue[slot].signal,
+				    __ATOMIC_ACQUIRE) == 0)
+	      ; /* spin  */
+
+	    /* Pass the request to libgomp; this will queue the request and
+	       return right away, without waiting for the kernel to run.  */
+	    struct rev_req *rev_data = &ptx_dev->rev_data->queue[slot];
 	    GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
 				    rev_data->addrs, rev_data->sizes,
 				    rev_data->kinds, rev_data->dev_num,
-				    reverse_offload_aq);
-	    if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
-	      exit (EXIT_FAILURE);
-	    __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
+				    rev_data->signal, true);
+
+	    /* Ensure that the slot doesn't trigger early, when reused.  */
+	    __atomic_store_n (&rev_data->signal, 0, __ATOMIC_RELEASE);
 	  }
+
+	/* The data is now consumed so release the slots for reuse.  */
+	unsigned int consumed_so_far = from;
+	while (!__atomic_compare_exchange_n (&rev_metadata->consumed,
+					    &consumed_so_far, to, false,
+					    __ATOMIC_RELEASE, __ATOMIC_RELAXED))
+	  {
+	    /* Another thread didn't consume all it claimed yet.... */
+	    consumed_so_far = from;
+	    usleep (1);
+	  }
+
+poll_again:
 	usleep (1);
       }
   else