3 files changed, 145 insertions, 5 deletions
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index eb562ac..7f4ddcc 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuMemcpyHtoDAsync)
 CUDA_ONE_CALL (cuMemcpy2D)
 CUDA_ONE_CALL (cuMemcpy2DUnaligned)
 CUDA_ONE_CALL (cuMemcpy3D)
+CUDA_ONE_CALL (cuMemsetD8)
 CUDA_ONE_CALL (cuMemFree)
 CUDA_ONE_CALL (cuMemFreeHost)
 CUDA_ONE_CALL (cuMemGetAddressRange)
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 4b42a59..498b549 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -208,6 +208,8 @@ struct hsa_runtime_fn_info
   hsa_status_t (*hsa_code_object_deserialize_fn)
     (void *serialized_code_object, size_t serialized_code_object_size,
      const char *options, hsa_code_object_t *code_object);
+  hsa_status_t (*hsa_amd_memory_fill_fn)(void *ptr, uint32_t value,
+					 size_t count);
   hsa_status_t (*hsa_amd_memory_lock_fn)
     (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
      void **agent_ptr);
@@ -1456,6 +1458,7 @@ init_hsa_runtime_functions (void)
   DLSYM_FN (hsa_signal_load_acquire)
   DLSYM_FN (hsa_queue_destroy)
   DLSYM_FN (hsa_code_object_deserialize)
+  DLSYM_OPT_FN (hsa_amd_memory_fill)
   DLSYM_OPT_FN (hsa_amd_memory_lock)
   DLSYM_OPT_FN (hsa_amd_memory_unlock)
   DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
@@ -4435,6 +4438,83 @@ init_hip_runtime_functions (void)
   return true;
 }
 
+bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+
+  /* A memset feature is only provided via hsa_amd_memory_fill; while it
+     is fast, it is an HSA extension and it has two requirements: The memory
+     must be aligned to multiples of 4 bytes - and, by construction, only
+     multiples of 4 bytes can be filled (uint32_t value argument).
+
+     This means: Either not using that function or up to three function calls:
+     - copy 1 to 3 bytes to get alignment (hsa_memory_copy), if unaligned
+     - call hsa_amd_memory_fill
+     - copy remaining 1 to 3 bytes (hsa_memory_copy), if after alignment
+       count is not a multiple of 4 bytes.
+
+     Having more than one function call is only profitable if there is
+     enough data to process; see below for the used heuristic values.  */
+
+  uint8_t v8 = (uint8_t) val;
+  size_t before = (4 - (uintptr_t) ptr % 4) % 4;  /* 0 to 3 bytes.  */
+  size_t tail = (count - before) % 4;  /* 0 to 3 bytes.  */
+
+  /* Heuristic  */
+  enum {
+    /* Prefer alloca to malloc up to ... */
+    alloca_size = 256,  /* bytes */
+    /* Call hsa_amd_memory_fill also when two copy calls are required.  */
+    always_use_fill = 256*1024,  /* bytes */
+    /* Call hsa_amd_memory_fill also when on copy call is required.  */
+    use_fill_one_copy = (128+64)*1024  /* bytes */
+  };
+
+  /* Do not call hsa_amd_memory_fill when any of the following conditions
+     is true. Note that it is always preferred if available and
+     before == tail == 0.  */
+  if (__builtin_expect (!hsa_fns.hsa_amd_memory_fill_fn, 0)
+      || (before && tail && count < always_use_fill)
+      || ((before || tail) && count < use_fill_one_copy))
+    before = count;
+
+  /* Copy call for alignment - or all data, if condition above is true.  */
+  if (before)
+    {
+      void *data;
+      if (before > alloca_size)
+	data = malloc (before * sizeof (uint8_t));
+      else
+	data = alloca (before * sizeof (uint8_t));
+      memset (data, val, before);
+      status = hsa_fns.hsa_memory_copy_fn (ptr, data, before);
+      if (before > alloca_size)
+	free (data);
+      if (data == 0 || status != HSA_STATUS_SUCCESS)
+	goto fail;
+      count -= before;
+    }
+
+  if (count == 0)
+    return true;
+
+  ptr += before;
+
+  uint32_t values = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24);
+  status = hsa_fns.hsa_amd_memory_fill_fn (ptr, values, count / 4);
+  if (tail && status == HSA_STATUS_SUCCESS)
+    {
+      ptr += count - tail;
+      status = hsa_fns.hsa_memory_copy_fn (ptr, &values, tail);
+    }
+  if (status == HSA_STATUS_SUCCESS)
+    return true;
+
+fail:
+  GOMP_PLUGIN_error ("memory set failed");
+  return false;
+}
 
 void
 GOMP_OFFLOAD_interop (struct interop_obj_t *obj, int ord,
@@ -5079,7 +5159,8 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
   queue_push_callback (aq, fn, data);
 }
 
-/* Queue up an asynchronous data copy from host to DEVICE.  */
+/* Queue up an asynchronous data copy from host to DEVICE.
+   (Also handles dev2host and dev2dev.)  */
 
 bool
 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
@@ -5097,10 +5178,16 @@ bool
 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
 				     size_t n, struct goacc_asyncqueue *aq)
 {
-  struct agent_info *agent = get_agent_info (device);
-  assert (agent == aq->agent);
-  queue_push_copy (aq, dst, src, n);
-  return true;
+  return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
+}
+
+/* Queue up an asynchronous data copy from DEVICE to DEVICE.  */
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int device, void *dst, const void *src,
+				    size_t n, struct goacc_asyncqueue *aq)
+{
+  return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
 }
 
 union goacc_property_value
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a5cf859..0ba445e 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -2019,6 +2019,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
 }
 
 static bool
+cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s)
+{
+  CUdeviceptr pb1, pb2;
+  size_t ps1, ps2;
+  if (!s)
+    return true;
+  if (!d1 || !d2)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1);
+  CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2);
+  if (!pb1 || !pb2)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  if ((void *)(d1 + s) > (void *)(pb1 + ps1)
+      || (void *)(d2 + s) > (void *)(pb2 + ps2))
+    {
+      GOMP_PLUGIN_error ("invalid size");
+      return false;
+    }
+  return true;
+}
+
+static bool
 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
 {
   CUdeviceptr pb;
@@ -2077,6 +2105,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
 bool
 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
 {
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_dev_sanity_check (dst, src, n))
+    return false;
   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
   return true;
 }
@@ -2267,6 +2298,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
 }
 
 bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+  if (!nvptx_attach_host_thread_to_device (ord))
+    return false;
+  CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count);
+  return true;
+}
+
+bool
 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
 				     size_t n, struct goacc_asyncqueue *aq)
 {
@@ -2288,6 +2328,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
   return true;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src,
+				    size_t n, struct goacc_asyncqueue *aq)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_dev_sanity_check (dst, src, n))
+    return false;
+  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+	     aq->cuda_stream);
+  return true;
+}
+
 union goacc_property_value
 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
 {