aboutsummaryrefslogtreecommitdiff
path: root/libgomp/plugin
diff options
context:
space:
mode:
Diffstat (limited to 'libgomp/plugin')
-rw-r--r--libgomp/plugin/cuda-lib.def1
-rw-r--r--libgomp/plugin/plugin-gcn.c97
-rw-r--r--libgomp/plugin/plugin-nvptx.c52
3 files changed, 145 insertions, 5 deletions
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index eb562ac..7f4ddcc 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuMemcpyHtoDAsync)
CUDA_ONE_CALL (cuMemcpy2D)
CUDA_ONE_CALL (cuMemcpy2DUnaligned)
CUDA_ONE_CALL (cuMemcpy3D)
+CUDA_ONE_CALL (cuMemsetD8)
CUDA_ONE_CALL (cuMemFree)
CUDA_ONE_CALL (cuMemFreeHost)
CUDA_ONE_CALL (cuMemGetAddressRange)
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 4b42a59..498b549 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -208,6 +208,8 @@ struct hsa_runtime_fn_info
hsa_status_t (*hsa_code_object_deserialize_fn)
(void *serialized_code_object, size_t serialized_code_object_size,
const char *options, hsa_code_object_t *code_object);
+ hsa_status_t (*hsa_amd_memory_fill_fn)(void *ptr, uint32_t value,
+ size_t count);
hsa_status_t (*hsa_amd_memory_lock_fn)
(void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
void **agent_ptr);
@@ -1456,6 +1458,7 @@ init_hsa_runtime_functions (void)
DLSYM_FN (hsa_signal_load_acquire)
DLSYM_FN (hsa_queue_destroy)
DLSYM_FN (hsa_code_object_deserialize)
+ DLSYM_OPT_FN (hsa_amd_memory_fill)
DLSYM_OPT_FN (hsa_amd_memory_lock)
DLSYM_OPT_FN (hsa_amd_memory_unlock)
DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
@@ -4435,6 +4438,83 @@ init_hip_runtime_functions (void)
return true;
}
+bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+ hsa_status_t status = HSA_STATUS_SUCCESS;
+
+ /* A memset feature is only provided via hsa_amd_memory_fill; while it
+ is fast, it is an HSA extension and it has two requirements: The memory
+ must be aligned to multiples of 4 bytes - and, by construction, only
+ multiples of 4 bytes can be filled (uint32_t value argument).
+
+ This means: Either not using that function or up to three function calls:
+ - copy 1 to 3 bytes to get alignment (hsa_memory_copy), if unaligned
+ - call hsa_amd_memory_fill
+ - copy remaining 1 to 3 bytes (hsa_memory_copy), if after alignment
+ count is not a multiple of 4 bytes.
+
+ Having more than one function call is only profitable if there is
+ enough data to process; see below for the used heuristic values. */
+
+ uint8_t v8 = (uint8_t) val;
+ size_t before = (4 - (uintptr_t) ptr % 4) % 4; /* 0 to 3 bytes. */
+ size_t tail = (count - before) % 4; /* 0 to 3 bytes. */
+
+ /* Heuristic */
+ enum {
+ /* Prefer alloca to malloc up to ... */
+ alloca_size = 256, /* bytes */
+ /* Call hsa_amd_memory_fill also when two copy calls are required. */
+ always_use_fill = 256*1024, /* bytes */
+ /* Call hsa_amd_memory_fill also when on copy call is required. */
+ use_fill_one_copy = (128+64)*1024 /* bytes */
+ };
+
+ /* Do not call hsa_amd_memory_fill when any of the following conditions
+ is true. Note that it is always preferred if available and
+ before == tail == 0. */
+ if (__builtin_expect (!hsa_fns.hsa_amd_memory_fill_fn, 0)
+ || (before && tail && count < always_use_fill)
+ || ((before || tail) && count < use_fill_one_copy))
+ before = count;
+
+ /* Copy call for alignment - or all data, if condition above is true. */
+ if (before)
+ {
+ void *data;
+ if (before > alloca_size)
+ data = malloc (before * sizeof (uint8_t));
+ else
+ data = alloca (before * sizeof (uint8_t));
+ memset (data, val, before);
+ status = hsa_fns.hsa_memory_copy_fn (ptr, data, before);
+ if (before > alloca_size)
+ free (data);
+ if (data == 0 || status != HSA_STATUS_SUCCESS)
+ goto fail;
+ count -= before;
+ }
+
+ if (count == 0)
+ return true;
+
+ ptr += before;
+
+ uint32_t values = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24);
+ status = hsa_fns.hsa_amd_memory_fill_fn (ptr, values, count / 4);
+ if (tail && status == HSA_STATUS_SUCCESS)
+ {
+ ptr += count - tail;
+ status = hsa_fns.hsa_memory_copy_fn (ptr, &values, tail);
+ }
+ if (status == HSA_STATUS_SUCCESS)
+ return true;
+
+fail:
+ GOMP_PLUGIN_error ("memory set failed");
+ return false;
+}
void
GOMP_OFFLOAD_interop (struct interop_obj_t *obj, int ord,
@@ -5079,7 +5159,8 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
queue_push_callback (aq, fn, data);
}
-/* Queue up an asynchronous data copy from host to DEVICE. */
+/* Queue up an asynchronous data copy from host to DEVICE.
+ (Also handles dev2host and dev2dev.) */
bool
GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
@@ -5097,10 +5178,16 @@ bool
GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
size_t n, struct goacc_asyncqueue *aq)
{
- struct agent_info *agent = get_agent_info (device);
- assert (agent == aq->agent);
- queue_push_copy (aq, dst, src, n);
- return true;
+ return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
+}
+
+/* Queue up an asynchronous data copy from DEVICE to DEVICE. */
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int device, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
}
union goacc_property_value
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a5cf859..0ba445e 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -2019,6 +2019,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
}
static bool
+cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s)
+{
+ CUdeviceptr pb1, pb2;
+ size_t ps1, ps2;
+ if (!s)
+ return true;
+ if (!d1 || !d2)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1);
+ CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2);
+ if (!pb1 || !pb2)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ if ((void *)(d1 + s) > (void *)(pb1 + ps1)
+ || (void *)(d2 + s) > (void *)(pb2 + ps2))
+ {
+ GOMP_PLUGIN_error ("invalid size");
+ return false;
+ }
+ return true;
+}
+
+static bool
cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
{
CUdeviceptr pb;
@@ -2077,6 +2105,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
bool
GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_dev_sanity_check (dst, src, n))
+ return false;
CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
return true;
}
@@ -2267,6 +2298,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
}
bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+ if (!nvptx_attach_host_thread_to_device (ord))
+ return false;
+ CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count);
+ return true;
+}
+
+bool
GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
size_t n, struct goacc_asyncqueue *aq)
{
@@ -2288,6 +2328,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
return true;
}
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_dev_sanity_check (dst, src, n))
+ return false;
+ CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+ aq->cuda_stream);
+ return true;
+}
+
union goacc_property_value
GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
{