diff options
Diffstat (limited to 'libgomp/plugin')
-rw-r--r-- | libgomp/plugin/cuda-lib.def | 1 | ||||
-rw-r--r-- | libgomp/plugin/plugin-gcn.c | 97 | ||||
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 52 |
3 files changed, 145 insertions, 5 deletions
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index eb562ac..7f4ddcc 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuMemcpyHtoDAsync) CUDA_ONE_CALL (cuMemcpy2D) CUDA_ONE_CALL (cuMemcpy2DUnaligned) CUDA_ONE_CALL (cuMemcpy3D) +CUDA_ONE_CALL (cuMemsetD8) CUDA_ONE_CALL (cuMemFree) CUDA_ONE_CALL (cuMemFreeHost) CUDA_ONE_CALL (cuMemGetAddressRange) diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 4b42a59..498b549 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -208,6 +208,8 @@ struct hsa_runtime_fn_info hsa_status_t (*hsa_code_object_deserialize_fn) (void *serialized_code_object, size_t serialized_code_object_size, const char *options, hsa_code_object_t *code_object); + hsa_status_t (*hsa_amd_memory_fill_fn)(void *ptr, uint32_t value, + size_t count); hsa_status_t (*hsa_amd_memory_lock_fn) (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent, void **agent_ptr); @@ -1456,6 +1458,7 @@ init_hsa_runtime_functions (void) DLSYM_FN (hsa_signal_load_acquire) DLSYM_FN (hsa_queue_destroy) DLSYM_FN (hsa_code_object_deserialize) + DLSYM_OPT_FN (hsa_amd_memory_fill) DLSYM_OPT_FN (hsa_amd_memory_lock) DLSYM_OPT_FN (hsa_amd_memory_unlock) DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect) @@ -4435,6 +4438,83 @@ init_hip_runtime_functions (void) return true; } +bool +GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + + /* A memset feature is only provided via hsa_amd_memory_fill; while it + is fast, it is an HSA extension and it has two requirements: The memory + must be aligned to multiples of 4 bytes - and, by construction, only + multiples of 4 bytes can be filled (uint32_t value argument). + + This means: Either not using that function or up to three function calls: + - copy 1 to 3 bytes to get alignment (hsa_memory_copy), if unaligned + - call hsa_amd_memory_fill + - copy remaining 1 to 3 bytes (hsa_memory_copy), if after alignment + count is not a multiple of 4 bytes. + + Having more than one function call is only profitable if there is + enough data to process; see below for the used heuristic values. */ + + uint8_t v8 = (uint8_t) val; + size_t before = (4 - (uintptr_t) ptr % 4) % 4; /* 0 to 3 bytes. */ + size_t tail = (count - before) % 4; /* 0 to 3 bytes. */ + + /* Heuristic */ + enum { + /* Prefer alloca to malloc up to ... */ + alloca_size = 256, /* bytes */ + /* Call hsa_amd_memory_fill also when two copy calls are required. */ + always_use_fill = 256*1024, /* bytes */ + /* Call hsa_amd_memory_fill also when on copy call is required. */ + use_fill_one_copy = (128+64)*1024 /* bytes */ + }; + + /* Do not call hsa_amd_memory_fill when any of the following conditions + is true. Note that it is always preferred if available and + before == tail == 0. */ + if (__builtin_expect (!hsa_fns.hsa_amd_memory_fill_fn, 0) + || (before && tail && count < always_use_fill) + || ((before || tail) && count < use_fill_one_copy)) + before = count; + + /* Copy call for alignment - or all data, if condition above is true. */ + if (before) + { + void *data; + if (before > alloca_size) + data = malloc (before * sizeof (uint8_t)); + else + data = alloca (before * sizeof (uint8_t)); + memset (data, val, before); + status = hsa_fns.hsa_memory_copy_fn (ptr, data, before); + if (before > alloca_size) + free (data); + if (data == 0 || status != HSA_STATUS_SUCCESS) + goto fail; + count -= before; + } + + if (count == 0) + return true; + + ptr += before; + + uint32_t values = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24); + status = hsa_fns.hsa_amd_memory_fill_fn (ptr, values, count / 4); + if (tail && status == HSA_STATUS_SUCCESS) + { + ptr += count - tail; + status = hsa_fns.hsa_memory_copy_fn (ptr, &values, tail); + } + if (status == HSA_STATUS_SUCCESS) + return true; + +fail: + GOMP_PLUGIN_error ("memory set failed"); + return false; +} void GOMP_OFFLOAD_interop (struct interop_obj_t *obj, int ord, @@ -5079,7 +5159,8 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, queue_push_callback (aq, fn, data); } -/* Queue up an asynchronous data copy from host to DEVICE. */ +/* Queue up an asynchronous data copy from host to DEVICE. + (Also handles dev2host and dev2dev.) */ bool GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src, @@ -5097,10 +5178,16 @@ bool GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src, size_t n, struct goacc_asyncqueue *aq) { - struct agent_info *agent = get_agent_info (device); - assert (agent == aq->agent); - queue_push_copy (aq, dst, src, n); - return true; + return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq); +} + +/* Queue up an asynchronous data copy from DEVICE to DEVICE. */ + +bool +GOMP_OFFLOAD_openacc_async_dev2dev (int device, void *dst, const void *src, + size_t n, struct goacc_asyncqueue *aq) +{ + return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq); } union goacc_property_value diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a5cf859..0ba445e 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -2019,6 +2019,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, } static bool +cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s) +{ + CUdeviceptr pb1, pb2; + size_t ps1, ps2; + if (!s) + return true; + if (!d1 || !d2) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1); + CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2); + if (!pb1 || !pb2) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + if ((void *)(d1 + s) > (void *)(pb1 + ps1) + || (void *)(d2 + s) > (void *)(pb2 + ps2)) + { + GOMP_PLUGIN_error ("invalid size"); + return false; + } + return true; +} + +static bool cuda_memcpy_sanity_check (const void *h, const void *d, size_t s) { CUdeviceptr pb; @@ -2077,6 +2105,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) bool GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) { + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_dev_sanity_check (dst, src, n)) + return false; CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL); return true; } @@ -2267,6 +2298,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, } bool +GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count) +{ + if (!nvptx_attach_host_thread_to_device (ord)) + return false; + CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count); + return true; +} + +bool GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src, size_t n, struct goacc_asyncqueue *aq) { @@ -2288,6 +2328,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src, return true; } +bool +GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src, + size_t n, struct goacc_asyncqueue *aq) +{ + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_dev_sanity_check (dst, src, n)) + return false; + CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, + aq->cuda_stream); + return true; +} + union goacc_property_value GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop) { |