diff options
author | Tobias Burnus <tobias@codesourcery.com> | 2023-07-26 16:22:35 +0200 |
---|---|---|
committer | Tobias Burnus <tobias@codesourcery.com> | 2023-07-26 16:22:35 +0200 |
commit | 25072a477a56a727b369bf9b20f4d18198ff5894 (patch) | |
tree | 8dc40c0f128509b0e5c78ff32d5102c321bbaa4d /libgomp/target.c | |
parent | c194a413369e9c9f92f1c9334556b359c7417742 (diff) | |
download | gcc-25072a477a56a727b369bf9b20f4d18198ff5894.zip gcc-25072a477a56a727b369bf9b20f4d18198ff5894.tar.gz gcc-25072a477a56a727b369bf9b20f4d18198ff5894.tar.bz2 |
OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect
When copying a 2D or 3D rectangular memmory block, the performance is
better when using CUDA's cuMemcpy2D/cuMemcpy3D instead of copying the
data one by one. That's what this commit does.
Additionally, it permits device-to-device copies, if neccessary using a
temporary variable on the host.
include/ChangeLog:
* cuda/cuda.h (CUlimit): Add CUDA_ERROR_NOT_INITIALIZED,
CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_INVALID_HANDLE.
(CUarray, CUmemorytype, CUDA_MEMCPY2D, CUDA_MEMCPY3D,
CUDA_MEMCPY3D_PEER): New typdefs.
(cuMemcpy2D, cuMemcpy2DAsync, cuMemcpy2DUnaligned,
cuMemcpy3D, cuMemcpy3DAsync, cuMemcpy3DPeer,
cuMemcpy3DPeerAsync): New prototypes.
libgomp/ChangeLog:
* libgomp-plugin.h (GOMP_OFFLOAD_memcpy2d,
GOMP_OFFLOAD_memcpy3d): New prototypes.
* libgomp.h (struct gomp_device_descr): Add memcpy2d_func
and memcpy3d_func.
* libgomp.texi (nvtpx): Document when cuMemcpy2D/cuMemcpy3D is used.
* oacc-host.c (memcpy2d_func, .memcpy3d_func): Init with NULL.
* plugin/cuda-lib.def (cuMemcpy2D, cuMemcpy2DUnaligned,
cuMemcpy3D): Invoke via CUDA_ONE_CALL.
* plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d,
GOMP_OFFLOAD_memcpy3d): New.
* target.c (omp_target_memcpy_rect_worker):
(omp_target_memcpy_rect_check, omp_target_memcpy_rect_copy):
Permit all device-to-device copyies; invoke new plugins for
2D and 3D copying when available.
(gomp_load_plugin_for_device): DLSYM the new plugin functions.
* testsuite/libgomp.c/target-12.c: Fix dimension bug.
* testsuite/libgomp.fortran/target-12.f90: Likewise.
* testsuite/libgomp.fortran/target-memcpy-rect-1.f90: New test.
Diffstat (limited to 'libgomp/target.c')
-rw-r--r-- | libgomp/target.c | 152 |
1 files changed, 127 insertions, 25 deletions
diff --git a/libgomp/target.c b/libgomp/target.c index 80c25a1..5cf2e8d 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -4526,7 +4526,8 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, const size_t *dst_dimensions, const size_t *src_dimensions, struct gomp_device_descr *dst_devicep, - struct gomp_device_descr *src_devicep) + struct gomp_device_descr *src_devicep, + size_t *tmp_size, void **tmp) { size_t dst_slice = element_size; size_t src_slice = element_size; @@ -4539,36 +4540,120 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off) || __builtin_mul_overflow (element_size, src_offsets[0], &src_off)) return EINVAL; - if (dst_devicep == NULL && src_devicep == NULL) - { - memcpy ((char *) dst + dst_off, (const char *) src + src_off, - length); - ret = 1; - } - else if (src_devicep == NULL) - ret = dst_devicep->host2dev_func (dst_devicep->target_id, + if (src_devicep != NULL && src_devicep == dst_devicep) + ret = src_devicep->dev2dev_func (src_devicep->target_id, + (char *) dst + dst_off, + (const char *) src + src_off, + length); + else if (src_devicep != NULL + && (dst_devicep == NULL + || (dst_devicep->capabilities + & GOMP_OFFLOAD_CAP_SHARED_MEM))) + ret = src_devicep->dev2host_func (src_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); - else if (dst_devicep == NULL) - ret = src_devicep->dev2host_func (src_devicep->target_id, + else if (dst_devicep != NULL + && (src_devicep == NULL + || (src_devicep->capabilities + & GOMP_OFFLOAD_CAP_SHARED_MEM))) + ret = dst_devicep->host2dev_func (dst_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); + else if (dst_devicep == NULL && src_devicep == NULL) + { + memcpy ((char *) dst + dst_off, (const char *) src + src_off, + length); + ret = 1; + } else if (src_devicep == dst_devicep) ret = src_devicep->dev2dev_func (src_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); else - ret = 0; + { + if (*tmp_size == 0) + { + *tmp_size = length; + *tmp = malloc (length); + if (*tmp == NULL) + return ENOMEM; + } + else if (*tmp_size < length) + { + *tmp_size = length; + *tmp = realloc (*tmp, length); + if (*tmp == NULL) + return ENOMEM; + } + ret = src_devicep->dev2host_func (src_devicep->target_id, *tmp, + (const char *) src + src_off, + length); + if (ret == 1) + ret = dst_devicep->host2dev_func (dst_devicep->target_id, + (char *) dst + dst_off, *tmp, + length); + } return ret ? 0 : EINVAL; } - /* FIXME: it would be nice to have some plugin function to handle - num_dims == 2 and num_dims == 3 more efficiently. Larger ones can - be handled in the generic recursion below, and for host-host it - should be used even for any num_dims >= 2. */ + /* host->device, device->host and same-device device->device. */ + if (num_dims == 2 + && ((src_devicep + && src_devicep == dst_devicep + && src_devicep->memcpy2d_func) + || (!src_devicep != !dst_devicep + && ((src_devicep && src_devicep->memcpy2d_func) + || (dst_devicep && dst_devicep->memcpy2d_func))))) + { + size_t vol_sz1, dst_sz1, src_sz1, dst_off_sz1, src_off_sz1; + int dst_id = dst_devicep ? dst_devicep->target_id : -1; + int src_id = src_devicep ? src_devicep->target_id : -1; + struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep; + + if (__builtin_mul_overflow (volume[1], element_size, &vol_sz1) + || __builtin_mul_overflow (dst_dimensions[1], element_size, &dst_sz1) + || __builtin_mul_overflow (src_dimensions[1], element_size, &src_sz1) + || __builtin_mul_overflow (dst_offsets[1], element_size, &dst_off_sz1) + || __builtin_mul_overflow (src_offsets[1], element_size, + &src_off_sz1)) + return EINVAL; + ret = devp->memcpy2d_func (dst_id, src_id, vol_sz1, volume[0], + dst, dst_off_sz1, dst_offsets[0], dst_sz1, + src, src_off_sz1, src_offsets[0], src_sz1); + if (ret != -1) + return ret ? 0 : EINVAL; + } + else if (num_dims == 3 + && ((src_devicep + && src_devicep == dst_devicep + && src_devicep->memcpy3d_func) + || (!src_devicep != !dst_devicep + && ((src_devicep && src_devicep->memcpy3d_func) + || (dst_devicep && dst_devicep->memcpy3d_func))))) + { + size_t vol_sz2, dst_sz2, src_sz2, dst_off_sz2, src_off_sz2; + int dst_id = dst_devicep ? dst_devicep->target_id : -1; + int src_id = src_devicep ? src_devicep->target_id : -1; + struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep; + + if (__builtin_mul_overflow (volume[2], element_size, &vol_sz2) + || __builtin_mul_overflow (dst_dimensions[2], element_size, &dst_sz2) + || __builtin_mul_overflow (src_dimensions[2], element_size, &src_sz2) + || __builtin_mul_overflow (dst_offsets[2], element_size, &dst_off_sz2) + || __builtin_mul_overflow (src_offsets[2], element_size, + &src_off_sz2)) + return EINVAL; + ret = devp->memcpy3d_func (dst_id, src_id, vol_sz2, volume[1], volume[0], + dst, dst_off_sz2, dst_offsets[1], + dst_offsets[0], dst_sz2, dst_dimensions[1], + src, src_off_sz2, src_offsets[1], + src_offsets[0], src_sz2, src_dimensions[1]); + if (ret != -1) + return ret ? 0 : EINVAL; + } for (i = 1; i < num_dims; i++) if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice) @@ -4585,7 +4670,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, volume + 1, dst_offsets + 1, src_offsets + 1, dst_dimensions + 1, src_dimensions + 1, dst_devicep, - src_devicep); + src_devicep, tmp_size, tmp); if (ret) return ret; dst_off += dst_slice; @@ -4608,9 +4693,6 @@ omp_target_memcpy_rect_check (void *dst, const void *src, int dst_device_num, if (ret) return ret; - if (*src_devicep != NULL && *dst_devicep != NULL && *src_devicep != *dst_devicep) - return EINVAL; - return 0; } @@ -4624,18 +4706,36 @@ omp_target_memcpy_rect_copy (void *dst, const void *src, struct gomp_device_descr *dst_devicep, struct gomp_device_descr *src_devicep) { - if (src_devicep) + size_t tmp_size = 0; + void *tmp = NULL; + bool lock_src; + bool lock_dst; + + lock_src = (src_devicep + && (!dst_devicep + || src_devicep == dst_devicep + || !(src_devicep->capabilities + & GOMP_OFFLOAD_CAP_SHARED_MEM))); + lock_dst = (dst_devicep + && (!lock_src + || (src_devicep != dst_devicep + && !(dst_devicep->capabilities + & GOMP_OFFLOAD_CAP_SHARED_MEM)))); + if (lock_src) gomp_mutex_lock (&src_devicep->lock); - else if (dst_devicep) + if (lock_dst) gomp_mutex_lock (&dst_devicep->lock); int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims, volume, dst_offsets, src_offsets, dst_dimensions, src_dimensions, - dst_devicep, src_devicep); - if (src_devicep) + dst_devicep, src_devicep, + &tmp_size, &tmp); + if (lock_src) gomp_mutex_unlock (&src_devicep->lock); - else if (dst_devicep) + if (lock_dst) gomp_mutex_unlock (&dst_devicep->lock); + if (tmp) + free (tmp); return ret; } @@ -4976,6 +5076,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM (free); DLSYM (dev2host); DLSYM (host2dev); + DLSYM (memcpy2d); + DLSYM (memcpy3d); device->capabilities = device->get_caps_func (); if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) { |