diff options
author | Tobias Burnus <tobias@codesourcery.com> | 2023-07-29 13:25:03 +0200 |
---|---|---|
committer | Tobias Burnus <tobias@codesourcery.com> | 2023-07-29 13:25:03 +0200 |
commit | 8b9e559fe7ca5715c74115322af99dbf9137a399 (patch) | |
tree | 1e9c2a5af7f731371133e14689a94c3af9812513 | |
parent | 5ffa9d0a5e22f6f763b7f04877a940689e7abcba (diff) | |
download | gcc-8b9e559fe7ca5715c74115322af99dbf9137a399.zip gcc-8b9e559fe7ca5715c74115322af99dbf9137a399.tar.gz gcc-8b9e559fe7ca5715c74115322af99dbf9137a399.tar.bz2 |
libgomp: cuda.h and omp_target_memcpy_rect cleanup
Fixes for commit r14-2792-g25072a477a56a727b369bf9b20f4d18198ff5894
"OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect",
namely:
In that commit, the code was changed to handle shared-memory devices;
however, as pointed out, omp_target_memcpy_check already set the pointer
to NULL in that case. Hence, this commit reverts to the prior version.
In cuda.h, it adds cuMemcpyPeer{,Async} for symmetry for cuMemcpy3DPeer
(all currently unused) and in three structs, fixes reserved-member names
and remove a bogus 'const' in three structs.
And it changes a DLSYM to DLSYM_OPT as not all plugins support the new
functions, yet.
include/ChangeLog:
* cuda/cuda.h (CUDA_MEMCPY2D, CUDA_MEMCPY3D, CUDA_MEMCPY3D_PEER):
Remove bogus 'const' from 'const void *dst' and fix reserved-name
name in those structs.
(cuMemcpyPeer, cuMemcpyPeerAsync): Add.
libgomp/ChangeLog:
* target.c (omp_target_memcpy_rect_worker): Undo dim=1 change for
GOMP_OFFLOAD_CAP_SHARED_MEM.
(omp_target_memcpy_rect_copy): Likewise for lock condition.
(gomp_load_plugin_for_device): Use DLSYM_OPT not DLSYM for
memcpy3d/memcpy2d.
* plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d,
GOMP_OFFLOAD_memcpy3d): Use memset 0 to nullify reserved and
unused src/dst fields for that mem type; remove '{src,dst}LOD = 0'.
-rw-r--r-- | include/cuda/cuda.h | 12 | ||||
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 6 | ||||
-rw-r--r-- | libgomp/target.c | 52 |
3 files changed, 28 insertions, 42 deletions
diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h index 09c3c2b..94fc64a 100644 --- a/include/cuda/cuda.h +++ b/include/cuda/cuda.h @@ -147,7 +147,7 @@ typedef struct { size_t dstXInBytes, dstY; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; size_t dstPitch; @@ -162,16 +162,16 @@ typedef struct { const void *srcHost; CUdeviceptr srcDevice; CUarray srcArray; - void *dummy; + void *reserved0; size_t srcPitch, srcHeight; size_t dstXInBytes, dstY, dstZ; size_t dstLOD; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; - void *dummy2; + void *reserved1; size_t dstPitch, dstHeight; size_t WidthInBytes, Height, Depth; @@ -190,7 +190,7 @@ typedef struct { size_t dstXInBytes, dstY, dstZ; size_t dstLOD; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; CUcontext dstContext; @@ -246,6 +246,8 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t); CUresult cuMemAllocHost (void **, size_t); CUresult cuMemHostAlloc (void **, size_t, unsigned int); CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t); +CUresult cuMemcpyPeer (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t); +CUresult cuMemcpyPeerAsync (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream); #define cuMemcpyDtoH cuMemcpyDtoH_v2 diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 9cdc55c..00d4241 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1794,6 +1794,8 @@ GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size, /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ CUDA_MEMCPY2D data; + + memset (&data, 0, sizeof (data)); data.WidthInBytes = dim1_size; data.Height = dim0_len; @@ -1855,6 +1857,8 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ CUDA_MEMCPY3D data; + + memset (&data, 0, sizeof (data)); data.WidthInBytes = dim2_size; data.Height = dim1_len; data.Depth = dim0_len; @@ -1874,7 +1878,6 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, data.dstXInBytes = dst_offset2_size; data.dstY = dst_offset1_len; data.dstZ = dst_offset0_len; - data.dstLOD = 0; if (src_ord == -1) { @@ -1891,7 +1894,6 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, data.srcXInBytes = src_offset2_size; data.srcY = src_offset1_len; data.srcZ = src_offset0_len; - data.srcLOD = 0; CUDA_CALL (cuMemcpy3D, &data); return true; diff --git a/libgomp/target.c b/libgomp/target.c index 5cf2e8d..cd4cc1b 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -4540,33 +4540,22 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off) || __builtin_mul_overflow (element_size, src_offsets[0], &src_off)) return EINVAL; - if (src_devicep != NULL && src_devicep == dst_devicep) - ret = src_devicep->dev2dev_func (src_devicep->target_id, - (char *) dst + dst_off, - (const char *) src + src_off, - length); - else if (src_devicep != NULL - && (dst_devicep == NULL - || (dst_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM))) - ret = src_devicep->dev2host_func (src_devicep->target_id, + if (dst_devicep == NULL && src_devicep == NULL) + { + memcpy ((char *) dst + dst_off, (const char *) src + src_off, + length); + ret = 1; + } + else if (src_devicep == NULL) + ret = dst_devicep->host2dev_func (dst_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); - else if (dst_devicep != NULL - && (src_devicep == NULL - || (src_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM))) - ret = dst_devicep->host2dev_func (dst_devicep->target_id, + else if (dst_devicep == NULL) + ret = src_devicep->dev2host_func (src_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); - else if (dst_devicep == NULL && src_devicep == NULL) - { - memcpy ((char *) dst + dst_off, (const char *) src + src_off, - length); - ret = 1; - } else if (src_devicep == dst_devicep) ret = src_devicep->dev2dev_func (src_devicep->target_id, (char *) dst + dst_off, @@ -4584,7 +4573,8 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, else if (*tmp_size < length) { *tmp_size = length; - *tmp = realloc (*tmp, length); + free (*tmp); + *tmp = malloc (length); if (*tmp == NULL) return ENOMEM; } @@ -4599,7 +4589,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, return ret ? 0 : EINVAL; } - /* host->device, device->host and same-device device->device. */ + /* host->device, device->host and intra device. */ if (num_dims == 2 && ((src_devicep && src_devicep == dst_devicep @@ -4711,16 +4701,8 @@ omp_target_memcpy_rect_copy (void *dst, const void *src, bool lock_src; bool lock_dst; - lock_src = (src_devicep - && (!dst_devicep - || src_devicep == dst_devicep - || !(src_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM))); - lock_dst = (dst_devicep - && (!lock_src - || (src_devicep != dst_devicep - && !(dst_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM)))); + lock_src = src_devicep != NULL; + lock_dst = dst_devicep != NULL && src_devicep != dst_devicep; if (lock_src) gomp_mutex_lock (&src_devicep->lock); if (lock_dst) @@ -5076,8 +5058,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM (free); DLSYM (dev2host); DLSYM (host2dev); - DLSYM (memcpy2d); - DLSYM (memcpy3d); + DLSYM_OPT (memcpy2d, memcpy2d); + DLSYM_OPT (memcpy3d, memcpy3d); device->capabilities = device->get_caps_func (); if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) { |