aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/cuda/cuda.h12
-rw-r--r--libgomp/plugin/plugin-nvptx.c6
-rw-r--r--libgomp/target.c52
3 files changed, 28 insertions, 42 deletions
diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index 09c3c2b..94fc64a 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -147,7 +147,7 @@ typedef struct {
size_t dstXInBytes, dstY;
CUmemorytype dstMemoryType;
- const void *dstHost;
+ void *dstHost;
CUdeviceptr dstDevice;
CUarray dstArray;
size_t dstPitch;
@@ -162,16 +162,16 @@ typedef struct {
const void *srcHost;
CUdeviceptr srcDevice;
CUarray srcArray;
- void *dummy;
+ void *reserved0;
size_t srcPitch, srcHeight;
size_t dstXInBytes, dstY, dstZ;
size_t dstLOD;
CUmemorytype dstMemoryType;
- const void *dstHost;
+ void *dstHost;
CUdeviceptr dstDevice;
CUarray dstArray;
- void *dummy2;
+ void *reserved1;
size_t dstPitch, dstHeight;
size_t WidthInBytes, Height, Depth;
@@ -190,7 +190,7 @@ typedef struct {
size_t dstXInBytes, dstY, dstZ;
size_t dstLOD;
CUmemorytype dstMemoryType;
- const void *dstHost;
+ void *dstHost;
CUdeviceptr dstDevice;
CUarray dstArray;
CUcontext dstContext;
@@ -246,6 +246,8 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t);
CUresult cuMemAllocHost (void **, size_t);
CUresult cuMemHostAlloc (void **, size_t, unsigned int);
CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
+CUresult cuMemcpyPeer (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t);
+CUresult cuMemcpyPeerAsync (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream);
#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
#define cuMemcpyDtoH cuMemcpyDtoH_v2
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 9cdc55c..00d4241 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1794,6 +1794,8 @@ GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
/* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
CUDA_MEMCPY2D data;
+
+ memset (&data, 0, sizeof (data));
data.WidthInBytes = dim1_size;
data.Height = dim0_len;
@@ -1855,6 +1857,8 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
/* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
CUDA_MEMCPY3D data;
+
+ memset (&data, 0, sizeof (data));
data.WidthInBytes = dim2_size;
data.Height = dim1_len;
data.Depth = dim0_len;
@@ -1874,7 +1878,6 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
data.dstXInBytes = dst_offset2_size;
data.dstY = dst_offset1_len;
data.dstZ = dst_offset0_len;
- data.dstLOD = 0;
if (src_ord == -1)
{
@@ -1891,7 +1894,6 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
data.srcXInBytes = src_offset2_size;
data.srcY = src_offset1_len;
data.srcZ = src_offset0_len;
- data.srcLOD = 0;
CUDA_CALL (cuMemcpy3D, &data);
return true;
diff --git a/libgomp/target.c b/libgomp/target.c
index 5cf2e8d..cd4cc1b 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -4540,33 +4540,22 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
|| __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
|| __builtin_mul_overflow (element_size, src_offsets[0], &src_off))
return EINVAL;
- if (src_devicep != NULL && src_devicep == dst_devicep)
- ret = src_devicep->dev2dev_func (src_devicep->target_id,
- (char *) dst + dst_off,
- (const char *) src + src_off,
- length);
- else if (src_devicep != NULL
- && (dst_devicep == NULL
- || (dst_devicep->capabilities
- & GOMP_OFFLOAD_CAP_SHARED_MEM)))
- ret = src_devicep->dev2host_func (src_devicep->target_id,
+ if (dst_devicep == NULL && src_devicep == NULL)
+ {
+ memcpy ((char *) dst + dst_off, (const char *) src + src_off,
+ length);
+ ret = 1;
+ }
+ else if (src_devicep == NULL)
+ ret = dst_devicep->host2dev_func (dst_devicep->target_id,
(char *) dst + dst_off,
(const char *) src + src_off,
length);
- else if (dst_devicep != NULL
- && (src_devicep == NULL
- || (src_devicep->capabilities
- & GOMP_OFFLOAD_CAP_SHARED_MEM)))
- ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+ else if (dst_devicep == NULL)
+ ret = src_devicep->dev2host_func (src_devicep->target_id,
(char *) dst + dst_off,
(const char *) src + src_off,
length);
- else if (dst_devicep == NULL && src_devicep == NULL)
- {
- memcpy ((char *) dst + dst_off, (const char *) src + src_off,
- length);
- ret = 1;
- }
else if (src_devicep == dst_devicep)
ret = src_devicep->dev2dev_func (src_devicep->target_id,
(char *) dst + dst_off,
@@ -4584,7 +4573,8 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
else if (*tmp_size < length)
{
*tmp_size = length;
- *tmp = realloc (*tmp, length);
+ free (*tmp);
+ *tmp = malloc (length);
if (*tmp == NULL)
return ENOMEM;
}
@@ -4599,7 +4589,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
return ret ? 0 : EINVAL;
}
- /* host->device, device->host and same-device device->device. */
+ /* host->device, device->host and intra device. */
if (num_dims == 2
&& ((src_devicep
&& src_devicep == dst_devicep
@@ -4711,16 +4701,8 @@ omp_target_memcpy_rect_copy (void *dst, const void *src,
bool lock_src;
bool lock_dst;
- lock_src = (src_devicep
- && (!dst_devicep
- || src_devicep == dst_devicep
- || !(src_devicep->capabilities
- & GOMP_OFFLOAD_CAP_SHARED_MEM)));
- lock_dst = (dst_devicep
- && (!lock_src
- || (src_devicep != dst_devicep
- && !(dst_devicep->capabilities
- & GOMP_OFFLOAD_CAP_SHARED_MEM))));
+ lock_src = src_devicep != NULL;
+ lock_dst = dst_devicep != NULL && src_devicep != dst_devicep;
if (lock_src)
gomp_mutex_lock (&src_devicep->lock);
if (lock_dst)
@@ -5076,8 +5058,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
DLSYM (free);
DLSYM (dev2host);
DLSYM (host2dev);
- DLSYM (memcpy2d);
- DLSYM (memcpy3d);
+ DLSYM_OPT (memcpy2d, memcpy2d);
+ DLSYM_OPT (memcpy3d, memcpy3d);
device->capabilities = device->get_caps_func ();
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
{