aboutsummaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorTobias Burnus <tobias@codesourcery.com>2023-07-26 16:22:35 +0200
committerTobias Burnus <tobias@codesourcery.com>2023-07-26 16:22:35 +0200
commit25072a477a56a727b369bf9b20f4d18198ff5894 (patch)
tree8dc40c0f128509b0e5c78ff32d5102c321bbaa4d /include
parentc194a413369e9c9f92f1c9334556b359c7417742 (diff)
downloadgcc-25072a477a56a727b369bf9b20f4d18198ff5894.zip
gcc-25072a477a56a727b369bf9b20f4d18198ff5894.tar.gz
gcc-25072a477a56a727b369bf9b20f4d18198ff5894.tar.bz2
OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect
When copying a 2D or 3D rectangular memmory block, the performance is better when using CUDA's cuMemcpy2D/cuMemcpy3D instead of copying the data one by one. That's what this commit does. Additionally, it permits device-to-device copies, if neccessary using a temporary variable on the host. include/ChangeLog: * cuda/cuda.h (CUlimit): Add CUDA_ERROR_NOT_INITIALIZED, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_INVALID_HANDLE. (CUarray, CUmemorytype, CUDA_MEMCPY2D, CUDA_MEMCPY3D, CUDA_MEMCPY3D_PEER): New typdefs. (cuMemcpy2D, cuMemcpy2DAsync, cuMemcpy2DUnaligned, cuMemcpy3D, cuMemcpy3DAsync, cuMemcpy3DPeer, cuMemcpy3DPeerAsync): New prototypes. libgomp/ChangeLog: * libgomp-plugin.h (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New prototypes. * libgomp.h (struct gomp_device_descr): Add memcpy2d_func and memcpy3d_func. * libgomp.texi (nvtpx): Document when cuMemcpy2D/cuMemcpy3D is used. * oacc-host.c (memcpy2d_func, .memcpy3d_func): Init with NULL. * plugin/cuda-lib.def (cuMemcpy2D, cuMemcpy2DUnaligned, cuMemcpy3D): Invoke via CUDA_ONE_CALL. * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New. * target.c (omp_target_memcpy_rect_worker): (omp_target_memcpy_rect_check, omp_target_memcpy_rect_copy): Permit all device-to-device copyies; invoke new plugins for 2D and 3D copying when available. (gomp_load_plugin_for_device): DLSYM the new plugin functions. * testsuite/libgomp.c/target-12.c: Fix dimension bug. * testsuite/libgomp.fortran/target-12.f90: Likewise. * testsuite/libgomp.fortran/target-memcpy-rect-1.f90: New test.
Diffstat (limited to 'include')
-rw-r--r--include/cuda/cuda.h85
1 files changed, 85 insertions, 0 deletions
diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index 338626f..09c3c2b 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -47,6 +47,7 @@ typedef void *CUevent;
typedef void *CUfunction;
typedef void *CUlinkState;
typedef void *CUmodule;
+typedef void *CUarray;
typedef size_t (*CUoccupancyB2DSize)(int);
typedef void *CUstream;
@@ -54,7 +55,10 @@ typedef enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_OUT_OF_MEMORY = 2,
+ CUDA_ERROR_NOT_INITIALIZED = 3,
+ CUDA_ERROR_DEINITIALIZED = 4,
CUDA_ERROR_INVALID_CONTEXT = 201,
+ CUDA_ERROR_INVALID_HANDLE = 400,
CUDA_ERROR_NOT_FOUND = 500,
CUDA_ERROR_NOT_READY = 600,
CUDA_ERROR_LAUNCH_FAILED = 719,
@@ -126,6 +130,75 @@ typedef enum {
CU_LIMIT_MALLOC_HEAP_SIZE = 0x02,
} CUlimit;
+typedef enum {
+ CU_MEMORYTYPE_HOST = 0x01,
+ CU_MEMORYTYPE_DEVICE = 0x02,
+ CU_MEMORYTYPE_ARRAY = 0x03,
+ CU_MEMORYTYPE_UNIFIED = 0x04
+} CUmemorytype;
+
+typedef struct {
+ size_t srcXInBytes, srcY;
+ CUmemorytype srcMemoryType;
+ const void *srcHost;
+ CUdeviceptr srcDevice;
+ CUarray srcArray;
+ size_t srcPitch;
+
+ size_t dstXInBytes, dstY;
+ CUmemorytype dstMemoryType;
+ const void *dstHost;
+ CUdeviceptr dstDevice;
+ CUarray dstArray;
+ size_t dstPitch;
+
+ size_t WidthInBytes, Height;
+} CUDA_MEMCPY2D;
+
+typedef struct {
+ size_t srcXInBytes, srcY, srcZ;
+ size_t srcLOD;
+ CUmemorytype srcMemoryType;
+ const void *srcHost;
+ CUdeviceptr srcDevice;
+ CUarray srcArray;
+ void *dummy;
+ size_t srcPitch, srcHeight;
+
+ size_t dstXInBytes, dstY, dstZ;
+ size_t dstLOD;
+ CUmemorytype dstMemoryType;
+ const void *dstHost;
+ CUdeviceptr dstDevice;
+ CUarray dstArray;
+ void *dummy2;
+ size_t dstPitch, dstHeight;
+
+ size_t WidthInBytes, Height, Depth;
+} CUDA_MEMCPY3D;
+
+typedef struct {
+ size_t srcXInBytes, srcY, srcZ;
+ size_t srcLOD;
+ CUmemorytype srcMemoryType;
+ const void *srcHost;
+ CUdeviceptr srcDevice;
+ CUarray srcArray;
+ CUcontext srcContext;
+ size_t srcPitch, srcHeight;
+
+ size_t dstXInBytes, dstY, dstZ;
+ size_t dstLOD;
+ CUmemorytype dstMemoryType;
+ const void *dstHost;
+ CUdeviceptr dstDevice;
+ CUarray dstArray;
+ CUcontext dstContext;
+ size_t dstPitch, dstHeight;
+
+ size_t WidthInBytes, Height, Depth;
+} CUDA_MEMCPY3D_PEER;
+
#define cuCtxCreate cuCtxCreate_v2
CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice);
#define cuCtxDestroy cuCtxDestroy_v2
@@ -183,6 +256,18 @@ CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t);
#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream);
+#define cuMemcpy2D cuMemcpy2D_v2
+CUresult cuMemcpy2D (const CUDA_MEMCPY2D *);
+#define cuMemcpy2DAsync cuMemcpy2DAsync_v2
+CUresult cuMemcpy2DAsync (const CUDA_MEMCPY2D *, CUstream);
+#define cuMemcpy2DUnaligned cuMemcpy2DUnaligned_v2
+CUresult cuMemcpy2DUnaligned (const CUDA_MEMCPY2D *);
+#define cuMemcpy3D cuMemcpy3D_v2
+CUresult cuMemcpy3D (const CUDA_MEMCPY3D *);
+#define cuMemcpy3DAsync cuMemcpy3DAsync_v2
+CUresult cuMemcpy3DAsync (const CUDA_MEMCPY3D *, CUstream);
+CUresult cuMemcpy3DPeer (const CUDA_MEMCPY3D_PEER *);
+CUresult cuMemcpy3DPeerAsync (const CUDA_MEMCPY3D_PEER *, CUstream);
#define cuMemFree cuMemFree_v2
CUresult cuMemFree (CUdeviceptr);
CUresult cuMemFreeHost (void *);