diff options
Diffstat (limited to 'libgomp/testsuite/libgomp.c')
16 files changed, 905 insertions, 0 deletions
diff --git a/libgomp/testsuite/libgomp.c/interop-cublas-full.c b/libgomp/testsuite/libgomp.c/interop-cublas-full.c new file mode 100644 index 0000000..2df5277 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-cublas-full.c @@ -0,0 +1,176 @@ +/* { dg-require-effective-target openacc_cublas } */ +/* { dg-additional-options "-lcublas" } */ + +/* NOTE: This file is also included by libgomp.c-c++-common/interop-cudablas-libonly.c + to test the fallback version. */ + +/* Check whether cuBlas' daxpy works with an interop object. + daxpy(N, DA, DX, INCX, DY, INCY) + calculates (for DX = DY = 1): + DY(1:N) = DY(1:N) + DA * DX(1:N) + and otherwise N array elements, taking every INCX-th or INCY-th one, repectively. + +Based on the interop example in OpenMP's example document */ + +/* Minimal check whether CUDA works - by checking whether the API routines + seem to work. This includes a fallback if the header is not + available. */ + +#include <assert.h> +#include <omp.h> +#include "../libgomp.c-c++-common/on_device_arch.h" + + +#if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && __has_include(<cublas_v2.h>) && !defined(USE_CUDA_FALLBACK_HEADER) + #include <cuda.h> + #include <cudaTypedefs.h> + #include <cuda_runtime.h> + #include <cublas_v2.h> + +#else + /* Add a poor man's fallback declaration. */ + #if USE_CUDA_FALLBACK_HEADER + // Don't warn. + #elif !__has_include(<cuda.h>) + #warning "Using GCC's cuda.h as fallback for cuda.h" + #elif !__has_include(<cudaTypedefs.h>) + #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" + #elif !__has_include(<cuda_runtime.h>) + #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" + #else + #warning "Using GCC's cuda.h as fallback for cublas_v2.h" + #endif + #include "../../../include/cuda/cuda.h" + + typedef enum { + CUBLAS_STATUS_SUCCESS = 0, + } cublasStatus_t; + + typedef CUstream cudaStream_t; + typedef struct cublasContext* cublasHandle_t; + + #define cublasCreate cublasCreate_v2 + cublasStatus_t cublasCreate_v2 (cublasHandle_t *); + + #define cublasSetStream cublasSetStream_v2 + cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t); + + #define cublasDaxpy cublasDaxpy_v2 + cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double*, const double*, int, double*, int); +#endif + +static int used_variant = 0; + +void +run_cuBlasdaxpy (int n, double da, const double *dx, int incx, double *dy, int incy, omp_interop_t obj) +{ + used_variant = 1; + + omp_interop_rc_t res; + cublasStatus_t stat; + + omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res); + assert (res == omp_irc_success && fr == omp_ifr_cuda); + + cudaStream_t stream = (cudaStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + cublasHandle_t handle; + stat = cublasCreate (&handle); + assert (stat == CUBLAS_STATUS_SUCCESS); + + stat = cublasSetStream (handle, stream); + assert (stat == CUBLAS_STATUS_SUCCESS); + + /* 'da' can be in host or device space, 'dx' and 'dy' must be in device space. */ + stat = cublasDaxpy (handle, n, &da, dx, 1, dy, 1) ; + assert (stat == CUBLAS_STATUS_SUCCESS); +} + + +#pragma omp declare variant(run_cuBlasdaxpy) \ + match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \ + adjust_args(need_device_ptr : dx, dy) \ + append_args(interop(targetsync, prefer_type("cuda"))) + +void +run_daxpy (int n, double da, const double *dx, int incx, double *dy, int incy) +{ + used_variant = 2; + + if (incx == 1 && incy == 1) + #pragma omp simd + for (int i = 0; i < n; i++) + dy[i] += da * dx[i]; + else + { + int ix = 0; + int iy = 0; + for (int i = 0; i < n; i++) + { + dy[iy] += da * dx[ix]; + ix += incx; + iy += incy; + } + } +} + + +void +run_test (int dev) +{ + constexpr int N = 1024; + + // A = {1,2,...,N} + // B = {-1, -2, ..., N} + // B' = daxpy (N, 3, A, incx=1, B, incy=1) + // = B + 3*A + // -> B' = {0, 2, 4, 6, ... } + + double A[N], B[N]; + double factor = 3.0; + for (int i = 0; i < N; i++) + { + A[i] = i; + B[i] = -i; + } + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target enter data device(dev) map(A, B) + } + + used_variant = 99; + #pragma omp dispatch device(dev) + run_daxpy (N, factor, A, 1, B, 1); + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target exit data device(dev) map(release: A) map(from: B) + + int tmp = omp_get_default_device (); + omp_set_default_device (dev); + if (on_device_arch_nvptx ()) + assert (used_variant == 1); + else + assert (used_variant == 2); + omp_set_default_device (tmp); + } + else + assert (used_variant == 2); + + for (int i = 0; i < N; i++) + assert (B[i] == 2*i); +} + +int +main () +{ + int ndev = omp_get_num_devices (); + + for (int dev = 0; dev <= ndev; dev++) + run_test (dev); + run_test (omp_initial_device); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/interop-cublas-libonly.c b/libgomp/testsuite/libgomp.c/interop-cublas-libonly.c new file mode 100644 index 0000000..89c0652 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-cublas-libonly.c @@ -0,0 +1,7 @@ +/* { dg-require-effective-target openacc_libcublas } */ +/* { dg-additional-options "-lcublas" } */ + +/* Same as interop-cudablas-full.c, but also works if the header is not available. */ + +#define USE_CUDA_FALLBACK_HEADER 1 +#include "interop-cublas-full.c" diff --git a/libgomp/testsuite/libgomp.c/interop-cuda-full.c b/libgomp/testsuite/libgomp.c/interop-cuda-full.c new file mode 100644 index 0000000..38aa6b1 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-cuda-full.c @@ -0,0 +1,159 @@ +/* { dg-require-effective-target openacc_cuda } */ +/* { dg-require-effective-target openacc_cudart } */ +/* { dg-additional-options "-lcuda -lcudart" } */ + +/* NOTE: This file is also included by libgomp.c-c++-common/interop-cuda-libonly.c + to test the fallback version, which defines USE_CUDA_FALLBACK_HEADER. */ + +/* Minimal check whether CUDA works - by checking whether the API routines + seem to work. This includes a fallback if the header is not + available. */ + +#include <assert.h> +#include <omp.h> + +#if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && !defined(USE_CUDA_FALLBACK_HEADER) + #include <cuda.h> + #include <cudaTypedefs.h> + #include <cuda_runtime.h> + +#else + /* Add a poor man's fallback declaration. */ + #if USE_CUDA_FALLBACK_HEADER + // Don't warn. + #elif !__has_include(<cuda.h>) + #warning "Using GCC's cuda.h as fallback for cuda.h" + #elif !__has_include(<cudaTypedefs.h>) + #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" + #else + #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" + #endif + #include "../../../include/cuda/cuda.h" + + typedef int cudaError_t; + typedef CUstream cudaStream_t; + enum { + cudaSuccess = 0 + }; + + enum cudaDeviceAttr { + cudaDevAttrClockRate = 13, + cudaDevAttrMaxGridDimX = 5 + }; + + cudaError_t cudaDeviceGetAttribute (int *, enum cudaDeviceAttr, int); + cudaError_t cudaStreamQuery(cudaStream_t); + CUresult cuCtxGetApiVersion(CUcontext, unsigned int *); + CUresult cuStreamGetCtx (CUstream, CUcontext *); +#endif + +int +main () +{ + int ivar; + unsigned uvar; + omp_interop_rc_t res; + omp_interop_t obj_cuda = omp_interop_none; + omp_interop_t obj_cuda_driver = omp_interop_none; + cudaError_t cuda_err; + CUresult cu_err; + + #pragma omp interop init(target, targetsync, prefer_type("cuda") : obj_cuda) \ + init(target, targetsync, prefer_type("cuda_driver") : obj_cuda_driver) \ + + omp_interop_fr_t fr = (omp_interop_fr_t) omp_get_interop_int (obj_cuda, omp_ipr_fr_id, &res); + assert (res == omp_irc_success); + assert (fr == omp_ifr_cuda); + + fr = (omp_interop_fr_t) omp_get_interop_int (obj_cuda_driver, omp_ipr_fr_id, &res); + assert (res == omp_irc_success); + assert (fr == omp_ifr_cuda_driver); + + ivar = (int) omp_get_interop_int (obj_cuda, omp_ipr_vendor, &res); + assert (res == omp_irc_success); + assert (ivar == 11); + + ivar = (int) omp_get_interop_int (obj_cuda_driver, omp_ipr_vendor, &res); + assert (res == omp_irc_success); + assert (ivar == 11); + + + /* Check whether the omp_ipr_device -> cudaDevice_t yields a valid device. */ + + CUdevice cu_dev = (int) omp_get_interop_int (obj_cuda_driver, omp_ipr_device, &res); + assert (res == omp_irc_success); + + /* Assume a clock size is available and > 1 GHz; value is in kHz. */ + cu_err = cuDeviceGetAttribute (&ivar, cudaDevAttrClockRate, cu_dev); + assert (cu_err == CUDA_SUCCESS); + assert (ivar > 1000000 /* kHz */); + + /* Assume that the MaxGridDimX is available and > 1024. */ + cu_err = cuDeviceGetAttribute (&ivar, cudaDevAttrMaxGridDimX, cu_dev); + assert (cu_err == CUDA_SUCCESS); + assert (ivar > 1024); + + int cuda_dev = (int) omp_get_interop_int (obj_cuda, omp_ipr_device, &res); + assert (res == omp_irc_success); + assert (cuda_dev == (CUdevice) cu_dev); // Assume they are the same ... + + /* Assume a clock size is available and > 1 GHz; value is in kHz. */ + cuda_err = cudaDeviceGetAttribute (&ivar, cudaDevAttrClockRate, cuda_dev); + assert (cuda_err == cudaSuccess); + assert (ivar > 1000000 /* kHz */); + + /* Assume that the MaxGridDimX is available and > 1024. */ + cuda_err = cudaDeviceGetAttribute (&ivar, cudaDevAttrMaxGridDimX, cuda_dev); + assert (cuda_err == cudaSuccess); + assert (ivar > 1024); + + + + + /* Check whether the omp_ipr_device_context -> CUcontext yields a context. */ + + CUcontext cu_ctx = (CUcontext) omp_get_interop_ptr (obj_cuda_driver, omp_ipr_device_context, &res); + assert (res == omp_irc_success); + + /* Assume API Version > 0 for Nvidia, cudaErrorNotSupported for AMD. */ + uvar = 99; + cu_err = cuCtxGetApiVersion (cu_ctx, &uvar); + assert (cu_err == CUDA_SUCCESS); + assert (uvar > 0); + + + /* Check whether the omp_ipr_targetsync -> cudaStream_t yields a stream. */ + + cudaStream_t cuda_sm = (cudaStream_t) omp_get_interop_ptr (obj_cuda, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + CUstream cu_sm = (cudaStream_t) omp_get_interop_ptr (obj_cuda_driver, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + assert ((void*) cu_sm != (void*) cuda_sm); // Type compatible but should have created two streams + + int dev_stream = 99; +#if CUDA_VERSION >= 12080 + cuda_err = cudaStreamGetDevice (cuda_sm, &dev_stream); + assert (cuda_err == cudaSuccess); +#else + cu_err = cuStreamGetCtx (cu_sm, &cu_ctx) != CUDA_SUCCESS; + if (cu_err == CUDA_SUCCESS) + cuda_err = cuCtxPushCurrent (cu_ctx) != CUDA_SUCCESS; + if (cu_err == CUDA_SUCCESS) + cuda_err = cuCtxGetDevice (&dev_stream) != CUDA_SUCCESS; + if (cu_err == CUDA_SUCCESS) + cu_err = cuCtxPopCurrent (&cu_ctx) != CUDA_SUCCESS; + assert (cu_err == CUDA_SUCCESS); +#endif + assert (dev_stream == cuda_dev); + + /* All jobs should have been completed (as there were none none) */ + cuda_err = cudaStreamQuery (cuda_sm); + assert (cuda_err == cudaSuccess); + + cu_err = cuStreamQuery (cu_sm); + assert (cu_err == CUDA_SUCCESS); + + #pragma omp interop destroy(obj_cuda, obj_cuda_driver) +} diff --git a/libgomp/testsuite/libgomp.c/interop-cuda-libonly.c b/libgomp/testsuite/libgomp.c/interop-cuda-libonly.c new file mode 100644 index 0000000..17cbb15 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-cuda-libonly.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target openacc_libcudart } */ +/* { dg-require-effective-target openacc_libcuda } */ +/* { dg-additional-options "-lcuda -lcudart" } */ + +/* Same as interop-cuda-full.c, but also works if the header is not available. */ + +#define USE_CUDA_FALLBACK_HEADER 1 +#include "interop-cuda-full.c" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-amd-full.c b/libgomp/testsuite/libgomp.c/interop-hip-amd-full.c new file mode 100644 index 0000000..d7725fc --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-amd-full.c @@ -0,0 +1,7 @@ +/* { dg-require-effective-target gomp_hip_header_amd } */ +/* { dg-require-effective-target gomp_libamdhip64 } */ +/* { dg-additional-options "-lamdhip64" } */ + +#define __HIP_PLATFORM_AMD__ 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-amd-no-hip-header.c b/libgomp/testsuite/libgomp.c/interop-hip-amd-no-hip-header.c new file mode 100644 index 0000000..2584537 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-amd-no-hip-header.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target gomp_libamdhip64 } */ +/* { dg-additional-options "-lamdhip64" } */ + +#define __HIP_PLATFORM_AMD__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-nvidia-full.c b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-full.c new file mode 100644 index 0000000..79af47d --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-full.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target openacc_cudart } */ +/* { dg-require-effective-target openacc_cuda } */ +/* { dg-require-effective-target gomp_hip_header_nvidia } */ +/* { dg-additional-options "-lcuda -lcudart -Wno-deprecated-declarations" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-headers.c b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-headers.c new file mode 100644 index 0000000..4586398 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-headers.c @@ -0,0 +1,10 @@ +/* { dg-require-effective-target openacc_libcudart } */ +/* { dg-require-effective-target openacc_libcuda } */ +/* { dg-additional-options "-lcuda -lcudart" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 +#define USE_CUDA_FALLBACK_HEADER 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-hip-header.c b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-hip-header.c new file mode 100644 index 0000000..4186984 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-hip-header.c @@ -0,0 +1,9 @@ +/* { dg-require-effective-target openacc_cudart } */ +/* { dg-require-effective-target openacc_cuda } */ +/* { dg-additional-options "-lcuda -lcudart" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip.h b/libgomp/testsuite/libgomp.c/interop-hip.h new file mode 100644 index 0000000..20a1ccb --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip.h @@ -0,0 +1,234 @@ +/* Minimal check whether HIP works - by checking whether the API routines + seem to work. This includes various fallbacks if the header is not + available. */ + +#include <assert.h> +#include <omp.h> + +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) + #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" +#endif + +#if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) + #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" +#endif + +#if __has_include(<hip/hip_runtime_api.h>) && !defined(USE_HIP_FALLBACK_HEADER) + #include <hip/hip_runtime_api.h> + +#elif defined(__HIP_PLATFORM_AMD__) + /* Add a poor man's fallback declaration. */ + #if !defined(USE_HIP_FALLBACK_HEADER) + #warning "Using fallback declaration for <hip/hip_runtime_api.h> for __HIP_PLATFORM_AMD__" + #endif + + typedef struct ihipStream_t* hipStream_t; + typedef struct ihipCtx_t* hipCtx_t; + typedef int hipError_t; + typedef int hipDevice_t; + enum { + hipSuccess = 0, + hipErrorNotSupported = 801 + }; + + typedef enum hipDeviceAttribute_t { + hipDeviceAttributeClockRate = 5, + hipDeviceAttributeMaxGridDimX = 29 + } hipDeviceAttribute_t; + + hipError_t hipDeviceGetAttribute (int *, hipDeviceAttribute_t, hipDevice_t); + hipError_t hipCtxGetApiVersion (hipCtx_t, int *); + hipError_t hipStreamGetDevice (hipStream_t, hipDevice_t *); + hipError_t hipStreamQuery (hipStream_t); + +#elif defined(__HIP_PLATFORM_NVIDIA__) + /* Add a poor man's fallback declaration. */ + #if !defined(USE_HIP_FALLBACK_HEADER) + #warning "Using fallback declaration for <hip/hip_runtime_api.h> for __HIP_PLATFORM_NVIDIA__" + #endif + + #if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && !defined(USE_CUDA_FALLBACK_HEADER) + #include <cuda.h> + #include <cudaTypedefs.h> + #include <cuda_runtime.h> + #else + #if defined(USE_CUDA_FALLBACK_HEADER) + // no warning + #elif !__has_include(<cuda.h>) + #warning "Using GCC's cuda.h as fallback for cuda.h" + #elif !__has_include(<cudaTypedefs.h>) + #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" + #else + #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" + #endif + + #include "../../../include/cuda/cuda.h" + + typedef int cudaError_t; + enum { + cudaSuccess = 0 + }; + + enum cudaDeviceAttr { + cudaDevAttrClockRate = 13, + cudaDevAttrMaxGridDimX = 5 + }; + + cudaError_t cudaDeviceGetAttribute (int *, enum cudaDeviceAttr, int); + CUresult cuCtxGetApiVersion(CUcontext, unsigned int *); + CUresult cuStreamGetCtx (CUstream, CUcontext *); + #endif + + typedef CUstream hipStream_t; + typedef CUcontext hipCtx_t; + typedef CUdevice hipDevice_t; + + typedef int hipError_t; + typedef int hipDevice_t; + enum { + hipSuccess = 0, + hipErrorNotSupported = 801 + }; + + + typedef enum hipDeviceAttribute_t { + hipDeviceAttributeClockRate = 5, + hipDeviceAttributeMaxGridDimX = 29 + } hipDeviceAttribute_t; + + inline static hipError_t + hipDeviceGetAttribute (int *ival, hipDeviceAttribute_t attr, hipDevice_t dev) + { + enum cudaDeviceAttr cuattr; + switch (attr) + { + case hipDeviceAttributeClockRate: + cuattr = cudaDevAttrClockRate; + break; + case hipDeviceAttributeMaxGridDimX: + cuattr = cudaDevAttrMaxGridDimX; + break; + default: + assert (0); + } + return cudaDeviceGetAttribute (ival, cuattr, dev) != cudaSuccess; + } + + inline static hipError_t + hipCtxGetApiVersion (hipCtx_t ctx, int *ver) + { + unsigned uver; + hipError_t err; + err = cuCtxGetApiVersion (ctx, &uver) != CUDA_SUCCESS; + *ver = (int) uver; + return err; + } + + inline static hipError_t + hipStreamGetDevice (hipStream_t stream, hipDevice_t *dev) + { +#if CUDA_VERSION >= 12080 + return cudaStreamGetDevice (stream, dev); +#else + hipError_t err; + CUcontext ctx; + err = cuStreamGetCtx (stream, &ctx) != CUDA_SUCCESS; + if (err == hipSuccess) + err = cuCtxPushCurrent (ctx) != CUDA_SUCCESS; + if (err == hipSuccess) + err = cuCtxGetDevice (dev) != CUDA_SUCCESS; + if (err == hipSuccess) + err = cuCtxPopCurrent (&ctx) != CUDA_SUCCESS; + return err; +#endif + } + + inline static hipError_t + hipStreamQuery (hipStream_t stream) + { + return cuStreamQuery (stream) != CUDA_SUCCESS; + } + +#else + #error "should be unreachable" +#endif + +int +main () +{ + int ivar; + omp_interop_rc_t res; + omp_interop_t obj = omp_interop_none; + hipError_t hip_err; + + #pragma omp interop init(target, targetsync, prefer_type("hip") : obj) + + omp_interop_fr_t fr = (omp_interop_fr_t) omp_get_interop_int (obj, omp_ipr_fr_id, &res); + assert (res == omp_irc_success); + assert (fr == omp_ifr_hip); + + ivar = (int) omp_get_interop_int (obj, omp_ipr_vendor, &res); + assert (res == omp_irc_success); + int vendor_is_amd = ivar == 1; + #if defined(__HIP_PLATFORM_AMD__) + assert (ivar == 1); + #elif defined(__HIP_PLATFORM_NVIDIA__) + assert (ivar == 11); + #else + assert (0); + #endif + + + /* Check whether the omp_ipr_device -> hipDevice_t yields a valid device. */ + + hipDevice_t hip_dev = (int) omp_get_interop_int (obj, omp_ipr_device, &res); + assert (res == omp_irc_success); + + /* Assume a clock size is available and > 1 GHz; value is in kHz. */ + hip_err = hipDeviceGetAttribute (&ivar, hipDeviceAttributeClockRate, hip_dev); + assert (hip_err == hipSuccess); + assert (ivar > 1000000 /* kHz */); + + /* Assume that the MaxGridDimX is available and > 1024. */ + hip_err = hipDeviceGetAttribute (&ivar, hipDeviceAttributeMaxGridDimX, hip_dev); + assert (hip_err == hipSuccess); + assert (ivar > 1024); + + + /* Check whether the omp_ipr_device_context -> hipCtx_t yields a context. */ + + hipCtx_t hip_ctx = (hipCtx_t) omp_get_interop_ptr (obj, omp_ipr_device_context, &res); + assert (res == omp_irc_success); + + /* Assume API Version > 0 for Nvidia, hipErrorNotSupported for AMD. */ + ivar = -99; + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" + hip_err = hipCtxGetApiVersion (hip_ctx, &ivar); + #pragma GCC diagnostic pop + + if (vendor_is_amd) + assert (hip_err == hipErrorNotSupported && ivar == -99); + else + { + assert (hip_err == hipSuccess); + assert (ivar > 0); + } + + + /* Check whether the omp_ipr_targetsync -> hipStream_t yields a stream. */ + + hipStream_t hip_sm = (hipStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + hipDevice_t dev_stream = 99; + hip_err = hipStreamGetDevice (hip_sm, &dev_stream); + assert (hip_err == hipSuccess); + assert (dev_stream == hip_dev); + + /* All jobs should have been completed (as there were none none) */ + hip_err = hipStreamQuery (hip_sm); + assert (hip_err == hipSuccess); + + #pragma omp interop destroy(obj) +} diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-amd-full.c b/libgomp/testsuite/libgomp.c/interop-hipblas-amd-full.c new file mode 100644 index 0000000..53c05bd --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-amd-full.c @@ -0,0 +1,7 @@ +/* { dg-require-effective-target gomp_hip_header_amd } */ +/* { dg-require-effective-target gomp_libhipblas } */ +/* { dg-additional-options "-lhipblas" } */ + +#define __HIP_PLATFORM_AMD__ 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-amd-no-hip-header.c b/libgomp/testsuite/libgomp.c/interop-hipblas-amd-no-hip-header.c new file mode 100644 index 0000000..0ea3133 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-amd-no-hip-header.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target gomp_libhipblas } */ +/* { dg-additional-options "-lhipblas" } */ + +#define __HIP_PLATFORM_AMD__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-full.c b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-full.c new file mode 100644 index 0000000..ed428c6 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-full.c @@ -0,0 +1,7 @@ +/* { dg-require-effective-target openacc_cublas } */ +/* { dg-require-effective-target gomp_hip_header_nvidia } */ +/* { dg-additional-options "-lcublas -Wno-deprecated-declarations" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-headers.c b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-headers.c new file mode 100644 index 0000000..1a31b30 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-headers.c @@ -0,0 +1,9 @@ +/* { dg-require-effective-target openacc_libcublas } */ +/* { dg-additional-options "-lcublas" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 +#define USE_CUDA_FALLBACK_HEADER 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-hip-header.c b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-hip-header.c new file mode 100644 index 0000000..f85c13b --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-hip-header.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target openacc_cublas } */ +/* { dg-additional-options "-lcublas" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas.h b/libgomp/testsuite/libgomp.c/interop-hipblas.h new file mode 100644 index 0000000..d7cb174 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas.h @@ -0,0 +1,240 @@ +/* Check whether hipBlas' daxpy works with an interop object. + daxpy(N, DA, DX, INCX, DY, INCY) + calculates (for DX = DY = 1): + DY(1:N) = DY(1:N) + DA * DX(1:N) + and otherwise N array elements, taking every INCX-th or INCY-th one, repectively. + +Based on the interop example in OpenMP's example document */ + +/* Minimal check whether HIP works - by checking whether the API routines + seem to work. This includes a fallback if the header is not + available. */ + +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) + #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" +#endif + +#if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) + #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" +#endif + + +#include <assert.h> +#include <omp.h> +#include "../libgomp.c-c++-common/on_device_arch.h" + + +#if __has_include(<hipblas/hipblas.h>) && (__has_include(<library_types.h>) || !defined(__HIP_PLATFORM_NVIDIA__)) && !defined(USE_HIP_FALLBACK_HEADER) + #ifdef __HIP_PLATFORM_NVIDIA__ + /* There seems to be an issue with hip/library_types.h including + CUDA's "library_types.h". Include CUDA's one explicitly here. + Could possibly worked around by using -isystem vs. -I. */ + #include <library_types.h> + + /* For some reasons, the following symbols do not seem to get + mapped from HIP to CUDA, causing link errors. */ + #define hipblasSetStream cublasSetStream_v2 + #define hipblasDaxpy cublasDaxpy_v2 + #define hipblasCreate cublasCreate_v2 + #endif + #include <hipblas/hipblas.h> + +#elif defined(__HIP_PLATFORM_AMD__) + /* Add a poor man's fallback declaration. */ + #if !defined(USE_HIP_FALLBACK_HEADER) + #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_AMD__" + #endif + + typedef enum + { + HIPBLAS_STATUS_SUCCESS = 0 + + } hipblasStatus_t; + + typedef struct ihipStream_t* hipStream_t; + typedef void* hipblasHandle_t; + + hipblasStatus_t hipblasCreate (hipblasHandle_t*); + hipblasStatus_t hipblasSetStream (hipblasHandle_t, hipStream_t); + hipblasStatus_t hipblasDaxpy (hipblasHandle_t, int, const double*, const double*, int, double*, int); + +#else + /* Add a poor man's fallback declaration. */ + #if !defined(USE_HIP_FALLBACK_HEADER) + #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_NVIDA__" + #endif + + #if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && __has_include(<cublas_v2.h>) && !defined(USE_CUDA_FALLBACK_HEADER) + #include <cuda.h> + #include <cudaTypedefs.h> + #include <cuda_runtime.h> + #include <cublas_v2.h> + + #else + /* Add a poor man's fallback declaration. */ + #if defined(USE_CUDA_FALLBACK_HEADER) + // no warning + #elif !__has_include(<cuda.h>) + #warning "Using GCC's cuda.h as fallback for cuda.h" + #elif !__has_include(<cudaTypedefs.h>) + #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" + #elif !__has_include(<cuda_runtime.h>) + #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" + #else + #warning "Using GCC's cuda.h as fallback for cublas_v2.h" + #endif + #include "../../../include/cuda/cuda.h" + + typedef enum { + CUBLAS_STATUS_SUCCESS = 0, + } cublasStatus_t; + + typedef CUstream cudaStream_t; + typedef struct cublasContext* cublasHandle_t; + + #define cublasCreate cublasCreate_v2 + cublasStatus_t cublasCreate_v2 (cublasHandle_t *); + + #define cublasSetStream cublasSetStream_v2 + cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t); + + #define cublasDaxpy cublasDaxpy_v2 + cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double*, const double*, int, double*, int); + #endif + + #define HIPBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS + #define hipblasStatus_t cublasStatus_t + #define hipStream_t cudaStream_t + #define hipblasHandle_t cublasHandle_t + #define hipblasCreate cublasCreate + #define hipblasSetStream cublasSetStream + #define hipblasDaxpy cublasDaxpy +#endif + +static int used_variant = 0; + +void +run_hipBlasdaxpy (int n, double da, const double *dx, int incx, double *dy, int incy, omp_interop_t obj) +{ + used_variant = 1; + + omp_interop_rc_t res; + hipblasStatus_t stat; + + omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res); + assert (res == omp_irc_success && fr == omp_ifr_hip); + + hipStream_t stream = (hipStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + hipblasHandle_t handle; + stat = hipblasCreate (&handle); + assert (stat == HIPBLAS_STATUS_SUCCESS); + + stat = hipblasSetStream (handle, stream); + assert (stat == HIPBLAS_STATUS_SUCCESS); + + /* 'da' can be in host or device space, 'dx' and 'dy' must be in device space. */ + stat = hipblasDaxpy (handle, n, &da, dx, 1, dy, 1) ; + assert (stat == HIPBLAS_STATUS_SUCCESS); +} + +#if defined(__HIP_PLATFORM_AMD__) +#pragma omp declare variant(run_hipBlasdaxpy) \ + match(construct={dispatch}, target_device={kind(nohost), arch("amdgcn")}) \ + adjust_args(need_device_ptr : dx, dy) \ + append_args(interop(targetsync, prefer_type("hip"))) +#elif defined(__HIP_PLATFORM_NVIDIA__) +#pragma omp declare variant(run_hipBlasdaxpy) \ + match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \ + adjust_args(need_device_ptr : dx, dy) \ + append_args(interop(targetsync, prefer_type("hip"))) +#else + #error "wrong platform" +#endif + +void +run_daxpy (int n, double da, const double *dx, int incx, double *dy, int incy) +{ + used_variant = 2; + + if (incx == 1 && incy == 1) + #pragma omp simd + for (int i = 0; i < n; i++) + dy[i] += da * dx[i]; + else + { + int ix = 0; + int iy = 0; + for (int i = 0; i < n; i++) + { + dy[iy] += da * dx[ix]; + ix += incx; + iy += incy; + } + } +} + + +void +run_test (int dev) +{ + constexpr int N = 1024; + + // A = {1,2,...,N} + // B = {-1, -2, ..., N} + // B' = daxpy (N, 3, A, incx=1, B, incy=1) + // = B + 3*A + // -> B' = {0, 2, 4, 6, ... } + + double A[N], B[N]; + double factor = 3.0; + for (int i = 0; i < N; i++) + { + A[i] = i; + B[i] = -i; + } + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target enter data device(dev) map(A, B) + } + + used_variant = 99; + #pragma omp dispatch device(dev) + run_daxpy (N, factor, A, 1, B, 1); + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target exit data device(dev) map(release: A) map(from: B) + + int tmp = omp_get_default_device (); + omp_set_default_device (dev); +#if defined(__HIP_PLATFORM_AMD__) + if (on_device_arch_gcn ()) +#else + if (on_device_arch_nvptx ()) +#endif + assert (used_variant == 1); + else + assert (used_variant == 2); + omp_set_default_device (tmp); + } + else + assert (used_variant == 2); + + for (int i = 0; i < N; i++) + assert (B[i] == 2*i); +} + +int +main () +{ + int ndev = omp_get_num_devices (); + + for (int dev = 0; dev <= ndev; dev++) + run_test (dev); + run_test (omp_initial_device); + + return 0; +} |