diff options
Diffstat (limited to 'libgomp/testsuite/libgomp.c/interop-cublas-full.c')
-rw-r--r-- | libgomp/testsuite/libgomp.c/interop-cublas-full.c | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/libgomp/testsuite/libgomp.c/interop-cublas-full.c b/libgomp/testsuite/libgomp.c/interop-cublas-full.c new file mode 100644 index 0000000..2df5277 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-cublas-full.c @@ -0,0 +1,176 @@ +/* { dg-require-effective-target openacc_cublas } */ +/* { dg-additional-options "-lcublas" } */ + +/* NOTE: This file is also included by libgomp.c-c++-common/interop-cudablas-libonly.c + to test the fallback version. */ + +/* Check whether cuBlas' daxpy works with an interop object. + daxpy(N, DA, DX, INCX, DY, INCY) + calculates (for DX = DY = 1): + DY(1:N) = DY(1:N) + DA * DX(1:N) + and otherwise N array elements, taking every INCX-th or INCY-th one, repectively. + +Based on the interop example in OpenMP's example document */ + +/* Minimal check whether CUDA works - by checking whether the API routines + seem to work. This includes a fallback if the header is not + available. */ + +#include <assert.h> +#include <omp.h> +#include "../libgomp.c-c++-common/on_device_arch.h" + + +#if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && __has_include(<cublas_v2.h>) && !defined(USE_CUDA_FALLBACK_HEADER) + #include <cuda.h> + #include <cudaTypedefs.h> + #include <cuda_runtime.h> + #include <cublas_v2.h> + +#else + /* Add a poor man's fallback declaration. */ + #if USE_CUDA_FALLBACK_HEADER + // Don't warn. + #elif !__has_include(<cuda.h>) + #warning "Using GCC's cuda.h as fallback for cuda.h" + #elif !__has_include(<cudaTypedefs.h>) + #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" + #elif !__has_include(<cuda_runtime.h>) + #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" + #else + #warning "Using GCC's cuda.h as fallback for cublas_v2.h" + #endif + #include "../../../include/cuda/cuda.h" + + typedef enum { + CUBLAS_STATUS_SUCCESS = 0, + } cublasStatus_t; + + typedef CUstream cudaStream_t; + typedef struct cublasContext* cublasHandle_t; + + #define cublasCreate cublasCreate_v2 + cublasStatus_t cublasCreate_v2 (cublasHandle_t *); + + #define cublasSetStream cublasSetStream_v2 + cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t); + + #define cublasDaxpy cublasDaxpy_v2 + cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double*, const double*, int, double*, int); +#endif + +static int used_variant = 0; + +void +run_cuBlasdaxpy (int n, double da, const double *dx, int incx, double *dy, int incy, omp_interop_t obj) +{ + used_variant = 1; + + omp_interop_rc_t res; + cublasStatus_t stat; + + omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res); + assert (res == omp_irc_success && fr == omp_ifr_cuda); + + cudaStream_t stream = (cudaStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + cublasHandle_t handle; + stat = cublasCreate (&handle); + assert (stat == CUBLAS_STATUS_SUCCESS); + + stat = cublasSetStream (handle, stream); + assert (stat == CUBLAS_STATUS_SUCCESS); + + /* 'da' can be in host or device space, 'dx' and 'dy' must be in device space. */ + stat = cublasDaxpy (handle, n, &da, dx, 1, dy, 1) ; + assert (stat == CUBLAS_STATUS_SUCCESS); +} + + +#pragma omp declare variant(run_cuBlasdaxpy) \ + match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \ + adjust_args(need_device_ptr : dx, dy) \ + append_args(interop(targetsync, prefer_type("cuda"))) + +void +run_daxpy (int n, double da, const double *dx, int incx, double *dy, int incy) +{ + used_variant = 2; + + if (incx == 1 && incy == 1) + #pragma omp simd + for (int i = 0; i < n; i++) + dy[i] += da * dx[i]; + else + { + int ix = 0; + int iy = 0; + for (int i = 0; i < n; i++) + { + dy[iy] += da * dx[ix]; + ix += incx; + iy += incy; + } + } +} + + +void +run_test (int dev) +{ + constexpr int N = 1024; + + // A = {1,2,...,N} + // B = {-1, -2, ..., N} + // B' = daxpy (N, 3, A, incx=1, B, incy=1) + // = B + 3*A + // -> B' = {0, 2, 4, 6, ... } + + double A[N], B[N]; + double factor = 3.0; + for (int i = 0; i < N; i++) + { + A[i] = i; + B[i] = -i; + } + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target enter data device(dev) map(A, B) + } + + used_variant = 99; + #pragma omp dispatch device(dev) + run_daxpy (N, factor, A, 1, B, 1); + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target exit data device(dev) map(release: A) map(from: B) + + int tmp = omp_get_default_device (); + omp_set_default_device (dev); + if (on_device_arch_nvptx ()) + assert (used_variant == 1); + else + assert (used_variant == 2); + omp_set_default_device (tmp); + } + else + assert (used_variant == 2); + + for (int i = 0; i < N; i++) + assert (B[i] == 2*i); +} + +int +main () +{ + int ndev = omp_get_num_devices (); + + for (int dev = 0; dev <= ndev; dev++) + run_test (dev); + run_test (omp_initial_device); + + return 0; +} |