/* Check whether hipBlas' daxpy works with an interop object. daxpy(N, DA, DX, INCX, DY, INCY) calculates (for DX = DY = 1): DY(1:N) = DY(1:N) + DA * DX(1:N) and otherwise N array elements, taking every INCX-th or INCY-th one, repectively. Based on the interop example in OpenMP's example document */ /* Minimal check whether HIP works - by checking whether the API routines seem to work. This includes a fallback if the header is not available. */ #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" #endif #if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" #endif #include #include #include "../libgomp.c-c++-common/on_device_arch.h" #if __has_include() && (__has_include() || !defined(__HIP_PLATFORM_NVIDIA__)) && !defined(USE_HIP_FALLBACK_HEADER) #ifdef __HIP_PLATFORM_NVIDIA__ /* There seems to be an issue with hip/library_types.h including CUDA's "library_types.h". Include CUDA's one explicitly here. Could possibly worked around by using -isystem vs. -I. */ #include /* For some reasons, the following symbols do not seem to get mapped from HIP to CUDA, causing link errors. */ #define hipblasSetStream cublasSetStream_v2 #define hipblasDaxpy cublasDaxpy_v2 #define hipblasCreate cublasCreate_v2 #endif #include #elif defined(__HIP_PLATFORM_AMD__) /* Add a poor man's fallback declaration. */ #if !defined(USE_HIP_FALLBACK_HEADER) #warning "Using fallback declaration for for __HIP_PLATFORM_AMD__" #endif typedef enum { HIPBLAS_STATUS_SUCCESS = 0 } hipblasStatus_t; typedef struct ihipStream_t* hipStream_t; typedef void* hipblasHandle_t; hipblasStatus_t hipblasCreate (hipblasHandle_t*); hipblasStatus_t hipblasSetStream (hipblasHandle_t, hipStream_t); hipblasStatus_t hipblasDaxpy (hipblasHandle_t, int, const double*, const double*, int, double*, int); #else /* Add a poor man's fallback declaration. */ #if !defined(USE_HIP_FALLBACK_HEADER) #warning "Using fallback declaration for for __HIP_PLATFORM_NVIDA__" #endif #if __has_include() && __has_include() && __has_include() && __has_include() && !defined(USE_CUDA_FALLBACK_HEADER) #include #include #include #include #else /* Add a poor man's fallback declaration. */ #if defined(USE_CUDA_FALLBACK_HEADER) // no warning #elif !__has_include() #warning "Using GCC's cuda.h as fallback for cuda.h" #elif !__has_include() #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" #elif !__has_include() #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" #else #warning "Using GCC's cuda.h as fallback for cublas_v2.h" #endif #include "../../../include/cuda/cuda.h" typedef enum { CUBLAS_STATUS_SUCCESS = 0, } cublasStatus_t; typedef CUstream cudaStream_t; typedef struct cublasContext* cublasHandle_t; #define cublasCreate cublasCreate_v2 cublasStatus_t cublasCreate_v2 (cublasHandle_t *); #define cublasSetStream cublasSetStream_v2 cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t); #define cublasDaxpy cublasDaxpy_v2 cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double*, const double*, int, double*, int); #endif #define HIPBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS #define hipblasStatus_t cublasStatus_t #define hipStream_t cudaStream_t #define hipblasHandle_t cublasHandle_t #define hipblasCreate cublasCreate #define hipblasSetStream cublasSetStream #define hipblasDaxpy cublasDaxpy #endif static int used_variant = 0; void run_hipBlasdaxpy (int n, double da, const double *dx, int incx, double *dy, int incy, omp_interop_t obj) { used_variant = 1; omp_interop_rc_t res; hipblasStatus_t stat; omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res); assert (res == omp_irc_success && fr == omp_ifr_hip); hipStream_t stream = (hipStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res); assert (res == omp_irc_success); hipblasHandle_t handle; stat = hipblasCreate (&handle); assert (stat == HIPBLAS_STATUS_SUCCESS); stat = hipblasSetStream (handle, stream); assert (stat == HIPBLAS_STATUS_SUCCESS); /* 'da' can be in host or device space, 'dx' and 'dy' must be in device space. */ stat = hipblasDaxpy (handle, n, &da, dx, 1, dy, 1) ; assert (stat == HIPBLAS_STATUS_SUCCESS); } #if defined(__HIP_PLATFORM_AMD__) #pragma omp declare variant(run_hipBlasdaxpy) \ match(construct={dispatch}, target_device={kind(nohost), arch("amdgcn")}) \ adjust_args(need_device_ptr : dx, dy) \ append_args(interop(targetsync, prefer_type("hip"))) #elif defined(__HIP_PLATFORM_NVIDIA__) #pragma omp declare variant(run_hipBlasdaxpy) \ match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \ adjust_args(need_device_ptr : dx, dy) \ append_args(interop(targetsync, prefer_type("hip"))) #else #error "wrong platform" #endif void run_daxpy (int n, double da, const double *dx, int incx, double *dy, int incy) { used_variant = 2; if (incx == 1 && incy == 1) #pragma omp simd for (int i = 0; i < n; i++) dy[i] += da * dx[i]; else { int ix = 0; int iy = 0; for (int i = 0; i < n; i++) { dy[iy] += da * dx[ix]; ix += incx; iy += incy; } } } void run_test (int dev) { constexpr int N = 1024; // A = {1,2,...,N} // B = {-1, -2, ..., N} // B' = daxpy (N, 3, A, incx=1, B, incy=1) // = B + 3*A // -> B' = {0, 2, 4, 6, ... } double A[N], B[N]; double factor = 3.0; for (int i = 0; i < N; i++) { A[i] = i; B[i] = -i; } if (dev != omp_initial_device && dev != omp_get_num_devices ()) { #pragma omp target enter data device(dev) map(A, B) } used_variant = 99; #pragma omp dispatch device(dev) run_daxpy (N, factor, A, 1, B, 1); if (dev != omp_initial_device && dev != omp_get_num_devices ()) { #pragma omp target exit data device(dev) map(release: A) map(from: B) int tmp = omp_get_default_device (); omp_set_default_device (dev); #if defined(__HIP_PLATFORM_AMD__) if (on_device_arch_gcn ()) #else if (on_device_arch_nvptx ()) #endif assert (used_variant == 1); else assert (used_variant == 2); omp_set_default_device (tmp); } else assert (used_variant == 2); for (int i = 0; i < N; i++) assert (B[i] == 2*i); } int main () { int ndev = omp_get_num_devices (); for (int dev = 0; dev <= ndev; dev++) run_test (dev); run_test (omp_initial_device); return 0; }