aboutsummaryrefslogtreecommitdiff
path: root/libgomp/testsuite/libgomp.c/interop-hipblas.h
blob: d7cb174b9e15a43676efddee59a7727db1627514 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/* Check whether hipBlas' daxpy works with an interop object.
     daxpy(N, DA, DX, INCX, DY, INCY)
   calculates (for DX = DY = 1):
     DY(1:N) =  DY(1:N) + DA * DX(1:N)
   and otherwise N array elements, taking every INCX-th or INCY-th one, repectively.

Based on the interop example in OpenMP's example document  */

/* Minimal check whether HIP works - by checking whether the API routines
   seem to work.  This includes a fallback if the header is not
   available.  */

#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
  #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined"
#endif

#if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
  #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined"
#endif


#include <assert.h>
#include <omp.h>
#include "../libgomp.c-c++-common/on_device_arch.h"


#if __has_include(<hipblas/hipblas.h>) && (__has_include(<library_types.h>) || !defined(__HIP_PLATFORM_NVIDIA__)) && !defined(USE_HIP_FALLBACK_HEADER)
  #ifdef __HIP_PLATFORM_NVIDIA__
    /* There seems to be an issue with hip/library_types.h including
       CUDA's "library_types.h". Include CUDA's one explicitly here.
       Could possibly worked around by using -isystem vs. -I.  */
    #include <library_types.h>

    /* For some reasons, the following symbols do not seem to get
       mapped from HIP to CUDA, causing link errors.  */
    #define hipblasSetStream cublasSetStream_v2
    #define hipblasDaxpy cublasDaxpy_v2
    #define hipblasCreate cublasCreate_v2
  #endif
  #include <hipblas/hipblas.h>

#elif defined(__HIP_PLATFORM_AMD__)
  /* Add a poor man's fallback declaration.  */
  #if !defined(USE_HIP_FALLBACK_HEADER)
    #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_AMD__"
  #endif

  typedef enum
  {
    HIPBLAS_STATUS_SUCCESS = 0

  } hipblasStatus_t;

  typedef struct ihipStream_t* hipStream_t;
  typedef void* hipblasHandle_t;

  hipblasStatus_t hipblasCreate (hipblasHandle_t*);
  hipblasStatus_t hipblasSetStream (hipblasHandle_t, hipStream_t);
  hipblasStatus_t hipblasDaxpy (hipblasHandle_t, int, const double*, const double*, int, double*, int);

#else
  /* Add a poor man's fallback declaration.  */
  #if !defined(USE_HIP_FALLBACK_HEADER)
    #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_NVIDA__"
  #endif

  #if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && __has_include(<cublas_v2.h>) && !defined(USE_CUDA_FALLBACK_HEADER)
    #include <cuda.h>
    #include <cudaTypedefs.h>
    #include <cuda_runtime.h>
    #include <cublas_v2.h>

  #else
    /* Add a poor man's fallback declaration.  */
    #if defined(USE_CUDA_FALLBACK_HEADER)
      // no warning
    #elif !__has_include(<cuda.h>)
      #warning "Using GCC's cuda.h as fallback for cuda.h"
    #elif !__has_include(<cudaTypedefs.h>)
      #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h"
    #elif !__has_include(<cuda_runtime.h>)
      #warning "Using GCC's cuda.h as fallback for cuda_runtime.h"
    #else
      #warning "Using GCC's cuda.h as fallback for cublas_v2.h"
    #endif
    #include "../../../include/cuda/cuda.h"

    typedef enum {
      CUBLAS_STATUS_SUCCESS = 0,
    } cublasStatus_t;

    typedef CUstream cudaStream_t;
    typedef struct cublasContext* cublasHandle_t;

    #define cublasCreate cublasCreate_v2
    cublasStatus_t cublasCreate_v2 (cublasHandle_t *);

    #define cublasSetStream cublasSetStream_v2
    cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t);

    #define cublasDaxpy cublasDaxpy_v2
    cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double*, const double*, int, double*, int);
  #endif

  #define HIPBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
  #define hipblasStatus_t cublasStatus_t
  #define hipStream_t cudaStream_t
  #define hipblasHandle_t cublasHandle_t
  #define hipblasCreate cublasCreate
  #define hipblasSetStream cublasSetStream
  #define hipblasDaxpy cublasDaxpy
#endif

static int used_variant = 0;

void
run_hipBlasdaxpy (int n, double da, const double *dx, int incx, double *dy, int incy, omp_interop_t obj)
{
  used_variant = 1;

  omp_interop_rc_t res;
  hipblasStatus_t stat;

  omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res);
  assert (res == omp_irc_success && fr == omp_ifr_hip);

  hipStream_t stream = (hipStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res);
  assert (res == omp_irc_success);

  hipblasHandle_t handle;
  stat = hipblasCreate (&handle);
  assert (stat == HIPBLAS_STATUS_SUCCESS);

  stat = hipblasSetStream (handle, stream);
  assert (stat == HIPBLAS_STATUS_SUCCESS);

  /* 'da' can be in host or device space, 'dx' and 'dy' must be in device space.  */
  stat = hipblasDaxpy (handle, n, &da, dx, 1, dy, 1) ;
  assert (stat == HIPBLAS_STATUS_SUCCESS);
}

#if defined(__HIP_PLATFORM_AMD__)
#pragma omp declare variant(run_hipBlasdaxpy) \
                       match(construct={dispatch}, target_device={kind(nohost), arch("amdgcn")}) \
                       adjust_args(need_device_ptr : dx, dy) \
                       append_args(interop(targetsync, prefer_type("hip")))
#elif defined(__HIP_PLATFORM_NVIDIA__) 
#pragma omp declare variant(run_hipBlasdaxpy) \
                       match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \
                       adjust_args(need_device_ptr : dx, dy) \
                       append_args(interop(targetsync, prefer_type("hip")))
#else
 #error "wrong platform"
#endif

void
run_daxpy (int n, double da, const double *dx, int incx, double *dy, int incy)
{
  used_variant = 2;

  if (incx == 1 && incy == 1)
    #pragma omp simd
    for (int i = 0; i < n; i++)
      dy[i] += da * dx[i];
  else
    {
      int ix = 0;
      int iy = 0;
      for (int i = 0; i < n; i++)
	{
	  dy[iy] += da * dx[ix];
	  ix += incx;
	  iy += incy;
	}
    }
}


void
run_test (int dev)
{
  constexpr int N = 1024;

  // A = {1,2,...,N}
  // B = {-1, -2, ..., N}
  // B' = daxpy (N, 3, A, incx=1, B, incy=1)
  //    = B + 3*A
  // -> B' = {0, 2, 4, 6, ... }

  double A[N], B[N];
  double factor = 3.0;
  for (int i = 0; i < N; i++)
    {
      A[i] = i;
      B[i] = -i;
    }

  if (dev != omp_initial_device && dev != omp_get_num_devices ())
    {
      #pragma omp target enter data device(dev) map(A, B)
    }

  used_variant = 99;
  #pragma omp dispatch device(dev)
    run_daxpy (N, factor, A, 1, B, 1);  

  if (dev != omp_initial_device && dev != omp_get_num_devices ())
    {
      #pragma omp target exit data device(dev) map(release: A) map(from: B)

      int tmp = omp_get_default_device ();
      omp_set_default_device (dev);
#if defined(__HIP_PLATFORM_AMD__)
      if (on_device_arch_gcn ())
#else
      if (on_device_arch_nvptx ())
#endif
	assert (used_variant == 1);
      else
	assert (used_variant == 2);
      omp_set_default_device (tmp);
    }
  else
    assert (used_variant == 2);

  for (int i = 0; i < N; i++)
    assert (B[i] == 2*i);
}

int   
main () 
{   
  int ndev = omp_get_num_devices ();

  for (int dev = 0; dev <= ndev; dev++)
    run_test (dev);
  run_test (omp_initial_device);  

  return 0;
}