aboutsummaryrefslogtreecommitdiff
path: root/libgomp/testsuite/libgomp.c/interop-hsa.c
blob: cf8bc90bb9c07df7f4d249e5850780c14f7ab4f7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/* { dg-additional-options "-ldl" } */
/* { dg-require-effective-target offload_device_gcn } */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <assert.h>
#include <dlfcn.h>
#include "../../../include/hsa.h"
#include "../../config/gcn/libgomp-gcn.h"

#define STACKSIZE (100 * 1024)
#define HEAPSIZE (10 * 1024 * 1024)
#define ARENASIZE HEAPSIZE

/* This code fragment must be optimized or else the host-fallback kernel has
 * invalid ASM inserts.  The rest of the file can be compiled safely at -O0.  */
#pragma omp declare target
uintptr_t __attribute__((optimize("O1")))
get_kernel_ptr ()
{
  uintptr_t val;
  if (!omp_is_initial_device ())
    /* "main._omp_fn.0" is the name GCC gives the first OpenMP target
     * region in the "main" function.
     * The ".kd" suffix is added by the LLVM assembler when it creates the
     * kernel meta-data, and this is what we need to launch a kernel.  */
    asm ("s_getpc_b64 %0\n\t"
	 "s_add_u32 %L0, %L0, main._omp_fn.0.kd@rel32@lo+4\n\t"
	 "s_addc_u32 %H0, %H0, main._omp_fn.0.kd@rel32@hi+4"
	 : "=Sg"(val));
  return val;
}
#pragma omp end declare target

int
main(int argc, char** argv)
{

  /* Load the HSA runtime DLL.  */
  void *hsalib = dlopen ("libhsa-runtime64.so.1", RTLD_LAZY);
  assert (hsalib);

  hsa_status_t (*hsa_signal_create) (hsa_signal_value_t initial_value,
				     uint32_t num_consumers,
				     const hsa_agent_t *consumers,
				     hsa_signal_t *signal)
    = dlsym (hsalib, "hsa_signal_create");
  assert (hsa_signal_create);

  uint64_t (*hsa_queue_load_write_index_relaxed) (const hsa_queue_t *queue)
    = dlsym (hsalib, "hsa_queue_load_write_index_relaxed");
  assert (hsa_queue_load_write_index_relaxed);

  void (*hsa_signal_store_relaxed) (hsa_signal_t signal,
				    hsa_signal_value_t value)
    = dlsym (hsalib, "hsa_signal_store_relaxed");
  assert (hsa_signal_store_relaxed);

  hsa_signal_value_t (*hsa_signal_wait_relaxed) (hsa_signal_t signal,
						 hsa_signal_condition_t condition,
						 hsa_signal_value_t compare_value,
						 uint64_t timeout_hint,
						 hsa_wait_state_t wait_state_hint)
    = dlsym (hsalib, "hsa_signal_wait_relaxed");
  assert (hsa_signal_wait_relaxed);

  void (*hsa_queue_store_write_index_relaxed) (const hsa_queue_t *queue,
					       uint64_t value)
    = dlsym (hsalib, "hsa_queue_store_write_index_relaxed");
  assert (hsa_queue_store_write_index_relaxed);

  hsa_status_t (*hsa_signal_destroy) (hsa_signal_t signal)
    = dlsym (hsalib, "hsa_signal_destroy");
  assert (hsa_signal_destroy);

  /* Set up the device data environment.  */
  int test_data_value = 0;
#pragma omp target enter data map(test_data_value)

  /* Get the interop details.  */
  int device_num = omp_get_default_device();
  hsa_agent_t *gpu_agent;
  hsa_queue_t *hsa_queue = NULL;

  omp_interop_t interop = omp_interop_none;
#pragma omp interop init(target, targetsync, prefer_type("hsa"): interop) device(device_num)
  assert (interop != omp_interop_none);

  omp_interop_rc_t retcode;
  omp_interop_fr_t fr = omp_get_interop_int (interop, omp_ipr_fr_id, &retcode);
  assert (retcode == omp_irc_success);
  assert (fr == omp_ifr_hsa);

  gpu_agent = omp_get_interop_ptr(interop, omp_ipr_device, &retcode);
  assert (retcode == omp_irc_success);

  hsa_queue = omp_get_interop_ptr(interop, omp_ipr_targetsync, &retcode);
  assert (retcode == omp_irc_success);
  assert (hsa_queue);

  /* Call an offload kernel via OpenMP/libgomp.
   *
   * This kernel serves two purposes:
   *   1) Lookup the device-side load-address of itself (thus avoiding the
   *   need to access the libgomp internals).
   *   2) Count how many times it is called.
   * We then call it once using OpenMP, and once manually, and check
   * the counter reads "2".  */
  uint64_t kernel_object = 0;
#pragma omp target map(from:kernel_object) map(present,alloc:test_data_value)
  {
    kernel_object = get_kernel_ptr ();
    ++test_data_value;
  }

  assert (kernel_object != 0);

  /* Configure the same kernel to run again, using HSA manually this time.  */
  hsa_status_t status;
  hsa_signal_t signal;
  status = hsa_signal_create(1, 0, NULL, &signal);
  assert (status == HSA_STATUS_SUCCESS);

  /* The kernel is built by GCC for OpenMP, so we need to pass the same
   * data pointers that libgomp would pass in.  */
  struct {
    uintptr_t test_data_value;
    uintptr_t kernel_object;
  } tgtaddrs;

#pragma omp target data use_device_addr(test_data_value)
  {
    tgtaddrs.test_data_value = (uintptr_t)&test_data_value;
    tgtaddrs.kernel_object = (uintptr_t)omp_target_alloc (8, device_num);
  }

  /* We also need to duplicate the launch ABI used by plugin-gcn.c.  */
  struct kernargs_abi args;    /* From libgomp-gcn.h.  */
  args.dummy1 = (int64_t)&tgtaddrs;
  args.out_ptr = (int64_t)malloc (sizeof (struct output)); /* Host side.  */
  args.heap_ptr = (int64_t)omp_target_alloc (HEAPSIZE, device_num);
  args.arena_ptr = (int64_t)omp_target_alloc (ARENASIZE, device_num);
  args.stack_ptr = (int64_t)omp_target_alloc (STACKSIZE, device_num);
  args.arena_size_per_team = ARENASIZE;
  args.stack_size_per_thread = STACKSIZE;

  /* Build the HSA dispatch packet, and insert it into the queue.  */
  uint64_t packet_id = hsa_queue_load_write_index_relaxed (hsa_queue);
  const uint32_t queueMask = hsa_queue->size - 1;
  hsa_kernel_dispatch_packet_t *dispatch_packet =
    &(((hsa_kernel_dispatch_packet_t *)
	  (hsa_queue->base_address))[packet_id & queueMask]);

  dispatch_packet->setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
  dispatch_packet->workgroup_size_x = 1;
  dispatch_packet->workgroup_size_y = 64;
  dispatch_packet->workgroup_size_z = 1;
  dispatch_packet->grid_size_x = 1;
  dispatch_packet->grid_size_y = 64;
  dispatch_packet->grid_size_z = 1;
  dispatch_packet->completion_signal = signal;
  dispatch_packet->kernel_object = kernel_object;
  dispatch_packet->kernarg_address = &args;
  dispatch_packet->private_segment_size = 0;
  dispatch_packet->group_segment_size = 1536;

  uint16_t header = 0;
  header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;

  /* Finish writing the packet header with an atomic release.  */
  __atomic_store_n((uint16_t*)dispatch_packet, header, __ATOMIC_RELEASE);

  hsa_queue_store_write_index_relaxed (hsa_queue, packet_id + 1);
  
  ;/* Run the kernel and wait for it to complete.  */
  hsa_signal_store_relaxed(hsa_queue->doorbell_signal, packet_id);
  while (hsa_signal_wait_relaxed(signal, HSA_SIGNAL_CONDITION_LT, 1,
	UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
    ;

  /* Clean up HSA.  */
  hsa_signal_destroy(signal);
  free ((void*)args.out_ptr);
  omp_target_free ((void*)args.heap_ptr, device_num);
  omp_target_free ((void*)args.arena_ptr, device_num);
  omp_target_free ((void*)args.stack_ptr, device_num);
  omp_target_free ((void*)tgtaddrs.kernel_object, device_num);

  /* Clean up OpenMP.  */
  #pragma omp interop destroy(interop)

  /* Bring the data back from the device.  */
#pragma omp target exit data map(test_data_value)

  /* Ensure the kernel was called twice.  Once by OpenMP, once by HSA.  */
  assert (test_data_value == 2);

  return 0;
}