aboutsummaryrefslogtreecommitdiff
path: root/openmp/libomptarget/include/omptarget.h
blob: 9a2bd1340e3b4d42fad6372ef6df1a000f48d686 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Interface to be used by Clang during the codegen of a
// target region.
//
//===----------------------------------------------------------------------===//

#ifndef _OMPTARGET_H_
#define _OMPTARGET_H_

#include "Shared/APITypes.h"
#include "Shared/Environment.h"
#include "Shared/SourceInfo.h"

#include "OpenMP/InternalTypes.h"

#include <cstdint>
#include <deque>
#include <functional>
#include <stddef.h>
#include <stdint.h>
#include <type_traits>

#include "llvm/ADT/SmallVector.h"

#define OFFLOAD_SUCCESS (0)
#define OFFLOAD_FAIL (~0)

#define OFFLOAD_DEVICE_DEFAULT -1

// Don't format out enums and structs.
// clang-format off

/// return flags of __tgt_target_XXX public APIs
enum __tgt_target_return_t : int {
  /// successful offload executed on a target device
  OMP_TGT_SUCCESS = 0,
  /// offload may not execute on the requested target device
  /// this scenario can be caused by the device not available or unsupported
  /// as described in the Execution Model in the specifcation
  /// this status may not be used for target device execution failure
  /// which should be handled internally in libomptarget
  OMP_TGT_FAIL = ~0
};

/// Data attributes for each data reference used in an OpenMP target region.
enum tgt_map_type {
  // No flags
  OMP_TGT_MAPTYPE_NONE            = 0x000,
  // copy data from host to device
  OMP_TGT_MAPTYPE_TO              = 0x001,
  // copy data from device to host
  OMP_TGT_MAPTYPE_FROM            = 0x002,
  // copy regardless of the reference count
  OMP_TGT_MAPTYPE_ALWAYS          = 0x004,
  // force unmapping of data
  OMP_TGT_MAPTYPE_DELETE          = 0x008,
  // map the pointer as well as the pointee
  OMP_TGT_MAPTYPE_PTR_AND_OBJ     = 0x010,
  // pass device base address to kernel
  OMP_TGT_MAPTYPE_TARGET_PARAM    = 0x020,
  // return base device address of mapped data
  OMP_TGT_MAPTYPE_RETURN_PARAM    = 0x040,
  // private variable - not mapped
  OMP_TGT_MAPTYPE_PRIVATE         = 0x080,
  // copy by value - not mapped
  OMP_TGT_MAPTYPE_LITERAL         = 0x100,
  // mapping is implicit
  OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
  // copy data to device
  OMP_TGT_MAPTYPE_CLOSE           = 0x400,
  // runtime error if not already allocated
  OMP_TGT_MAPTYPE_PRESENT         = 0x1000,
  // use a separate reference counter so that the data cannot be unmapped within
  // the structured region
  // This is an OpenMP extension for the sake of OpenACC support.
  OMP_TGT_MAPTYPE_OMPX_HOLD       = 0x2000,
  // descriptor for non-contiguous target-update
  OMP_TGT_MAPTYPE_NON_CONTIG      = 0x100000000000,
  // member of struct, member given by [16 MSBs] - 1
  OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
};

/// Flags for offload entries.
enum OpenMPOffloadingDeclareTargetFlags {
  /// Mark the entry global as having a 'link' attribute.
  OMP_DECLARE_TARGET_LINK = 0x01,
  /// Mark the entry global as being an indirectly callable function.
  OMP_DECLARE_TARGET_INDIRECT = 0x08,
  /// This is an entry corresponding to a requirement to be registered.
  OMP_REGISTER_REQUIRES = 0x10,
};

enum TargetAllocTy : int32_t {
  TARGET_ALLOC_DEVICE = 0,
  TARGET_ALLOC_HOST,
  TARGET_ALLOC_SHARED,
  TARGET_ALLOC_DEFAULT,
  /// The allocation will not block on other streams.
  TARGET_ALLOC_DEVICE_NON_BLOCKING,
};

inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
	     nullptr, nullptr, nullptr,   nullptr,
	     0,      {0,0},       {1, 0, 0}, {1, 0, 0}, 0};

struct DeviceTy;

/// The libomptarget wrapper around a __tgt_async_info object directly
/// associated with a libomptarget layer device. RAII semantics to avoid
/// mistakes.
class AsyncInfoTy {
public:
  enum class SyncTy { BLOCKING, NON_BLOCKING };

private:
  /// Locations we used in (potentially) asynchronous calls which should live
  /// as long as this AsyncInfoTy object.
  std::deque<void *> BufferLocations;

  /// Post-processing operations executed after a successful synchronization.
  /// \note the post-processing function should return OFFLOAD_SUCCESS or
  /// OFFLOAD_FAIL appropriately.
  using PostProcFuncTy = std::function<int()>;
  llvm::SmallVector<PostProcFuncTy> PostProcessingFunctions;

  __tgt_async_info AsyncInfo;
  DeviceTy &Device;

public:
  /// Synchronization method to be used.
  SyncTy SyncType;

  AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
      : Device(Device), SyncType(SyncType) {}
  ~AsyncInfoTy() { synchronize(); }

  /// Implicit conversion to the __tgt_async_info which is used in the
  /// plugin interface.
  operator __tgt_async_info *() { return &AsyncInfo; }

  /// Synchronize all pending actions.
  ///
  /// \note synchronization will be performance in a blocking or non-blocking
  /// manner, depending on the SyncType.
  ///
  /// \note if the operations are completed, the registered post-processing
  /// functions will be executed once and unregistered afterwards.
  ///
  /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately.
  int synchronize();

  /// Return a void* reference with a lifetime that is at least as long as this
  /// AsyncInfoTy object. The location can be used as intermediate buffer.
  void *&getVoidPtrLocation();

  /// Check if all asynchronous operations are completed.
  ///
  /// \note only a lightweight check. If needed, use synchronize() to query the
  /// status of AsyncInfo before checking.
  ///
  /// \returns true if there is no pending asynchronous operations, false
  /// otherwise.
  bool isDone() const;

  /// Add a new post-processing function to be executed after synchronization.
  ///
  /// \param[in] Function is a templated function (e.g., function pointers,
  /// lambdas, std::function) that can be convertible to a PostProcFuncTy (i.e.,
  /// it must have int() as its function signature).
  template <typename FuncTy> void addPostProcessingFunction(FuncTy &&Function) {
    static_assert(std::is_convertible_v<FuncTy, PostProcFuncTy>,
                  "Invalid post-processing function type. Please check "
                  "function signature!");
    PostProcessingFunctions.emplace_back(Function);
  }

private:
  /// Run all the post-processing functions sequentially.
  ///
  /// \note after a successful execution, all previously registered functions
  /// are unregistered.
  ///
  /// \returns OFFLOAD_FAIL if any post-processing function failed,
  /// OFFLOAD_SUCCESS otherwise.
  int32_t runPostProcessing();

  /// Check if the internal asynchronous info queue is empty or not.
  ///
  /// \returns true if empty, false otherwise.
  bool isQueueEmpty() const;
};

// Wrapper for task stored async info objects.
class TaskAsyncInfoWrapperTy {
  // Invalid GTID as defined by libomp; keep in sync
  static constexpr int KMP_GTID_DNE = -2;

  const int ExecThreadID = KMP_GTID_DNE;
  AsyncInfoTy LocalAsyncInfo;
  AsyncInfoTy *AsyncInfo = &LocalAsyncInfo;
  void **TaskAsyncInfoPtr = nullptr;

public:
  TaskAsyncInfoWrapperTy(DeviceTy &Device)
      : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
    // If we failed to acquired the current global thread id, we cannot
    // re-enqueue the current task. Thus we should use the local blocking async
    // info.
    if (ExecThreadID == KMP_GTID_DNE)
      return;

    // Only tasks with an assigned task team can be re-enqueue and thus can
    // use the non-blocking synchronization scheme. Thus we should use the local
    // blocking async info, if we donĀ“t have one.
    if (!__kmpc_omp_has_task_team(ExecThreadID))
      return;

    // Acquire a pointer to the AsyncInfo stored inside the current task being
    // executed.
    TaskAsyncInfoPtr = __kmpc_omp_get_target_async_handle_ptr(ExecThreadID);

    // If we cannot acquire such pointer, fallback to using the local blocking
    // async info.
    if (!TaskAsyncInfoPtr)
      return;

    // When creating a new task async info, the task handle must always be
    // invalid. We must never overwrite any task async handle and there should
    // never be any valid handle store inside the task at this point.
    assert((*TaskAsyncInfoPtr) == nullptr &&
           "Task async handle is not empty when dispatching new device "
           "operations. The handle was not cleared properly or "
           "__tgt_target_nowait_query should have been called!");

    // If no valid async handle is present, a new AsyncInfo will be allocated
    // and stored in the current task.
    AsyncInfo = new AsyncInfoTy(Device, AsyncInfoTy::SyncTy::NON_BLOCKING);
    *TaskAsyncInfoPtr = (void *)AsyncInfo;
  }

  ~TaskAsyncInfoWrapperTy() {
    // Local async info destruction is automatically handled by ~AsyncInfoTy.
    if (AsyncInfo == &LocalAsyncInfo)
      return;

    // If the are device operations still pending, return immediately without
    // deallocating the handle.
    if (!AsyncInfo->isDone())
      return;

    // Delete the handle and unset it from the OpenMP task data.
    delete AsyncInfo;
    *TaskAsyncInfoPtr = nullptr;
  }

  operator AsyncInfoTy &() { return *AsyncInfo; }
};

/// This struct is a record of non-contiguous information
struct __tgt_target_non_contig {
  uint64_t Offset;
  uint64_t Count;
  uint64_t Stride;
};

#ifdef __cplusplus
extern "C" {
#endif

int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                      size_t DstOffset, size_t SrcOffset, int DstDevice,
                      int SrcDevice);
int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                           int NumDims, const size_t *Volume,
                           const size_t *DstOffsets, const size_t *SrcOffsets,
                           const size_t *DstDimensions,
                           const size_t *SrcDimensions, int DstDevice,
                           int SrcDevice);
void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
                             size_t Size, size_t DeviceOffset, int DeviceNum);
int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);

/// Explicit target memory allocators
/// Using the llvm_ prefix until they become part of the OpenMP standard.
void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);

/// Explicit target memory deallocators
/// Using the llvm_ prefix until they become part of the OpenMP standard.
void llvm_omp_target_free_device(void *DevicePtr, int DeviceNum);
void llvm_omp_target_free_host(void *DevicePtr, int DeviceNum);
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);

/// Dummy target so we have a symbol for generating host fallback.
void *llvm_omp_target_dynamic_shared_alloc();

/// add the clauses of the requires directives in a given file
void __tgt_register_requires(int64_t Flags);

/// Initializes the runtime library.
void __tgt_rtl_init();

/// Deinitializes the runtime library.
void __tgt_rtl_deinit();

/// adds a target shared library to the target execution image
void __tgt_register_lib(__tgt_bin_desc *Desc);

/// Initialize all RTLs at once
void __tgt_init_all_rtls();

/// removes a target shared library from the target execution image
void __tgt_unregister_lib(__tgt_bin_desc *Desc);

// creates the host to target data mapping, stores it in the
// libomptarget.so internal structure (an entry in a stack of data maps) and
// passes the data to the device;
void __tgt_target_data_begin(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                             void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
void __tgt_target_data_begin_nowait(int64_t DeviceId, int32_t ArgNum,
                                    void **ArgsBase, void **Args,
                                    int64_t *ArgSizes, int64_t *ArgTypes,
                                    int32_t DepNum, void *DepList,
                                    int32_t NoAliasDepNum,
                                    void *NoAliasDepList);
void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
                                    int32_t ArgNum, void **ArgsBase,
                                    void **Args, int64_t *ArgSizes,
                                    int64_t *ArgTypes, map_var_info_t *ArgNames,
                                    void **ArgMappers);
void __tgt_target_data_begin_nowait_mapper(
    ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
    void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
    void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
    void *NoAliasDepList);

// passes data from the target, release target memory and destroys the
// host-target mapping (top entry from the stack of data maps) created by
// the last __tgt_target_data_begin
void __tgt_target_data_end(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                           void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
void __tgt_target_data_end_nowait(int64_t DeviceId, int32_t ArgNum,
                                  void **ArgsBase, void **Args,
                                  int64_t *ArgSizes, int64_t *ArgTypes,
                                  int32_t DepNum, void *DepList,
                                  int32_t NoAliasDepNum, void *NoAliasDepList);
void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
                                  int32_t ArgNum, void **ArgsBase, void **Args,
                                  int64_t *ArgSizes, int64_t *ArgTypes,
                                  map_var_info_t *ArgNames, void **ArgMappers);
void __tgt_target_data_end_nowait_mapper(
    ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
    void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
    void **ArgMappers, int32_t depNum, void *depList, int32_t NoAliasDepNum,
    void *NoAliasDepList);

/// passes data to/from the target
void __tgt_target_data_update(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                              void **Args, int64_t *ArgSizes,
                              int64_t *ArgTypes);
void __tgt_target_data_update_nowait(int64_t DeviceId, int32_t ArgNum,
                                     void **ArgsBase, void **Args,
                                     int64_t *ArgSizes, int64_t *ArgTypes,
                                     int32_t DepNum, void *DepList,
                                     int32_t NoAliasDepNum,
                                     void *NoAliasDepList);
void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
                                     int32_t ArgNum, void **ArgsBase,
                                     void **Args, int64_t *ArgSizes,
                                     int64_t *ArgTypes,
                                     map_var_info_t *ArgNames,
                                     void **ArgMappers);
void __tgt_target_data_update_nowait_mapper(
    ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
    void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
    void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
    void *NoAliasDepList);

// Performs the same actions as data_begin in case ArgNum is non-zero
// and initiates run of offloaded region on target platform; if ArgNum
// is non-zero after the region execution is done it also performs the
// same action as data_end above. The following types are used; this
// function returns 0 if it was able to transfer the execution to a
// target and an int different from zero otherwise.
int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                        int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);

// Non-blocking synchronization for target nowait regions. This function
// acquires the asynchronous context from task data of the current task being
// executed and tries to query for the completion of its operations. If the
// operations are still pending, the function returns immediately. If the
// operations are completed, all the post-processing procedures stored in the
// asynchronous context are executed and the context is removed from the task
// data.
void __tgt_target_nowait_query(void **AsyncHandle);

/// Executes a target kernel by replaying recorded kernel arguments and
/// device memory.
int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                               void *DeviceMemory, int64_t DeviceMemorySize,
                               void **TgtArgs, ptrdiff_t *TgtOffsets,
                               int32_t NumArgs, int32_t NumTeams,
                               int32_t ThreadLimit, uint64_t LoopTripCount);

void __tgt_set_info_flag(uint32_t);

int __tgt_print_device_info(int64_t DeviceId);

int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
                                 void *VAddr, bool IsRecord, bool SaveOutput,
                                 uint64_t &ReqPtrArgOffset);

#ifdef __cplusplus
}
#endif

#ifdef __cplusplus
#define EXTERN extern "C"
#else
#define EXTERN extern
#endif

#endif // _OMPTARGET_H_