1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
|
//===--- Level Zero Target RTL Implementation -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Memory related support for SPIR-V/Xe machine.
//
//===----------------------------------------------------------------------===//
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
#include <cassert>
#include <level_zero/ze_api.h>
#include <list>
#include <map>
#include <memory>
#include <mutex>
#include "L0Defs.h"
#include "L0Trace.h"
namespace llvm::omp::target::plugin {
class L0DeviceTy;
// Forward declarations.
struct L0OptionsTy;
class L0DeviceTy;
class L0ContextTy;
constexpr static int32_t MaxMemKind = TARGET_ALLOC_LAST + 1;
struct DynamicMemHeapTy {
/// Base address memory is allocated from.
uintptr_t AllocBase = 0;
/// Minimal size served by the current heap.
size_t BlockSize = 0;
/// Max size served by the current heap.
size_t MaxSize = 0;
/// Available memory blocks.
uint32_t NumBlocks = 0;
/// Number of block descriptors.
uint32_t NumBlockDesc = 0;
/// Number of block counters.
uint32_t NumBlockCounter = 0;
/// List of memory block descriptors.
uint64_t *BlockDesc = nullptr;
/// List of memory block counters.
uint32_t *BlockCounter = nullptr;
};
struct DynamicMemPoolTy {
/// Location of device memory blocks.
void *PoolBase = nullptr;
/// Heap size common to all heaps.
size_t HeapSize = 0;
/// Number of heaps available.
uint32_t NumHeaps = 0;
/// Heap descriptors (using fixed-size array to simplify memory allocation).
DynamicMemHeapTy HeapDesc[8];
};
/// Memory allocation information used in memory allocation/deallocation.
struct MemAllocInfoTy {
/// Base address allocated from compute runtime.
void *Base = nullptr;
/// Allocation size known to users/libomptarget.
size_t ReqSize = 0;
/// Allocation size known to the plugin (can be larger than ReqSize).
size_t AllocSize = 0;
/// TARGET_ALLOC kind.
int32_t Kind = TARGET_ALLOC_DEFAULT;
/// Is the allocation from a pool?
bool InPool = false;
/// Is an implicit argument?
bool ImplicitArg = false;
MemAllocInfoTy() = default;
MemAllocInfoTy(void *Base, size_t ReqSize, size_t AllocSize, int32_t Kind,
bool InPool, bool ImplicitArg)
: Base(Base), ReqSize(ReqSize), AllocSize(AllocSize), Kind(Kind),
InPool(InPool), ImplicitArg(ImplicitArg) {}
};
/// Responsible for all activities involving memory allocation/deallocation.
/// It contains memory pool management, memory allocation bookkeeping.
class MemAllocatorTy {
/// Simple memory allocation statistics. Maintains numbers for pool allocation
/// and GPU RT allocation.
struct MemStatTy {
size_t Requested[2] = {0, 0}; // Requested bytes.
size_t Allocated[2] = {0, 0}; // Allocated bytes.
size_t Freed[2] = {0, 0}; // Freed bytes.
size_t InUse[2] = {0, 0}; // Current memory in use.
size_t PeakUse[2] = {0, 0}; // Peak bytes used.
size_t NumAllocs[2] = {0, 0}; // Number of allocations.
};
/// Memory pool which enables reuse of already allocated blocks:
/// -- Pool maintains a list of buckets each of which can allocate fixed-size
/// memory.
/// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
/// -- Each memory block can allocate multiple fixed-size memory requested by
/// offload RT or user.
/// -- Memory allocation falls back to GPU RT allocation when the pool size
/// (total memory used by pool) reaches a threshold.
class MemPoolTy {
/// Memory block maintained in each bucket.
struct BlockTy {
/// Base address of this block.
uintptr_t Base = 0;
/// Size of the block.
size_t Size = 0;
/// Supported allocation size by this block.
size_t ChunkSize = 0;
/// Total number of slots.
uint32_t NumSlots = 0;
/// Maximum slot value.
static constexpr uint32_t MaxSlots =
std::numeric_limits<decltype(NumSlots)>::max();
/// Number of slots in use.
uint32_t NumUsedSlots = 0;
/// Cached available slot returned by the last dealloc() call.
uint32_t FreeSlot = MaxSlots;
/// Marker for the currently used slots.
std::vector<bool> UsedSlots;
BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
Base = reinterpret_cast<uintptr_t>(_Base);
Size = _Size;
ChunkSize = _ChunkSize;
NumSlots = Size / ChunkSize;
NumUsedSlots = 0;
UsedSlots.resize(NumSlots, /*InitValue=*/false);
}
/// Check if the current block is fully used.
bool isFull() const { return NumUsedSlots == NumSlots; }
/// Check if the given address belongs to the current block.
bool contains(void *Mem) const {
auto M = reinterpret_cast<uintptr_t>(Mem);
return M >= Base && M < Base + Size;
}
/// Allocate a single chunk from the block.
void *alloc();
/// Deallocate the given memory.
void dealloc(void *Mem);
}; // BlockTy
/// Allocation kind for the current pool.
int32_t AllocKind = TARGET_ALLOC_DEFAULT;
/// Access to the allocator.
MemAllocatorTy *Allocator = nullptr;
/// Minimum supported memory allocation size from pool.
size_t AllocMin = 1 << 6; // 64B
/// Maximum supported memory allocation size from pool.
size_t AllocMax = 0;
/// Allocation size when the pool needs to allocate a block.
size_t AllocUnit = 1 << 16; // 64KB
/// Capacity of each block in the buckets which decides number of
/// allocatable chunks from the block. Each block in the bucket can serve
/// at least BlockCapacity chunks.
/// If ChunkSize * BlockCapacity <= AllocUnit
/// BlockSize = AllocUnit
/// Otherwise,
/// BlockSize = ChunkSize * BlockCapacity
/// This simply means how much memory is over-allocated.
uint32_t BlockCapacity = 0;
/// Total memory allocated from GPU RT for this pool.
size_t PoolSize = 0;
/// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
/// when PoolSize reaches PoolSizeMax.
size_t PoolSizeMax = 0;
/// Small allocation size allowed in the pool even if pool size is over the
/// pool size limit.
size_t SmallAllocMax = 1024;
/// Small allocation pool size.
size_t SmallPoolSize = 0;
/// Small allocation pool size max (4MB).
size_t SmallPoolSizeMax = (4 << 20);
/// List of buckets.
std::vector<std::vector<BlockTy *>> Buckets;
/// List of bucket parameters.
std::vector<std::pair<size_t, size_t>> BucketParams;
/// Map from allocated pointer to corresponding block.
llvm::DenseMap<void *, BlockTy *> PtrToBlock;
/// Simple stats counting miss/hit in each bucket.
std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
/// Need to zero-initialize after L0 allocation.
bool ZeroInit = false;
/// Get bucket ID from the specified allocation size.
uint32_t getBucketId(size_t Size) {
uint32_t Count = 0;
for (size_t SZ = AllocMin; SZ < Size; Count++)
SZ <<= 1;
return Count;
}
public:
MemPoolTy() = default;
MemPoolTy(const MemPoolTy &) = delete;
MemPoolTy(MemPoolTy &&) = delete;
MemPoolTy &operator=(const MemPoolTy &) = delete;
MemPoolTy &operator=(const MemPoolTy &&) = delete;
~MemPoolTy() = default;
void printUsage();
/// Initialize pool with allocation kind, allocator, and user options.
Error init(int32_t Kind, MemAllocatorTy *Allocator,
const L0OptionsTy &Option);
// Initialize pool used for reduction pool.
Error init(MemAllocatorTy *Allocator, const L0OptionsTy &Option);
// Initialize pool used for small memory pool with fixed parameters.
Error init(MemAllocatorTy *Allocator);
/// Release resources used in the pool.
Error deinit();
/// Allocate the requested size of memory from this pool.
/// AllocSize is the chunk size internally used for the returned memory.
Expected<void *> alloc(size_t Size, size_t &AllocSize);
/// Deallocate the specified memory and returns block size deallocated.
size_t dealloc(void *Ptr);
}; // MemPoolTy
/// Allocation information maintained in the plugin.
class MemAllocInfoMapTy {
/// Map from allocated pointer to allocation information.
std::map<void *, MemAllocInfoTy> Map;
/// Map from target alloc kind to number of implicit arguments.
std::array<uint32_t, MaxMemKind> NumImplicitArgs;
public:
/// Add allocation information to the map.
void add(void *Ptr, void *Base, size_t ReqSize, size_t AllocSize,
int32_t Kind, bool InPool = false, bool ImplicitArg = false);
/// Remove allocation information for the given memory location.
bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);
/// Finds allocation information for the given memory location.
const MemAllocInfoTy *find(void *Ptr) const {
auto AllocInfo = Map.find(Ptr);
if (AllocInfo == Map.end())
return nullptr;
else
return &AllocInfo->second;
}
/// Check if the map contains the given pointer and offset.
bool contains(const void *Ptr, size_t Size) const {
if (Map.size() == 0)
return false;
auto I = Map.upper_bound(const_cast<void *>(Ptr));
if (I == Map.begin())
return false;
--I;
uintptr_t PtrAsInt = reinterpret_cast<uintptr_t>(Ptr);
uintptr_t MapBase = reinterpret_cast<uintptr_t>(I->first);
uintptr_t MapSize = static_cast<uintptr_t>(I->second.ReqSize);
bool Ret = MapBase <= PtrAsInt && PtrAsInt + Size <= MapBase + MapSize;
return Ret;
}
/// Returns the number of implicit arguments for the specified allocation
/// kind.
size_t getNumImplicitArgs(int32_t Kind) {
assert(Kind >= 0 && Kind < MaxMemKind &&
"Invalid target allocation kind");
return NumImplicitArgs[Kind];
}
}; // MemAllocInfoMapTy
/// L0 context to use.
const L0ContextTy *L0Context = nullptr;
/// L0 device to use.
L0DeviceTy *Device = nullptr;
/// Whether the device supports large memory allocation.
bool SupportsLargeMem = false;
/// Cached max alloc size supported by device.
uint64_t MaxAllocSize;
/// Map from allocation kind to memory statistics.
std::array<MemStatTy, MaxMemKind> Stats;
/// Map from allocation kind to memory pool.
std::array<std::unique_ptr<MemPoolTy>, MaxMemKind> Pools;
/// Memory pool dedicated to reduction scratch space.
std::unique_ptr<MemPoolTy> ReductionPool;
/// Memory pool dedicated to reduction counters.
std::unique_ptr<MemPoolTy> CounterPool;
/// Allocation information map.
MemAllocInfoMapTy AllocInfo;
/// RTL-owned memory that needs to be freed automatically.
std::vector<void *> MemOwned;
/// Lock protection.
std::mutex Mtx;
/// Allocator only supports host memory.
bool IsHostMem = false;
// Internal deallocation function to be called when already
// hondling the Mtx lock.
Error deallocLocked(void *Ptr);
/// Allocate memory from L0 GPU RT.
Expected<void *> allocFromL0(size_t Size, size_t Align, int32_t Kind);
/// Deallocate memory from L0 GPU RT.
Error deallocFromL0(void *Ptr);
/// We use over-allocation workaround to support target pointer with
/// offset, and positive "ActiveSize" is specified in such cases to
/// correct debug logging.
Expected<void *> allocFromL0AndLog(size_t Size, size_t Align, int32_t Kind,
size_t ActiveSize = 0) {
auto MemOrErr = allocFromL0(Size, Align, Kind);
if (!MemOrErr)
return MemOrErr;
size_t LoggedSize = ActiveSize ? ActiveSize : Size;
log(LoggedSize, Size, Kind);
return MemOrErr;
}
/// Log memory allocation/deallocation.
void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
if (Kind < 0 || Kind >= MaxMemKind)
return; // Stat is disabled.
auto &ST = Stats[Kind];
int32_t I = Pool ? 1 : 0;
if (ReqSize > 0) {
ST.Requested[I] += ReqSize;
ST.Allocated[I] += Size;
ST.InUse[I] += Size;
ST.NumAllocs[I]++;
} else {
ST.Freed[I] += Size;
ST.InUse[I] -= Size;
}
ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
}
/// Perform copy operation.
Error enqueueMemCopy(void *Dst, const void *Src, size_t Size);
/// Perform memory fill operation.
Error enqueueMemSet(void *Dst, int8_t Value, size_t Size);
/// Allocate memory with the specified information from a memory pool.
Expected<void *> allocFromPool(size_t Size, size_t Align, int32_t Kind,
intptr_t Offset, bool UserAlloc,
bool DevMalloc, uint32_t MemAdvice,
AllocOptionTy AllocOpt);
/// Deallocate memory from memory pool.
Error deallocFromPool(void *Ptr) {
std::lock_guard<std::mutex> Lock(Mtx);
return deallocLocked(Ptr);
}
public:
MemAllocatorTy()
: MaxAllocSize(std::numeric_limits<decltype(MaxAllocSize)>::max()) {}
MemAllocatorTy(const MemAllocatorTy &) = delete;
MemAllocatorTy(MemAllocatorTy &&) = delete;
MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
~MemAllocatorTy() = default;
Error initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
Error initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
void updateMaxAllocSize(L0DeviceTy &L0Device);
/// Release resources and report statistics if requested.
Error deinit();
/// Allocate memory with the specified information from a memory pool.
Expected<void *> alloc(size_t Size, size_t Align, int32_t Kind,
intptr_t Offset, bool UserAlloc, bool DevMalloc,
uint32_t MemAdvice, AllocOptionTy AllocOpt) {
return allocFromPool(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
MemAdvice, AllocOpt);
}
/// Deallocate memory.
Error dealloc(void *Ptr) { return deallocFromPool(Ptr); }
/// Check if the given memory location and offset belongs to any allocated
/// memory.
bool contains(const void *Ptr, size_t Size) {
std::lock_guard<std::mutex> Lock(Mtx);
return AllocInfo.contains(Ptr, Size);
}
/// Get allocation information for the specified memory location.
const MemAllocInfoTy *getAllocInfo(void *Ptr) {
std::lock_guard<std::mutex> Lock(Mtx);
return AllocInfo.find(Ptr);
}
/// Get kernel indirect access flags using implicit argument info.
ze_kernel_indirect_access_flags_t getIndirectFlags() {
std::lock_guard<std::mutex> Lock(Mtx);
ze_kernel_indirect_access_flags_t Ret = 0;
if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0)
Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0)
Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0)
Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
return Ret;
}
}; /// MemAllocatorTy
// Simple generic wrapper to reuse objects
// objects must have zero argument accessible constructor.
template <class ObjTy> class ObjPool {
// Protection.
std::unique_ptr<std::mutex> Mtx;
// List of Objects.
std::list<ObjTy *> Objects;
public:
ObjPool() { Mtx.reset(new std::mutex); }
ObjPool(const ObjPool &) = delete;
ObjPool(ObjPool &) = delete;
ObjPool &operator=(const ObjPool &) = delete;
ObjPool &operator=(const ObjPool &&) = delete;
ObjTy *get() {
if (!Objects.empty()) {
std::lock_guard<std::mutex> Lock(*Mtx);
if (!Objects.empty()) {
const auto Ret = Objects.back();
Objects.pop_back();
return Ret;
}
}
return new ObjTy();
}
void release(ObjTy *obj) {
std::lock_guard<std::mutex> Lock(*Mtx);
Objects.push_back(obj);
}
~ObjPool() {
for (auto Object : Objects)
delete Object;
}
};
/// Common event pool used in the plugin. This event pool assumes all events
/// from the pool are host-visible and use the same event pool flag.
class EventPoolTy {
/// Size of L0 event pool created on demand.
size_t PoolSize = 64;
/// Context of the events.
ze_context_handle_t Context = nullptr;
/// Additional event pool flags common to this pull.
uint32_t Flags = 0;
/// Protection.
std::unique_ptr<std::mutex> Mtx;
/// List of created L0 event pools.
std::list<ze_event_pool_handle_t> Pools;
/// List of free L0 events.
std::list<ze_event_handle_t> Events;
#ifdef OMPT_SUPPORT
/// Event to OMPT record map. The timestamp information is recorded to the
/// OMPT record before the event is recycled.
std::unordered_map<ze_event_handle_t, ompt_record_ompt_t *> EventToRecord;
#endif // OMPT_SUPPORT
public:
/// Initialize context, flags, and mutex.
Error init(ze_context_handle_t ContextIn, uint32_t FlagsIn) {
Context = ContextIn;
Flags = FlagsIn;
Mtx.reset(new std::mutex);
return Plugin::success();
}
/// Destroys L0 resources.
Error deinit() {
for (auto E : Events)
CALL_ZE_RET_ERROR(zeEventDestroy, E);
for (auto P : Pools)
CALL_ZE_RET_ERROR(zeEventPoolDestroy, P);
return Plugin::success();
}
/// Get a free event from the pool.
Expected<ze_event_handle_t> getEvent();
/// Return an event to the pool.
Error releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
};
/// Staging buffer.
/// A single staging buffer is not enough when batching is enabled since there
/// can be multiple pending copy operations.
class StagingBufferTy {
/// Context for L0 calls.
ze_context_handle_t Context = nullptr;
/// Max allowed size for staging buffer.
size_t Size = L0StagingBufferSize;
/// Number of buffers allocated together.
size_t Count = L0StagingBufferCount;
/// Buffers increasing by Count if a new buffer is required.
llvm::SmallVector<void *> Buffers;
/// Next buffer location in the buffers.
size_t Offset = 0;
Expected<void *> addBuffers() {
ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
nullptr, 0};
void *Ret = nullptr;
size_t AllocSize = Size * Count;
CALL_ZE_RET_ERROR(zeMemAllocHost, Context, &AllocDesc, AllocSize,
L0DefaultAlignment, &Ret);
Buffers.push_back(Ret);
return Ret;
}
public:
StagingBufferTy() = default;
StagingBufferTy(const StagingBufferTy &) = delete;
StagingBufferTy(StagingBufferTy &&) = delete;
StagingBufferTy &operator=(const StagingBufferTy &) = delete;
StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
~StagingBufferTy() = default;
Error clear() {
for (auto Ptr : Buffers)
CALL_ZE_RET_ERROR(zeMemFree, Context, Ptr);
Context = nullptr;
return Plugin::success();
}
bool initialized() const { return Context != nullptr; }
void init(ze_context_handle_t ContextIn, size_t SizeIn, size_t CountIn) {
Context = ContextIn;
Size = SizeIn;
Count = CountIn;
}
void reset() { Offset = 0; }
/// Always return the first buffer.
Expected<void *> get() {
if (Size == 0 || Count == 0)
return nullptr;
return Buffers.empty() ? addBuffers() : Buffers.front();
}
/// Return the next available buffer.
Expected<void *> getNext() {
void *Ret = nullptr;
if (Size == 0 || Count == 0)
return Ret;
size_t AllocSize = Size * Count;
bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
if (NeedToGrow) {
auto PtrOrErr = addBuffers();
if (!PtrOrErr)
return PtrOrErr.takeError();
Ret = *PtrOrErr;
} else
Ret = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(Buffers.back()) + (Offset % AllocSize));
if (!Ret)
return nullptr;
Offset += Size;
return Ret;
}
/// Return either a fixed buffer or next buffer.
Expected<void *> get(bool Next) { return Next ? getNext() : get(); }
};
} // namespace llvm::omp::target::plugin
#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
|