//===--- Level Zero Target RTL Implementation -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Memory related support for SPIR-V/Xe machine. // //===----------------------------------------------------------------------===// #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H #include #include #include #include #include #include #include "L0Defs.h" #include "L0Trace.h" namespace llvm::omp::target::plugin { class L0DeviceTy; // Forward declarations. struct L0OptionsTy; class L0DeviceTy; class L0ContextTy; constexpr static int32_t MaxMemKind = TARGET_ALLOC_LAST + 1; struct DynamicMemHeapTy { /// Base address memory is allocated from. uintptr_t AllocBase = 0; /// Minimal size served by the current heap. size_t BlockSize = 0; /// Max size served by the current heap. size_t MaxSize = 0; /// Available memory blocks. uint32_t NumBlocks = 0; /// Number of block descriptors. uint32_t NumBlockDesc = 0; /// Number of block counters. uint32_t NumBlockCounter = 0; /// List of memory block descriptors. uint64_t *BlockDesc = nullptr; /// List of memory block counters. uint32_t *BlockCounter = nullptr; }; struct DynamicMemPoolTy { /// Location of device memory blocks. void *PoolBase = nullptr; /// Heap size common to all heaps. size_t HeapSize = 0; /// Number of heaps available. uint32_t NumHeaps = 0; /// Heap descriptors (using fixed-size array to simplify memory allocation). DynamicMemHeapTy HeapDesc[8]; }; /// Memory allocation information used in memory allocation/deallocation. struct MemAllocInfoTy { /// Base address allocated from compute runtime. void *Base = nullptr; /// Allocation size known to users/libomptarget. size_t ReqSize = 0; /// Allocation size known to the plugin (can be larger than ReqSize). size_t AllocSize = 0; /// TARGET_ALLOC kind. int32_t Kind = TARGET_ALLOC_DEFAULT; /// Is the allocation from a pool? bool InPool = false; /// Is an implicit argument? bool ImplicitArg = false; MemAllocInfoTy() = default; MemAllocInfoTy(void *Base, size_t ReqSize, size_t AllocSize, int32_t Kind, bool InPool, bool ImplicitArg) : Base(Base), ReqSize(ReqSize), AllocSize(AllocSize), Kind(Kind), InPool(InPool), ImplicitArg(ImplicitArg) {} }; /// Responsible for all activities involving memory allocation/deallocation. /// It contains memory pool management, memory allocation bookkeeping. class MemAllocatorTy { /// Simple memory allocation statistics. Maintains numbers for pool allocation /// and GPU RT allocation. struct MemStatTy { size_t Requested[2] = {0, 0}; // Requested bytes. size_t Allocated[2] = {0, 0}; // Allocated bytes. size_t Freed[2] = {0, 0}; // Freed bytes. size_t InUse[2] = {0, 0}; // Current memory in use. size_t PeakUse[2] = {0, 0}; // Peak bytes used. size_t NumAllocs[2] = {0, 0}; // Number of allocations. }; /// Memory pool which enables reuse of already allocated blocks: /// -- Pool maintains a list of buckets each of which can allocate fixed-size /// memory. /// -- Each bucket maintains a list of memory blocks allocated by GPU RT. /// -- Each memory block can allocate multiple fixed-size memory requested by /// offload RT or user. /// -- Memory allocation falls back to GPU RT allocation when the pool size /// (total memory used by pool) reaches a threshold. class MemPoolTy { /// Memory block maintained in each bucket. struct BlockTy { /// Base address of this block. uintptr_t Base = 0; /// Size of the block. size_t Size = 0; /// Supported allocation size by this block. size_t ChunkSize = 0; /// Total number of slots. uint32_t NumSlots = 0; /// Maximum slot value. static constexpr uint32_t MaxSlots = std::numeric_limits::max(); /// Number of slots in use. uint32_t NumUsedSlots = 0; /// Cached available slot returned by the last dealloc() call. uint32_t FreeSlot = MaxSlots; /// Marker for the currently used slots. std::vector UsedSlots; BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) { Base = reinterpret_cast(_Base); Size = _Size; ChunkSize = _ChunkSize; NumSlots = Size / ChunkSize; NumUsedSlots = 0; UsedSlots.resize(NumSlots, /*InitValue=*/false); } /// Check if the current block is fully used. bool isFull() const { return NumUsedSlots == NumSlots; } /// Check if the given address belongs to the current block. bool contains(void *Mem) const { auto M = reinterpret_cast(Mem); return M >= Base && M < Base + Size; } /// Allocate a single chunk from the block. void *alloc(); /// Deallocate the given memory. void dealloc(void *Mem); }; // BlockTy /// Allocation kind for the current pool. int32_t AllocKind = TARGET_ALLOC_DEFAULT; /// Access to the allocator. MemAllocatorTy *Allocator = nullptr; /// Minimum supported memory allocation size from pool. size_t AllocMin = 1 << 6; // 64B /// Maximum supported memory allocation size from pool. size_t AllocMax = 0; /// Allocation size when the pool needs to allocate a block. size_t AllocUnit = 1 << 16; // 64KB /// Capacity of each block in the buckets which decides number of /// allocatable chunks from the block. Each block in the bucket can serve /// at least BlockCapacity chunks. /// If ChunkSize * BlockCapacity <= AllocUnit /// BlockSize = AllocUnit /// Otherwise, /// BlockSize = ChunkSize * BlockCapacity /// This simply means how much memory is over-allocated. uint32_t BlockCapacity = 0; /// Total memory allocated from GPU RT for this pool. size_t PoolSize = 0; /// Maximum allowed pool size. Allocation falls back to GPU RT allocation if /// when PoolSize reaches PoolSizeMax. size_t PoolSizeMax = 0; /// Small allocation size allowed in the pool even if pool size is over the /// pool size limit. size_t SmallAllocMax = 1024; /// Small allocation pool size. size_t SmallPoolSize = 0; /// Small allocation pool size max (4MB). size_t SmallPoolSizeMax = (4 << 20); /// List of buckets. std::vector> Buckets; /// List of bucket parameters. std::vector> BucketParams; /// Map from allocated pointer to corresponding block. llvm::DenseMap PtrToBlock; /// Simple stats counting miss/hit in each bucket. std::vector> BucketStats; /// Need to zero-initialize after L0 allocation. bool ZeroInit = false; /// Get bucket ID from the specified allocation size. uint32_t getBucketId(size_t Size) { uint32_t Count = 0; for (size_t SZ = AllocMin; SZ < Size; Count++) SZ <<= 1; return Count; } public: MemPoolTy() = default; MemPoolTy(const MemPoolTy &) = delete; MemPoolTy(MemPoolTy &&) = delete; MemPoolTy &operator=(const MemPoolTy &) = delete; MemPoolTy &operator=(const MemPoolTy &&) = delete; ~MemPoolTy() = default; void printUsage(); /// Initialize pool with allocation kind, allocator, and user options. Error init(int32_t Kind, MemAllocatorTy *Allocator, const L0OptionsTy &Option); // Initialize pool used for reduction pool. Error init(MemAllocatorTy *Allocator, const L0OptionsTy &Option); // Initialize pool used for small memory pool with fixed parameters. Error init(MemAllocatorTy *Allocator); /// Release resources used in the pool. Error deinit(); /// Allocate the requested size of memory from this pool. /// AllocSize is the chunk size internally used for the returned memory. Expected alloc(size_t Size, size_t &AllocSize); /// Deallocate the specified memory and returns block size deallocated. size_t dealloc(void *Ptr); }; // MemPoolTy /// Allocation information maintained in the plugin. class MemAllocInfoMapTy { /// Map from allocated pointer to allocation information. std::map Map; /// Map from target alloc kind to number of implicit arguments. std::array NumImplicitArgs; public: /// Add allocation information to the map. void add(void *Ptr, void *Base, size_t ReqSize, size_t AllocSize, int32_t Kind, bool InPool = false, bool ImplicitArg = false); /// Remove allocation information for the given memory location. bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr); /// Finds allocation information for the given memory location. const MemAllocInfoTy *find(void *Ptr) const { auto AllocInfo = Map.find(Ptr); if (AllocInfo == Map.end()) return nullptr; else return &AllocInfo->second; } /// Check if the map contains the given pointer and offset. bool contains(const void *Ptr, size_t Size) const { if (Map.size() == 0) return false; auto I = Map.upper_bound(const_cast(Ptr)); if (I == Map.begin()) return false; --I; uintptr_t PtrAsInt = reinterpret_cast(Ptr); uintptr_t MapBase = reinterpret_cast(I->first); uintptr_t MapSize = static_cast(I->second.ReqSize); bool Ret = MapBase <= PtrAsInt && PtrAsInt + Size <= MapBase + MapSize; return Ret; } /// Returns the number of implicit arguments for the specified allocation /// kind. size_t getNumImplicitArgs(int32_t Kind) { assert(Kind >= 0 && Kind < MaxMemKind && "Invalid target allocation kind"); return NumImplicitArgs[Kind]; } }; // MemAllocInfoMapTy /// L0 context to use. const L0ContextTy *L0Context = nullptr; /// L0 device to use. L0DeviceTy *Device = nullptr; /// Whether the device supports large memory allocation. bool SupportsLargeMem = false; /// Cached max alloc size supported by device. uint64_t MaxAllocSize; /// Map from allocation kind to memory statistics. std::array Stats; /// Map from allocation kind to memory pool. std::array, MaxMemKind> Pools; /// Memory pool dedicated to reduction scratch space. std::unique_ptr ReductionPool; /// Memory pool dedicated to reduction counters. std::unique_ptr CounterPool; /// Allocation information map. MemAllocInfoMapTy AllocInfo; /// RTL-owned memory that needs to be freed automatically. std::vector MemOwned; /// Lock protection. std::mutex Mtx; /// Allocator only supports host memory. bool IsHostMem = false; // Internal deallocation function to be called when already // hondling the Mtx lock. Error deallocLocked(void *Ptr); /// Allocate memory from L0 GPU RT. Expected allocFromL0(size_t Size, size_t Align, int32_t Kind); /// Deallocate memory from L0 GPU RT. Error deallocFromL0(void *Ptr); /// We use over-allocation workaround to support target pointer with /// offset, and positive "ActiveSize" is specified in such cases to /// correct debug logging. Expected allocFromL0AndLog(size_t Size, size_t Align, int32_t Kind, size_t ActiveSize = 0) { auto MemOrErr = allocFromL0(Size, Align, Kind); if (!MemOrErr) return MemOrErr; size_t LoggedSize = ActiveSize ? ActiveSize : Size; log(LoggedSize, Size, Kind); return MemOrErr; } /// Log memory allocation/deallocation. void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) { if (Kind < 0 || Kind >= MaxMemKind) return; // Stat is disabled. auto &ST = Stats[Kind]; int32_t I = Pool ? 1 : 0; if (ReqSize > 0) { ST.Requested[I] += ReqSize; ST.Allocated[I] += Size; ST.InUse[I] += Size; ST.NumAllocs[I]++; } else { ST.Freed[I] += Size; ST.InUse[I] -= Size; } ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]); } /// Perform copy operation. Error enqueueMemCopy(void *Dst, const void *Src, size_t Size); /// Perform memory fill operation. Error enqueueMemSet(void *Dst, int8_t Value, size_t Size); /// Allocate memory with the specified information from a memory pool. Expected allocFromPool(size_t Size, size_t Align, int32_t Kind, intptr_t Offset, bool UserAlloc, bool DevMalloc, uint32_t MemAdvice, AllocOptionTy AllocOpt); /// Deallocate memory from memory pool. Error deallocFromPool(void *Ptr) { std::lock_guard Lock(Mtx); return deallocLocked(Ptr); } public: MemAllocatorTy() : MaxAllocSize(std::numeric_limits::max()) {} MemAllocatorTy(const MemAllocatorTy &) = delete; MemAllocatorTy(MemAllocatorTy &&) = delete; MemAllocatorTy &operator=(const MemAllocatorTy &) = delete; MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete; ~MemAllocatorTy() = default; Error initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option); Error initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option); void updateMaxAllocSize(L0DeviceTy &L0Device); /// Release resources and report statistics if requested. Error deinit(); /// Allocate memory with the specified information from a memory pool. Expected alloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset, bool UserAlloc, bool DevMalloc, uint32_t MemAdvice, AllocOptionTy AllocOpt) { return allocFromPool(Size, Align, Kind, Offset, UserAlloc, DevMalloc, MemAdvice, AllocOpt); } /// Deallocate memory. Error dealloc(void *Ptr) { return deallocFromPool(Ptr); } /// Check if the given memory location and offset belongs to any allocated /// memory. bool contains(const void *Ptr, size_t Size) { std::lock_guard Lock(Mtx); return AllocInfo.contains(Ptr, Size); } /// Get allocation information for the specified memory location. const MemAllocInfoTy *getAllocInfo(void *Ptr) { std::lock_guard Lock(Mtx); return AllocInfo.find(Ptr); } /// Get kernel indirect access flags using implicit argument info. ze_kernel_indirect_access_flags_t getIndirectFlags() { std::lock_guard Lock(Mtx); ze_kernel_indirect_access_flags_t Ret = 0; if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0) Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE; if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0) Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST; if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0) Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED; return Ret; } }; /// MemAllocatorTy // Simple generic wrapper to reuse objects // objects must have zero argument accessible constructor. template class ObjPool { // Protection. std::unique_ptr Mtx; // List of Objects. std::list Objects; public: ObjPool() { Mtx.reset(new std::mutex); } ObjPool(const ObjPool &) = delete; ObjPool(ObjPool &) = delete; ObjPool &operator=(const ObjPool &) = delete; ObjPool &operator=(const ObjPool &&) = delete; ObjTy *get() { if (!Objects.empty()) { std::lock_guard Lock(*Mtx); if (!Objects.empty()) { const auto Ret = Objects.back(); Objects.pop_back(); return Ret; } } return new ObjTy(); } void release(ObjTy *obj) { std::lock_guard Lock(*Mtx); Objects.push_back(obj); } ~ObjPool() { for (auto Object : Objects) delete Object; } }; /// Common event pool used in the plugin. This event pool assumes all events /// from the pool are host-visible and use the same event pool flag. class EventPoolTy { /// Size of L0 event pool created on demand. size_t PoolSize = 64; /// Context of the events. ze_context_handle_t Context = nullptr; /// Additional event pool flags common to this pull. uint32_t Flags = 0; /// Protection. std::unique_ptr Mtx; /// List of created L0 event pools. std::list Pools; /// List of free L0 events. std::list Events; #ifdef OMPT_SUPPORT /// Event to OMPT record map. The timestamp information is recorded to the /// OMPT record before the event is recycled. std::unordered_map EventToRecord; #endif // OMPT_SUPPORT public: /// Initialize context, flags, and mutex. Error init(ze_context_handle_t ContextIn, uint32_t FlagsIn) { Context = ContextIn; Flags = FlagsIn; Mtx.reset(new std::mutex); return Plugin::success(); } /// Destroys L0 resources. Error deinit() { for (auto E : Events) CALL_ZE_RET_ERROR(zeEventDestroy, E); for (auto P : Pools) CALL_ZE_RET_ERROR(zeEventPoolDestroy, P); return Plugin::success(); } /// Get a free event from the pool. Expected getEvent(); /// Return an event to the pool. Error releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device); }; /// Staging buffer. /// A single staging buffer is not enough when batching is enabled since there /// can be multiple pending copy operations. class StagingBufferTy { /// Context for L0 calls. ze_context_handle_t Context = nullptr; /// Max allowed size for staging buffer. size_t Size = L0StagingBufferSize; /// Number of buffers allocated together. size_t Count = L0StagingBufferCount; /// Buffers increasing by Count if a new buffer is required. llvm::SmallVector Buffers; /// Next buffer location in the buffers. size_t Offset = 0; Expected addBuffers() { ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, 0}; void *Ret = nullptr; size_t AllocSize = Size * Count; CALL_ZE_RET_ERROR(zeMemAllocHost, Context, &AllocDesc, AllocSize, L0DefaultAlignment, &Ret); Buffers.push_back(Ret); return Ret; } public: StagingBufferTy() = default; StagingBufferTy(const StagingBufferTy &) = delete; StagingBufferTy(StagingBufferTy &&) = delete; StagingBufferTy &operator=(const StagingBufferTy &) = delete; StagingBufferTy &operator=(const StagingBufferTy &&) = delete; ~StagingBufferTy() = default; Error clear() { for (auto Ptr : Buffers) CALL_ZE_RET_ERROR(zeMemFree, Context, Ptr); Context = nullptr; return Plugin::success(); } bool initialized() const { return Context != nullptr; } void init(ze_context_handle_t ContextIn, size_t SizeIn, size_t CountIn) { Context = ContextIn; Size = SizeIn; Count = CountIn; } void reset() { Offset = 0; } /// Always return the first buffer. Expected get() { if (Size == 0 || Count == 0) return nullptr; return Buffers.empty() ? addBuffers() : Buffers.front(); } /// Return the next available buffer. Expected getNext() { void *Ret = nullptr; if (Size == 0 || Count == 0) return Ret; size_t AllocSize = Size * Count; bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize; if (NeedToGrow) { auto PtrOrErr = addBuffers(); if (!PtrOrErr) return PtrOrErr.takeError(); Ret = *PtrOrErr; } else Ret = reinterpret_cast( reinterpret_cast(Buffers.back()) + (Offset % AllocSize)); if (!Ret) return nullptr; Offset += Size; return Ret; } /// Return either a fixed buffer or next buffer. Expected get(bool Next) { return Next ? getNext() : get(); } }; } // namespace llvm::omp::target::plugin #endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H