//===--- Level Zero Target RTL Implementation -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // GenericDevice instatiation for SPIR-V/Xe machine. // //===----------------------------------------------------------------------===// #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H #include "llvm/ADT/SmallVector.h" #include "PerThreadTable.h" #include "AsyncQueue.h" #include "L0Context.h" #include "L0Program.h" #include "PluginInterface.h" #include "TLS.h" namespace llvm::omp::target::plugin { using OmpInteropTy = omp_interop_val_t *; class LevelZeroPluginTy; // clang-format off enum class PCIIdTy : int32_t { None = 0x0000, SKL = 0x1900, KBL = 0x5900, CFL = 0x3E00, CFL_2 = 0x9B00, ICX = 0x8A00, TGL = 0xFF20, TGL_2 = 0x9A00, DG1 = 0x4900, RKL = 0x4C00, ADLS = 0x4600, RTL = 0xA700, MTL = 0x7D00, PVC = 0x0B00, DG2_ATS_M = 0x4F00, DG2_ATS_M_2 = 0x5600, LNL = 0x6400, BMG = 0xE200, }; /// Device type enumeration common to compiler and runtime. enum class DeviceArchTy : uint64_t { DeviceArch_None = 0, DeviceArch_Gen = 0x0001, // Gen 9, Gen 11 or Xe DeviceArch_XeLPG = 0x0002, DeviceArch_XeHPC = 0x0004, DeviceArch_XeHPG = 0x0008, DeviceArch_Xe2LP = 0x0010, DeviceArch_Xe2HP = 0x0020, DeviceArch_x86_64 = 0x0100 }; // clang-format on struct L0DeviceIdTy { ze_device_handle_t zeId; int32_t RootId; int32_t SubId; int32_t CCSId; L0DeviceIdTy(ze_device_handle_t Device, int32_t RootId, int32_t SubId = -1, int32_t CCSId = -1) : zeId(Device), RootId(RootId), SubId(SubId), CCSId(CCSId) {} }; class L0DeviceTLSTy { /// Command list for each device. ze_command_list_handle_t CmdList = nullptr; /// Main copy command list for each device. ze_command_list_handle_t CopyCmdList = nullptr; /// Command queue for each device. ze_command_queue_handle_t CmdQueue = nullptr; /// Main copy command queue for each device. ze_command_queue_handle_t CopyCmdQueue = nullptr; /// Immediate command list for each device. ze_command_list_handle_t ImmCmdList = nullptr; /// Immediate copy command list for each device. ze_command_list_handle_t ImmCopyCmdList = nullptr; public: L0DeviceTLSTy() = default; ~L0DeviceTLSTy() { // assert all fields are nullptr on destruction. assert(!CmdList && !CopyCmdList && !CmdQueue && !CopyCmdQueue && !ImmCmdList && !ImmCopyCmdList && "L0DeviceTLSTy destroyed without clearing resources"); } L0DeviceTLSTy(const L0DeviceTLSTy &) = delete; L0DeviceTLSTy(L0DeviceTLSTy &&Other) { CmdList = std::exchange(Other.CmdList, nullptr); CopyCmdList = std::exchange(Other.CopyCmdList, nullptr); CmdQueue = std::exchange(Other.CmdQueue, nullptr); CopyCmdQueue = std::exchange(Other.CopyCmdQueue, nullptr); ImmCmdList = std::exchange(Other.ImmCmdList, nullptr); ImmCopyCmdList = std::exchange(Other.ImmCopyCmdList, nullptr); } Error deinit() { // destroy all lists and queues. if (CmdList) CALL_ZE_RET_ERROR(zeCommandListDestroy, CmdList); if (CopyCmdList) CALL_ZE_RET_ERROR(zeCommandListDestroy, CopyCmdList); if (ImmCmdList) CALL_ZE_RET_ERROR(zeCommandListDestroy, ImmCmdList); if (ImmCopyCmdList) CALL_ZE_RET_ERROR(zeCommandListDestroy, ImmCopyCmdList); if (CmdQueue) CALL_ZE_RET_ERROR(zeCommandQueueDestroy, CmdQueue); if (CopyCmdQueue) CALL_ZE_RET_ERROR(zeCommandQueueDestroy, CopyCmdQueue); CmdList = nullptr; CopyCmdList = nullptr; CmdQueue = nullptr; CopyCmdQueue = nullptr; ImmCmdList = nullptr; ImmCopyCmdList = nullptr; return Plugin::success(); } L0DeviceTLSTy &operator=(const L0DeviceTLSTy &) = delete; L0DeviceTLSTy &operator=(L0DeviceTLSTy &&) = delete; ze_command_list_handle_t getCmdList() const { return CmdList; } void setCmdList(ze_command_list_handle_t _CmdList) { CmdList = _CmdList; } ze_command_list_handle_t getCopyCmdList() const { return CopyCmdList; } void setCopyCmdList(ze_command_list_handle_t _CopyCmdList) { CopyCmdList = _CopyCmdList; } ze_command_list_handle_t getImmCmdList() const { return ImmCmdList; } void setImmCmdList(ze_command_list_handle_t ImmCmdListIn) { ImmCmdList = ImmCmdListIn; } ze_command_list_handle_t getImmCopyCmdList() const { return ImmCopyCmdList; } void setImmCopyCmdList(ze_command_list_handle_t ImmCopyCmdListIn) { ImmCopyCmdList = ImmCopyCmdListIn; } ze_command_queue_handle_t getCmdQueue() const { return CmdQueue; } void setCmdQueue(ze_command_queue_handle_t CmdQueueIn) { CmdQueue = CmdQueueIn; } ze_command_queue_handle_t getCopyCmdQueue() const { return CopyCmdQueue; } void setCopyCmdQueue(ze_command_queue_handle_t CopyCmdQueueIn) { CopyCmdQueue = CopyCmdQueueIn; } }; struct L0DeviceTLSTableTy : public PerThreadContainer, 8> { Error deinit() { return PerThreadTable::deinit( [](L0DeviceTLSTy &Entry) { return Entry.deinit(); }); } }; class L0DeviceTy final : public GenericDeviceTy { // Level Zero Context for this Device. L0ContextTy &l0Context; // Level Zero handle for this Device. ze_device_handle_t zeDevice; // Device Properties. ze_device_properties_t DeviceProperties{}; ze_device_compute_properties_t ComputeProperties{}; ze_device_memory_properties_t MemoryProperties{}; ze_device_cache_properties_t CacheProperties{}; /// Devices' default target allocation kind for internal allocation. int32_t AllocKind = TARGET_ALLOC_DEVICE; DeviceArchTy DeviceArch = DeviceArchTy::DeviceArch_None; std::string DeviceName; /// Common indirect access flags for this device. ze_kernel_indirect_access_flags_t IndirectAccessFlags = 0; /// Device UUID for toplevel devices only. std::string DeviceUuid; /// L0 Device ID as string. std::string zeId; /// Command queue group ordinals for each device. static constexpr uint32_t MaxOrdinal = std::numeric_limits::max(); std::pair ComputeOrdinal{MaxOrdinal, 0}; /// Command queue group ordinals for copying. std::pair CopyOrdinal{MaxOrdinal, 0}; /// Command queue index for each device. uint32_t ComputeIndex = 0; bool IsAsyncEnabled = false; /// Lock for this device. std::mutex Mutex; /// Contains all modules (possibly from multiple device images) to handle /// dynamic link across multiple images llvm::SmallVector GlobalModules; /// L0 programs created for this device std::list Programs; /// MemAllocator for this device. MemAllocatorTy MemAllocator; DeviceArchTy computeArch() const; /// Get default compute group ordinal. Returns Ordinal-NumQueues pair. std::pair findComputeOrdinal(); /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair. std::pair findCopyOrdinal(bool LinkCopy = false); public: L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, ze_device_handle_t zeDevice, L0ContextTy &DriverInfo, const std::string_view zeId, int32_t ComputeIndex) : GenericDeviceTy(Plugin, DeviceId, NumDevices, SPIRVGridValues), l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId), ComputeIndex(ComputeIndex) { DeviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; DeviceProperties.pNext = nullptr; ComputeProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES; ComputeProperties.pNext = nullptr; MemoryProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES; MemoryProperties.pNext = nullptr; CacheProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES; CacheProperties.pNext = nullptr; } static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) { return static_cast(Device); } LevelZeroPluginTy &getPlugin() { return reinterpret_cast(Plugin); } L0DeviceTLSTy &getTLS(); Error setContext() override { return Plugin::success(); } Error initImpl(GenericPluginTy &Plugin) override; Error deinitImpl() override; ze_device_handle_t getZeDevice() const { return zeDevice; } const L0ContextTy &getL0Context() const { return l0Context; } L0ContextTy &getL0Context() { return l0Context; } const std::string_view getName() const { return DeviceName; } const char *getNameCStr() const { return DeviceName.c_str(); } const char *getArchCStr() const; const std::string_view getZeId() const { return zeId; } const char *getZeIdCStr() const { return zeId.c_str(); } std::mutex &getMutex() { return Mutex; } uint32_t getComputeIndex() const { return ComputeIndex; } ze_kernel_indirect_access_flags_t getIndirectFlags() const { return IndirectAccessFlags; } size_t getNumGlobalModules() const { return GlobalModules.size(); } void addGlobalModule(ze_module_handle_t Module) { GlobalModules.push_back(Module); } ze_module_handle_t *getGlobalModulesArray() { return GlobalModules.data(); } L0ProgramTy *getProgramFromImage(MemoryBufferRef Image) { for (auto &PGM : Programs) if (PGM.getMemoryBuffer() == Image) return &PGM; return nullptr; } Error buildAllKernels() { for (auto &PGM : Programs) { if (auto Err = PGM.loadModuleKernels()) return Err; } return Plugin::success(); } // add a new program to the device. Return a reference to the new program. Expected addProgram(int32_t ImageId, L0ProgramBuilderTy &Builder) { auto ImageOrErr = Builder.getELF(); if (!ImageOrErr) return ImageOrErr.takeError(); Programs.emplace_back(ImageId, *this, std::move(*ImageOrErr), Builder.getGlobalModule(), std::move(Builder.getModules())); return Programs.back(); } const L0ProgramTy &getLastProgram() const { return Programs.back(); } L0ProgramTy &getLastProgram() { return Programs.back(); } // Device properties getters. uint32_t getVendorId() const { return DeviceProperties.vendorId; } bool isGPU() const { return DeviceProperties.type == ZE_DEVICE_TYPE_GPU; } uint32_t getPCIId() const { return DeviceProperties.deviceId; } uint32_t getNumThreadsPerEU() const { return DeviceProperties.numThreadsPerEU; } uint32_t getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; } uint32_t getNumEUsPerSubslice() const { return DeviceProperties.numEUsPerSubslice; } uint32_t getNumSubslicesPerSlice() const { return DeviceProperties.numSubslicesPerSlice; } uint32_t getNumSlices() const { return DeviceProperties.numSlices; } uint32_t getNumSubslices() const { return DeviceProperties.numSubslicesPerSlice * DeviceProperties.numSlices; } uint32_t getNumEUs() const { return DeviceProperties.numEUsPerSubslice * getNumSubslices(); } uint32_t getTotalThreads() const { return DeviceProperties.numThreadsPerEU * getNumEUs(); } uint32_t getNumThreadsPerSubslice() const { return getNumEUsPerSubslice() * getNumThreadsPerEU(); } uint32_t getClockRate() const { return DeviceProperties.coreClockRate; } uint32_t getMaxSharedLocalMemory() const { return ComputeProperties.maxSharedLocalMemory; } uint32_t getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; } uint32_t getMaxGroupCount() const { return getMaxGroupCountX() * getMaxGroupCountY() * getMaxGroupCountZ(); } uint32_t getMaxGroupSizeX() const { return ComputeProperties.maxGroupSizeX; } uint32_t getMaxGroupSizeY() const { return ComputeProperties.maxGroupSizeY; } uint32_t getMaxGroupSizeZ() const { return ComputeProperties.maxGroupSizeZ; } uint32_t getMaxGroupCountX() const { return ComputeProperties.maxGroupCountX; } uint32_t getMaxGroupCountY() const { return ComputeProperties.maxGroupCountY; } uint32_t getMaxGroupCountZ() const { return ComputeProperties.maxGroupCountZ; } uint32_t getMemoryClockRate() const { return MemoryProperties.maxClockRate; } uint64_t getGlobalMemorySize() const { return MemoryProperties.totalSize; } size_t getCacheSize() const { return CacheProperties.cacheSize; } uint64_t getMaxMemAllocSize() const { return DeviceProperties.maxMemAllocSize; } int32_t getAllocKind() const { return AllocKind; } DeviceArchTy getDeviceArch() const { return DeviceArch; } bool isDeviceArch(DeviceArchTy Arch) const { return DeviceArch == Arch; } static bool isDiscrete(uint32_t PCIId) { switch (static_cast(PCIId & 0xFF00)) { case PCIIdTy::DG1: case PCIIdTy::PVC: case PCIIdTy::DG2_ATS_M: case PCIIdTy::DG2_ATS_M_2: case PCIIdTy::BMG: return true; default: return false; } } static bool isDiscrete(ze_device_handle_t Device) { ze_device_properties_t PR{}; PR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; PR.pNext = nullptr; CALL_ZE_RET(false, zeDeviceGetProperties, Device, &PR); return isDiscrete(PR.deviceId); } bool isDiscreteDevice() { return isDiscrete(getPCIId()); } bool isDeviceIPorNewer(uint32_t Version) const; const std::string_view getUuid() const { return DeviceUuid; } uint32_t getComputeEngine() const { return ComputeOrdinal.first; } uint32_t getNumComputeQueues() const { return ComputeOrdinal.second; } bool hasMainCopyEngine() const { return CopyOrdinal.first != MaxOrdinal; } uint32_t getMainCopyEngine() const { return CopyOrdinal.first; } bool deviceRequiresImmCmdList() const { constexpr uint32_t BMGIP = 0x05004000; return isDeviceIPorNewer(BMGIP); } bool asyncEnabled() const { return IsAsyncEnabled; } bool useImmForCompute() const { return true; } bool useImmForCopy() const { return true; } bool useImmForInterop() const { return true; } void reportDeviceInfo() const; // Command queues related functions. /// Create a command list with given ordinal and flags. Expected createCmdList(ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal, ze_command_list_flags_t Flags, const std::string_view DeviceIdStr); /// Create a command list with default flags. Expected createCmdList(ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal, const std::string_view DeviceIdStr); Expected getCmdList(); /// Create a command queue with given ordinal and flags. Expected createCmdQueue(ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal, uint32_t Index, ze_command_queue_flags_t Flags, const std::string_view DeviceIdStr); /// Create a command queue with default flags. Expected createCmdQueue(ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal, uint32_t Index, const std::string_view DeviceIdStr, bool InOrder = false); /// Create a new command queue for the given OpenMP device ID. Expected createCommandQueue(bool InOrder = false); /// Create an immediate command list. Expected createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder = false); /// Create an immediate command list for computing. Expected createImmCmdList(bool InOrder = false) { return createImmCmdList(getComputeEngine(), getComputeIndex(), InOrder); } /// Create an immediate command list for copying. Expected createImmCopyCmdList(); Expected getCmdQueue(); Expected getCopyCmdList(); Expected getCopyCmdQueue(); Expected getImmCmdList(); Expected getImmCopyCmdList(); /// Enqueue copy command. Error enqueueMemCopy(void *Dst, const void *Src, size_t Size, __tgt_async_info *AsyncInfo = nullptr, bool UseCopyEngine = true); /// Enqueue asynchronous copy command. Error enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size, __tgt_async_info *AsyncInfo, bool CopyTo = true); /// Enqueue fill command. Error enqueueMemFill(void *Ptr, const void *Pattern, size_t PatternSize, size_t Size); /// Driver related functions. /// Reurn the driver handle for this device. ze_driver_handle_t getZeDriver() const { return l0Context.getZeDriver(); } /// Return context for this device. ze_context_handle_t getZeContext() const { return l0Context.getZeContext(); } /// Return driver API version for this device. ze_api_version_t getDriverAPIVersion() const { return l0Context.getDriverAPIVersion(); } /// Return an event from the driver associated to this device. Expected getEvent() { return l0Context.getEventPool().getEvent(); } /// Release event to the pool associated to this device. Error releaseEvent(ze_event_handle_t Event) { return l0Context.getEventPool().releaseEvent(Event, *this); } StagingBufferTy &getStagingBuffer() { return l0Context.getStagingBuffer(); } bool supportsLargeMem() const { return l0Context.supportsLargeMem(); } // Allocation related routines. /// Data alloc. Expected dataAlloc( size_t Size, size_t Align, int32_t Kind, intptr_t Offset, bool UserAlloc, bool DevMalloc = false, uint32_t MemAdvice = std::numeric_limits::max(), AllocOptionTy AllocOpt = AllocOptionTy::ALLOC_OPT_NONE); /// Data delete. Error dataDelete(void *Ptr); /// Return the memory allocation type for the specified memory location. uint32_t getMemAllocType(const void *Ptr) const; const MemAllocatorTy &getDeviceMemAllocator() const { return MemAllocator; } MemAllocatorTy &getDeviceMemAllocator() { return MemAllocator; } MemAllocatorTy &getMemAllocator(int32_t Kind) { if (Kind == TARGET_ALLOC_HOST) return l0Context.getHostMemAllocator(); return getDeviceMemAllocator(); } MemAllocatorTy &getMemAllocator(const void *Ptr) { if (ZE_MEMORY_TYPE_HOST == getMemAllocType(Ptr)) return l0Context.getHostMemAllocator(); return getDeviceMemAllocator(); } Error makeMemoryResident(void *Mem, size_t Size); // Generic device interface implementation. Expected loadBinaryImpl(std::unique_ptr &&TgtImage, int32_t ImageId) override; Error unloadBinaryImpl(DeviceImageTy *Image) override; Expected allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override; Error free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override; /// This plugin does nothing to lock buffers. Do not return an error, just /// return the same pointer as the device pointer. Expected dataLockImpl(void *HstPtr, int64_t Size) override { return HstPtr; } Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } Expected isPinnedPtrImpl(void *, void *&, void *&, size_t &) const override { // Don't need to do anything, this is handled by the driver. return false; } Expected isAccessiblePtrImpl(const void *Ptr, size_t Size) override; Error dataFence(__tgt_async_info *Async) override; Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; Error synchronizeImpl(__tgt_async_info &AsyncInfo, bool ReleaseQueue) override; Error queryAsyncImpl(__tgt_async_info &AsyncInfo, bool ReleaseQueue, bool *IsQueueWorkCompleted) override; Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override; Expected hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override; Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, AsyncInfoWrapperTy &AsyncInfo) override { return Plugin::error(ErrorCode::UNIMPLEMENTED, "enqueueHostCallImpl not implemented yet"); } // Event routines are used to ensure ordering between dataTransfers. Instead // of adding extra events in the queues, we make sure they're ordered by // using the events from the data submission APIs so we don't need to support // these routines. // They still need to report succes to indicate the event are handled // somewhere waitEvent and syncEvent should remain unimplemented. Expected isEventCompleteImpl(void *EventPtr, AsyncInfoWrapperTy &) override { return true; } Error createEventImpl(void **EventPtrStorage) override { return Plugin::success(); } Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); } Error recordEventImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) override { return Plugin::success(); } Error waitEventImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) override { return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n", __func__); } Error syncEventImpl(void *EventPtr) override { return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n", __func__); } Expected obtainInfoImpl() override; uint64_t getClockFrequency() const override { return getClockRate(); } uint64_t getHardwareParallelism() const override { return getTotalThreads(); } Error getDeviceMemorySize(uint64_t &DSize) override { DSize = getGlobalMemorySize(); return Plugin::success(); } Error getDeviceStackSize(uint64_t &V) override { V = 0; return Plugin::success(); } Expected constructKernel(const char *Name) override; Error setDeviceStackSize(uint64_t V) override { return Plugin::success(); } Expected createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override; Error releaseInterop(omp_interop_val_t *Interop) override; interop_spec_t selectInteropPreference(int32_t InteropType, int32_t NumPrefers, interop_spec_t *Prefers) override; }; } // namespace llvm::omp::target::plugin #endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H