//===--- Level Zero Target RTL Implementation -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // GenericKernel implementation for SPIR-V/Xe machine. // //===----------------------------------------------------------------------===// #include "L0Kernel.h" #include "L0Device.h" #include "L0Plugin.h" #include "L0Program.h" namespace llvm::omp::target::plugin { bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn, const int32_t ThreadLimitIn, uint32_t *GroupSizesOut, L0LaunchEnvTy &KEnv) const { if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit) return false; // Found matching input parameters. std::copy_n(GroupSizes, 3, GroupSizesOut); KEnv.GroupCounts = GroupCounts; return true; } void KernelPropertiesTy::cacheGroupParams(const int32_t NumTeamsIn, const int32_t ThreadLimitIn, const uint32_t *GroupSizesIn, L0LaunchEnvTy &KEnv) { NumTeams = NumTeamsIn; ThreadLimit = ThreadLimitIn; std::copy_n(GroupSizesIn, 3, GroupSizes); GroupCounts = KEnv.GroupCounts; } Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) { const auto &l0Device = L0DeviceTy::makeL0Device(Program.getDevice()); auto &KernelPR = getProperties(); ze_kernel_properties_t KP = {}; KP.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES; KP.pNext = nullptr; ze_kernel_preferred_group_size_properties_t KPrefGRPSize = {}; KPrefGRPSize.stype = ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES; KPrefGRPSize.pNext = nullptr; if (l0Device.getDriverAPIVersion() >= ZE_API_VERSION_1_2) KP.pNext = &KPrefGRPSize; CALL_ZE_RET_ERROR(zeKernelGetProperties, zeKernel, &KP); KernelPR.SIMDWidth = KP.maxSubgroupSize; KernelPR.Width = KP.maxSubgroupSize; if (KP.pNext) KernelPR.Width = KPrefGRPSize.preferredMultiple; if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen)) { KernelPR.Width = (std::max)(KernelPR.Width, 2 * KernelPR.SIMDWidth); } KernelPR.MaxThreadGroupSize = KP.maxSubgroupSize * KP.maxNumSubgroups; return Plugin::success(); } Error L0KernelTy::buildKernel(L0ProgramTy &Program) { const auto *KernelName = getName(); auto Module = Program.findModuleFromKernelName(KernelName); ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, KernelName}; CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel); if (auto Err = readKernelProperties(Program)) return Err; return Plugin::success(); } Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) { auto &Program = L0ProgramTy::makeL0Program(Image); if (auto Err = buildKernel(Program)) return Err; Program.addKernel(this); return Plugin::success(); } void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit, uint32_t *GroupSizes, L0LaunchEnvTy &KEnv) const { const KernelPropertiesTy &KernelPR = getProperties(); const auto DeviceId = Device.getDeviceId(); bool MaxGroupSizeForced = false; bool MaxGroupCountForced = false; uint32_t MaxGroupSize = Device.getMaxGroupSize(); const auto &Option = LevelZeroPluginTy::getOptions(); const auto OptSubscRate = Option.SubscriptionRate; auto &GroupCounts = KEnv.GroupCounts; uint32_t SIMDWidth = KernelPR.SIMDWidth; uint32_t KernelWidth = KernelPR.Width; uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize; if (KernelMaxThreadGroupSize < MaxGroupSize) { MaxGroupSize = KernelMaxThreadGroupSize; INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Capping maximum team size to %" PRIu32 " due to kernel constraints.\n", MaxGroupSize); } if (ThreadLimit > 0) { MaxGroupSizeForced = true; MaxGroupSize = ThreadLimit; } uint32_t MaxGroupCount = 0; if (NumTeams > 0) { MaxGroupCount = NumTeams; MaxGroupCountForced = true; } if (MaxGroupCountForced) { // If number of teams is specified by the user, then use KernelWidth. // WIs per WG by default, so that it matches // decideLoopKernelGroupArguments() behavior. if (!MaxGroupSizeForced) { MaxGroupSize = KernelWidth; } } else { const uint32_t NumSubslices = Device.getNumSubslices(); uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice(); if (KEnv.HalfNumThreads) NumThreadsPerSubslice /= 2; MaxGroupCount = NumSubslices * NumThreadsPerSubslice; if (MaxGroupSizeForced) { // Set group size for the HW capacity. uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth; uint32_t NumGroupsPerSubslice = (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup; MaxGroupCount = NumGroupsPerSubslice * NumSubslices; } else { assert(!MaxGroupSizeForced && !MaxGroupCountForced); assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) && "Invalid maxGroupSize"); // Maximize group size. while (MaxGroupSize >= KernelWidth) { uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth; if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) { uint32_t NumGroupsPerSubslice = NumThreadsPerSubslice / NumThreadsPerGroup; MaxGroupCount = NumGroupsPerSubslice * NumSubslices; break; } MaxGroupSize -= KernelWidth; } } } uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1}; uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1}; if (!MaxGroupCountForced) { GRPCounts[0] *= OptSubscRate; } GroupCounts.groupCountX = GRPCounts[0]; GroupCounts.groupCountY = GRPCounts[1]; GroupCounts.groupCountZ = GRPCounts[2]; std::copy(GRPSizes, GRPSizes + 3, GroupSizes); } Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams, int32_t ThreadLimit, uint32_t *GroupSizes, L0LaunchEnvTy &KEnv) const { const auto DeviceId = Device.getDeviceId(); const auto &KernelPR = getProperties(); // Read the most recent global thread limit and max teams. const int32_t NumTeamsICV = 0; const int32_t ThreadLimitICV = 0; bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG); KEnv.HalfNumThreads = LevelZeroPluginTy::getOptions().ZeDebugEnabled && IsXeHPG; uint32_t KernelWidth = KernelPR.Width; uint32_t SIMDWidth = KernelPR.SIMDWidth; INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth); INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth); assert(SIMDWidth <= KernelWidth && "Invalid SIMD width."); if (ThreadLimit > 0) { // use thread_limit clause value default. ODBG(OLDT_Kernel) << "Max team size is set to " << ThreadLimit << " (thread_limit clause)"; } else if (ThreadLimitICV > 0) { // else use thread-limit-var ICV. ThreadLimit = ThreadLimitICV; ODBG(OLDT_Kernel) << "Max team size is set to " << ThreadLimit << " (thread-limit-icv)"; } size_t MaxThreadLimit = Device.getMaxGroupSize(); // Set correct max group size if the kernel was compiled with explicit SIMD. if (SIMDWidth == 1) MaxThreadLimit = Device.getNumThreadsPerSubslice(); if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) { MaxThreadLimit = KernelPR.MaxThreadGroupSize; ODBG(OLDT_Kernel) << "Capping maximum team size to " << MaxThreadLimit << " due to kernel constraints."; } if (ThreadLimit > static_cast(MaxThreadLimit)) { ThreadLimit = MaxThreadLimit; ODBG(OLDT_Kernel) << "Max team size exceeds current maximum " << MaxThreadLimit << ". Adjusted"; } // scope code to ease integration with downstream custom code. { if (NumTeams > 0) { ODBG(OLDT_Kernel) << "Number of teams is set to " << NumTeams << " (num_teams clause or no teams construct)"; } else if (NumTeamsICV > 0) { // OMP_NUM_TEAMS only matters, if num_teams() clause is absent. INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV); NumTeams = NumTeamsICV; ODBG(OLDT_Kernel) << "Max number of teams is set to " << NumTeams << " (OMP_NUM_TEAMS)"; } decideKernelGroupArguments(Device, (uint32_t)NumTeams, (uint32_t)ThreadLimit, GroupSizes, KEnv); } return Plugin::success(); } static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device, ze_kernel_handle_t zeKernel, L0LaunchEnvTy &KEnv, CommandModeTy CommandMode) { const auto DeviceId = l0Device.getDeviceId(); auto *IdStr = l0Device.getZeIdCStr(); auto CmdListOrErr = l0Device.getImmCmdList(); if (!CmdListOrErr) return CmdListOrErr.takeError(); const ze_command_list_handle_t CmdList = *CmdListOrErr; // Command queue is not used with immediate command list. INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Using immediate command list for kernel submission.\n"); auto EventOrError = l0Device.getEvent(); if (!EventOrError) return EventOrError.takeError(); ze_event_handle_t Event = *EventOrError; size_t NumWaitEvents = 0; ze_event_handle_t *WaitEvents = nullptr; auto *AsyncQueue = KEnv.AsyncQueue; if (KEnv.IsAsync && !AsyncQueue->WaitEvents.empty()) { if (CommandMode == CommandModeTy::AsyncOrdered) { NumWaitEvents = 1; WaitEvents = &AsyncQueue->WaitEvents.back(); } else { NumWaitEvents = AsyncQueue->WaitEvents.size(); WaitEvents = AsyncQueue->WaitEvents.data(); } } INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Kernel depends on %zu data copying events.\n", NumWaitEvents); CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel, &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents); KEnv.KernelPR.Mtx.unlock(); INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr); if (KEnv.IsAsync) { AsyncQueue->WaitEvents.push_back(Event); AsyncQueue->KernelEvent = Event; } else { CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout); if (auto Err = l0Device.releaseEvent(Event)) return Err; } INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel), IdStr); return Plugin::success(); } static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device, ze_kernel_handle_t zeKernel, L0LaunchEnvTy &KEnv) { const auto DeviceId = l0Device.getDeviceId(); const auto *IdStr = l0Device.getZeIdCStr(); auto CmdListOrErr = l0Device.getCmdList(); if (!CmdListOrErr) return CmdListOrErr.takeError(); ze_command_list_handle_t CmdList = *CmdListOrErr; auto CmdQueueOrErr = l0Device.getCmdQueue(); if (!CmdQueueOrErr) return CmdQueueOrErr.takeError(); const ze_command_queue_handle_t CmdQueue = *CmdQueueOrErr; INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Using regular command list for kernel submission.\n"); ze_event_handle_t Event = nullptr; CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel, &KEnv.GroupCounts, Event, 0, nullptr); KEnv.KernelPR.Mtx.unlock(); CALL_ZE_RET_ERROR(zeCommandListClose, CmdList); CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(), CmdQueue, 1, &CmdList, nullptr); INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr); CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout); CALL_ZE_RET_ERROR(zeCommandListReset, CmdList); if (Event) { if (auto Err = l0Device.releaseEvent(Event)) return Err; } INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel), IdStr); return Plugin::success(); } Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv, uint32_t NumThreads[3], uint32_t NumBlocks[3]) const { if (KernelEnvironment.Configuration.ExecMode != OMP_TGT_EXEC_MODE_BARE) { // For non-bare mode, the groups are already set in the launch. KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]}; CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), NumThreads[0], NumThreads[1], NumThreads[2]); return Plugin::success(); } int32_t NumTeams = NumBlocks[0]; int32_t ThreadLimit = NumThreads[0]; if (NumTeams < 0) NumTeams = 0; if (ThreadLimit < 0) ThreadLimit = 0; uint32_t GroupSizes[3]; auto DeviceId = l0Device.getDeviceId(); auto &KernelPR = KEnv.KernelPR; // Check if we can reuse previous group parameters. bool GroupParamsReused = KernelPR.reuseGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv); if (!GroupParamsReused) { if (auto Err = getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes, KEnv)) return Err; KernelPR.cacheGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv); } INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0], GroupSizes[1], GroupSizes[2]); INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", KEnv.GroupCounts.groupCountX, KEnv.GroupCounts.groupCountY, KEnv.GroupCounts.groupCountZ); if (!GroupParamsReused) { CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0], GroupSizes[1], GroupSizes[2]); } return Plugin::success(); } Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv) const { // Set Kernel Indirect flags. ze_kernel_indirect_access_flags_t Flags = 0; Flags |= l0Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags(); Flags |= l0Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags(); if (KEnv.KernelPR.IndirectAccessFlags != Flags) { // Combine with common access flags. const auto FinalFlags = l0Device.getIndirectFlags() | Flags; CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, zeKernel, FinalFlags); ODBG(OLDT_Kernel) << "Setting indirect access flags " << reinterpret_cast(FinalFlags); KEnv.KernelPR.IndirectAccessFlags = Flags; } return Plugin::success(); } Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice); __tgt_async_info *AsyncInfo = AsyncInfoWrapper; auto zeKernel = getZeKernel(); auto DeviceId = l0Device.getDeviceId(); int32_t NumArgs = KernelArgs.NumArgs; INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel " DPxMOD "...\n", DPxPTR(zeKernel)); auto &Plugin = l0Device.getPlugin(); auto *IdStr = l0Device.getZeIdCStr(); auto &Options = LevelZeroPluginTy::getOptions(); bool IsAsync = AsyncInfo && l0Device.asyncEnabled(); if (IsAsync && !AsyncInfo->Queue) { AsyncInfo->Queue = reinterpret_cast(Plugin.getAsyncQueue()); if (!AsyncInfo->Queue) IsAsync = false; // Couldn't get a queue, revert to sync. } auto *AsyncQueue = IsAsync ? static_cast(AsyncInfo->Queue) : nullptr; auto &KernelPR = getProperties(); L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR); // Protect from kernel preparation to submission as kernels are shared. KernelPR.Mtx.lock(); if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks)) return Err; // Set kernel arguments. for (int32_t I = 0; I < NumArgs; I++) { // Scope code to ease integration with downstream custom code. { void *Arg = (static_cast(LaunchParams.Data))[I]; CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg), Arg == nullptr ? nullptr : &Arg); INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Kernel Pointer argument %" PRId32 " (value: " DPxMOD ") was set successfully for device %s.\n", I, DPxPTR(Arg), IdStr); } } if (auto Err = setIndirectFlags(l0Device, KEnv)) return Err; // The next calls should unlock the KernelLock internally. const bool UseImmCmdList = l0Device.useImmForCompute(); if (UseImmCmdList) return launchKernelWithImmCmdList(l0Device, zeKernel, KEnv, Options.CommandMode); return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv); } } // namespace llvm::omp::target::plugin