diff options
author | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2022-07-28 16:32:56 +0100 |
---|---|---|
committer | Tom Stellard <tstellar@redhat.com> | 2022-08-08 11:00:41 -0700 |
commit | b5151c32f9aab3e9c61e622db9ec671dbd2331d9 (patch) | |
tree | 18245eaf9dfb58bf82fedb26c9c7abae125de01e /openmp | |
parent | 410bfa00a68b9876381aa1d3d48a2a4f5ee35e2c (diff) | |
download | llvm-b5151c32f9aab3e9c61e622db9ec671dbd2331d9.zip llvm-b5151c32f9aab3e9c61e622db9ec671dbd2331d9.tar.gz llvm-b5151c32f9aab3e9c61e622db9ec671dbd2331d9.tar.bz2 |
[openmp][amdgpu] Move global DeviceInfo behind call syntax prior to using D130712
(cherry picked from commit 75aa52106452a1d15ca487af7b408a812012e133)
Diffstat (limited to 'openmp')
-rw-r--r-- | openmp/libomptarget/plugins/amdgpu/src/rtl.cpp | 215 |
1 files changed, 109 insertions, 106 deletions
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 9b4bf41..7552986 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1113,14 +1113,17 @@ public: pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; -static RTLDeviceInfoTy DeviceInfo; +// Putting accesses to DeviceInfo global behind a function call prior +// to changing to use init_plugin/deinit_plugin calls +static RTLDeviceInfoTy DeviceInfoState; +RTLDeviceInfoTy& DeviceInfo() { return DeviceInfoState; } namespace { int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); - assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); // Return success if we are not copying back to host from target. if (!HstPtr) return OFFLOAD_SUCCESS; @@ -1129,7 +1132,7 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, (long long unsigned)(Elf64_Addr)TgtPtr, (long long unsigned)(Elf64_Addr)HstPtr); - Err = DeviceInfo.freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, + Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, DeviceId); if (Err != HSA_STATUS_SUCCESS) { @@ -1148,7 +1151,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); hsa_status_t Err; - assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); // Return success if we are not doing host to target. if (!HstPtr) return OFFLOAD_SUCCESS; @@ -1156,7 +1159,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, (long long unsigned)(Elf64_Addr)HstPtr, (long long unsigned)(Elf64_Addr)TgtPtr); - Err = DeviceInfo.freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, + Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("Error when copying data from host to device. Pointers: " @@ -1377,7 +1380,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; std::string KernelName = std::string(KernelInfo->Name); - auto &KernelInfoTable = DeviceInfo.KernelInfoTable; + auto &KernelInfoTable = DeviceInfo().KernelInfoTable; if (KernelInfoTable[DeviceId].find(KernelName) == KernelInfoTable[DeviceId].end()) { DP("Kernel %s not found\n", KernelName.c_str()); @@ -1387,7 +1390,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, const atl_kernel_info_t KernelInfoEntry = KernelInfoTable[DeviceId][KernelName]; const uint32_t GroupSegmentSize = - KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize; + KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize; const uint32_t SgprCount = KernelInfoEntry.sgpr_count; const uint32_t VgprCount = KernelInfoEntry.vgpr_count; const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; @@ -1399,12 +1402,12 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, * Set limit based on ThreadsPerGroup and GroupsPerDevice */ LaunchVals LV = - getLaunchVals(DeviceInfo.WarpSize[DeviceId], DeviceInfo.Env, + getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env, KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, NumTeams, // From run_region arg ThreadLimit, // From run_region arg LoopTripcount, // From run_region arg - DeviceInfo.NumTeams[KernelInfo->DeviceId]); + DeviceInfo().NumTeams[KernelInfo->DeviceId]); const int GridSize = LV.GridSize; const int WorkgroupSize = LV.WorkgroupSize; @@ -1425,7 +1428,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, // Run on the device. { - hsa_queue_t *Queue = DeviceInfo.HSAQueueSchedulers[DeviceId].next(); + hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next(); if (!Queue) { return OFFLOAD_FAIL; } @@ -1488,12 +1491,12 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ImplArgs->offset_z = 0; // assign a hostcall buffer for the selected Q - if (__atomic_load_n(&DeviceInfo.HostcallRequired, __ATOMIC_ACQUIRE)) { + if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) { // hostrpc_assign_buffer is not thread safe, and this function is // under a multiple reader lock, not a writer lock. static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&HostcallInitLock); - uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo.HSAAgents[DeviceId], + uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId); pthread_mutex_unlock(&HostcallInitLock); if (!Buffer) { @@ -1527,7 +1530,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, Packet->kernarg_address = KernArg; } - hsa_signal_t S = DeviceInfo.FreeSignalPool.pop(); + hsa_signal_t S = DeviceInfo().FreeSignalPool.pop(); if (S.handle == 0) { DP("Failed to get signal instance\n"); return OFFLOAD_FAIL; @@ -1549,7 +1552,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, assert(ArgPool); ArgPool->deallocate(KernArg); - DeviceInfo.FreeSignalPool.push(S); + DeviceInfo().FreeSignalPool.push(S); } DP("Kernel completed\n"); @@ -1743,7 +1746,7 @@ hsa_status_t moduleRegisterFromMemoryToPlace( }; return core::RegisterModuleFromMemory( KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, - DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&Cb), + DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb), HSAExecutables); } @@ -1839,7 +1842,7 @@ struct DeviceEnvironment { DP("Setting global device environment after load (%u bytes)\n", SI.Size); int DeviceId = HostDeviceEnv.DeviceNum; - auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId]; + auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; void *StatePtr; uint32_t StatePtrSize; hsa_status_t Err = interop_hsa_get_symbol_info( @@ -1855,7 +1858,7 @@ struct DeviceEnvironment { return HSA_STATUS_ERROR; } - return DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, + return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, StatePtrSize, DeviceId); } } @@ -1866,7 +1869,7 @@ struct DeviceEnvironment { hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { uint64_t Rounded = 4 * ((Size + 3) / 4); void *Ptr; - hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); + hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); if (Err != HSA_STATUS_SUCCESS) { return Err; @@ -1893,8 +1896,8 @@ bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { namespace core { hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { - return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(), - &DeviceInfo.HSAAgents[0], NULL, Ptr); + return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(), + &DeviceInfo().HSAAgents[0], NULL, Ptr); } } // namespace core @@ -1916,7 +1919,7 @@ static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) { auto TripleTargetID = llvm::StringRef(TargetID); if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) { - DeviceInfo.TargetID.push_back(TripleTargetID.ltrim('-').str()); + DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str()); } return HSA_STATUS_SUCCESS; } @@ -2034,13 +2037,13 @@ int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { __tgt_rtl_init_device(DeviceId); - hsa_agent_t agent = DeviceInfo.HSAAgents[DeviceId]; + hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId]; hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId); if (err != HSA_STATUS_SUCCESS) { DP("Error iterating ISAs\n"); return false; } - if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo.TargetID[DeviceId])) + if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId])) return false; } DP("Image has Target ID compatible with the current environment: %s\n", @@ -2053,8 +2056,8 @@ int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; } int __tgt_rtl_number_of_devices() { // If the construction failed, no methods are safe to call - if (DeviceInfo.ConstructionSucceeded) { - return DeviceInfo.NumberOfDevices; + if (DeviceInfo().ConstructionSucceeded) { + return DeviceInfo().NumberOfDevices; } DP("AMDGPU plugin construction failed. Zero devices available\n"); return 0; @@ -2062,7 +2065,7 @@ int __tgt_rtl_number_of_devices() { int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { DP("Init requires flags to %ld\n", RequiresFlags); - DeviceInfo.RequiresFlags = RequiresFlags; + DeviceInfo().RequiresFlags = RequiresFlags; return RequiresFlags; } @@ -2075,7 +2078,7 @@ int32_t __tgt_rtl_init_device(int DeviceId) { // this is per device id init DP("Initialize the device id: %d\n", DeviceId); - hsa_agent_t Agent = DeviceInfo.HSAAgents[DeviceId]; + hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; // Get number of Compute Unit uint32_t ComputeUnits = 0; @@ -2083,39 +2086,39 @@ int32_t __tgt_rtl_init_device(int DeviceId) { Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &ComputeUnits); if (Err != HSA_STATUS_SUCCESS) { - DeviceInfo.ComputeUnits[DeviceId] = 1; + DeviceInfo().ComputeUnits[DeviceId] = 1; DP("Error getting compute units : settiing to 1\n"); } else { - DeviceInfo.ComputeUnits[DeviceId] = ComputeUnits; - DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[DeviceId]); + DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits; + DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]); } char GetInfoName[64]; // 64 max size returned by get info Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, (void *)GetInfoName); if (Err) - DeviceInfo.GPUName[DeviceId] = "--unknown gpu--"; + DeviceInfo().GPUName[DeviceId] = "--unknown gpu--"; else { - DeviceInfo.GPUName[DeviceId] = GetInfoName; + DeviceInfo().GPUName[DeviceId] = GetInfoName; } if (print_kernel_trace & STARTUP_DETAILS) DP("Device#%-2d CU's: %2d %s\n", DeviceId, - DeviceInfo.ComputeUnits[DeviceId], DeviceInfo.GPUName[DeviceId].c_str()); + DeviceInfo().ComputeUnits[DeviceId], DeviceInfo().GPUName[DeviceId].c_str()); // Query attributes to determine number of threads/block and blocks/grid. uint16_t WorkgroupMaxDim[3]; Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, &WorkgroupMaxDim); if (Err != HSA_STATUS_SUCCESS) { - DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; + DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; DP("Error getting grid dims: num groups : %d\n", RTLDeviceInfoTy::DefaultNumTeams); } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { - DeviceInfo.GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; - DP("Using %d ROCm blocks per grid\n", DeviceInfo.GroupsPerDevice[DeviceId]); + DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; + DP("Using %d ROCm blocks per grid\n", DeviceInfo().GroupsPerDevice[DeviceId]); } else { - DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; + DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " "at the hard limit\n", WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit); @@ -2125,22 +2128,22 @@ int32_t __tgt_rtl_init_device(int DeviceId) { hsa_dim3_t GridMaxDim; Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); if (Err == HSA_STATUS_SUCCESS) { - DeviceInfo.ThreadsPerGroup[DeviceId] = + DeviceInfo().ThreadsPerGroup[DeviceId] = reinterpret_cast<uint32_t *>(&GridMaxDim)[0] / - DeviceInfo.GroupsPerDevice[DeviceId]; + DeviceInfo().GroupsPerDevice[DeviceId]; - if (DeviceInfo.ThreadsPerGroup[DeviceId] == 0) { - DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; + if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) { + DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); - } else if (enforceUpperBound(&DeviceInfo.ThreadsPerGroup[DeviceId], + } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId], RTLDeviceInfoTy::MaxWgSize)) { DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); } else { DP("Using ROCm Queried thread limit: %d\n", - DeviceInfo.ThreadsPerGroup[DeviceId]); + DeviceInfo().ThreadsPerGroup[DeviceId]); } } else { - DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; + DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; DP("Error getting max block dimension, use default:%d \n", RTLDeviceInfoTy::MaxWgSize); } @@ -2151,27 +2154,27 @@ int32_t __tgt_rtl_init_device(int DeviceId) { hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); if (Err == HSA_STATUS_SUCCESS) { DP("Queried wavefront size: %d\n", WavefrontSize); - DeviceInfo.WarpSize[DeviceId] = WavefrontSize; + DeviceInfo().WarpSize[DeviceId] = WavefrontSize; } else { // TODO: Burn the wavefront size into the code object DP("Warning: Unknown wavefront size, assuming 64\n"); - DeviceInfo.WarpSize[DeviceId] = 64; + DeviceInfo().WarpSize[DeviceId] = 64; } // Adjust teams to the env variables - if (DeviceInfo.Env.TeamLimit > 0 && - (enforceUpperBound(&DeviceInfo.GroupsPerDevice[DeviceId], - DeviceInfo.Env.TeamLimit))) { + if (DeviceInfo().Env.TeamLimit > 0 && + (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId], + DeviceInfo().Env.TeamLimit))) { DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", - DeviceInfo.Env.TeamLimit); + DeviceInfo().Env.TeamLimit); } // Set default number of teams - if (DeviceInfo.Env.NumTeams > 0) { - DeviceInfo.NumTeams[DeviceId] = DeviceInfo.Env.NumTeams; + if (DeviceInfo().Env.NumTeams > 0) { + DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams; DP("Default number of teams set according to environment %d\n", - DeviceInfo.Env.NumTeams); + DeviceInfo().Env.NumTeams); } else { char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); int TeamsPerCU = DefaultTeamsPerCU; @@ -2179,45 +2182,45 @@ int32_t __tgt_rtl_init_device(int DeviceId) { TeamsPerCU = std::stoi(TeamsPerCUEnvStr); } - DeviceInfo.NumTeams[DeviceId] = - TeamsPerCU * DeviceInfo.ComputeUnits[DeviceId]; + DeviceInfo().NumTeams[DeviceId] = + TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId]; DP("Default number of teams = %d * number of compute units %d\n", - TeamsPerCU, DeviceInfo.ComputeUnits[DeviceId]); + TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]); } - if (enforceUpperBound(&DeviceInfo.NumTeams[DeviceId], - DeviceInfo.GroupsPerDevice[DeviceId])) { + if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId], + DeviceInfo().GroupsPerDevice[DeviceId])) { DP("Default number of teams exceeds device limit, capping at %d\n", - DeviceInfo.GroupsPerDevice[DeviceId]); + DeviceInfo().GroupsPerDevice[DeviceId]); } // Adjust threads to the env variables - if (DeviceInfo.Env.TeamThreadLimit > 0 && - (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId], - DeviceInfo.Env.TeamThreadLimit))) { + if (DeviceInfo().Env.TeamThreadLimit > 0 && + (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], + DeviceInfo().Env.TeamThreadLimit))) { DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", - DeviceInfo.Env.TeamThreadLimit); + DeviceInfo().Env.TeamThreadLimit); } // Set default number of threads - DeviceInfo.NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; + DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; DP("Default number of threads set according to library's default %d\n", RTLDeviceInfoTy::DefaultWgSize); - if (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId], - DeviceInfo.ThreadsPerGroup[DeviceId])) { + if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], + DeviceInfo().ThreadsPerGroup[DeviceId])) { DP("Default number of threads exceeds device limit, capping at %d\n", - DeviceInfo.ThreadsPerGroup[DeviceId]); + DeviceInfo().ThreadsPerGroup[DeviceId]); } DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", - DeviceId, DeviceInfo.GroupsPerDevice[DeviceId], - DeviceInfo.ThreadsPerGroup[DeviceId]); + DeviceId, DeviceInfo().GroupsPerDevice[DeviceId], + DeviceInfo().ThreadsPerGroup[DeviceId]); DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId, - DeviceInfo.WarpSize[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId], - DeviceInfo.GroupsPerDevice[DeviceId], - DeviceInfo.GroupsPerDevice[DeviceId] * - DeviceInfo.ThreadsPerGroup[DeviceId]); + DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId], + DeviceInfo().GroupsPerDevice[DeviceId], + DeviceInfo().GroupsPerDevice[DeviceId] * + DeviceInfo().ThreadsPerGroup[DeviceId]); return OFFLOAD_SUCCESS; } @@ -2227,9 +2230,9 @@ __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, __tgt_device_image *Image) { - DeviceInfo.LoadRunLock.lock(); + DeviceInfo().LoadRunLock.lock(); __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); - DeviceInfo.LoadRunLock.unlock(); + DeviceInfo().LoadRunLock.unlock(); return Res; } @@ -2259,7 +2262,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; - DeviceInfo.clearOffloadEntriesTable(DeviceId); + DeviceInfo().clearOffloadEntriesTable(DeviceId); // We do not need to set the ELF version because the caller of this function // had to do that to decide the right runtime to use @@ -2268,25 +2271,25 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, return NULL; { - auto Env = DeviceEnvironment(DeviceId, DeviceInfo.NumberOfDevices, - DeviceInfo.Env.DynamicMemSize, Image, ImgSize); + auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, + DeviceInfo().Env.DynamicMemSize, Image, ImgSize); - auto &KernelInfo = DeviceInfo.KernelInfoTable[DeviceId]; - auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId]; + auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId]; + auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; hsa_status_t Err = moduleRegisterFromMemoryToPlace( KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, [&](void *Data, size_t Size) { if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { - __atomic_store_n(&DeviceInfo.HostcallRequired, true, + __atomic_store_n(&DeviceInfo().HostcallRequired, true, __ATOMIC_RELEASE); } return Env.beforeLoading(Data, Size); }, - DeviceInfo.HSAExecutables); + DeviceInfo().HSAExecutables); check("Module registering", Err); if (Err != HSA_STATUS_SUCCESS) { - const char *DeviceName = DeviceInfo.GPUName[DeviceId].c_str(); + const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str(); const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); if (strcmp(DeviceName, ElfName) != 0) { @@ -2315,7 +2318,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, void *StatePtr; uint32_t StatePtrSize; - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId]; + auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; hsa_status_t Err = interop_hsa_get_symbol_info( SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, &StatePtrSize); @@ -2340,7 +2343,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, return NULL; } - auto &DSS = DeviceInfo.DeviceStateStore[DeviceId]; + auto &DSS = DeviceInfo().DeviceStateStore[DeviceId]; if (DSS.first.get() == nullptr) { assert(DSS.second == 0); void *Ptr = NULL; @@ -2362,7 +2365,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, } // write ptr to device memory so it can be used by later kernels - Err = DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *), + Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *), DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("memcpy install of state_ptr failed\n"); @@ -2399,7 +2402,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, void *Varptr; uint32_t Varsize; - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId]; + auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; hsa_status_t Err = interop_hsa_get_symbol_info( SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); @@ -2419,14 +2422,14 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)); Entry.addr = (void *)Varptr; - DeviceInfo.addOffloadEntry(DeviceId, Entry); + DeviceInfo().addOffloadEntry(DeviceId, Entry); - if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && E->flags & OMP_DECLARE_TARGET_LINK) { // If unified memory is present any target link variables // can access host addresses directly. There is no longer a // need for device copies. - Err = DeviceInfo.freesignalpoolMemcpyH2D(Varptr, E->addr, + Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *), DeviceId); if (Err != HSA_STATUS_SUCCESS) DP("Error when copying USM\n"); @@ -2442,7 +2445,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, // errors in kernarg_segment_size previously treated as = 0 (or as undef) uint32_t KernargSegmentSize = 0; - auto &KernelInfoMap = DeviceInfo.KernelInfoTable[DeviceId]; + auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId]; hsa_status_t Err = HSA_STATUS_SUCCESS; if (!E->name) { Err = HSA_STATUS_ERROR; @@ -2589,19 +2592,19 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo.KernArgPool)); + DeviceInfo().KernArgPool)); __tgt_offload_entry Entry = *E; Entry.addr = (void *)&KernelsList.back(); - DeviceInfo.addOffloadEntry(DeviceId, Entry); + DeviceInfo().addOffloadEntry(DeviceId, Entry); DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); } - return DeviceInfo.getOffloadEntriesTable(DeviceId); + return DeviceInfo().getOffloadEntriesTable(DeviceId); } void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { void *Ptr = NULL; - assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); if (Kind != TARGET_ALLOC_DEFAULT) { REPORT("Invalid target data allocation kind or requested allocator not " @@ -2609,7 +2612,7 @@ void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { return NULL; } - hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); + hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size, (long long unsigned)(Elf64_Addr)Ptr); @@ -2619,7 +2622,7 @@ void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, int64_t Size) { - assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); __tgt_async_info AsyncInfo; int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); if (Rc != OFFLOAD_SUCCESS) @@ -2630,7 +2633,7 @@ int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo) { - assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); if (AsyncInfo) { initAsyncInfo(AsyncInfo); return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); @@ -2640,7 +2643,7 @@ int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, int64_t Size) { - assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); __tgt_async_info AsyncInfo; int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); if (Rc != OFFLOAD_SUCCESS) @@ -2653,13 +2656,13 @@ int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); - assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); initAsyncInfo(AsyncInfo); return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); } int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) { - assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); + assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); hsa_status_t Err; DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr); Err = core::Runtime::Memfree(TgtPtr); @@ -2676,11 +2679,11 @@ int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, int32_t ThreadLimit, uint64_t LoopTripcount) { - DeviceInfo.LoadRunLock.lock_shared(); + DeviceInfo().LoadRunLock.lock_shared(); int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, NumTeams, ThreadLimit, LoopTripcount); - DeviceInfo.LoadRunLock.unlock_shared(); + DeviceInfo().LoadRunLock.unlock_shared(); return Res; } @@ -2703,11 +2706,11 @@ int32_t __tgt_rtl_run_target_team_region_async( assert(AsyncInfo && "AsyncInfo is nullptr"); initAsyncInfo(AsyncInfo); - DeviceInfo.LoadRunLock.lock_shared(); + DeviceInfo().LoadRunLock.lock_shared(); int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, NumTeams, ThreadLimit, LoopTripcount); - DeviceInfo.LoadRunLock.unlock_shared(); + DeviceInfo().LoadRunLock.unlock_shared(); return Res; } @@ -2740,7 +2743,7 @@ void __tgt_rtl_print_device_info(int32_t DeviceId) { // TODO: Assertion to see if DeviceId is correct // NOTE: We don't need to set context for print device info. - DeviceInfo.printDeviceInfo(DeviceId, DeviceInfo.HSAAgents[DeviceId]); + DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); } } // extern "C" |