//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Implementation of the interface to be used by Clang during the codegen of a // target region. // //===----------------------------------------------------------------------===// #include "omptarget.h" #include "OffloadPolicy.h" #include "OpenMP/OMPT/Callback.h" #include "OpenMP/OMPT/Interface.h" #include "PluginManager.h" #include "Shared/Debug.h" #include "Shared/EnvironmentVar.h" #include "Shared/Utils.h" #include "device.h" #include "private.h" #include "rtl.h" #include "Shared/Profile.h" #include "OpenMP/Mapping.h" #include "OpenMP/omp.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/bit.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Object/ObjectFile.h" #include #include #include using llvm::SmallVector; #ifdef OMPT_SUPPORT using namespace llvm::omp::target::ompt; #endif int AsyncInfoTy::synchronize() { int Result = OFFLOAD_SUCCESS; if (!isQueueEmpty()) { switch (SyncType) { case SyncTy::BLOCKING: // If we have a queue we need to synchronize it now. Result = Device.synchronize(*this); assert(AsyncInfo.Queue == nullptr && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!"); break; case SyncTy::NON_BLOCKING: Result = Device.queryAsync(*this); break; } } // Run any pending post-processing function registered on this async object. if (Result == OFFLOAD_SUCCESS && isQueueEmpty()) Result = runPostProcessing(); return Result; } void *&AsyncInfoTy::getVoidPtrLocation() { BufferLocations.push_back(nullptr); return BufferLocations.back(); } bool AsyncInfoTy::isDone() const { return isQueueEmpty(); } int32_t AsyncInfoTy::runPostProcessing() { size_t Size = PostProcessingFunctions.size(); for (size_t I = 0; I < Size; ++I) { const int Result = PostProcessingFunctions[I](); if (Result != OFFLOAD_SUCCESS) return Result; } // Clear the vector up until the last known function, since post-processing // procedures might add new procedures themselves. const auto *PrevBegin = PostProcessingFunctions.begin(); PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size); return OFFLOAD_SUCCESS; } bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; } /* All begin addresses for partially mapped structs must be aligned, up to 16, * in order to ensure proper alignment of members. E.g. * * struct S { * int a; // 4-aligned * int b; // 4-aligned * int *p; // 8-aligned * } s1; * ... * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) * { * s1.b = 5; * for (int i...) s1.p[i] = ...; * } * * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment * requirements for its type. Now, when we allocate memory on the device, in * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. * This means that the chunk of the struct on the device will start at a * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and * address of p will be a misaligned 0x204 (on the host there was no need to add * padding between b and p, so p comes exactly 4 bytes after b). If the device * kernel tries to access s1.p, a misaligned address error occurs (as reported * by the CUDA plugin). By padding the begin address down to a multiple of 8 and * extending the size of the allocated chuck accordingly, the chuck on the * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and * &s1.p=0x208, as they should be to satisfy the alignment requirements. */ static const int64_t MaxAlignment = 16; /// Return the alignment requirement of partially mapped structs, see /// MaxAlignment above. static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) { int LowestOneBit = __builtin_ffsl(reinterpret_cast(HstPtrBase)); uint64_t BaseAlignment = 1 << (LowestOneBit - 1); return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment; } void handleTargetOutcome(bool Success, ident_t *Loc) { switch (OffloadPolicy::get(*PM).Kind) { case OffloadPolicy::DISABLED: if (Success) { FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); } break; case OffloadPolicy::MANDATORY: if (!Success) { if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) { auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor(); for (auto &Device : PM->devices(ExclusiveDevicesAccessor)) dumpTargetPointerMappings(Loc, Device); } else FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html " "for debugging options.\n"); if (!PM->getNumActivePlugins()) { FAILURE_MESSAGE( "No images found compatible with the installed hardware. "); llvm::SmallVector Archs; for (auto &Image : PM->deviceImages()) { const char *Start = reinterpret_cast( Image.getExecutableImage().ImageStart); uint64_t Length = utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd); llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length), /*Identifier=*/""); auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Buffer); if (auto Err = ObjectOrErr.takeError()) { llvm::consumeError(std::move(Err)); continue; } if (auto CPU = (*ObjectOrErr)->tryGetCPUName()) Archs.push_back(*CPU); } fprintf(stderr, "Found %zu image(s): (%s)\n", Archs.size(), llvm::join(Archs, ",").c_str()); } SourceInfo Info(Loc); if (Info.isAvailible()) fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(), Info.getColumn()); else FAILURE_MESSAGE("Source location information not present. Compile with " "-g or -gline-tables-only.\n"); FATAL_MESSAGE0( 1, "failure of target construct while offloading is mandatory"); } else { if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) { auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor(); for (auto &Device : PM->devices(ExclusiveDevicesAccessor)) dumpTargetPointerMappings(Loc, Device); } } break; } } static int32_t getParentIndex(int64_t Type) { return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; } void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, const char *Name) { DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size); if (Size <= 0) { DP("Call to %s with non-positive length\n", Name); return NULL; } void *Rc = NULL; if (DeviceNum == omp_get_initial_device()) { Rc = malloc(Size); DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc)); return Rc; } auto DeviceOrErr = PM->getDevice(DeviceNum); if (!DeviceOrErr) FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); Rc = DeviceOrErr->allocData(Size, nullptr, Kind); DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc)); return Rc; } void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, const char *Name) { DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum, DPxPTR(DevicePtr)); if (!DevicePtr) { DP("Call to %s with NULL ptr\n", Name); return; } if (DeviceNum == omp_get_initial_device()) { free(DevicePtr); DP("%s deallocated host ptr\n", Name); return; } auto DeviceOrErr = PM->getDevice(DeviceNum); if (!DeviceOrErr) FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL) FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr. Set " "OFFLOAD_TRACK_ALLOCATION_TRACES=1 to track allocations."); DP("omp_target_free deallocated device ptr\n"); } void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, const char *Name) { DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size); if (Size <= 0) { DP("Call to %s with non-positive length\n", Name); return NULL; } void *RC = NULL; auto DeviceOrErr = PM->getDevice(DeviceNum); if (!DeviceOrErr) FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); int32_t Err = 0; Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC); if (Err) { DP("Could not lock ptr %p\n", HostPtr); return nullptr; } DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(RC)); return RC; } void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { DP("Call to %s for device %d unlocking\n", Name, DeviceNum); auto DeviceOrErr = PM->getDevice(DeviceNum); if (!DeviceOrErr) FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr); DP("%s returns\n", Name); } /// Call the user-defined mapper function followed by the appropriate // targetData* function (targetData{Begin,End,Update}). int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, void *ArgMapper, AsyncInfoTy &AsyncInfo, TargetDataFuncPtrTy TargetDataFunction, AttachInfoTy *AttachInfo = nullptr) { DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper)); // The mapper function fills up Components. MapperComponentsTy MapperComponents; MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper); (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType, ArgNames); // Construct new arrays for args_base, args, arg_sizes and arg_types // using the information in MapperComponents and call the corresponding // targetData* function using these new arrays. SmallVector MapperArgsBase(MapperComponents.Components.size()); SmallVector MapperArgs(MapperComponents.Components.size()); SmallVector MapperArgSizes(MapperComponents.Components.size()); SmallVector MapperArgTypes(MapperComponents.Components.size()); SmallVector MapperArgNames(MapperComponents.Components.size()); for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { auto &C = MapperComponents.Components[I]; MapperArgsBase[I] = C.Base; MapperArgs[I] = C.Begin; MapperArgSizes[I] = C.Size; MapperArgTypes[I] = C.Type; MapperArgNames[I] = C.Name; } int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(), MapperArgsBase.data(), MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), MapperArgNames.data(), /*arg_mappers*/ nullptr, AsyncInfo, AttachInfo, /*FromMapper=*/true); return Rc; } /// Returns a buffer of the requested \p Size, to be used as the source for /// `submitData`. /// /// For small buffers (`Size <= sizeof(void*)`), uses \p AsyncInfo's /// getVoidPtrLocation(). /// For larger buffers, creates a dynamic buffer which will be eventually /// deleted by \p AsyncInfo's post-processing callback. static char *getOrCreateSourceBufferForSubmitData(AsyncInfoTy &AsyncInfo, int64_t Size) { constexpr int64_t VoidPtrSize = sizeof(void *); if (Size <= VoidPtrSize) { void *&BufferElement = AsyncInfo.getVoidPtrLocation(); return reinterpret_cast(&BufferElement); } // Create a dynamic buffer for larger data and schedule its deletion. char *DataBuffer = new char[Size]; AsyncInfo.addPostProcessingFunction([DataBuffer]() { delete[] DataBuffer; return OFFLOAD_SUCCESS; }); return DataBuffer; } /// Calculates the target pointee base by applying the host /// pointee begin/base delta to the target pointee begin. /// /// ``` /// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase) /// ``` static void *calculateTargetPointeeBase(void *HstPteeBase, void *HstPteeBegin, void *TgtPteeBegin) { uint64_t Delta = reinterpret_cast(HstPteeBegin) - reinterpret_cast(HstPteeBase); void *TgtPteeBase = reinterpret_cast( reinterpret_cast(TgtPteeBegin) - Delta); DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n", DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta); DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD "\n", DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin)); return TgtPteeBase; } /// Utility function to perform a pointer attachment operation. /// /// For something like: /// ```cpp /// int *p; /// ... /// #pragma omp target enter data map(to:p[10:10]) /// ``` /// /// for which the attachment operation gets represented using: /// ``` /// &p, &p[10], sizeof(p), ATTACH /// ``` /// /// (Hst|Tgt)PtrAddr represents &p /// (Hst|Tgt)PteeBase represents &p[0] /// (Hst|Tgt)PteeBegin represents &p[10] /// /// This function first computes the expected TgtPteeBase using: /// `