diff options
Diffstat (limited to 'offload')
42 files changed, 1472 insertions, 170 deletions
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt index b277380..6e801b1 100644 --- a/offload/CMakeLists.txt +++ b/offload/CMakeLists.txt @@ -90,18 +90,18 @@ else() set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}") if (NOT MSVC) - set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) - set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++) + set(OPENMP_TEST_C_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang) + set(OPENMP_TEST_CXX_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang++) else() - set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe) - set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe) + set(OPENMP_TEST_C_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang.exe) + set(OPENMP_TEST_CXX_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang++.exe) endif() # Check for flang if (NOT MSVC) - set(OPENMP_TEST_Fortran_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/flang) + set(OPENMP_TEST_Fortran_COMPILER ${LLVM_TOOLS_BINARY_DIR}/flang) else() - set(OPENMP_TEST_Fortran_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/flang.exe) + set(OPENMP_TEST_Fortran_COMPILER ${LLVM_TOOLS_BINARY_DIR}/flang.exe) endif() # Set fortran test compiler if flang is found diff --git a/offload/cmake/OpenMPTesting.cmake b/offload/cmake/OpenMPTesting.cmake index ef8cf34..b564e46 100644 --- a/offload/cmake/OpenMPTesting.cmake +++ b/offload/cmake/OpenMPTesting.cmake @@ -68,9 +68,9 @@ else() message(WARNING "The check targets will not be available!") set(ENABLE_CHECK_TARGETS FALSE) else() - set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck) + set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/FileCheck) endif() - set(OPENMP_NOT_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/not) + set(OPENMP_NOT_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/not) endif() set(OFFLOAD_DEVICE_INFO_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-offload-device-info) set(OFFLOAD_TBLGEN_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/offload-tblgen) diff --git a/offload/include/OpenMP/InteropAPI.h b/offload/include/OpenMP/InteropAPI.h index 8c06ba3..02e097e 100644 --- a/offload/include/OpenMP/InteropAPI.h +++ b/offload/include/OpenMP/InteropAPI.h @@ -160,17 +160,11 @@ struct InteropTableEntry { Interops.push_back(obj); } - template <class ClearFuncTy> void clear(ClearFuncTy f) { - for (auto &Obj : Interops) { - f(Obj); - } - } - /// vector interface int size() const { return Interops.size(); } iterator begin() { return Interops.begin(); } iterator end() { return Interops.end(); } - iterator erase(iterator it) { return Interops.erase(it); } + void clear() { Interops.clear(); } }; struct InteropTblTy diff --git a/offload/include/OpenMP/omp.h b/offload/include/OpenMP/omp.h index 768ca46..8db0a05 100644 --- a/offload/include/OpenMP/omp.h +++ b/offload/include/OpenMP/omp.h @@ -30,6 +30,14 @@ extern "C" { +/// Definitions +///{ + +// See definition in OpenMP (omp.h.var/omp_lib.(F90|h).var) +#define omp_invalid_device -2 + +///} + /// Type declarations ///{ diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h index 45b1961..b547761 100644 --- a/offload/include/PerThreadTable.h +++ b/offload/include/PerThreadTable.h @@ -14,101 +14,256 @@ #define OFFLOAD_PERTHREADTABLE_H #include <list> +#include <llvm/ADT/SmallVector.h> +#include <llvm/Support/Error.h> #include <memory> #include <mutex> +#include <type_traits> + +template <typename ObjectType> class PerThread { + std::mutex Mutex; + llvm::SmallVector<std::shared_ptr<ObjectType>> ThreadDataList; + + ObjectType &getThreadData() { + static thread_local std::shared_ptr<ObjectType> ThreadData = nullptr; + if (!ThreadData) { + ThreadData = std::make_shared<ObjectType>(); + std::lock_guard<std::mutex> Lock(Mutex); + ThreadDataList.push_back(ThreadData); + } + return *ThreadData; + } + +public: + // Define default constructors, disable copy and move constructors. + PerThread() = default; + PerThread(const PerThread &) = delete; + PerThread(PerThread &&) = delete; + PerThread &operator=(const PerThread &) = delete; + PerThread &operator=(PerThread &&) = delete; + ~PerThread() { + assert(Mutex.try_lock() && (Mutex.unlock(), true) && + "Cannot be deleted while other threads are adding entries"); + ThreadDataList.clear(); + } + + ObjectType &get() { return getThreadData(); } + + template <class ClearFuncTy> void clear(ClearFuncTy ClearFunc) { + assert(Mutex.try_lock() && (Mutex.unlock(), true) && + "Clear cannot be called while other threads are adding entries"); + for (std::shared_ptr<ObjectType> ThreadData : ThreadDataList) { + if (!ThreadData) + continue; + ClearFunc(*ThreadData); + } + ThreadDataList.clear(); + } +}; + +template <typename ContainerTy> struct ContainerConcepts { + template <typename, template <typename> class, typename = std::void_t<>> + struct has : std::false_type {}; + template <typename Ty, template <typename> class Op> + struct has<Ty, Op, std::void_t<Op<Ty>>> : std::true_type {}; + + template <typename Ty> using IteratorTypeCheck = typename Ty::iterator; + template <typename Ty> using MappedTypeCheck = typename Ty::mapped_type; + template <typename Ty> using ValueTypeCheck = typename Ty::value_type; + template <typename Ty> using KeyTypeCheck = typename Ty::key_type; + template <typename Ty> using SizeTypeCheck = typename Ty::size_type; + + template <typename Ty> + using ClearCheck = decltype(std::declval<Ty>().clear()); + template <typename Ty> + using ReserveCheck = decltype(std::declval<Ty>().reserve(1)); + template <typename Ty> + using ResizeCheck = decltype(std::declval<Ty>().resize(1)); + + static constexpr bool hasIterator = + has<ContainerTy, IteratorTypeCheck>::value; + static constexpr bool hasClear = has<ContainerTy, ClearCheck>::value; + static constexpr bool isAssociative = + has<ContainerTy, MappedTypeCheck>::value; + static constexpr bool hasReserve = has<ContainerTy, ReserveCheck>::value; + static constexpr bool hasResize = has<ContainerTy, ResizeCheck>::value; + + template <typename, template <typename> class, typename = std::void_t<>> + struct has_type { + using type = void; + }; + template <typename Ty, template <typename> class Op> + struct has_type<Ty, Op, std::void_t<Op<Ty>>> { + using type = Op<Ty>; + }; + + using iterator = typename has_type<ContainerTy, IteratorTypeCheck>::type; + using value_type = typename std::conditional_t< + isAssociative, typename has_type<ContainerTy, MappedTypeCheck>::type, + typename has_type<ContainerTy, ValueTypeCheck>::type>; + using key_type = typename std::conditional_t< + isAssociative, typename has_type<ContainerTy, KeyTypeCheck>::type, + typename has_type<ContainerTy, SizeTypeCheck>::type>; +}; // Using an STL container (such as std::vector) indexed by thread ID has // too many race conditions issues so we store each thread entry into a // thread_local variable. -// T is the container type used to store the objects, e.g., std::vector, -// std::set, etc. by each thread. O is the type of the stored objects e.g., -// omp_interop_val_t *, ... - -template <typename ContainerType, typename ObjectType> struct PerThreadTable { - using iterator = typename ContainerType::iterator; +// ContainerType is the container type used to store the objects, e.g., +// std::vector, std::set, etc. by each thread. ObjectType is the type of the +// stored objects e.g., omp_interop_val_t *, ... +template <typename ContainerType, typename ObjectType> class PerThreadTable { + using iterator = typename ContainerConcepts<ContainerType>::iterator; struct PerThreadData { - size_t NElements = 0; - std::unique_ptr<ContainerType> ThEntry; + size_t Size = 0; + std::unique_ptr<ContainerType> ThreadEntry; }; - std::mutex Mtx; - std::list<std::shared_ptr<PerThreadData>> ThreadDataList; - - // define default constructors, disable copy and move constructors - PerThreadTable() = default; - PerThreadTable(const PerThreadTable &) = delete; - PerThreadTable(PerThreadTable &&) = delete; - PerThreadTable &operator=(const PerThreadTable &) = delete; - PerThreadTable &operator=(PerThreadTable &&) = delete; - ~PerThreadTable() { - std::lock_guard<std::mutex> Lock(Mtx); - ThreadDataList.clear(); - } + std::mutex Mutex; + llvm::SmallVector<std::shared_ptr<PerThreadData>> ThreadDataList; -private: PerThreadData &getThreadData() { - static thread_local std::shared_ptr<PerThreadData> ThData = nullptr; - if (!ThData) { - ThData = std::make_shared<PerThreadData>(); - std::lock_guard<std::mutex> Lock(Mtx); - ThreadDataList.push_back(ThData); + static thread_local std::shared_ptr<PerThreadData> ThreadData = nullptr; + if (!ThreadData) { + ThreadData = std::make_shared<PerThreadData>(); + std::lock_guard<std::mutex> Lock(Mutex); + ThreadDataList.push_back(ThreadData); } - return *ThData; + return *ThreadData; } protected: ContainerType &getThreadEntry() { - auto &ThData = getThreadData(); - if (ThData.ThEntry) - return *ThData.ThEntry; - ThData.ThEntry = std::make_unique<ContainerType>(); - return *ThData.ThEntry; + PerThreadData &ThreadData = getThreadData(); + if (ThreadData.ThreadEntry) + return *ThreadData.ThreadEntry; + ThreadData.ThreadEntry = std::make_unique<ContainerType>(); + return *ThreadData.ThreadEntry; + } + + size_t &getThreadSize() { + PerThreadData &ThreadData = getThreadData(); + return ThreadData.Size; } - size_t &getThreadNElements() { - auto &ThData = getThreadData(); - return ThData.NElements; + void setSize(size_t Size) { + size_t &SizeRef = getThreadSize(); + SizeRef = Size; } public: + // define default constructors, disable copy and move constructors. + PerThreadTable() = default; + PerThreadTable(const PerThreadTable &) = delete; + PerThreadTable(PerThreadTable &&) = delete; + PerThreadTable &operator=(const PerThreadTable &) = delete; + PerThreadTable &operator=(PerThreadTable &&) = delete; + ~PerThreadTable() { + assert(Mutex.try_lock() && (Mutex.unlock(), true) && + "Cannot be deleted while other threads are adding entries"); + ThreadDataList.clear(); + } + void add(ObjectType obj) { - auto &Entry = getThreadEntry(); - auto &NElements = getThreadNElements(); - NElements++; + ContainerType &Entry = getThreadEntry(); + size_t &SizeRef = getThreadSize(); + SizeRef++; Entry.add(obj); } iterator erase(iterator it) { - auto &Entry = getThreadEntry(); - auto &NElements = getThreadNElements(); - NElements--; + ContainerType &Entry = getThreadEntry(); + size_t &SizeRef = getThreadSize(); + SizeRef--; return Entry.erase(it); } - size_t size() { return getThreadNElements(); } + size_t size() { return getThreadSize(); } // Iterators to traverse objects owned by - // the current thread + // the current thread. iterator begin() { - auto &Entry = getThreadEntry(); + ContainerType &Entry = getThreadEntry(); return Entry.begin(); } iterator end() { - auto &Entry = getThreadEntry(); + ContainerType &Entry = getThreadEntry(); return Entry.end(); } - template <class F> void clear(F f) { - std::lock_guard<std::mutex> Lock(Mtx); - for (auto ThData : ThreadDataList) { - if (!ThData->ThEntry || ThData->NElements == 0) + template <class ClearFuncTy> void clear(ClearFuncTy ClearFunc) { + assert(Mutex.try_lock() && (Mutex.unlock(), true) && + "Clear cannot be called while other threads are adding entries"); + for (std::shared_ptr<PerThreadData> ThreadData : ThreadDataList) { + if (!ThreadData->ThreadEntry || ThreadData->Size == 0) continue; - ThData->ThEntry->clear(f); - ThData->NElements = 0; + if constexpr (ContainerConcepts<ContainerType>::hasIterator && + ContainerConcepts<ContainerType>::hasClear) { + for (auto &Obj : *ThreadData->ThreadEntry) { + if constexpr (ContainerConcepts<ContainerType>::isAssociative) { + ClearFunc(Obj.second); + } else { + ClearFunc(Obj); + } + } + ThreadData->ThreadEntry->clear(); + } else { + static_assert(true, "Container type not supported"); + } + ThreadData->Size = 0; } ThreadDataList.clear(); } + + template <class DeinitFuncTy> llvm::Error deinit(DeinitFuncTy DeinitFunc) { + assert(Mutex.try_lock() && (Mutex.unlock(), true) && + "Deinit cannot be called while other threads are adding entries"); + for (std::shared_ptr<PerThreadData> ThreadData : ThreadDataList) { + if (!ThreadData->ThreadEntry || ThreadData->Size == 0) + continue; + for (auto &Obj : *ThreadData->ThreadEntry) { + if constexpr (ContainerConcepts<ContainerType>::isAssociative) { + if (auto Err = DeinitFunc(Obj.second)) + return Err; + } else { + if (auto Err = DeinitFunc(Obj)) + return Err; + } + } + } + return llvm::Error::success(); + } +}; + +template <typename ContainerType, size_t ReserveSize = 0> +class PerThreadContainer + : public PerThreadTable<ContainerType, typename ContainerConcepts< + ContainerType>::value_type> { + + using IndexType = typename ContainerConcepts<ContainerType>::key_type; + using ObjectType = typename ContainerConcepts<ContainerType>::value_type; + +public: + // Get the object for the given index in the current thread. + ObjectType &get(IndexType Index) { + ContainerType &Entry = this->getThreadEntry(); + + // Specialized code for vector-like containers. + if constexpr (ContainerConcepts<ContainerType>::hasResize) { + if (Index >= Entry.size()) { + if constexpr (ContainerConcepts<ContainerType>::hasReserve && + ReserveSize > 0) + Entry.reserve(ReserveSize); + + // If the index is out of bounds, try resize the container. + Entry.resize(Index + 1); + } + } + ObjectType &Ret = Entry[Index]; + this->setSize(Entry.size()); + return Ret; + } }; #endif diff --git a/offload/include/Shared/Debug.h b/offload/include/Shared/Debug.h index 7c3db8d..9e657e6 100644 --- a/offload/include/Shared/Debug.h +++ b/offload/include/Shared/Debug.h @@ -39,9 +39,13 @@ #define OMPTARGET_SHARED_DEBUG_H #include <atomic> +#include <cstdarg> #include <mutex> #include <string> +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/circular_raw_ostream.h" + /// 32-Bit field data attributes controlling information presented to the user. enum OpenMPInfoType : uint32_t { // Print data arguments and attributes upon entering an OpenMP device kernel. @@ -75,17 +79,6 @@ inline std::atomic<uint32_t> &getInfoLevelInternal() { inline uint32_t getInfoLevel() { return getInfoLevelInternal().load(); } -inline uint32_t getDebugLevel() { - static uint32_t DebugLevel = 0; - static std::once_flag Flag{}; - std::call_once(Flag, []() { - if (char *EnvStr = getenv("LIBOMPTARGET_DEBUG")) - DebugLevel = std::stoi(EnvStr); - }); - - return DebugLevel; -} - #undef USED #undef GCC_VERSION @@ -144,46 +137,11 @@ inline uint32_t getDebugLevel() { fprintf(_stdDst, __VA_ARGS__); \ } while (0) -// Debugging messages -#ifdef OMPTARGET_DEBUG -#include <stdio.h> - -#define DEBUGP(prefix, ...) \ - { \ - fprintf(stderr, "%s --> ", prefix); \ - fprintf(stderr, __VA_ARGS__); \ - } - -/// Emit a message for debugging -#define DP(...) \ - do { \ - if (getDebugLevel() > 0) { \ - DEBUGP(DEBUG_PREFIX, __VA_ARGS__); \ - } \ - } while (false) - -/// Emit a message for debugging or failure if debugging is disabled -#define REPORT(...) \ - do { \ - if (getDebugLevel() > 0) { \ - DP(__VA_ARGS__); \ - } else { \ - FAILURE_MESSAGE(__VA_ARGS__); \ - } \ - } while (false) -#else -#define DEBUGP(prefix, ...) \ - {} -#define DP(...) \ - {} -#define REPORT(...) FAILURE_MESSAGE(__VA_ARGS__); -#endif // OMPTARGET_DEBUG - /// Emit a message giving the user extra information about the runtime if #define INFO(_flags, _id, ...) \ do { \ - if (getDebugLevel() > 0) { \ - DEBUGP(DEBUG_PREFIX, __VA_ARGS__); \ + if (::llvm::offload::debug::isDebugEnabled()) { \ + DP(__VA_ARGS__); \ } else if (getInfoLevel() & _flags) { \ INFO_MESSAGE(_id, __VA_ARGS__); \ } \ @@ -198,4 +156,388 @@ inline uint32_t getDebugLevel() { } \ } while (false) +namespace llvm::offload::debug { + +/// A raw_ostream that tracks `\n` and print the prefix after each +/// newline. Based on raw_ldbg_ostream from Support/DebugLog.h +class LLVM_ABI odbg_ostream final : public raw_ostream { +public: + enum IfLevel : uint32_t; + enum OnlyLevel : uint32_t; + +private: + std::string Prefix; + raw_ostream &Os; + uint32_t BaseLevel; + bool ShouldPrefixNextString; + bool ShouldEmitNewLineOnDestruction; + bool NeedEndNewLine = false; + + /// If the stream is muted, writes to it are ignored + bool Muted = false; + + /// Split the line on newlines and insert the prefix before each + /// newline. Forward everything to the underlying stream. + void write_impl(const char *Ptr, size_t Size) final { + if (Muted) + return; + + NeedEndNewLine = false; + auto Str = StringRef(Ptr, Size); + auto Eol = Str.find('\n'); + // Handle `\n` occurring in the string, ensure to print the prefix at the + // beginning of each line. + while (Eol != StringRef::npos) { + // Take the line up to the newline (including the newline). + StringRef Line = Str.take_front(Eol + 1); + if (!Line.empty()) + writeWithPrefix(Line); + // We printed a newline, record here to print a prefix. + ShouldPrefixNextString = true; + Str = Str.drop_front(Eol + 1); + Eol = Str.find('\n'); + } + if (!Str.empty()) { + writeWithPrefix(Str); + NeedEndNewLine = true; + } + } + void emitPrefix() { Os.write(Prefix.c_str(), Prefix.size()); } + void writeWithPrefix(StringRef Str) { + if (ShouldPrefixNextString) { + emitPrefix(); + ShouldPrefixNextString = false; + } + Os.write(Str.data(), Str.size()); + } + +public: + explicit odbg_ostream(std::string Prefix, raw_ostream &Os, uint32_t BaseLevel, + bool ShouldPrefixNextString = true, + bool ShouldEmitNewLineOnDestruction = true) + : Prefix(std::move(Prefix)), Os(Os), BaseLevel(BaseLevel), + ShouldPrefixNextString(ShouldPrefixNextString), + ShouldEmitNewLineOnDestruction(ShouldEmitNewLineOnDestruction) { + SetUnbuffered(); + } + ~odbg_ostream() final { + if (ShouldEmitNewLineOnDestruction && NeedEndNewLine) + Os << '\n'; + } + odbg_ostream(const odbg_ostream &) = delete; + odbg_ostream &operator=(const odbg_ostream &) = delete; + odbg_ostream(odbg_ostream &&other) : Os(other.Os) { + Prefix = std::move(other.Prefix); + BaseLevel = other.BaseLevel; + ShouldPrefixNextString = other.ShouldPrefixNextString; + ShouldEmitNewLineOnDestruction = other.ShouldEmitNewLineOnDestruction; + NeedEndNewLine = other.NeedEndNewLine; + Muted = other.Muted; + } + + /// Forward the current_pos method to the underlying stream. + uint64_t current_pos() const final { return Os.tell(); } + + /// Some of the `<<` operators expect an lvalue, so we trick the type + /// system. + odbg_ostream &asLvalue() { return *this; } + + void shouldMute(const IfLevel Filter) { Muted = Filter > BaseLevel; } + void shouldMute(const OnlyLevel Filter) { Muted = BaseLevel != Filter; } +}; + +/// dbgs - Return a circular-buffered debug stream. +[[maybe_unused]] static llvm::raw_ostream &dbgs() { + // Do one-time initialization in a thread-safe way. + static struct dbgstream { + llvm::circular_raw_ostream strm; + + dbgstream() : strm(llvm::errs(), "*** Debug Log Output ***\n", 0) {} + } thestrm; + + return thestrm.strm; +} + +#ifdef OMPTARGET_DEBUG + +struct DebugFilter { + StringRef Type; + uint32_t Level; +}; + +struct DebugSettings { + bool Enabled = false; + uint32_t DefaultLevel = 1; + llvm::SmallVector<DebugFilter> Filters; +}; + +[[maybe_unused]] static DebugFilter parseDebugFilter(StringRef Filter) { + size_t Pos = Filter.find(':'); + if (Pos == StringRef::npos) + return {Filter, 1}; + + StringRef Type = Filter.slice(0, Pos); + uint32_t Level = 1; + if (Filter.slice(Pos + 1, Filter.size()).getAsInteger(10, Level)) + Level = 1; + + return {Type, Level}; +} + +[[maybe_unused]] static DebugSettings &getDebugSettings() { + static DebugSettings Settings; + static std::once_flag Flag{}; + std::call_once(Flag, []() { + // Eventually, we probably should allow the upper layers to set + // debug settings directly according to their own env var or + // other methods. + // For now, mantain compatibility with existing libomptarget env var + // and add a liboffload independent one. + char *Env = getenv("LIBOMPTARGET_DEBUG"); + if (!Env) { + Env = getenv("LIBOFFLOAD_DEBUG"); + if (!Env) + return; + } + + StringRef EnvRef(Env); + if (EnvRef == "0") + return; + + Settings.Enabled = true; + if (EnvRef.equals_insensitive("all")) + return; + + if (!EnvRef.getAsInteger(10, Settings.DefaultLevel)) + return; + + Settings.DefaultLevel = 1; + + for (auto &FilterSpec : llvm::split(EnvRef, ',')) { + if (FilterSpec.empty()) + continue; + Settings.Filters.push_back(parseDebugFilter(FilterSpec)); + } + }); + + return Settings; +} + +inline bool isDebugEnabled() { return getDebugSettings().Enabled; } + +[[maybe_unused]] static bool +shouldPrintDebug(const char *Component, const char *Type, uint32_t &Level) { + const auto &Settings = getDebugSettings(); + if (!Settings.Enabled) + return false; + + if (Settings.Filters.empty()) { + if (Level <= Settings.DefaultLevel) { + Level = Settings.DefaultLevel; + return true; + } + return false; + } + + for (const auto &DT : Settings.Filters) { + if (DT.Level < Level) + continue; + if (DT.Type.equals_insensitive(Type) || + DT.Type.equals_insensitive(Component)) { + Level = DT.Level; + return true; + } + } + + return false; +} + +/// Compute the prefix for the debug log in the form of: +/// "Component --> " +[[maybe_unused]] static std::string computePrefix(StringRef Component, + StringRef DebugType) { + std::string Prefix; + raw_string_ostream OsPrefix(Prefix); + OsPrefix << Component << " --> "; + return OsPrefix.str(); +} + +static inline raw_ostream &operator<<(raw_ostream &Os, + const odbg_ostream::IfLevel Filter) { + odbg_ostream &Dbg = static_cast<odbg_ostream &>(Os); + Dbg.shouldMute(Filter); + return Dbg; +} + +static inline raw_ostream &operator<<(raw_ostream &Os, + const odbg_ostream::OnlyLevel Filter) { + odbg_ostream &Dbg = static_cast<odbg_ostream &>(Os); + Dbg.shouldMute(Filter); + return Dbg; +} + +#define ODBG_BASE(Stream, Component, Prefix, Type, Level) \ + for (uint32_t RealLevel = (Level), \ + _c = llvm::offload::debug::isDebugEnabled() && \ + llvm::offload::debug::shouldPrintDebug( \ + (Component), (Type), RealLevel); \ + _c; _c = 0) \ + ::llvm::offload::debug::odbg_ostream{ \ + ::llvm::offload::debug::computePrefix((Prefix), (Type)), (Stream), \ + RealLevel, /*ShouldPrefixNextString=*/true, \ + /*ShouldEmitNewLineOnDestruction=*/true} \ + .asLvalue() + +#define ODBG_STREAM(Stream, Type, Level) \ + ODBG_BASE(Stream, GETNAME(TARGET_NAME), DEBUG_PREFIX, Type, Level) + +#define ODBG_0() ODBG_2("default", 1) +#define ODBG_1(Type) ODBG_2(Type, 1) +#define ODBG_2(Type, Level) \ + ODBG_STREAM(llvm::offload::debug::dbgs(), Type, Level) +#define ODBG_SELECT(Type, Level, NArgs, ...) ODBG_##NArgs + +// Print a debug message of a certain type and verbosity level. If no type +// or level is provided, "default" and "1" are assumed respectively. +// Usage examples: +// ODBG("type1", 2) << "This is a level 2 message of type1"; +// ODBG("Init") << "This is a default level of the init type"; +// ODBG() << "This is a level 1 message of the default type"; +// ODBG("Init", 3) << NumDevices << " were initialized"; +// ODBG("Kernel") << "Launching " << KernelName << " on device " << DeviceId; +#define ODBG(...) ODBG_SELECT(__VA_ARGS__ __VA_OPT__(, ) 2, 1, 0)(__VA_ARGS__) + +// Filter the next elements in the debug stream if the current debug level is +// lower than specified level. Example: +// ODBG("Mapping", 2) << "level 2 info " +// << ODBG_IF_LEVEL(3) << " level 3 info" << Arg +// << ODBG_IF_LEVEL(4) << " level 4 info" << &Arg +// << ODBG_RESET_LEVEL() << " more level 2 info"; +#define ODBG_IF_LEVEL(Level) \ + static_cast<llvm::offload::debug::odbg_ostream::IfLevel>(Level) + +// Filter the next elements in the debug stream if the current debug level is +// not exactly the specified level. Example: +// ODBG() << "Starting computation " +// << ODBG_ONLY_LEVEL(1) << "on a device" +// << ODBG_ONLY_LEVEL(2) << "and mapping data on device" << DeviceId; +// << ODBG_ONLY_LEVEL(3) << dumpDetailedMappingInfo(DeviceId); +#define ODBG_ONLY_LEVEL(Level) \ + static_cast<llvm::offload::debug::odbg_ostream::OnlyLevel>(Level) + +// Reset the level back to the original level after ODBG_IF_LEVEL or +// ODBG_ONLY_LEVEL have been used +#define ODBG_RESET_LEVEL() \ + static_cast<llvm::offload::debug::odbg_ostream::IfLevel>(0) + +#else + +inline bool isDebugEnabled() { return false; } + +#define ODBG_NULL \ + for (bool _c = false; _c; _c = false) \ + ::llvm::nulls() + +// Don't print anything if debugging is disabled +#define ODBG_BASE(Stream, Component, Prefix, Type, Level) ODBG_NULL +#define ODBG_STREAM(Stream, Type, Level) ODBG_NULL +#define ODBG_IF_LEVEL(Level) 0 +#define ODBG_ONLY_LEVEL(Level) 0 +#define ODBG_RESET_LEVEL() 0 +#define ODBG(...) ODBG_NULL + +#endif + +} // namespace llvm::offload::debug + +namespace llvm::omp::target::debug { +using namespace llvm::offload::debug; + +enum OmpDebugLevel : uint32_t { + ODL_Default = 1, + ODL_Error = ODL_Default, + ODL_Detailed = 2, + ODL_Verbose = 3, + ODL_VeryVerbose = 4, + ODL_Dumping = 5 +}; + +/* Debug types to use in libomptarget */ +constexpr const char *ODT_Init = "Init"; +constexpr const char *ODT_Mapping = "Mapping"; +constexpr const char *ODT_Kernel = "Kernel"; +constexpr const char *ODT_DataTransfer = "DataTransfer"; +constexpr const char *ODT_Sync = "Sync"; +constexpr const char *ODT_Deinit = "Deinit"; +constexpr const char *ODT_Error = "Error"; +constexpr const char *ODT_KernelArgs = "KernelArgs"; +constexpr const char *ODT_MappingExists = "MappingExists"; +constexpr const char *ODT_DumpTable = "DumpTable"; +constexpr const char *ODT_MappingChanged = "MappingChanged"; +constexpr const char *ODT_PluginKernel = "PluginKernel"; +constexpr const char *ODT_EmptyMapping = "EmptyMapping"; + +static inline odbg_ostream reportErrorStream() { +#ifdef OMPTARGET_DEBUG + if (::llvm::offload::debug::isDebugEnabled()) { + uint32_t RealLevel = ODL_Error; + if (::llvm::offload::debug::shouldPrintDebug(GETNAME(TARGET_NAME), + (ODT_Error), RealLevel)) + return odbg_ostream{ + ::llvm::offload::debug::computePrefix(DEBUG_PREFIX, ODT_Error), + ::llvm::offload::debug::dbgs(), RealLevel}; + else + return odbg_ostream{"", ::llvm::nulls(), 1}; + } +#endif + return odbg_ostream{GETNAME(TARGET_NAME) " error: ", + ::llvm::offload::debug::dbgs(), ODL_Error}; +} + +#ifdef OMPTARGET_DEBUG +// Deprecated debug print macros +[[maybe_unused]] static std::string formatToStr(const char *format, ...) { + va_list args; + va_start(args, format); + size_t len = std::vsnprintf(NULL, 0, format, args); + va_end(args); + llvm::SmallVector<char, 128> vec(len + 1); + va_start(args, format); + std::vsnprintf(&vec[0], len + 1, format, args); + va_end(args); + return &vec[0]; +} + +// helper macro to support old DP and REPORT macros with printf syntax +#define FORMAT_TO_STR(Format, ...) \ + ::llvm::omp::target::debug::formatToStr(Format __VA_OPT__(, ) __VA_ARGS__) + +#define DP(...) ODBG() << FORMAT_TO_STR(__VA_ARGS__); + +#define REPORT_INT_OLD(...) \ + do { \ + if (::llvm::offload::debug::isDebugEnabled()) { \ + ODBG(::llvm::omp::target::debug::ODT_Error, \ + ::llvm::omp::target::debug::ODL_Error) \ + << FORMAT_TO_STR(__VA_ARGS__); \ + } else { \ + FAILURE_MESSAGE(__VA_ARGS__); \ + } \ + } while (false) + +#else +#define DP(...) \ + { \ + } +#define REPORT_INT_OLD(...) FAILURE_MESSAGE(__VA_ARGS__); +#endif // OMPTARGET_DEBUG + +// This is used for the new style REPORT macro +#define REPORT_INT() ::llvm::omp::target::debug::reportErrorStream() + +// Make REPORT compatible with old and new syntax +#define REPORT(...) REPORT_INT##__VA_OPT__(_OLD)(__VA_ARGS__) + +} // namespace llvm::omp::target::debug + #endif // OMPTARGET_SHARED_DEBUG_H diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 89aa468..0305899 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -94,6 +94,8 @@ enum OpenMPOffloadingDeclareTargetFlags { OMP_DECLARE_TARGET_INDIRECT = 0x08, /// This is an entry corresponding to a requirement to be registered. OMP_REGISTER_REQUIRES = 0x10, + /// Mark the entry global as being an indirect vtable. + OMP_DECLARE_TARGET_INDIRECT_VTABLE = 0x20, }; enum TargetAllocTy : int32_t { @@ -103,10 +105,6 @@ enum TargetAllocTy : int32_t { TARGET_ALLOC_DEFAULT, }; -inline KernelArgsTy CTorDTorKernelArgs = { - 1, 0, nullptr, nullptr, nullptr, nullptr, nullptr, - nullptr, 0, {0, 0, 0}, {1, 0, 0}, {1, 0, 0}, 0}; - struct DeviceTy; /// The libomptarget wrapper around a __tgt_async_info object directly @@ -274,6 +272,8 @@ extern "C" { void ompx_dump_mapping_tables(void); int omp_get_num_devices(void); int omp_get_device_num(void); +int omp_get_device_from_uid(const char *DeviceUid); +const char *omp_get_uid_from_device(int DeviceNum); int omp_get_initial_device(void); void *omp_target_alloc(size_t Size, int DeviceNum); void omp_target_free(void *DevicePtr, int DeviceNum); diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td index e9c1548..6ada191 100644 --- a/offload/liboffload/API/Device.td +++ b/offload/liboffload/API/Device.td @@ -43,6 +43,7 @@ def ol_device_info_t : Enum { TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">, TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">, TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">, + TaggedEtor<"WORK_GROUP_LOCAL_MEM_SIZE", "uint64_t", "The maximum size of local shared memory per work group in bytes">, ]; list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">); list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>); diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 84bc414..eab9627 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -495,6 +495,13 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.write(static_cast<uint32_t>(Value)); } + case OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE: { + if (!std::holds_alternative<uint64_t>(Entry->Value)) + return makeError(ErrorCode::BACKEND_FAILURE, + "plugin returned incorrect type"); + return Info.write(std::get<uint64_t>(Entry->Value)); + } + case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION: case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: { // {x, y, z} triples @@ -590,6 +597,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, return Info.write<uint32_t>(std::numeric_limits<uintptr_t>::digits); case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: + case OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE: return Info.write<uint64_t>(0); default: return createOffloadError(ErrorCode::INVALID_ENUMERATION, diff --git a/offload/libomptarget/OffloadRTL.cpp b/offload/libomptarget/OffloadRTL.cpp index 04bd21e..3a18d76 100644 --- a/offload/libomptarget/OffloadRTL.cpp +++ b/offload/libomptarget/OffloadRTL.cpp @@ -19,6 +19,7 @@ #ifdef OMPT_SUPPORT extern void llvm::omp::target::ompt::connectLibrary(); #endif +using namespace llvm::omp::target::debug; static std::mutex PluginMtx; static uint32_t RefCount = 0; @@ -35,7 +36,7 @@ void initRuntime() { RefCount++; if (RefCount == 1) { - DP("Init offload library!\n"); + ODBG(ODT_Init) << "Init offload library!"; #ifdef OMPT_SUPPORT // Initialize OMPT first llvm::omp::target::ompt::connectLibrary(); @@ -54,12 +55,12 @@ void deinitRuntime() { assert(PM && "Runtime not initialized"); if (RefCount == 1) { - DP("Deinit offload library!\n"); + ODBG(ODT_Deinit) << "Deinit offload library!"; // RTL deinitialization has started RTLAlive = false; while (RTLOngoingSyncs > 0) { - DP("Waiting for ongoing syncs to finish, count: %d\n", - RTLOngoingSyncs.load()); + ODBG(ODT_Sync) << "Waiting for ongoing syncs to finish, count:" + << RTLOngoingSyncs.load(); std::this_thread::sleep_for(std::chrono::milliseconds(100)); } PM->deinit(); diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp index dd83a3c..6e85e57 100644 --- a/offload/libomptarget/OpenMP/API.cpp +++ b/offload/libomptarget/OpenMP/API.cpp @@ -40,6 +40,8 @@ EXTERN void ompx_dump_mapping_tables() { using namespace llvm::omp::target::ompt; #endif +using GenericDeviceTy = llvm::omp::target::plugin::GenericDeviceTy; + void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, const char *Name); void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, @@ -68,6 +70,62 @@ EXTERN int omp_get_device_num(void) { return HostDevice; } +static inline bool is_initial_device_uid(const char *DeviceUid) { + return strcmp(DeviceUid, GenericPluginTy::getHostDeviceUid()) == 0; +} + +EXTERN int omp_get_device_from_uid(const char *DeviceUid) { + TIMESCOPE(); + OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); + + if (!DeviceUid) { + DP("Call to omp_get_device_from_uid returning omp_invalid_device\n"); + return omp_invalid_device; + } + if (is_initial_device_uid(DeviceUid)) { + DP("Call to omp_get_device_from_uid returning initial device number %d\n", + omp_get_initial_device()); + return omp_get_initial_device(); + } + + int DeviceNum = omp_invalid_device; + + auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor(); + for (const DeviceTy &Device : PM->devices(ExclusiveDevicesAccessor)) { + const char *Uid = Device.RTL->getDevice(Device.RTLDeviceID).getDeviceUid(); + if (Uid && strcmp(DeviceUid, Uid) == 0) { + DeviceNum = Device.DeviceID; + break; + } + } + + DP("Call to omp_get_device_from_uid returning %d\n", DeviceNum); + return DeviceNum; +} + +EXTERN const char *omp_get_uid_from_device(int DeviceNum) { + TIMESCOPE(); + OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); + + if (DeviceNum == omp_invalid_device) { + DP("Call to omp_get_uid_from_device returning nullptr\n"); + return nullptr; + } + if (DeviceNum == omp_get_initial_device()) { + DP("Call to omp_get_uid_from_device returning initial device UID\n"); + return GenericPluginTy::getHostDeviceUid(); + } + + auto DeviceOrErr = PM->getDevice(DeviceNum); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); + + const char *Uid = + DeviceOrErr->RTL->getDevice(DeviceOrErr->RTLDeviceID).getDeviceUid(); + DP("Call to omp_get_uid_from_device returning %s\n", Uid); + return Uid; +} + EXTERN int omp_get_initial_device(void) { TIMESCOPE(); OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp index c8d6b42..cd6d037 100644 --- a/offload/libomptarget/PluginManager.cpp +++ b/offload/libomptarget/PluginManager.cpp @@ -36,7 +36,7 @@ void PluginManager::init() { return; } - DP("Loading RTLs...\n"); + ODBG("Init") << "Loading RTLs"; // Attempt to create an instance of each supported plugin. #define PLUGIN_TARGET(Name) \ @@ -437,20 +437,22 @@ static int loadImagesOntoDevice(DeviceTy &Device) { llvm::offloading::EntryTy DeviceEntry = Entry; if (Entry.Size) { - if (Device.RTL->get_global(Binary, Entry.Size, Entry.SymbolName, - &DeviceEntry.Address) != OFFLOAD_SUCCESS) - REPORT("Failed to load symbol %s\n", Entry.SymbolName); + if (!(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT_VTABLE)) + if (Device.RTL->get_global(Binary, Entry.Size, Entry.SymbolName, + &DeviceEntry.Address) != OFFLOAD_SUCCESS) + REPORT("Failed to load symbol %s\n", Entry.SymbolName); // If unified memory is active, the corresponding global is a device // reference to the host global. We need to initialize the pointer on // the device to point to the memory on the host. - if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) || - (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) { + if (!(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT_VTABLE) && + !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT) && + ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) || + (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY))) if (Device.RTL->data_submit(DeviceId, DeviceEntry.Address, Entry.Address, Entry.Size) != OFFLOAD_SUCCESS) REPORT("Failed to write symbol for USM %s\n", Entry.SymbolName); - } } else if (Entry.Address) { if (Device.RTL->get_function(Binary, Entry.SymbolName, &DeviceEntry.Address) != OFFLOAD_SUCCESS) diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index ee36fbe..659ef68 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -38,6 +38,7 @@ using namespace llvm::omp::target::ompt; #endif using namespace llvm::omp::target::plugin; +using namespace llvm::omp::target::debug; int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const { @@ -48,7 +49,7 @@ int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device, void *Event = getEvent(); bool NeedNewEvent = Event == nullptr; if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) { - REPORT("Failed to create event\n"); + REPORT() << "Failed to create event"; return OFFLOAD_FAIL; } @@ -56,7 +57,7 @@ int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device, // know if the target support event. But if a target doesn't, // recordEvent should always return success. if (Device.recordEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { - REPORT("Failed to set dependence on event " DPxMOD "\n", DPxPTR(Event)); + REPORT() << "Failed to set dependence on event " << Event; return OFFLOAD_FAIL; } @@ -112,21 +113,58 @@ setupIndirectCallTable(DeviceTy &Device, __tgt_device_image *Image, llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable; for (const auto &Entry : Entries) { if (Entry.Kind != llvm::object::OffloadKind::OFK_OpenMP || - Entry.Size == 0 || !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT)) + Entry.Size == 0 || + (!(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT) && + !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT_VTABLE))) continue; - assert(Entry.Size == sizeof(void *) && "Global not a function pointer?"); - auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); - - void *Ptr; - if (Device.RTL->get_global(Binary, Entry.Size, Entry.SymbolName, &Ptr)) - return error::createOffloadError(error::ErrorCode::INVALID_BINARY, - "failed to load %s", Entry.SymbolName); - - HstPtr = Entry.Address; - if (Device.retrieveData(&DevPtr, Ptr, Entry.Size, AsyncInfo)) - return error::createOffloadError(error::ErrorCode::INVALID_BINARY, - "failed to load %s", Entry.SymbolName); + size_t PtrSize = sizeof(void *); + if (Entry.Flags & OMP_DECLARE_TARGET_INDIRECT_VTABLE) { + // This is a VTable entry, the current entry is the first index of the + // VTable and Entry.Size is the total size of the VTable. Unlike the + // indirect function case below, the Global is not of size Entry.Size and + // is instead of size PtrSize (sizeof(void*)). + void *Vtable; + void *res; + if (Device.RTL->get_global(Binary, PtrSize, Entry.SymbolName, &Vtable)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to load %s", Entry.SymbolName); + + // HstPtr = Entry.Address; + if (Device.retrieveData(&res, Vtable, PtrSize, AsyncInfo)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to load %s", Entry.SymbolName); + if (Device.synchronize(AsyncInfo)) + return error::createOffloadError( + error::ErrorCode::INVALID_BINARY, + "failed to synchronize after retrieving %s", Entry.SymbolName); + // Calculate and emplace entire Vtable from first Vtable byte + for (uint64_t i = 0; i < Entry.Size / PtrSize; ++i) { + auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); + HstPtr = reinterpret_cast<void *>( + reinterpret_cast<uintptr_t>(Entry.Address) + i * PtrSize); + DevPtr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(res) + + i * PtrSize); + } + } else { + // Indirect function case: Entry.Size should equal PtrSize since we're + // dealing with a single function pointer (not a VTable) + assert(Entry.Size == PtrSize && "Global not a function pointer?"); + auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); + void *Ptr; + if (Device.RTL->get_global(Binary, Entry.Size, Entry.SymbolName, &Ptr)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to load %s", Entry.SymbolName); + + HstPtr = Entry.Address; + if (Device.retrieveData(&DevPtr, Ptr, Entry.Size, AsyncInfo)) + return error::createOffloadError(error::ErrorCode::INVALID_BINARY, + "failed to load %s", Entry.SymbolName); + } + if (Device.synchronize(AsyncInfo)) + return error::createOffloadError( + error::ErrorCode::INVALID_BINARY, + "failed to synchronize after retrieving %s", Entry.SymbolName); } // If we do not have any indirect globals we exit early. @@ -278,21 +316,21 @@ int32_t DeviceTy::dataFence(AsyncInfoTy &AsyncInfo) { } int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) { - DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n", - DPxPTR(HstPtr), Size); + ODBG(ODT_Mapping) << "Notifying about new mapping: HstPtr=" << HstPtr + << ", Size=" << Size; if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) { - REPORT("Notifying about data mapping failed.\n"); + REPORT() << "Notifying about data mapping failed."; return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) { - DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr)); + ODBG(ODT_Mapping) << "Notifying about an unmapping: HstPtr=" << HstPtr; if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) { - REPORT("Notifying about data unmapping failed.\n"); + REPORT() << "Notifying about data unmapping failed."; return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports index 910a5b6..2ebc23e 100644 --- a/offload/libomptarget/exports +++ b/offload/libomptarget/exports @@ -40,6 +40,8 @@ VERS1.0 { omp_get_mapped_ptr; omp_get_num_devices; omp_get_device_num; + omp_get_device_from_uid; + omp_get_uid_from_device; omp_get_initial_device; omp_target_alloc; omp_target_free; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 04b3944..5196509 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2186,6 +2186,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = checkIfAPU()) return Err; + // Retrieve the size of the group memory. + for (const auto *Pool : AllMemoryPools) { + if (Pool->isGroup()) { + if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, + MaxBlockSharedMemSize)) + return Err; + break; + } + } + return Plugin::success(); } @@ -2923,6 +2933,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (Status == HSA_STATUS_SUCCESS) Info.add("Cacheline Size", TmpUInt); + Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes", + DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE); + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); if (Status == HSA_STATUS_SUCCESS) Info.add("Max Clock Freq", TmpUInt, "MHz", diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 2135e06..1d52c96 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -794,6 +794,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Get the unique identifier of the device. const char *getDeviceUid() const { return DeviceUid.c_str(); } + /// Get the total shared memory per block (in bytes) that can be used in any + /// kernel. + size_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; } + /// Set the context of the device if needed, before calling device-specific /// functions. Plugins may implement this function as a no-op if not needed. virtual Error setContext() = 0; @@ -1251,6 +1255,9 @@ protected: /// Internal representation for OMPT device (initialize & finalize) std::atomic<bool> OmptInitialized; #endif + + /// The total per-block native shared memory that a kernel may use. + size_t MaxBlockSharedMemSize = 0; }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp index e19f2ef..8d67047 100644 --- a/offload/plugins-nextgen/common/src/RPC.cpp +++ b/offload/plugins-nextgen/common/src/RPC.cpp @@ -83,7 +83,8 @@ static rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device, return rpc::RPC_ERROR; } -static rpc::Status runServer(plugin::GenericDeviceTy &Device, void *Buffer) { +static rpc::Status runServer(plugin::GenericDeviceTy &Device, void *Buffer, + bool &ClientInUse) { uint64_t NumPorts = std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT); rpc::Server Server(NumPorts, Buffer); @@ -92,6 +93,7 @@ static rpc::Status runServer(plugin::GenericDeviceTy &Device, void *Buffer) { if (!Port) return rpc::RPC_SUCCESS; + ClientInUse = true; rpc::Status Status = handleOffloadOpcodes(Device, *Port, Device.getWarpSize()); @@ -99,7 +101,6 @@ static rpc::Status runServer(plugin::GenericDeviceTy &Device, void *Buffer) { if (Status == rpc::RPC_UNHANDLED_OPCODE) Status = LIBC_NAMESPACE::shared::handle_libc_opcodes(*Port, Device.getWarpSize()); - Port->close(); return Status; @@ -122,7 +123,11 @@ void RPCServerTy::ServerThread::shutDown() { } void RPCServerTy::ServerThread::run() { + static constexpr auto IdleTime = std::chrono::microseconds(25); + static constexpr auto IdleSleep = std::chrono::microseconds(250); std::unique_lock<decltype(Mutex)> Lock(Mutex); + + auto LastUse = std::chrono::steady_clock::now(); for (;;) { CV.wait(Lock, [&]() { return NumUsers.load(std::memory_order_acquire) > 0 || @@ -133,15 +138,25 @@ void RPCServerTy::ServerThread::run() { return; Lock.unlock(); + bool ClientInUse = false; while (NumUsers.load(std::memory_order_relaxed) > 0 && Running.load(std::memory_order_relaxed)) { + + // Suspend this thread briefly if there is no current work. + auto Now = std::chrono::steady_clock::now(); + if (!ClientInUse && Now - LastUse >= IdleTime) + std::this_thread::sleep_for(IdleSleep); + else if (ClientInUse) + LastUse = Now; + + ClientInUse = false; std::lock_guard<decltype(Mutex)> Lock(BufferMutex); for (const auto &[Buffer, Device] : llvm::zip_equal(Buffers, Devices)) { if (!Buffer || !Device) continue; // If running the server failed, print a message but keep running. - if (runServer(*Device, Buffer) != rpc::RPC_SUCCESS) + if (runServer(*Device, Buffer, ClientInUse) != rpc::RPC_SUCCESS) FAILURE_MESSAGE("Unhandled or invalid RPC opcode!"); } } diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 45e580e7..a27c6f3 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -379,6 +379,12 @@ struct CUDADeviceTy : public GenericDeviceTy { return Err; HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize); + uint32_t MaxSharedMem; + if (auto Err = getDeviceAttr( + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem)) + return Err; + MaxBlockSharedMemSize = MaxSharedMem; + return Plugin::success(); } @@ -1089,10 +1095,8 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) Info.add("Total Constant Memory", TmpInt, "bytes"); - Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - TmpInt); - if (Res == CUDA_SUCCESS) - Info.add("Max Shared Memory per Block", TmpInt, "bytes"); + Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes", + DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt); if (Res == CUDA_SUCCESS) diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 48de1fef..8fa1c3e 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -443,6 +443,8 @@ struct GenELF64PluginTy final : public GenericPluginTy { if (auto Err = Plugin::check(ffi_init(), "failed to initialize libffi")) return std::move(Err); #endif + ODBG("Init") << "GenELF64 plugin detected " << ODBG_IF_LEVEL(2) + << NUM_DEVICES << " " << ODBG_RESET_LEVEL() << "devices"; return NUM_DEVICES; } diff --git a/offload/test/api/omp_device_uid.c b/offload/test/api/omp_device_uid.c new file mode 100644 index 0000000..2a41d8d --- /dev/null +++ b/offload/test/api/omp_device_uid.c @@ -0,0 +1,76 @@ +// RUN: %libomptarget-compile-run-and-check-generic + +#include <omp.h> +#include <stdio.h> +#include <string.h> + +int test_omp_device_uid(int device_num) { + const char *device_uid = omp_get_uid_from_device(device_num); + if (device_uid == NULL) { + printf("FAIL for device %d: omp_get_uid_from_device returned NULL\n", + device_num); + return 0; + } + + int device_num_from_uid = omp_get_device_from_uid(device_uid); + if (device_num_from_uid != device_num) { + printf( + "FAIL for device %d: omp_get_device_from_uid returned %d (UID: %s)\n", + device_num, device_num_from_uid, device_uid); + return 0; + } + + if (device_num == omp_get_initial_device()) + return 1; + + int success = 1; + +// Note that the following code may be executed on the host if the host is the +// device +#pragma omp target map(tofrom : success) device(device_num) + { + int device_num = omp_get_device_num(); + + // omp_get_uid_from_device() in the device runtime is a dummy function + // returning NULL + const char *device_uid = omp_get_uid_from_device(device_num); + + // omp_get_device_from_uid() in the device runtime is a dummy function + // returning omp_invalid_device. + int device_num_from_uid = omp_get_device_from_uid(device_uid); + + // Depending on whether we're executing on the device or the host, we either + // got NULL as the device UID or the correct device UID. Consequently, + // omp_get_device_from_uid() either returned omp_invalid_device or the + // correct device number (aka omp_get_initial_device()). + if (device_uid ? device_num_from_uid != device_num + : device_num_from_uid != omp_invalid_device) { + printf("FAIL for device %d (target): omp_get_device_from_uid returned %d " + "(UID: %s)\n", + device_num, device_num_from_uid, device_uid); + success = 0; + } + } + + return success; +} + +int main() { + int num_devices = omp_get_num_devices(); + int num_failed = 0; + // (also test initial device aka num_devices) + for (int i = 0; i < num_devices + 1; i++) { + if (!test_omp_device_uid(i)) { + printf("FAIL for device %d\n", i); + num_failed++; + } + } + if (num_failed) { + printf("FAIL\n"); + return 1; + } + printf("PASS\n"); + return 0; +} + +// CHECK: PASS diff --git a/offload/test/api/omp_indirect_call_table_manual.c b/offload/test/api/omp_indirect_call_table_manual.c new file mode 100644 index 0000000..e958d47 --- /dev/null +++ b/offload/test/api/omp_indirect_call_table_manual.c @@ -0,0 +1,107 @@ +// RUN: %libomptarget-compile-run-and-check-generic +#include <assert.h> +#include <omp.h> +#include <stdio.h> + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL + +typedef struct { + uint64_t Reserved; + uint16_t Version; + uint16_t Kind; // OpenMP==1 + uint32_t Flags; + void *Address; + char *SymbolName; + uint64_t Size; + uint64_t Data; + void *AuxAddr; +} __tgt_offload_entry; + +enum OpenMPOffloadingDeclareTargetFlags { + /// Mark the entry global as having a 'link' attribute. + OMP_DECLARE_TARGET_LINK = 0x01, + /// Mark the entry global as being an indirectly callable function. + OMP_DECLARE_TARGET_INDIRECT = 0x08, + /// This is an entry corresponding to a requirement to be registered. + OMP_REGISTER_REQUIRES = 0x10, + /// Mark the entry global as being an indirect vtable. + OMP_DECLARE_TARGET_INDIRECT_VTABLE = 0x20, +}; + +#pragma omp begin declare variant match(device = {kind(gpu)}) +// Provided by the runtime. +void *__llvm_omp_indirect_call_lookup(void *host_ptr); +#pragma omp declare target to(__llvm_omp_indirect_call_lookup) \ + device_type(nohost) +#pragma omp end declare variant + +#pragma omp begin declare variant match(device = {kind(cpu)}) +// We assume unified addressing on the CPU target. +void *__llvm_omp_indirect_call_lookup(void *host_ptr) { return host_ptr; } +#pragma omp end declare variant + +#pragma omp begin declare target +void foo(int *i) { *i += 1; } +void bar(int *i) { *i += 10; } +void baz(int *i) { *i += 100; } +#pragma omp end declare target + +typedef void (*fptr_t)(int *i); + +// Dispatch Table - declare separately on host and device to avoid +// registering with the library; this also allows us to use separate +// names, which is convenient for debugging. This dispatchTable is +// intended to mimic what Clang emits for C++ vtables. +fptr_t dispatchTable[] = {foo, bar, baz}; +#pragma omp begin declare target device_type(nohost) +fptr_t GPUdispatchTable[] = {foo, bar, baz}; +fptr_t *GPUdispatchTablePtr = GPUdispatchTable; +#pragma omp end declare target + +// Define "manual" OpenMP offload entries, where we emit Clang +// offloading entry structure definitions in the appropriate ELF +// section. This allows us to emulate the offloading entries that Clang would +// normally emit for us + +__attribute__((weak, section("llvm_offload_entries"), aligned(8))) +const __tgt_offload_entry __offloading_entry[] = {{ + 0ULL, // Reserved + 1, // Version + 1, // Kind + OMP_DECLARE_TARGET_INDIRECT_VTABLE, // Flags + &dispatchTable, // Address + "GPUdispatchTablePtr", // SymbolName + (size_t)(sizeof(dispatchTable)), // Size + 0ULL, // Data + NULL // AuxAddr +}}; + +// Mimic how Clang emits vtable pointers for C++ classes +typedef struct { + fptr_t *dispatchPtr; +} myClass; + +// --------------------------------------------------------------------------- +int main() { + myClass obj_foo = {dispatchTable + 0}; + myClass obj_bar = {dispatchTable + 1}; + myClass obj_baz = {dispatchTable + 2}; + int aaa = 0; + +#pragma omp target map(aaa) map(to : obj_foo, obj_bar, obj_baz) + { + // Lookup + fptr_t *foo_ptr = __llvm_omp_indirect_call_lookup(obj_foo.dispatchPtr); + fptr_t *bar_ptr = __llvm_omp_indirect_call_lookup(obj_bar.dispatchPtr); + fptr_t *baz_ptr = __llvm_omp_indirect_call_lookup(obj_baz.dispatchPtr); + foo_ptr[0](&aaa); + bar_ptr[0](&aaa); + baz_ptr[0](&aaa); + } + + assert(aaa == 111); + // CHECK: PASS + printf("PASS\n"); + return 0; +} diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in index 00f4e2b..c8ba45c 100644 --- a/offload/test/lit.site.cfg.in +++ b/offload/test/lit.site.cfg.in @@ -1,6 +1,6 @@ @AUTO_GEN_COMMENT@ -config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" +config.bin_llvm_tools_dir = "@LLVM_TOOLS_BINARY_DIR@" config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@" diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_fallback.c b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_fallback.c new file mode 100644 index 0000000..4b67a3b --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_fallback.c @@ -0,0 +1,24 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// Test that when a use_device_addr lookup fails, the +// list-item retains its original address by default. +// +// This is necessary because we must assume that the +// list-item is device-accessible, even if it was not +// previously mapped. + +// XFAIL: * + +#include <stdio.h> +int h[10]; +int *ph = &h[0]; + +void f1() { + printf("%p\n", &h[2]); // CHECK: 0x[[#%x,ADDR:]] +#pragma omp target data use_device_addr(h[2]) + printf("%p\n", &h[2]); // CHECK-NEXT: 0x{{0*}}[[#ADDR]] +#pragma omp target data use_device_addr(ph[2]) + printf("%p\n", &ph[2]); // CHECK-NEXT: 0x{{0*}}[[#ADDR]] +} + +int main() { f1(); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_fallback.c b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_fallback.c new file mode 100644 index 0000000..4495a46b --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_fallback.c @@ -0,0 +1,21 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// Test that when a use_device_addr lookup fails, the +// list-item retains its original address by default. +// +// This is necessary because we must assume that the +// list-item is device-accessible, even if it was not +// previously mapped. + +// XFAIL: * + +#include <stdio.h> +int x; + +void f1() { + printf("%p\n", &x); // CHECK: 0x[[#%x,ADDR:]] +#pragma omp target data use_device_addr(x) + printf("%p\n", &x); // CHECK-NEXT: 0x{{0*}}[[#ADDR]] +} + +int main() { f1(); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_var_fallback.c b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_var_fallback.c new file mode 100644 index 0000000..e8fa3b6 --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_var_fallback.c @@ -0,0 +1,32 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// Test that when a use_device_ptr lookup fails, the +// privatized pointer retains its original value by +// default. +// +// This is necessary because we must assume that the +// pointee is device-accessible, even if it was not +// previously mapped. +// +// OpenMP 5.1, sec 2.14.2, target data construct, p 188, l26-31: +// If a list item that appears in a use_device_ptr clause ... does not point to +// a mapped object, it must contain a valid device address for the target +// device, and the list item references are instead converted to references to a +// local device pointer that refers to this device address. +// +// Note: OpenMP 6.1 will have a way to change the +// fallback behavior: preserve or nullify. + +// XFAIL: * + +#include <stdio.h> +int x; +int *xp = &x; + +void f1() { + printf("%p\n", xp); // CHECK: 0x[[#%x,ADDR:]] +#pragma omp target data use_device_ptr(xp) + printf("%p\n", xp); // CHECK-NEXT: 0x{{0*}}[[#ADDR]] +} + +int main() { f1(); } diff --git a/offload/test/offloading/fortran/declare-target-to-allocatable-vars-in-target-with-update.f90 b/offload/test/offloading/fortran/declare-target-to-allocatable-vars-in-target-with-update.f90 new file mode 100644 index 0000000..727a08b --- /dev/null +++ b/offload/test/offloading/fortran/declare-target-to-allocatable-vars-in-target-with-update.f90 @@ -0,0 +1,41 @@ +! Test that checks an allocatable array can be marked implicit +! `declare target to` and functions without issue. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +module test + implicit none + integer, allocatable, dimension(:) :: alloca_arr + !$omp declare target(alloca_arr) +end module test + +program main + use test + implicit none + integer :: cycle, i + + allocate(alloca_arr(10)) + + do i = 1, 10 + alloca_arr(i) = 0 + end do + + !$omp target data map(to:alloca_arr) + do cycle = 1, 2 + !$omp target + do i = 1, 10 + alloca_arr(i) = alloca_arr(i) + i + end do + !$omp end target + + ! NOTE: Technically doesn't affect the results, but there is a + ! regression case that'll cause a runtime crash if this is + ! invoked more than once, so this checks for that. + !$omp target update from(alloca_arr) + end do + !$omp end target data + + print *, alloca_arr +end program + +! CHECK: 2 4 6 8 10 12 14 16 18 20 diff --git a/offload/test/offloading/fortran/declare-target-to-vars-target-region-and-update.f90 b/offload/test/offloading/fortran/declare-target-to-vars-target-region-and-update.f90 new file mode 100644 index 0000000..16433af --- /dev/null +++ b/offload/test/offloading/fortran/declare-target-to-vars-target-region-and-update.f90 @@ -0,0 +1,40 @@ +! Test the implicit `declare target to` interaction with `target update from` +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +module test + implicit none + integer :: array(10) + !$omp declare target(array) +end module test + +PROGRAM main + use test + implicit none + integer :: i + + do i = 1, 10 + array(i) = 0 + end do + + !$omp target + do i = 1, 10 + array(i) = i + end do + !$omp end target + + !$omp target + do i = 1, 10 + array(i) = array(i) + i + end do + !$omp end target + + print *, array + + !$omp target update from(array) + + print *, array +END PROGRAM + +! CHECK: 0 0 0 0 0 0 0 0 0 0 +! CHECK: 2 4 6 8 10 12 14 16 18 20 diff --git a/offload/test/offloading/fortran/declare-target-to-zero-index-allocatable-target-map.f90 b/offload/test/offloading/fortran/declare-target-to-zero-index-allocatable-target-map.f90 new file mode 100644 index 0000000..0d650f6 --- /dev/null +++ b/offload/test/offloading/fortran/declare-target-to-zero-index-allocatable-target-map.f90 @@ -0,0 +1,30 @@ +! Test `declare target to` interaction with an allocatable with a non-default +! range +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +module test_0 + real(4), allocatable :: zero_off(:) + !$omp declare target(zero_off) +end module test_0 + +program main + use test_0 + implicit none + + allocate(zero_off(0:10)) + + zero_off(0) = 30.0 + zero_off(1) = 40.0 + zero_off(10) = 25.0 + + !$omp target map(tofrom: zero_off) + zero_off(0) = zero_off(1) + !$omp end target + + print *, zero_off(0) + print *, zero_off(1) +end program + +! CHECK: 40. +! CHECK: 40. diff --git a/offload/test/offloading/fortran/dtype-member-overlap-map.f90 b/offload/test/offloading/fortran/dtype-member-overlap-map.f90 new file mode 100644 index 0000000..e457014 --- /dev/null +++ b/offload/test/offloading/fortran/dtype-member-overlap-map.f90 @@ -0,0 +1,56 @@ +! Basic offloading test checking the interaction of an overlapping +! member map. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + implicit none + integer :: i + + type dtype2 + integer :: int + real :: float + end type dtype2 + + type dtype1 + character (LEN=30) :: characters + type(dtype2) :: internal_dtype2 + end type dtype1 + + type dtype + integer :: elements(10) + type(dtype1) :: internal_dtype + integer :: value + end type dtype + + type (dtype) :: single_dtype + + do i = 1, 10 + single_dtype%elements(i) = 0 + end do + + !$omp target map(tofrom: single_dtype%internal_dtype, single_dtype%internal_dtype%internal_dtype2%int) + single_dtype%internal_dtype%internal_dtype2%int = 123 + single_dtype%internal_dtype%characters(1:1) = "Z" + !$omp end target + + !$omp target map(to: single_dtype) map(tofrom: single_dtype%internal_dtype%internal_dtype2, single_dtype%value) + single_dtype%value = 20 + do i = 1, 10 + single_dtype%elements(i) = i + end do + single_dtype%internal_dtype%internal_dtype2%float = 32.0 + !$omp end target + + print *, single_dtype%value + print *, single_dtype%internal_dtype%internal_dtype2%float + print *, single_dtype%elements + print *, single_dtype%internal_dtype%internal_dtype2%int + print *, single_dtype%internal_dtype%characters(1:1) +end program main + +! CHECK: 20 +! CHECK: 32. +! CHECK: 0 0 0 0 0 0 0 0 0 0 +! CHECK: 123 +! CHECK: Z diff --git a/offload/test/offloading/fortran/implicit-derived-enter-exit.f90 b/offload/test/offloading/fortran/implicit-derived-enter-exit.f90 new file mode 100644 index 0000000..0c896e64 --- /dev/null +++ b/offload/test/offloading/fortran/implicit-derived-enter-exit.f90 @@ -0,0 +1,65 @@ +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic +! RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic + +module enter_exit_mapper_mod + implicit none + + type :: field_type + real, allocatable :: values(:) + end type field_type + + type :: tile_type + type(field_type) :: field + integer, allocatable :: neighbors(:) + end type tile_type + +contains + subroutine init_tile(tile) + type(tile_type), intent(inout) :: tile + integer :: j + + allocate(tile%field%values(4)) + allocate(tile%neighbors(4)) + do j = 1, 4 + tile%field%values(j) = 10.0 * j + tile%neighbors(j) = j + end do + end subroutine init_tile + +end module enter_exit_mapper_mod + +program implicit_enter_exit + use enter_exit_mapper_mod + implicit none + integer :: j + type(tile_type) :: tile + + call init_tile(tile) + + !$omp target enter data map(alloc: tile%field%values) + + !$omp target + do j = 1, size(tile%field%values) + tile%field%values(j) = 5.0 * j + end do + !$omp end target + + !$omp target exit data map(from: tile%field%values) + + do j = 1, size(tile%field%values) + if (tile%field%values(j) /= 5.0 * j) then + print *, "======= Test Failed! =======" + stop 1 + end if + if (tile%neighbors(j) /= j) then + print *, "======= Test Failed! =======" + stop 1 + end if + end do + + print *, "======= Test Passed! =======" +end program implicit_enter_exit + +! CHECK: ======= Test Passed! ======= diff --git a/offload/test/offloading/fortran/target-custom-reduction-derivedtype.f90 b/offload/test/offloading/fortran/target-custom-reduction-derivedtype.f90 new file mode 100644 index 0000000..cc390cf0 --- /dev/null +++ b/offload/test/offloading/fortran/target-custom-reduction-derivedtype.f90 @@ -0,0 +1,88 @@ +! Basic offloading test with custom OpenMP reduction on derived type +! REQUIRES: flang, amdgpu +! +! RUN: %libomptarget-compile-fortran-generic +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +module maxtype_mod + implicit none + + type maxtype + integer::sumval + integer::maxval + end type maxtype + +contains + + subroutine initme(x,n) + type(maxtype) :: x,n + x%sumval=0 + x%maxval=0 + end subroutine initme + + function mycombine(lhs, rhs) + type(maxtype) :: lhs, rhs + type(maxtype) :: mycombine + mycombine%sumval = lhs%sumval + rhs%sumval + mycombine%maxval = max(lhs%maxval, rhs%maxval) + end function mycombine + +end module maxtype_mod + +program main + use maxtype_mod + implicit none + + integer :: n = 100 + integer :: i + integer :: error = 0 + type(maxtype) :: x(100) + type(maxtype) :: res + integer :: expected_sum, expected_max + +!$omp declare reduction(red_add_max:maxtype:omp_out=mycombine(omp_out,omp_in)) initializer(initme(omp_priv,omp_orig)) + + ! Initialize array with test data + do i = 1, n + x(i)%sumval = i + x(i)%maxval = i + end do + + ! Initialize reduction variable + res%sumval = 0 + res%maxval = 0 + + ! Perform reduction in target region + !$omp target parallel do map(to:x) reduction(red_add_max:res) + do i = 1, n + res = mycombine(res, x(i)) + end do + !$omp end target parallel do + + ! Compute expected values + expected_sum = 0 + expected_max = 0 + do i = 1, n + expected_sum = expected_sum + i + expected_max = max(expected_max, i) + end do + + ! Check results + if (res%sumval /= expected_sum) then + error = 1 + endif + + if (res%maxval /= expected_max) then + error = 1 + endif + + if (error == 0) then + print *,"PASSED" + else + print *,"FAILED" + endif + +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: PASSED + diff --git a/offload/test/offloading/fortran/target-is-device-ptr.f90 b/offload/test/offloading/fortran/target-is-device-ptr.f90 new file mode 100644 index 0000000..d6d8c02 --- /dev/null +++ b/offload/test/offloading/fortran/target-is-device-ptr.f90 @@ -0,0 +1,60 @@ +! Validate that a device pointer obtained via omp_get_mapped_ptr can be used +! inside a TARGET region with the is_device_ptr clause. +! REQUIRES: flang, amdgcn-amd-amdhsa + +! RUN: %libomptarget-compile-fortran-run-and-check-generic + +module mod + implicit none + integer, parameter :: n = 4 +contains + subroutine kernel(dptr) + use iso_c_binding, only : c_ptr, c_f_pointer + implicit none + + type(c_ptr) :: dptr + integer, dimension(:), pointer :: b + integer :: i + + b => null() + + !$omp target is_device_ptr(dptr) + call c_f_pointer(dptr, b, [n]) + do i = 1, n + b(i) = b(i) + 1 + end do + !$omp end target + end subroutine kernel +end module mod + +program is_device_ptr_target + use iso_c_binding, only : c_ptr, c_loc, c_f_pointer + use omp_lib, only: omp_get_default_device, omp_get_mapped_ptr + use mod, only: kernel, n + implicit none + + integer, dimension(n), target :: a + integer :: dev + type(c_ptr) :: dptr + + a = [2, 4, 6, 8] + print '("BEFORE:", I3)', a + + dev = omp_get_default_device() + + !$omp target data map(tofrom: a) + dptr = omp_get_mapped_ptr(c_loc(a), dev) + call kernel(dptr) + !$omp end target data + + print '("AFTER: ", I3)', a + + if (all(a == [3, 5, 7, 9])) then + print '("PASS")' + else + print '("FAIL ", I3)', a + end if + +end program is_device_ptr_target + +!CHECK: PASS diff --git a/offload/test/offloading/gpupgo/pgo_atomic_teams.c b/offload/test/offloading/gpupgo/pgo_atomic_teams.c index 42d8ae4..b3b72db 100644 --- a/offload/test/offloading/gpupgo/pgo_atomic_teams.c +++ b/offload/test/offloading/gpupgo/pgo_atomic_teams.c @@ -18,7 +18,6 @@ // REQUIRES: amdgpu // REQUIRES: pgo -// XFAIL: amdgpu int test1(int a) { return a / 2; } int test2(int a) { return a * 2; } diff --git a/offload/test/offloading/gpupgo/pgo_atomic_threads.c b/offload/test/offloading/gpupgo/pgo_atomic_threads.c index 09a4dc1..440a6b5 100644 --- a/offload/test/offloading/gpupgo/pgo_atomic_threads.c +++ b/offload/test/offloading/gpupgo/pgo_atomic_threads.c @@ -18,7 +18,6 @@ // REQUIRES: amdgpu // REQUIRES: pgo -// XFAIL: amdgpu int test1(int a) { return a / 2; } diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c index c53e69a..3e95791 100644 --- a/offload/test/offloading/gpupgo/pgo_device_and_host.c +++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c @@ -50,7 +50,6 @@ // REQUIRES: amdgpu // REQUIRES: pgo -// XFAIL: amdgpu int main() { int host_var = 0; diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c index 644df6e..2939af61 100644 --- a/offload/test/offloading/gpupgo/pgo_device_only.c +++ b/offload/test/offloading/gpupgo/pgo_device_only.c @@ -16,7 +16,6 @@ // REQUIRES: amdgpu // REQUIRES: pgo -// XFAIL: amdgpu int test1(int a) { return a / 2; } int test2(int a) { return a * 2; } diff --git a/offload/test/offloading/shared_lib_fp_mapping.c b/offload/test/offloading/shared_lib_fp_mapping.c index c620344..e0af9b7 100644 --- a/offload/test/offloading/shared_lib_fp_mapping.c +++ b/offload/test/offloading/shared_lib_fp_mapping.c @@ -7,8 +7,8 @@ #include <stdio.h> -extern int func(); // Provided in liba.so, returns 42 -typedef int (*fp_t)(); +extern int func(void); // Provided in liba.so, returns 42 +typedef int (*fp_t)(void); int main() { int x = 0; diff --git a/offload/test/offloading/static_linking.c b/offload/test/offloading/static_linking.c index 7be95a1..273109e 100644 --- a/offload/test/offloading/static_linking.c +++ b/offload/test/offloading/static_linking.c @@ -14,7 +14,7 @@ int foo() { } #else #include <stdio.h> -int foo(); +int foo(void); int main() { int x = foo(); diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp index 42ffb97..74af3bf 100644 --- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp +++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp @@ -206,6 +206,9 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) { OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE, "Global Mem Size", "B")); OFFLOAD_ERR( + printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE, + "Work Group Shared Mem Size", "B")); + OFFLOAD_ERR( (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>( S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG, "Single Precision Floating Point Capability"))); diff --git a/offload/tools/offload-tblgen/EntryPointGen.cpp b/offload/tools/offload-tblgen/EntryPointGen.cpp index 4e42e49..4f76100 100644 --- a/offload/tools/offload-tblgen/EntryPointGen.cpp +++ b/offload/tools/offload-tblgen/EntryPointGen.cpp @@ -83,13 +83,15 @@ static void EmitEntryPointFunc(const FunctionRec &F, raw_ostream &OS) { OS << ") {\n"; // Check offload is initialized - if (F.getName() != "olInit") + if (F.getName() != "olInit") { OS << "if (!llvm::offload::isOffloadInitialized()) return &UninitError;"; - // Emit pre-call prints - OS << TAB_1 "if (llvm::offload::isTracingEnabled()) {\n"; - OS << formatv(TAB_2 "llvm::errs() << \"---> {0}\";\n", F.getName()); - OS << TAB_1 "}\n\n"; + // Emit pre-call prints + // Postpone pre-calls for olInit as tracing requires liboffload to be initialized + OS << TAB_1 "if (llvm::offload::isTracingEnabled()) {\n"; + OS << formatv(TAB_2 "llvm::errs() << \"---> {0}\";\n", F.getName()); + OS << TAB_1 "}\n\n"; + } // Perform actual function call to the validation wrapper ParamNameList = ParamNameList.substr(0, ParamNameList.size() - 2); @@ -99,6 +101,10 @@ static void EmitEntryPointFunc(const FunctionRec &F, raw_ostream &OS) { // Emit post-call prints OS << TAB_1 "if (llvm::offload::isTracingEnabled()) {\n"; + // postponed pre-call print for olInit + if (F.getName() == "olInit") + OS << formatv(TAB_2 "llvm::errs() << \"---> {0}\";\n", F.getName()); + if (F.getParams().size() > 0) { OS << formatv(TAB_2 "{0} Params = {{", F.getParamStructName()); for (const auto &Param : F.getParams()) { diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp index 30eafee..ba29fb1 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp @@ -217,6 +217,11 @@ OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0); OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE); +OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(SharedMemSize, uint64_t, + OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE, + 0); +OL_DEVICE_INFO_TEST_HOST_SUCCESS(SharedMemSize, uint64_t, + OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE); TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) { ol_device_type_t DeviceType; diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp index 79a18c1..2c375eb 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp @@ -71,6 +71,8 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE); OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(SharedMemSize, uint64_t, + OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE); TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) { size_t Size = 0; |
