diff options
Diffstat (limited to 'offload')
140 files changed, 7670 insertions, 522 deletions
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h index 2e5d923..111143a 100644 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -136,6 +136,12 @@ struct omp_lock_t { void *Lock; }; +// see definition in openmp/runtime kmp.h +typedef enum omp_severity_t { + severity_warning = 1, + severity_fatal = 2 +} omp_severity_t; + using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num); using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id, int16_t lane_offset, int16_t shortCircuit); diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index 467e44a..8c2828b 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -30,7 +30,8 @@ enum OMPTgtExecModeFlags : unsigned char { OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, OMP_TGT_EXEC_MODE_SPMD = 1 << 1, OMP_TGT_EXEC_MODE_GENERIC_SPMD = - OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD + OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD, + OMP_TGT_EXEC_MODE_SPMD_NO_LOOP = 1 << 2 | OMP_TGT_EXEC_MODE_SPMD }; static void diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp index 08ce616..aa5e740 100644 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ b/offload/DeviceRTL/src/Parallelism.cpp @@ -45,7 +45,24 @@ using namespace ompx; namespace { -uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { +void numThreadsStrictError(int32_t nt_strict, int32_t nt_severity, + const char *nt_message, int32_t requested, + int32_t actual) { + if (nt_message) + printf("%s\n", nt_message); + else + printf("The computed number of threads (%u) does not match the requested " + "number of threads (%d). Consider that it might not be supported " + "to select exactly %d threads on this target device.\n", + actual, requested, requested); + if (nt_severity == severity_fatal) + __builtin_trap(); +} + +uint32_t determineNumberOfThreads(int32_t NumThreadsClause, + int32_t nt_strict = false, + int32_t nt_severity = severity_fatal, + const char *nt_message = nullptr) { uint32_t NThreadsICV = NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; uint32_t NumThreads = mapping::getMaxTeamThreads(); @@ -55,13 +72,17 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { // SPMD mode allows any number of threads, for generic mode we round down to a // multiple of WARPSIZE since it is legal to do so in OpenMP. - if (mapping::isSPMDMode()) - return NumThreads; + if (!mapping::isSPMDMode()) { + if (NumThreads < mapping::getWarpSize()) + NumThreads = 1; + else + NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); + } - if (NumThreads < mapping::getWarpSize()) - NumThreads = 1; - else - NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); + if (NumThreadsClause != -1 && nt_strict && + NumThreads != static_cast<uint32_t>(NumThreadsClause)) + numThreadsStrictError(nt_strict, nt_severity, nt_message, NumThreadsClause, + NumThreads); return NumThreads; } @@ -82,12 +103,14 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { extern "C" { -[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, - int32_t num_threads, - void *fn, void **args, - const int64_t nargs) { +[[clang::always_inline]] void +__kmpc_parallel_spmd(IdentTy *ident, int32_t num_threads, void *fn, void **args, + const int64_t nargs, int32_t nt_strict = false, + int32_t nt_severity = severity_fatal, + const char *nt_message = nullptr) { uint32_t TId = mapping::getThreadIdInBlock(); - uint32_t NumThreads = determineNumberOfThreads(num_threads); + uint32_t NumThreads = + determineNumberOfThreads(num_threads, nt_strict, nt_severity, nt_message); uint32_t PTeamSize = NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; // Avoid the race between the read of the `icv::Level` above and the write @@ -140,10 +163,11 @@ extern "C" { return; } -[[clang::always_inline]] void -__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, - int32_t num_threads, int proc_bind, void *fn, - void *wrapper_fn, void **args, int64_t nargs) { +[[clang::always_inline]] void __kmpc_parallel_51( + IdentTy *ident, int32_t, int32_t if_expr, int32_t num_threads, + int proc_bind, void *fn, void *wrapper_fn, void **args, int64_t nargs, + int32_t nt_strict = false, int32_t nt_severity = severity_fatal, + const char *nt_message = nullptr) { uint32_t TId = mapping::getThreadIdInBlock(); // Assert the parallelism level is zero if disabled by the user. @@ -156,6 +180,11 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, // 3) nested parallel regions if (OMP_UNLIKELY(!if_expr || state::HasThreadState || (config::mayUseNestedParallelism() && icv::Level))) { + // OpenMP 6.0 12.1.2 requires the num_threads 'strict' modifier to also have + // effect when parallel execution is disabled by a corresponding if clause + // attached to the parallel directive. + if (nt_strict && num_threads > 1) + numThreadsStrictError(nt_strict, nt_severity, nt_message, num_threads, 1); state::DateEnvironmentRAII DERAII(ident); ++icv::Level; invokeMicrotask(TId, 0, fn, args, nargs); @@ -169,12 +198,14 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, // This was moved to its own routine so it could be called directly // in certain situations to avoid resource consumption of unused // logic in parallel_51. - __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs); + __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs, nt_strict, + nt_severity, nt_message); return; } - uint32_t NumThreads = determineNumberOfThreads(num_threads); + uint32_t NumThreads = + determineNumberOfThreads(num_threads, nt_strict, nt_severity, nt_message); uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; @@ -277,6 +308,16 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, __kmpc_end_sharing_variables(); } +[[clang::always_inline]] void __kmpc_parallel_60( + IdentTy *ident, int32_t id, int32_t if_expr, int32_t num_threads, + int proc_bind, void *fn, void *wrapper_fn, void **args, int64_t nargs, + int32_t nt_strict = false, int32_t nt_severity = severity_fatal, + const char *nt_message = nullptr) { + return __kmpc_parallel_51(ident, id, if_expr, num_threads, proc_bind, fn, + wrapper_fn, args, nargs, nt_strict, nt_severity, + nt_message); +} + [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { // Work function and arguments for L1 parallel region. *WorkFn = state::ParallelRegionFn; diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index a875930..59a2cc3 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -698,7 +698,7 @@ template <typename Ty> class StaticLoopChunker { static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, Ty NumBlocks, Ty BId, Ty NumThreads, Ty TId, Ty NumIters, - bool OneIterationPerThread) { + uint8_t OneIterationPerThread) { Ty KernelIteration = NumBlocks * NumThreads; // Start index in the normalized space. @@ -729,7 +729,7 @@ template <typename Ty> class StaticLoopChunker { Ty BlockChunk, Ty NumBlocks, Ty BId, Ty ThreadChunk, Ty NumThreads, Ty TId, Ty NumIters, - bool OneIterationPerThread) { + uint8_t OneIterationPerThread) { Ty KernelIteration = NumBlocks * BlockChunk; // Start index in the chunked space. @@ -767,8 +767,18 @@ template <typename Ty> class StaticLoopChunker { public: /// Worksharing `for`-loop. + /// \param[in] Loc Description of source location + /// \param[in] LoopBody Function which corresponds to loop body + /// \param[in] Arg Pointer to struct which contains loop body args + /// \param[in] NumIters Number of loop iterations + /// \param[in] NumThreads Number of GPU threads + /// \param[in] ThreadChunk Size of thread chunk + /// \param[in] OneIterationPerThread If true/nonzero, each thread executes + /// only one loop iteration or one thread chunk. This avoids an outer loop + /// over all loop iterations/chunks. static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty NumThreads, Ty ThreadChunk) { + Ty NumIters, Ty NumThreads, Ty ThreadChunk, + uint8_t OneIterationPerThread) { ASSERT(NumIters >= 0, "Bad iteration count"); ASSERT(ThreadChunk >= 0, "Bad thread count"); @@ -790,12 +800,13 @@ public: // If we know we have more threads than iterations we can indicate that to // avoid an outer loop. - bool OneIterationPerThread = false; if (config::getAssumeThreadsOversubscription()) { - ASSERT(NumThreads >= NumIters, "Broken assumption"); OneIterationPerThread = true; } + if (OneIterationPerThread) + ASSERT(NumThreads >= NumIters, "Broken assumption"); + if (ThreadChunk != 1) NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, ThreadChunk, NumThreads, TId, NumIters, @@ -806,8 +817,17 @@ public: } /// Worksharing `distribute`-loop. + /// \param[in] Loc Description of source location + /// \param[in] LoopBody Function which corresponds to loop body + /// \param[in] Arg Pointer to struct which contains loop body args + /// \param[in] NumIters Number of loop iterations + /// \param[in] BlockChunk Size of block chunk + /// \param[in] OneIterationPerThread If true/nonzero, each thread executes + /// only one loop iteration or one thread chunk. This avoids an outer loop + /// over all loop iterations/chunks. static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty BlockChunk) { + Ty NumIters, Ty BlockChunk, + uint8_t OneIterationPerThread) { ASSERT(icv::Level == 0, "Bad distribute"); ASSERT(icv::ActiveLevel == 0, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); @@ -831,12 +851,13 @@ public: // If we know we have more blocks than iterations we can indicate that to // avoid an outer loop. - bool OneIterationPerThread = false; if (config::getAssumeTeamsOversubscription()) { - ASSERT(NumBlocks >= NumIters, "Broken assumption"); OneIterationPerThread = true; } + if (OneIterationPerThread) + ASSERT(NumBlocks >= NumIters, "Broken assumption"); + if (BlockChunk != NumThreads) NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, ThreadChunk, NumThreads, TId, NumIters, @@ -852,9 +873,20 @@ public: } /// Worksharing `distribute parallel for`-loop. + /// \param[in] Loc Description of source location + /// \param[in] LoopBody Function which corresponds to loop body + /// \param[in] Arg Pointer to struct which contains loop body args + /// \param[in] NumIters Number of loop iterations + /// \param[in] NumThreads Number of GPU threads + /// \param[in] BlockChunk Size of block chunk + /// \param[in] ThreadChunk Size of thread chunk + /// \param[in] OneIterationPerThread If true/nonzero, each thread executes + /// only one loop iteration or one thread chunk. This avoids an outer loop + /// over all loop iterations/chunks. static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, Ty NumIters, Ty NumThreads, - Ty BlockChunk, Ty ThreadChunk) { + Ty BlockChunk, Ty ThreadChunk, + uint8_t OneIterationPerThread) { ASSERT(icv::Level == 1, "Bad distribute"); ASSERT(icv::ActiveLevel == 1, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); @@ -882,13 +914,14 @@ public: // If we know we have more threads (across all blocks) than iterations we // can indicate that to avoid an outer loop. - bool OneIterationPerThread = false; if (config::getAssumeTeamsOversubscription() & config::getAssumeThreadsOversubscription()) { OneIterationPerThread = true; - ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); } + if (OneIterationPerThread) + ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); + if (BlockChunk != NumThreads || ThreadChunk != 1) NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, ThreadChunk, NumThreads, TId, NumIters, @@ -907,24 +940,26 @@ public: #define OMP_LOOP_ENTRY(BW, TY) \ [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY block_chunk, TY thread_chunk) { \ + __kmpc_distribute_for_static_loop##BW( \ + IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ + TY num_threads, TY block_chunk, TY thread_chunk, \ + uint8_t one_iteration_per_thread) { \ ompx::StaticLoopChunker<TY>::DistributeFor( \ - loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \ + loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \ + one_iteration_per_thread); \ } \ [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ - void *arg, TY num_iters, \ - TY block_chunk) { \ - ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters, \ - block_chunk); \ + __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ + void *arg, TY num_iters, TY block_chunk, \ + uint8_t one_iteration_per_thread) { \ + ompx::StaticLoopChunker<TY>::Distribute( \ + loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \ } \ [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY thread_chunk) { \ + TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) { \ ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \ - thread_chunk); \ + thread_chunk, one_iteration_per_thread); \ } extern "C" { diff --git a/offload/cmake/OpenMPTesting.cmake b/offload/cmake/OpenMPTesting.cmake index 8e955ff3..ef8cf34 100644 --- a/offload/cmake/OpenMPTesting.cmake +++ b/offload/cmake/OpenMPTesting.cmake @@ -57,7 +57,7 @@ if (${OPENMP_STANDALONE_BUILD}) if (MSVC OR XCODE) set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --no-progress-bar") endif() - if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") + if ("${CMAKE_SYSTEM_NAME}" MATCHES "AIX") set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --time-tests --timeout=1800") endif() set(OPENMP_LIT_ARGS "${DEFAULT_LIT_ARGS}" CACHE STRING "Options for lit.") diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h index b9f5c16..93c1e569 100644 --- a/offload/include/OpenMP/Mapping.h +++ b/offload/include/OpenMP/Mapping.h @@ -417,12 +417,42 @@ struct MapperComponentsTy { typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t, void *); +/// Structure to store information about a single ATTACH map entry. +struct AttachMapInfo { + void *PointerBase; + void *PointeeBegin; + int64_t PointerSize; + int64_t MapType; + map_var_info_t Pointername; + + AttachMapInfo(void *PointerBase, void *PointeeBegin, int64_t Size, + int64_t Type, map_var_info_t Name) + : PointerBase(PointerBase), PointeeBegin(PointeeBegin), PointerSize(Size), + MapType(Type), Pointername(Name) {} +}; + +/// Structure to track ATTACH entries and new allocations across recursive calls +/// (for handling mappers) to targetDataBegin for a given construct. +struct AttachInfoTy { + /// ATTACH map entries for deferred processing. + llvm::SmallVector<AttachMapInfo> AttachEntries; + + /// Key: host pointer, Value: allocation size. + llvm::DenseMap<void *, int64_t> NewAllocations; + + AttachInfoTy() = default; + + // Delete copy constructor and copy assignment operator to prevent copying + AttachInfoTy(const AttachInfoTy &) = delete; + AttachInfoTy &operator=(const AttachInfoTy &) = delete; +}; + // Function pointer type for targetData* functions (targetDataBegin, // targetDataEnd and targetDataUpdate). typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, map_var_info_t *, void **, AsyncInfoTy &, - bool); + AttachInfoTy *, bool); void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device, bool toStdOut = false); @@ -431,20 +461,26 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, - bool FromMapper = false); + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); +// Process deferred ATTACH map entries collected during targetDataBegin. +int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo, + AsyncInfoTy &AsyncInfo); + struct MappingInfoTy { MappingInfoTy(DeviceTy &Device) : Device(Device) {} diff --git a/offload/include/device.h b/offload/include/device.h index f4b10ab..1e85bb1 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -98,6 +98,10 @@ struct DeviceTy { int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, int64_t Size, AsyncInfoTy &AsyncInfo); + // Insert a data fence between previous data operations and the following + // operations if necessary for the device. + int32_t dataFence(AsyncInfoTy &AsyncInfo); + /// Notify the plugin about a new mapping starting at the host address /// \p HstPtr and \p Size bytes. int32_t notifyDataMapped(void *HstPtr, int64_t Size); diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 6971780..8fd722b 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -33,9 +33,6 @@ #define OFFLOAD_DEVICE_DEFAULT -1 -// Don't format out enums and structs. -// clang-format off - /// return flags of __tgt_target_XXX public APIs enum __tgt_target_return_t : int { /// successful offload executed on a target device @@ -51,39 +48,42 @@ enum __tgt_target_return_t : int { /// Data attributes for each data reference used in an OpenMP target region. enum tgt_map_type { // No flags - OMP_TGT_MAPTYPE_NONE = 0x000, + OMP_TGT_MAPTYPE_NONE = 0x000, // copy data from host to device - OMP_TGT_MAPTYPE_TO = 0x001, + OMP_TGT_MAPTYPE_TO = 0x001, // copy data from device to host - OMP_TGT_MAPTYPE_FROM = 0x002, + OMP_TGT_MAPTYPE_FROM = 0x002, // copy regardless of the reference count - OMP_TGT_MAPTYPE_ALWAYS = 0x004, + OMP_TGT_MAPTYPE_ALWAYS = 0x004, // force unmapping of data - OMP_TGT_MAPTYPE_DELETE = 0x008, + OMP_TGT_MAPTYPE_DELETE = 0x008, // map the pointer as well as the pointee - OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010, + OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010, // pass device base address to kernel - OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020, + OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020, // return base device address of mapped data - OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040, + OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040, // private variable - not mapped - OMP_TGT_MAPTYPE_PRIVATE = 0x080, + OMP_TGT_MAPTYPE_PRIVATE = 0x080, // copy by value - not mapped - OMP_TGT_MAPTYPE_LITERAL = 0x100, + OMP_TGT_MAPTYPE_LITERAL = 0x100, // mapping is implicit - OMP_TGT_MAPTYPE_IMPLICIT = 0x200, + OMP_TGT_MAPTYPE_IMPLICIT = 0x200, // copy data to device - OMP_TGT_MAPTYPE_CLOSE = 0x400, + OMP_TGT_MAPTYPE_CLOSE = 0x400, // runtime error if not already allocated - OMP_TGT_MAPTYPE_PRESENT = 0x1000, + OMP_TGT_MAPTYPE_PRESENT = 0x1000, // use a separate reference counter so that the data cannot be unmapped within // the structured region // This is an OpenMP extension for the sake of OpenACC support. - OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000, + OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000, + // Attach pointer and pointee, after processing all other maps. + // Applicable to map-entering directives. Does not change ref-count. + OMP_TGT_MAPTYPE_ATTACH = 0x4000, // descriptor for non-contiguous target-update - OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000, + OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000, // member of struct, member given by [16 MSBs] - 1 - OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 + OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 }; /// Flags for offload entries. @@ -105,9 +105,9 @@ enum TargetAllocTy : int32_t { TARGET_ALLOC_DEVICE_NON_BLOCKING, }; -inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr, - 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; +inline KernelArgsTy CTorDTorKernelArgs = { + 1, 0, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, 0, {0, 0, 0}, {1, 0, 0}, {1, 0, 0}, 0}; struct DeviceTy; diff --git a/offload/liboffload/API/APIDefs.td b/offload/liboffload/API/APIDefs.td index 640932d..ea3896f 100644 --- a/offload/liboffload/API/APIDefs.td +++ b/offload/liboffload/API/APIDefs.td @@ -31,6 +31,13 @@ class IsHandleType<string Type> { !ne(!find(Type, "_handle_t", !sub(!size(Type), 9)), -1)); } +// Does the type end with '_cb_t'? +class IsCallbackType<string Type> { + // size("_cb_t") == 5 + bit ret = !if(!lt(!size(Type), 5), 0, + !ne(!find(Type, "_cb_t", !sub(!size(Type), 5)), -1)); +} + // Does the type end with '*'? class IsPointerType<string Type> { bit ret = !ne(!find(Type, "*", !sub(!size(Type), 1)), -1); @@ -58,6 +65,7 @@ class Param<string Type, string Name, string Desc, bits<3> Flags = 0> { TypeInfo type_info = TypeInfo<"", "">; bit IsHandle = IsHandleType<type>.ret; bit IsPointer = IsPointerType<type>.ret; + bit IsCallback = IsCallbackType<type>.ret; } // A parameter whose range is described by other parameters in the function. @@ -81,7 +89,7 @@ class ShouldCheckHandle<Param P> { } class ShouldCheckPointer<Param P> { - bit ret = !and(P.IsPointer, !eq(!and(PARAM_OPTIONAL, P.flags), 0)); + bit ret = !and(!or(P.IsPointer, P.IsCallback), !eq(!and(PARAM_OPTIONAL, P.flags), 0)); } // For a list of returns that contains a specific return code, find and append @@ -137,7 +145,6 @@ defvar DefaultReturns = [Return<PREFIX#"_RESULT_SUCCESS">, Return<PREFIX#"_ERRC_DEVICE_LOST">]; class APIObject { - string name; string desc; } @@ -168,6 +175,10 @@ class Enum : APIObject { // all Etor values must be TaggedEtor records bit is_typed = 0; + // This refers to whether the enumerator is used to name bits of a bit field, + // where consecutive values are bit-shifted rather than incremented. + bit is_bit_field = 0; + list<Etor> etors = []; } diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td index 6eaf604..ac27d85 100644 --- a/offload/liboffload/API/Common.td +++ b/offload/liboffload/API/Common.td @@ -10,77 +10,64 @@ // //===----------------------------------------------------------------------===// -def : Macro { - let name = "OL_VERSION_MAJOR"; +def OL_VERSION_MAJOR : Macro { let desc = "Major version of the Offload API"; let value = "0"; } -def : Macro { - let name = "OL_VERSION_MINOR"; +def OL_VERSION_MINOR : Macro { let desc = "Minor version of the Offload API"; let value = "0"; } -def : Macro { - let name = "OL_VERSION_PATCH"; +def OL_VERSION_PATCH : Macro { let desc = "Patch version of the Offload API"; let value = "1"; } -def : Macro { - let name = "OL_APICALL"; +def OL_APICALL : Macro { let desc = "Calling convention for all API functions"; let condition = "defined(_WIN32)"; let value = "__cdecl"; let alt_value = ""; } -def : Macro { - let name = "OL_APIEXPORT"; +def OL_APIEXPORT : Macro { let desc = "Microsoft-specific dllexport storage-class attribute"; let condition = "defined(_WIN32)"; let value = "__declspec(dllexport)"; let alt_value = ""; } -def : Handle { - let name = "ol_platform_handle_t"; +def ol_platform_handle_t : Handle { let desc = "Handle of a platform instance"; } -def : Handle { - let name = "ol_device_handle_t"; +def ol_device_handle_t : Handle { let desc = "Handle of platform's device object"; } -def : Handle { - let name = "ol_context_handle_t"; +def ol_context_handle_t : Handle { let desc = "Handle of context object"; } -def : Handle { - let name = "ol_queue_handle_t"; +def ol_queue_handle_t : Handle { let desc = "Handle of queue object"; } -def : Handle { - let name = "ol_event_handle_t"; +def ol_event_handle_t : Handle { let desc = "Handle of event object"; } -def : Handle { - let name = "ol_program_handle_t"; +def ol_program_handle_t : Handle { let desc = "Handle of program object"; } -def : Handle { - let name = "ol_symbol_handle_t"; +def ol_symbol_handle_t : Handle { let desc = "Handle of an object in a device's memory for a specific program"; } -def ErrorCode : Enum { - let name = "ol_errc_t"; +def ol_errc_t : Enum { let desc = "Defines Return/Error codes"; let etors =[ Etor<"SUCCESS", "success">, @@ -115,8 +102,7 @@ def ErrorCode : Enum { ]; } -def : Struct { - let name = "ol_error_struct_t"; +def ol_error_struct_t : Struct { let desc = "Details of the error condition returned by an API call"; let members = [ StructMember<"ol_errc_t", "Code", "The error code">, @@ -124,20 +110,17 @@ def : Struct { ]; } -def : Typedef { - let name = "ol_result_t"; +def ol_result_t : Typedef { let desc = "Result type returned by all entry points."; - let value = "const ol_error_struct_t*"; + let value = "const struct ol_error_struct_t*"; } -def : Macro { - let name = "OL_SUCCESS"; +def OL_SUCCESS : Macro { let desc = "Success condition"; let value = "NULL"; } -def : Struct { - let name = "ol_code_location_t"; +def ol_code_location_t : Struct { let desc = "Code location information that can optionally be associated with an API call"; let members = [ StructMember<"const char*", "FunctionName", "Function name">, @@ -147,8 +130,7 @@ def : Struct { ]; } -def : Struct { - let name = "ol_dimensions_t"; +def ol_dimensions_t : Struct { let desc = "A three element vector"; let members = [ StructMember<"uint32_t", "x", "X">, @@ -157,8 +139,7 @@ def : Struct { ]; } -def : Function { - let name = "olInit"; +def olInit : Function { let desc = "Perform initialization of the Offload library and plugins"; let details = [ "This must be the first API call made by a user of the Offload library", @@ -168,8 +149,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olShutDown"; +def olShutDown : Function { let desc = "Release the resources in use by Offload"; let details = [ "This decrements an internal reference count. When this reaches 0, all resources will be released", diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td index 857c596..5b54c79 100644 --- a/offload/liboffload/API/Device.td +++ b/offload/liboffload/API/Device.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Enum { - let name = "ol_device_type_t"; +def ol_device_type_t : Enum { let desc = "Supported device types."; let etors =[ Etor<"DEFAULT", "The default device type as preferred by the runtime">, @@ -22,23 +21,54 @@ def : Enum { ]; } -def DeviceInfo : Enum { - let name = "ol_device_info_t"; +def ol_device_info_t : Enum { let desc = "Supported device info."; let is_typed = 1; - let etors =[ + list<TaggedEtor> basic_etors =[ TaggedEtor<"TYPE", "ol_device_type_t", "type of the device">, TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">, TaggedEtor<"NAME", "char[]", "Device name">, + TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">, TaggedEtor<"VENDOR", "char[]", "Device vendor">, TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">, TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">, TaggedEtor<"MAX_WORK_GROUP_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work group size in each dimension">, + TaggedEtor<"MAX_WORK_SIZE", "uint32_t", "Maximum total work items">, + TaggedEtor<"MAX_WORK_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work items in each dimension">, + TaggedEtor<"VENDOR_ID", "uint32_t", "A unique vendor device identifier assigned by PCI-SIG">, + TaggedEtor<"NUM_COMPUTE_UNITS", "uint32_t", "The number of parallel compute units available to the device">, + TaggedEtor<"MAX_CLOCK_FREQUENCY", "uint32_t", "The maximum configured clock frequency of this device in MHz">, + TaggedEtor<"MEMORY_CLOCK_RATE", "uint32_t", "Memory clock frequency in MHz">, + TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">, + TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">, + TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">, ]; + list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">); + list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>); + let etors = !listconcat(basic_etors, fp_configs, native_vec_widths); +} + +def ol_device_fp_capability_flag_t : Enum { + let desc = "Device floating-point capability flags"; + let is_bit_field = 1; + let etors =[ + Etor<"CORRECTLY_ROUNDED_DIVIDE_SQRT", "Support correctly rounded divide and sqrt">, + Etor<"ROUND_TO_NEAREST", "Support round to nearest">, + Etor<"ROUND_TO_ZERO", "Support round to zero">, + Etor<"ROUND_TO_INF", "Support round to infinity">, + Etor<"INF_NAN", "Support INF to NAN">, + Etor<"DENORM", "Support denorm">, + Etor<"FMA", "Support fused multiply-add">, + Etor<"SOFT_FLOAT", "Basic floating point operations implemented in software">, + ]; +} + +def ol_device_fp_capability_flags_t : Typedef { + let desc = "Device floating-point capability flags"; + let value = "uint32_t"; } -def : FptrTypedef { - let name = "ol_device_iterate_cb_t"; +def ol_device_iterate_cb_t : FptrTypedef { let desc = "User-provided function to be used with `olIterateDevices`"; let params = [ Param<"ol_device_handle_t", "Device", "the device handle of the current iteration", PARAM_IN>, @@ -47,8 +77,7 @@ def : FptrTypedef { let return = "bool"; } -def : Function { - let name = "olIterateDevices"; +def olIterateDevices : Function { let desc = "Iterates over all available devices, calling the callback for each device."; let details = [ "If the user-provided callback returns `false`, the iteration is stopped." @@ -62,8 +91,7 @@ def : Function { ]; } -def : Function { - let name = "olGetDeviceInfo"; +def olGetDeviceInfo : Function { let desc = "Queries the given property of the device."; let details = []; let params = [ @@ -86,8 +114,7 @@ def : Function { ]; } -def : Function { - let name = "olGetDeviceInfoSize"; +def olGetDeviceInfoSize : Function { let desc = "Returns the storage size of the given device query."; let details = []; let params = [ diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td index 9d217ae..075bf5b 100644 --- a/offload/liboffload/API/Event.td +++ b/offload/liboffload/API/Event.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Function { - let name = "olCreateEvent"; +def olCreateEvent : Function { let desc = "Enqueue an event to `Queue` and return it."; let details = [ "This event can be used with `olSyncEvent` and `olWaitEvents` and will be complete once all enqueued work prior to the `olCreateEvent` call is complete.", @@ -23,8 +22,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olDestroyEvent"; +def olDestroyEvent : Function { let desc = "Destroy the event and free all underlying resources."; let details = []; let params = [ @@ -33,8 +31,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olSyncEvent"; +def olSyncEvent : Function { let desc = "Block the calling thread until the event is complete."; let details = []; let params = [ @@ -43,17 +40,16 @@ def : Function { let returns = []; } -def : Enum { - let name = "ol_event_info_t"; +def ol_event_info_t : Enum { let desc = "Supported event info."; let is_typed = 1; let etors = [ - TaggedEtor<"QUEUE", "ol_queue_handle_t", "The handle of the queue associated with the device."> + TaggedEtor<"QUEUE", "ol_queue_handle_t", "The handle of the queue associated with the device.">, + TaggedEtor<"IS_COMPLETE", "bool", "True if and only if the event is complete.">, ]; } -def : Function { - let name = "olGetEventInfo"; +def olGetEventInfo : Function { let desc = "Queries the given property of the event."; let details = [ "`olGetEventInfoSize` can be used to query the storage size " @@ -77,8 +73,7 @@ def : Function { ]; } -def : Function { - let name = "olGetEventInfoSize"; +def olGetEventInfoSize : Function { let desc = "Returns the storage size of the given event query."; let details = []; let params = [ diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td index 502fb36..2f5692a 100644 --- a/offload/liboffload/API/Kernel.td +++ b/offload/liboffload/API/Kernel.td @@ -6,12 +6,11 @@ // //===----------------------------------------------------------------------===// // -// This file contains Offload API definitions related to launching kernels +// This file contains Offload API definitions related to kernels // //===----------------------------------------------------------------------===// -def : Struct { - let name = "ol_kernel_launch_size_args_t"; +def ol_kernel_launch_size_args_t : Struct { let desc = "Size-related arguments for a kernel launch."; let members = [ StructMember<"size_t", "Dimensions", "Number of work dimensions">, @@ -21,8 +20,7 @@ def : Struct { ]; } -def : Function { - let name = "olLaunchKernel"; +def olLaunchKernel : Function { let desc = "Enqueue a kernel launch with the specified size and parameters."; let details = [ "If a queue is not specified, kernel execution happens synchronously", @@ -42,3 +40,20 @@ def : Function { Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>, ]; } + +def olCalculateOptimalOccupancy : Function { + let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy."; + let details = [ + "For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.", + ]; + let params = [ + Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>, + Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>, + Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>, + Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT> + ]; + let returns = [ + Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>, + Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>, + ]; +} diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td index 5f71585..cc98b67 100644 --- a/offload/liboffload/API/Memory.td +++ b/offload/liboffload/API/Memory.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Enum { - let name = "ol_alloc_type_t"; +def ol_alloc_type_t : Enum { let desc = "Represents the type of allocation made with olMemAlloc."; let etors = [ Etor<"HOST", "Host allocation">, @@ -20,8 +19,7 @@ def : Enum { ]; } -def : Function { - let name = "olMemAlloc"; +def olMemAlloc : Function { let desc = "Creates a memory allocation on the specified device."; let params = [ Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>, @@ -36,8 +34,7 @@ def : Function { ]; } -def : Function { - let name = "olMemFree"; +def olMemFree : Function { let desc = "Frees a memory allocation previously made by olMemAlloc."; let params = [ Param<"void*", "Address", "address of the allocation to free", PARAM_IN>, @@ -45,8 +42,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olMemcpy"; +def olMemcpy : Function { let desc = "Enqueue a memcpy operation."; let details = [ "For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.", @@ -63,3 +59,22 @@ def : Function { ]; let returns = []; } + +def olMemFill : Function { + let desc = "Fill memory with copies of the given pattern"; + let details = [ + "Filling with patterns larger than 4 bytes may be less performant", + "The destination pointer and queue must be associated with the same device", + "The fill size must be a multiple of the pattern size", + ]; + let params = [ + Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>, + Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>, + Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>, + Param<"const void*", "PatternPtr", "", PARAM_IN>, + Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>, + ]; + let returns = [ + Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]> + ]; +} diff --git a/offload/liboffload/API/Platform.td b/offload/liboffload/API/Platform.td index 97c2cc2..906f899 100644 --- a/offload/liboffload/API/Platform.td +++ b/offload/liboffload/API/Platform.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Enum { - let name = "ol_platform_info_t"; +def ol_platform_info_t : Enum { let desc = "Supported platform info."; let is_typed = 1; let etors = [ @@ -22,8 +21,7 @@ def : Enum { ]; } -def : Enum { - let name = "ol_platform_backend_t"; +def ol_platform_backend_t : Enum { let desc = "Identifies the native backend of the platform."; let etors =[ Etor<"UNKNOWN", "The backend is not recognized">, @@ -33,8 +31,7 @@ def : Enum { ]; } -def : Function { - let name = "olGetPlatformInfo"; +def olGetPlatformInfo : Function { let desc = "Queries the given property of the platform."; let details = [ "`olGetPlatformInfoSize` can be used to query the storage size " @@ -61,8 +58,7 @@ def : Function { ]; } -def : Function { - let name = "olGetPlatformInfoSize"; +def olGetPlatformInfoSize : Function { let desc = "Returns the storage size of the given platform query."; let details = []; let params = [ diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td index 0476fa1..1f48f65 100644 --- a/offload/liboffload/API/Program.td +++ b/offload/liboffload/API/Program.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Function { - let name = "olCreateProgram"; +def olCreateProgram : Function { let desc = "Create a program for the device from the binary image pointed to by `ProgData`."; let details = [ "The provided `ProgData` will be copied and need not outlive the returned handle", @@ -25,8 +24,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olDestroyProgram"; +def olDestroyProgram : Function { let desc = "Destroy the program and free all underlying resources."; let details = []; let params = [ diff --git a/offload/liboffload/API/Queue.td b/offload/liboffload/API/Queue.td index 1d9f6f2..ededa9c 100644 --- a/offload/liboffload/API/Queue.td +++ b/offload/liboffload/API/Queue.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Function { - let name = "olCreateQueue"; +def olCreateQueue : Function { let desc = "Create a queue for the given device."; let details = []; let params = [ @@ -21,8 +20,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olDestroyQueue"; +def olDestroyQueue : Function { let desc = "Destroy the queue and free all underlying resources."; let details = [ "Any work previously enqueued to the queue is still performed and any events generated for this queue remain valid." @@ -33,8 +31,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olSyncQueue"; +def olSyncQueue : Function { let desc = "Block the calling thread until the enqueued work on a queue is complete."; let details = []; let params = [ @@ -43,8 +40,7 @@ def : Function { let returns = []; } -def : Function { - let name = "olWaitEvents"; +def olWaitEvents : Function { let desc = "Make any future work submitted to this queue wait until the provided events are complete."; let details = [ "All events in `Events` must complete before the queue is unblocked.", @@ -60,8 +56,7 @@ def : Function { ]; } -def : Enum { - let name = "ol_queue_info_t"; +def ol_queue_info_t : Enum { let desc = "Supported queue info."; let is_typed = 1; let etors = [ @@ -70,8 +65,7 @@ def : Enum { ]; } -def : Function { - let name = "olGetQueueInfo"; +def olGetQueueInfo : Function { let desc = "Queries the given property of the queue."; let details = [ "`olGetQueueInfoSize` can be used to query the storage size " @@ -95,8 +89,7 @@ def : Function { ]; } -def : Function { - let name = "olGetQueueInfoSize"; +def olGetQueueInfoSize : Function { let desc = "Returns the storage size of the given queue query."; let details = []; let params = [ @@ -108,3 +101,27 @@ def : Function { Return<"OL_ERRC_INVALID_QUEUE"> ]; } + +def ol_host_function_cb_t : FptrTypedef { + let desc = "Host function for use by `olLaunchHostFunction`."; + let params = [ + Param<"void *", "UserData", "user specified data passed into `olLaunchHostFunction`.", PARAM_IN>, + ]; + let return = "void"; +} + +def olLaunchHostFunction : Function { + let desc = "Enqueue a callback function on the host."; + let details = [ + "The provided function will be called from the same process as the one that called `olLaunchHostFunction`.", + "The callback will not run until all previous work submitted to the queue has completed.", + "The callback must return before any work submitted to the queue after it is started.", + "The callback must not call any liboffload API functions or any backend specific functions (such as Cuda or HSA library functions).", + ]; + let params = [ + Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>, + Param<"ol_host_function_cb_t", "Callback", "the callback function to call on the host", PARAM_IN>, + Param<"void *", "UserData", "a pointer that will be passed verbatim to the callback function", PARAM_IN_OPTIONAL>, + ]; + let returns = []; +} diff --git a/offload/liboffload/API/Symbol.td b/offload/liboffload/API/Symbol.td index 2e94d70..c57a2e1 100644 --- a/offload/liboffload/API/Symbol.td +++ b/offload/liboffload/API/Symbol.td @@ -10,8 +10,7 @@ // //===----------------------------------------------------------------------===// -def : Enum { - let name = "ol_symbol_kind_t"; +def ol_symbol_kind_t : Enum { let desc = "The kind of a symbol"; let etors =[ Etor<"KERNEL", "a kernel object">, @@ -19,8 +18,7 @@ def : Enum { ]; } -def : Function { - let name = "olGetSymbol"; +def olGetSymbol : Function { let desc = "Get a symbol (kernel or global variable) identified by `Name` in the given program."; let details = [ "Symbol handles are owned by the program and do not need to be manually destroyed." @@ -34,8 +32,7 @@ def : Function { let returns = []; } -def : Enum { - let name = "ol_symbol_info_t"; +def ol_symbol_info_t : Enum { let desc = "Supported symbol info."; let is_typed = 1; let etors = [ @@ -45,8 +42,7 @@ def : Enum { ]; } -def : Function { - let name = "olGetSymbolInfo"; +def olGetSymbolInfo : Function { let desc = "Queries the given property of the symbol."; let details = [ "`olGetSymbolInfoSize` can be used to query the storage size " @@ -73,8 +69,7 @@ def : Function { ]; } -def : Function { - let name = "olGetSymbolInfoSize"; +def olGetSymbolInfoSize : Function { let desc = "Returns the storage size of the given symbol query."; let details = []; let params = [ diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index f5365ca..7e8e297 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -47,10 +47,59 @@ struct ol_device_impl_t { ol_platform_handle_t Platform, InfoTreeNode &&DevInfo) : DeviceNum(DeviceNum), Device(Device), Platform(Platform), Info(std::forward<InfoTreeNode>(DevInfo)) {} + + ~ol_device_impl_t() { + assert(!OutstandingQueues.size() && + "Device object dropped with outstanding queues"); + } + int DeviceNum; GenericDeviceTy *Device; ol_platform_handle_t Platform; InfoTreeNode Info; + + llvm::SmallVector<__tgt_async_info *> OutstandingQueues; + std::mutex OutstandingQueuesMutex; + + /// If the device has any outstanding queues that are now complete, remove it + /// from the list and return it. + /// + /// Queues may be added to the outstanding queue list by olDestroyQueue if + /// they are destroyed but not completed. + __tgt_async_info *getOutstandingQueue() { + // Not locking the `size()` access is fine here - In the worst case we + // either miss a queue that exists or loop through an empty array after + // taking the lock. Both are sub-optimal but not that bad. + if (OutstandingQueues.size()) { + std::lock_guard<std::mutex> Lock(OutstandingQueuesMutex); + + // As queues are pulled and popped from this list, longer running queues + // naturally bubble to the start of the array. Hence looping backwards. + for (auto Q = OutstandingQueues.rbegin(); Q != OutstandingQueues.rend(); + Q++) { + if (!Device->hasPendingWork(*Q)) { + auto OutstandingQueue = *Q; + *Q = OutstandingQueues.back(); + OutstandingQueues.pop_back(); + return OutstandingQueue; + } + } + } + return nullptr; + } + + /// Complete all pending work for this device and perform any needed cleanup. + /// + /// After calling this function, no liboffload functions should be called with + /// this device handle. + llvm::Error destroy() { + llvm::Error Result = Plugin::success(); + for (auto Q : OutstandingQueues) + if (auto Err = Device->synchronize(Q, /*Release=*/true)) + Result = llvm::joinErrors(std::move(Result), std::move(Err)); + OutstandingQueues.clear(); + return Result; + } }; struct ol_platform_impl_t { @@ -58,23 +107,51 @@ struct ol_platform_impl_t { ol_platform_backend_t BackendType) : Plugin(std::move(Plugin)), BackendType(BackendType) {} std::unique_ptr<GenericPluginTy> Plugin; - std::vector<ol_device_impl_t> Devices; + llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices; ol_platform_backend_t BackendType; + + /// Complete all pending work for this platform and perform any needed + /// cleanup. + /// + /// After calling this function, no liboffload functions should be called with + /// this platform handle. + llvm::Error destroy() { + llvm::Error Result = Plugin::success(); + for (auto &D : Devices) + if (auto Err = D->destroy()) + Result = llvm::joinErrors(std::move(Result), std::move(Err)); + + if (auto Res = Plugin->deinit()) + Result = llvm::joinErrors(std::move(Result), std::move(Res)); + + return Result; + } }; struct ol_queue_impl_t { ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device) - : AsyncInfo(AsyncInfo), Device(Device) {} + : AsyncInfo(AsyncInfo), Device(Device), Id(IdCounter++) {} __tgt_async_info *AsyncInfo; ol_device_handle_t Device; + // A unique identifier for the queue + size_t Id; + static std::atomic<size_t> IdCounter; }; +std::atomic<size_t> ol_queue_impl_t::IdCounter(0); struct ol_event_impl_t { - ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue) - : EventInfo(EventInfo), Queue(Queue) {} + ol_event_impl_t(void *EventInfo, ol_device_handle_t Device, + ol_queue_handle_t Queue) + : EventInfo(EventInfo), Device(Device), QueueId(Queue->Id), Queue(Queue) { + } // EventInfo may be null, in which case the event should be considered always // complete void *EventInfo; + ol_device_handle_t Device; + size_t QueueId; + // Events may outlive the queue - don't assume this is always valid. + // It is provided only to implement OL_EVENT_INFO_QUEUE. Use QueueId to check + // for queue equality instead. ol_queue_handle_t Queue; }; @@ -125,12 +202,13 @@ struct OffloadContext { bool TracingEnabled = false; bool ValidationEnabled = true; DenseMap<void *, AllocInfo> AllocInfoMap{}; + std::mutex AllocInfoMapMutex{}; SmallVector<ol_platform_impl_t, 4> Platforms{}; size_t RefCount; ol_device_handle_t HostDevice() { // The host platform is always inserted last - return &Platforms.back().Devices[0]; + return Platforms.back().Devices[0].get(); } static OffloadContext &get() { @@ -189,8 +267,8 @@ Error initPlugins(OffloadContext &Context) { auto Info = Device->obtainInfoImpl(); if (auto Err = Info.takeError()) return Err; - Platform.Devices.emplace_back(DevNum, Device, &Platform, - std::move(*Info)); + Platform.Devices.emplace_back(std::make_unique<ol_device_impl_t>( + DevNum, Device, &Platform, std::move(*Info))); } } } @@ -198,7 +276,8 @@ Error initPlugins(OffloadContext &Context) { // Add the special host device auto &HostPlatform = Context.Platforms.emplace_back( ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST}); - HostPlatform.Devices.emplace_back(-1, nullptr, nullptr, InfoTreeNode{}); + HostPlatform.Devices.emplace_back( + std::make_unique<ol_device_impl_t>(-1, nullptr, nullptr, InfoTreeNode{})); Context.HostDevice()->Platform = &HostPlatform; Context.TracingEnabled = std::getenv("OFFLOAD_TRACE"); @@ -239,7 +318,7 @@ Error olShutDown_impl() { if (!P.Plugin || !P.Plugin->is_initialized()) continue; - if (auto Res = P.Plugin->deinit()) + if (auto Res = P.destroy()) Result = llvm::joinErrors(std::move(Result), std::move(Res)); } @@ -302,10 +381,57 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, }; // These are not implemented by the plugin interface - if (PropName == OL_DEVICE_INFO_PLATFORM) + switch (PropName) { + case OL_DEVICE_INFO_PLATFORM: return Info.write<void *>(Device->Platform); - if (PropName == OL_DEVICE_INFO_TYPE) + + case OL_DEVICE_INFO_TYPE: return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_GPU); + + case OL_DEVICE_INFO_SINGLE_FP_CONFIG: + case OL_DEVICE_INFO_DOUBLE_FP_CONFIG: { + ol_device_fp_capability_flags_t flags{0}; + flags |= OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + OL_DEVICE_FP_CAPABILITY_FLAG_DENORM | + OL_DEVICE_FP_CAPABILITY_FLAG_FMA; + return Info.write(flags); + } + + case OL_DEVICE_INFO_HALF_FP_CONFIG: + return Info.write<ol_device_fp_capability_flags_t>(0); + + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: + return Info.write<uint32_t>(1); + + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: + return Info.write<uint32_t>(0); + + // None of the existing plugins specify a limit on a single allocation, + // so return the global memory size instead + case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: + [[fallthrough]]; + // AMD doesn't provide the global memory size (trivially) with the device info + // struct, so use the plugin interface + case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: { + uint64_t Mem; + if (auto Err = Device->Device->getDeviceMemorySize(Mem)) + return Err; + return Info.write<uint64_t>(Mem); + } break; + + default: + break; + } + if (PropName >= OL_DEVICE_INFO_LAST) return createOffloadError(ErrorCode::INVALID_ENUMERATION, "getDeviceInfo enum '%i' is invalid", PropName); @@ -316,8 +442,10 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, "plugin did not provide a response for this information"); auto Entry = *EntryOpt; + // Retrieve properties from the plugin interface switch (PropName) { case OL_DEVICE_INFO_NAME: + case OL_DEVICE_INFO_PRODUCT_NAME: case OL_DEVICE_INFO_VENDOR: case OL_DEVICE_INFO_DRIVER_VERSION: { // String values @@ -327,7 +455,13 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.writeString(std::get<std::string>(Entry->Value).c_str()); } - case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { + case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: + case OL_DEVICE_INFO_MAX_WORK_SIZE: + case OL_DEVICE_INFO_VENDOR_ID: + case OL_DEVICE_INFO_NUM_COMPUTE_UNITS: + case OL_DEVICE_INFO_ADDRESS_BITS: + case OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY: + case OL_DEVICE_INFO_MEMORY_CLOCK_RATE: { // Uint32 values if (!std::holds_alternative<uint64_t>(Entry->Value)) return makeError(ErrorCode::BACKEND_FAILURE, @@ -339,6 +473,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.write(static_cast<uint32_t>(Value)); } + case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION: case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: { // {x, y, z} triples ol_dimensions_t Out{0, 0, 0}; @@ -377,6 +512,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, assert(Device == OffloadContext::get().HostDevice()); InfoWriter Info(PropSize, PropValue, PropSizeRet); + constexpr auto uint32_max = std::numeric_limits<uint32_t>::max(); + switch (PropName) { case OL_DEVICE_INFO_PLATFORM: return Info.write<void *>(Device->Platform); @@ -384,14 +521,52 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_HOST); case OL_DEVICE_INFO_NAME: return Info.writeString("Virtual Host Device"); + case OL_DEVICE_INFO_PRODUCT_NAME: + return Info.writeString("Virtual Host Device"); case OL_DEVICE_INFO_VENDOR: return Info.writeString("Liboffload"); case OL_DEVICE_INFO_DRIVER_VERSION: return Info.writeString(LLVM_VERSION_STRING); case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: - return Info.write<uint64_t>(1); + return Info.write<uint32_t>(1); case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: return Info.write<ol_dimensions_t>(ol_dimensions_t{1, 1, 1}); + case OL_DEVICE_INFO_MAX_WORK_SIZE: + return Info.write<uint32_t>(uint32_max); + case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION: + return Info.write<ol_dimensions_t>( + ol_dimensions_t{uint32_max, uint32_max, uint32_max}); + case OL_DEVICE_INFO_VENDOR_ID: + return Info.write<uint32_t>(0); + case OL_DEVICE_INFO_NUM_COMPUTE_UNITS: + return Info.write<uint32_t>(1); + case OL_DEVICE_INFO_SINGLE_FP_CONFIG: + case OL_DEVICE_INFO_DOUBLE_FP_CONFIG: + return Info.write<ol_device_fp_capability_flags_t>( + OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + OL_DEVICE_FP_CAPABILITY_FLAG_DENORM | OL_DEVICE_FP_CAPABILITY_FLAG_FMA); + case OL_DEVICE_INFO_HALF_FP_CONFIG: + return Info.write<ol_device_fp_capability_flags_t>(0); + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: + return Info.write<uint32_t>(1); + case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: + return Info.write<uint32_t>(0); + case OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY: + case OL_DEVICE_INFO_MEMORY_CLOCK_RATE: + case OL_DEVICE_INFO_ADDRESS_BITS: + return Info.write<uint32_t>(std::numeric_limits<uintptr_t>::digits); + case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: + case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: + return Info.write<uint64_t>(0); default: return createOffloadError(ErrorCode::INVALID_ENUMERATION, "getDeviceInfo enum '%i' is invalid", PropName); @@ -420,7 +595,7 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device, Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) { for (auto &Platform : OffloadContext::get().Platforms) { for (auto &Device : Platform.Devices) { - if (!Callback(&Device, UserData)) { + if (!Callback(Device.get(), UserData)) { break; } } @@ -449,39 +624,78 @@ Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type, return Alloc.takeError(); *AllocationOut = *Alloc; - OffloadContext::get().AllocInfoMap.insert_or_assign(*Alloc, - AllocInfo{Device, Type}); + { + std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex); + OffloadContext::get().AllocInfoMap.insert_or_assign( + *Alloc, AllocInfo{Device, Type}); + } return Error::success(); } Error olMemFree_impl(void *Address) { - if (!OffloadContext::get().AllocInfoMap.contains(Address)) - return createOffloadError(ErrorCode::INVALID_ARGUMENT, - "address is not a known allocation"); - - auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address); - auto Device = AllocInfo.Device; - auto Type = AllocInfo.Type; + ol_device_handle_t Device; + ol_alloc_type_t Type; + { + std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex); + if (!OffloadContext::get().AllocInfoMap.contains(Address)) + return createOffloadError(ErrorCode::INVALID_ARGUMENT, + "address is not a known allocation"); + + auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address); + Device = AllocInfo.Device; + Type = AllocInfo.Type; + OffloadContext::get().AllocInfoMap.erase(Address); + } if (auto Res = Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type))) return Res; - OffloadContext::get().AllocInfoMap.erase(Address); - return Error::success(); } Error olCreateQueue_impl(ol_device_handle_t Device, ol_queue_handle_t *Queue) { auto CreatedQueue = std::make_unique<ol_queue_impl_t>(nullptr, Device); - if (auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo))) + + auto OutstandingQueue = Device->getOutstandingQueue(); + if (OutstandingQueue) { + // The queue is empty, but we still need to sync it to release any temporary + // memory allocations or do other cleanup. + if (auto Err = + Device->Device->synchronize(OutstandingQueue, /*Release=*/false)) + return Err; + CreatedQueue->AsyncInfo = OutstandingQueue; + } else if (auto Err = + Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo))) { return Err; + } *Queue = CreatedQueue.release(); return Error::success(); } -Error olDestroyQueue_impl(ol_queue_handle_t Queue) { return olDestroy(Queue); } +Error olDestroyQueue_impl(ol_queue_handle_t Queue) { + auto *Device = Queue->Device; + // This is safe; as soon as olDestroyQueue is called it is not possible to add + // any more work to the queue, so if it's finished now it will remain finished + // forever. + auto Res = Device->Device->hasPendingWork(Queue->AsyncInfo); + if (!Res) + return Res.takeError(); + + if (!*Res) { + // The queue is complete, so sync it and throw it back into the pool. + if (auto Err = Device->Device->synchronize(Queue->AsyncInfo, + /*Release=*/true)) + return Err; + } else { + // The queue still has outstanding work. Store it so we can check it later. + std::lock_guard<std::mutex> Lock(Device->OutstandingQueuesMutex); + Device->OutstandingQueues.push_back(Queue->AsyncInfo); + } + + return olDestroy(Queue); +} Error olSyncQueue_impl(ol_queue_handle_t Queue) { // Host plugin doesn't have a queue set so it's not safe to call synchronize @@ -509,7 +723,7 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events, "olWaitEvents asked to wait on a NULL event"); // Do nothing if the event is for this queue or the event is always complete - if (Event->Queue == Queue || !Event->EventInfo) + if (Event->QueueId == Queue->Id || !Event->EventInfo) continue; if (auto Err = Device->waitEvent(Event->EventInfo, Queue->AsyncInfo)) @@ -553,11 +767,11 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName, } Error olSyncEvent_impl(ol_event_handle_t Event) { + // No event info means that this event was complete on creation if (!Event->EventInfo) - // Event always complete return Plugin::success(); - if (auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo)) + if (auto Res = Event->Device->Device->syncEvent(Event->EventInfo)) return Res; return Error::success(); @@ -565,7 +779,7 @@ Error olSyncEvent_impl(ol_event_handle_t Event) { Error olDestroyEvent_impl(ol_event_handle_t Event) { if (Event->EventInfo) - if (auto Res = Event->Queue->Device->Device->destroyEvent(Event->EventInfo)) + if (auto Res = Event->Device->Device->destroyEvent(Event->EventInfo)) return Res; return olDestroy(Event); @@ -575,10 +789,22 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event, ol_event_info_t PropName, size_t PropSize, void *PropValue, size_t *PropSizeRet) { InfoWriter Info(PropSize, PropValue, PropSizeRet); + auto Queue = Event->Queue; switch (PropName) { case OL_EVENT_INFO_QUEUE: - return Info.write<ol_queue_handle_t>(Event->Queue); + return Info.write<ol_queue_handle_t>(Queue); + case OL_EVENT_INFO_IS_COMPLETE: { + // No event info means that this event was complete on creation + if (!Event->EventInfo) + return Info.write<bool>(true); + + auto Res = Queue->Device->Device->isEventComplete(Event->EventInfo, + Queue->AsyncInfo); + if (auto Err = Res.takeError()) + return Err; + return Info.write<bool>(*Res); + } default: return createOffloadError(ErrorCode::INVALID_ENUMERATION, "olGetEventInfo enum '%i' is invalid", PropName); @@ -604,7 +830,7 @@ Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) { if (auto Err = Pending.takeError()) return Err; - *EventOut = new ol_event_impl_t(nullptr, Queue); + *EventOut = new ol_event_impl_t(nullptr, Queue->Device, Queue); if (!*Pending) // Queue is empty, don't record an event and consider the event always // complete @@ -656,6 +882,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr, return Error::success(); } +Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize, + const void *PatternPtr, size_t FillSize) { + return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize, + Queue->AsyncInfo); +} + Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData, size_t ProgDataSize, ol_program_handle_t *Program) { // Make a copy of the program binary in case it is released by the caller. @@ -696,6 +928,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) { return olDestroy(Program); } +Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device, + ol_symbol_handle_t Kernel, + size_t DynamicMemSize, + size_t *GroupSize) { + if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL) + return createOffloadError(ErrorCode::SYMBOL_KIND, + "provided symbol is not a kernel"); + auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl); + + auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize); + if (auto Err = Res.takeError()) + return Err; + + *GroupSize = *Res; + + return Error::success(); +} + Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, ol_symbol_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, @@ -765,7 +1015,7 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name, return Error::success(); } case OL_SYMBOL_KIND_GLOBAL_VARIABLE: { - auto &Global = Program->KernelSymbols[Name]; + auto &Global = Program->GlobalSymbols[Name]; if (!Global) { GlobalTy GlobalObj{Name}; if (auto Res = @@ -833,5 +1083,12 @@ Error olGetSymbolInfoSize_impl(ol_symbol_handle_t Symbol, return olGetSymbolInfoImplDetail(Symbol, PropName, 0, nullptr, PropSizeRet); } +Error olLaunchHostFunction_impl(ol_queue_handle_t Queue, + ol_host_function_cb_t Callback, + void *UserData) { + return Queue->Device->Device->enqueueHostCall(Callback, UserData, + Queue->AsyncInfo); +} + } // namespace offload } // namespace llvm diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index f88e30a..6585286 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -191,6 +191,10 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, DstPtr, Size, AsyncInfo); } +int32_t DeviceTy::dataFence(AsyncInfoTy &AsyncInfo) { + return RTL->data_fence(RTLDeviceID, AsyncInfo); +} + int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) { DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n", DPxPTR(HstPtr), Size); diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp index e9b148d..fe18289 100644 --- a/offload/libomptarget/interface.cpp +++ b/offload/libomptarget/interface.cpp @@ -30,6 +30,7 @@ #include <cstdint> #include <cstdio> #include <cstdlib> +#include <memory> #ifdef OMPT_SUPPORT using namespace llvm::omp::target::ompt; @@ -165,12 +166,24 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, OMPT_GET_RETURN_ADDRESS);) int Rc = OFFLOAD_SUCCESS; + + // Only allocate AttachInfo for targetDataBegin + std::unique_ptr<AttachInfoTy> AttachInfo; + if (TargetDataFunction == targetDataBegin) + AttachInfo = std::make_unique<AttachInfoTy>(); + Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo, - false /*FromMapper=*/); + AttachInfo.get(), /*FromMapper=*/false); - if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + if (Rc == OFFLOAD_SUCCESS) { + // Process deferred ATTACH entries BEFORE synchronization + if (AttachInfo && !AttachInfo->AttachEntries.empty()) + Rc = processAttachEntries(*DeviceOrErr, *AttachInfo, AsyncInfo); + + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo.synchronize(); + } handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index 5b25d95..4c8eba1 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -293,7 +293,8 @@ void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, void *ArgMapper, AsyncInfoTy &AsyncInfo, - TargetDataFuncPtrTy TargetDataFunction) { + TargetDataFuncPtrTy TargetDataFunction, + AttachInfoTy *AttachInfo = nullptr) { DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper)); // The mapper function fills up Components. @@ -324,17 +325,184 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, MapperArgsBase.data(), MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), MapperArgNames.data(), /*arg_mappers*/ nullptr, - AsyncInfo, /*FromMapper=*/true); + AsyncInfo, AttachInfo, /*FromMapper=*/true); return Rc; } +/// Utility function to perform a pointer attachment operation. +/// +/// For something like: +/// ```cpp +/// int *p; +/// ... +/// #pragma omp target enter data map(to:p[10:10]) +/// ``` +/// +/// for which the attachment operation gets represented using: +/// ``` +/// &p, &p[10], sizeof(p), ATTACH +/// ``` +/// +/// (Hst|Tgt)PtrAddr represents &p +/// (Hst|Tgt)PteeBase represents &p[0] +/// (Hst|Tgt)PteeBegin represents &p[10] +/// +/// This function first computes the expected TgtPteeBase using: +/// `<Select>TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)` +/// +/// and then attaches TgtPteeBase to TgtPtrAddr. +/// +/// \p HstPtrSize represents the size of the pointer p. For C/C++, this +/// should be same as "sizeof(void*)" (say 8). +/// +/// However, for Fortran, pointers/allocatables, which are also eligible for +/// "pointer-attachment", may be implemented using descriptors that contain the +/// address of the pointee in the first 8 bytes, but also contain other +/// information such as lower-bound/upper-bound etc in their subsequent fields. +/// +/// For example, for the following: +/// ```fortran +/// integer, allocatable :: x(:) +/// integer, pointer :: p(:) +/// ... +/// p => x(10: 19) +/// ... +/// !$omp target enter data map(to:p(:)) +/// ``` +/// +/// The map should trigger a pointer-attachment (assuming the pointer-attachment +/// conditions as noted on processAttachEntries are met) between the descriptor +/// for p, and its pointee data. +/// +/// Since only the first 8 bytes of the descriptor contain the address of the +/// pointee, an attachment operation on device descriptors involves: +/// * Setting the first 8 bytes of the device descriptor to point the device +/// address of the pointee. +/// * Copying the remaining information about bounds/offset etc. from the host +/// descriptor to the device descriptor. +/// +/// The function also handles pointer-attachment portion of PTR_AND_OBJ maps, +/// like: +/// ``` +/// &p, &p[10], 10 * sizeof(p[10]), PTR_AND_OBJ +/// ``` +/// by using `sizeof(void*)` as \p HstPtrSize. +static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo, + void **HstPtrAddr, void *HstPteeBase, + void *HstPteeBegin, void **TgtPtrAddr, + void *TgtPteeBegin, int64_t HstPtrSize, + TargetPointerResultTy &PtrTPR) { + assert(PtrTPR.getEntry() && + "Need a valid pointer entry to perform pointer-attachment"); + + int64_t VoidPtrSize = sizeof(void *); + assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small"); + + uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) - + reinterpret_cast<uint64_t>(HstPteeBase); + void *TgtPteeBase = reinterpret_cast<void *>( + reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta); + DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD + ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n", + DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta); + DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD + "\n", + DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin)); + + // Add shadow pointer tracking + // TODO: Support shadow-tracking of larger than VoidPtrSize pointers, + // to support restoration of Fortran descriptors. Currently, this check + // would return false, even if the host Fortran descriptor had been + // updated since its previous map, and we should have updated its + // device counterpart. e.g. + // + // !$omp target enter data map(x(1:100)) ! (1) + // p => x(10: 19) + // !$omp target enter data map(p, p(:)) ! (2) + // p => x(5: 9) + // !$omp target enter data map(attach(always): p(:)) ! (3) + // + // While PtrAddr(&desc_p) and PteeBase(&p(1)) are same for (2) and (3), the + // pointer attachment for (3) needs to update the bounds information + // in the descriptor of p on device. + if (!PtrTPR.getEntry()->addShadowPointer( + ShadowPtrInfoTy{HstPtrAddr, HstPteeBase, TgtPtrAddr, TgtPteeBase})) { + DP("Pointer " DPxMOD " is already attached to " DPxMOD "\n", + DPxPTR(TgtPtrAddr), DPxPTR(TgtPteeBase)); + return OFFLOAD_SUCCESS; + } + + DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(TgtPtrAddr), + DPxPTR(TgtPteeBase)); + + // Lambda to handle submitData result and perform final steps. + auto HandleSubmitResult = [&](int SubmitResult) -> int { + if (SubmitResult != OFFLOAD_SUCCESS) { + REPORT("Failed to update pointer on device.\n"); + return OFFLOAD_FAIL; + } + + if (PtrTPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) != + OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return OFFLOAD_SUCCESS; + }; + + bool IsPtrAFortranDescriptor = HstPtrSize > VoidPtrSize; + if (!IsPtrAFortranDescriptor) { + // For "regular" pointers, we can use the VoidPtrLocation from AsyncInfo as + // the buffer space for the submission. + void *&BufferElement = AsyncInfo.getVoidPtrLocation(); + BufferElement = TgtPteeBase; + + // Submit the updated pointer value to device + return HandleSubmitResult(Device.submitData( + TgtPtrAddr, &BufferElement, VoidPtrSize, AsyncInfo, PtrTPR.getEntry())); + } + + // For larger "pointers" (like Fortran's descriptors), we create a dynamic + // buffer, which will be eventually destroyed by AsyncInfo's post-processing + // callback. + char *DataBuffer = new char[HstPtrSize]; + + // For such descriptors, to the first VoidPtrSize bytes, we store the + // pointee's device address. + std::memcpy(DataBuffer, &TgtPteeBase, sizeof(void *)); + + // And to the remaining bytes, we copy the remaining contents of the host + // descriptor after the initial VoidPtrSize bytes. + uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; + void *HstDescriptorFieldsAddr = + reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize; + std::memcpy(DataBuffer + VoidPtrSize, HstDescriptorFieldsAddr, + HstDescriptorFieldsSize); + + DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD ") (pointer + %" PRId64 + " additional bytes from host descriptor " DPxMOD ")\n", + HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize, + DPxPTR(HstDescriptorFieldsAddr)); + + // Submit the entire buffer to device + int SubmitResult = Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize, + AsyncInfo, PtrTPR.getEntry()); + + AsyncInfo.addPostProcessingFunction([DataBuffer]() -> int { + delete[] DataBuffer; + return OFFLOAD_SUCCESS; + }); + return HandleSubmitResult(SubmitResult); +} + /// Internal function to do the mapping and transfer the data to the device int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, - bool FromMapper) { + AttachInfoTy *AttachInfo, bool FromMapper) { + assert(AttachInfo && "AttachInfo must be available for targetDataBegin for " + "handling ATTACH map-types."); // process each input. for (int32_t I = 0; I < ArgNum; ++I) { // Ignore private variables and arrays - there is no mapping for them. @@ -352,7 +520,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, - targetDataBegin); + targetDataBegin, AttachInfo); if (Rc != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" @@ -369,6 +537,18 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, int64_t DataSize = ArgSizes[I]; map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; + // ATTACH map-types are supposed to be handled after all mapping for the + // construct is done. Defer their processing. + if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) { + AttachInfo->AttachEntries.emplace_back( + /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin, + /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I], + /*PointeeName=*/HstPtrName); + + DP("Deferring ATTACH map-type processing for argument %d\n", I); + continue; + } + // Adjust for proper alignment if this is a combined entry (for structs). // Look at the next argument - if that is MEMBER_OF this one, then this one // is a combined entry. @@ -434,13 +614,18 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, : "device failure or illegal mapping"); return OFFLOAD_FAIL; } + + // Track new allocation, for eventual use in attachment decision-making. + if (PointerTpr.Flags.IsNewEntry && !IsHostPtr) + AttachInfo->NewAllocations[HstPtrBase] = sizeof(void *); + DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin), (PointerTpr.Flags.IsNewEntry ? "" : " not")); PointerHstPtrBegin = HstPtrBase; // modify current entry. - HstPtrBase = *(void **)HstPtrBase; + HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); // No need to update pointee ref count for the first element of the // subelement that comes from mapper. UpdateRef = @@ -464,6 +649,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, : "device failure or illegal mapping"); return OFFLOAD_FAIL; } + + // Track new allocation, for eventual use in attachment decision-making. + if (TPR.Flags.IsNewEntry && !IsHostPtr && TgtPtrBegin) + AttachInfo->NewAllocations[HstPtrBegin] = DataSize; + DP("There are %" PRId64 " bytes allocated at target address " DPxMOD " - is%s new\n", DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); @@ -476,30 +666,13 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, } if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { - - uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; - void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); - - if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{ - (void **)PointerHstPtrBegin, HstPtrBase, - (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) { - DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", - DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); - - void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); - TgtPtrBase = ExpectedTgtPtrBase; - - int Ret = - Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *), - AsyncInfo, PointerTpr.getEntry()); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Copying data to device failed.\n"); - return OFFLOAD_FAIL; - } - if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) != - OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - } + int Ret = performPointerAttachment( + Device, AsyncInfo, reinterpret_cast<void **>(PointerHstPtrBegin), + HstPtrBase, HstPtrBegin, + reinterpret_cast<void **>(PointerTgtPtrBegin), TgtPtrBegin, + sizeof(void *), PointerTpr); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; } // Check if variable can be used on the device: @@ -515,6 +688,189 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, return OFFLOAD_SUCCESS; } +/// Process deferred ATTACH map entries collected during targetDataBegin. +/// +/// From OpenMP's perspective, when mapping something that has a base pointer, +/// such as: +/// ```cpp +/// int *p; +/// #pragma omp enter target data map(to: p[10:20]) +/// ``` +/// +/// a pointer-attachment between p and &p[10] should occur if both p and +/// p[10] are present on the device after doing all allocations for all maps +/// on the construct, and one of the following is true: +/// +/// * The pointer p was newly allocated while handling the construct +/// * The pointee p[10:20] was newly allocated while handling the construct +/// * attach(always) map-type modifier was specified (OpenMP 6.1) +/// +/// That's why we collect all attach entries and new memory allocations during +/// targetDataBegin, and use that information to make the decision of whether +/// to perform a pointer-attachment or not here, after maps have been handled. +/// +/// Additionally, once we decide that a pointer-attachment should be performed, +/// we need to make sure that it happens after any previously submitted data +/// transfers have completed, to avoid the possibility of the pending transfers +/// clobbering the attachment. For example: +/// +/// ```cpp +/// int *p = ...; +/// int **pp = &p; +/// map(to: pp[0], p[0]) +/// ``` +/// +/// Which would be represented by: +/// ``` +/// &pp[0], &pp[0], sizeof(pp[0]), TO (1) +/// &p[0], &p[0], sizeof(p[0]), TO (2) +/// +/// &pp, &pp[0], sizeof(pp), ATTACH (3) +/// &p, &p[0], sizeof(p), ATTACH (4) +/// ``` +/// +/// (4) and (1) are both trying to modify the device memory corresponding to +/// `&p`. So, if we decide that (4) should do an attachment, we also need to +/// ensure that (4) happens after (1) is complete. +/// +/// For this purpose, we insert a data_fence before the first +/// pointer-attachment, (3), to ensure that all pending transfers finish first. +int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo, + AsyncInfoTy &AsyncInfo) { + // Report all tracked allocations from both main loop and ATTACH processing + if (!AttachInfo.NewAllocations.empty()) { + DP("Tracked %u total new allocations:\n", + (unsigned)AttachInfo.NewAllocations.size()); + for (const auto &Alloc : AttachInfo.NewAllocations) { + DP(" Host ptr: " DPxMOD ", Size: %" PRId64 " bytes\n", + DPxPTR(Alloc.first), Alloc.second); + } + } + + if (AttachInfo.AttachEntries.empty()) + return OFFLOAD_SUCCESS; + + DP("Processing %zu deferred ATTACH map entries\n", + AttachInfo.AttachEntries.size()); + + int Ret = OFFLOAD_SUCCESS; + bool IsFirstPointerAttachment = true; + for (size_t EntryIdx = 0; EntryIdx < AttachInfo.AttachEntries.size(); + ++EntryIdx) { + const auto &AttachEntry = AttachInfo.AttachEntries[EntryIdx]; + + void **HstPtr = reinterpret_cast<void **>(AttachEntry.PointerBase); + + void *HstPteeBase = *HstPtr; + void *HstPteeBegin = AttachEntry.PointeeBegin; + + int64_t PtrSize = AttachEntry.PointerSize; + int64_t MapType = AttachEntry.MapType; + + DP("Processing ATTACH entry %zu: HstPtr=" DPxMOD ", HstPteeBegin=" DPxMOD + ", Size=%" PRId64 ", Type=0x%" PRIx64 "\n", + EntryIdx, DPxPTR(HstPtr), DPxPTR(HstPteeBegin), PtrSize, MapType); + + const bool IsAttachAlways = MapType & OMP_TGT_MAPTYPE_ALWAYS; + + // Lambda to check if a pointer was newly allocated + auto WasNewlyAllocated = [&](void *Ptr, const char *PtrName) { + bool IsNewlyAllocated = + llvm::any_of(AttachInfo.NewAllocations, [&](const auto &Alloc) { + void *AllocPtr = Alloc.first; + int64_t AllocSize = Alloc.second; + return Ptr >= AllocPtr && + Ptr < reinterpret_cast<void *>( + reinterpret_cast<char *>(AllocPtr) + AllocSize); + }); + DP("Attach %s " DPxMOD " was newly allocated: %s\n", PtrName, DPxPTR(Ptr), + IsNewlyAllocated ? "yes" : "no"); + return IsNewlyAllocated; + }; + + // Only process ATTACH if either the pointee or the pointer was newly + // allocated, or the ALWAYS flag is set. + if (!IsAttachAlways && !WasNewlyAllocated(HstPteeBegin, "pointee") && + !WasNewlyAllocated(HstPtr, "pointer")) { + DP("Skipping ATTACH entry %zu: neither pointer nor pointee was newly " + "allocated and no ALWAYS flag\n", + EntryIdx); + continue; + } + + // Lambda to perform target pointer lookup and validation + auto LookupTargetPointer = + [&](void *Ptr, int64_t Size, + const char *PtrType) -> std::optional<TargetPointerResultTy> { + // ATTACH map-type does not change ref-count, or do any allocation + // We just need to do a lookup for the pointer/pointee. + TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin( + Ptr, Size, /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false, /*MustContain=*/true); + + DP("Attach %s lookup - IsPresent=%s, IsHostPtr=%s\n", PtrType, + TPR.isPresent() ? "yes" : "no", + TPR.Flags.IsHostPointer ? "yes" : "no"); + + if (!TPR.isPresent()) { + DP("Skipping ATTACH entry %zu: %s not present on device\n", EntryIdx, + PtrType); + return std::nullopt; + } + if (TPR.Flags.IsHostPointer) { + DP("Skipping ATTACH entry %zu: device version of the %s is a host " + "pointer.\n", + EntryIdx, PtrType); + return std::nullopt; + } + + return TPR; + }; + + // Get device version of the pointee (e.g., &p[10]) first, as we can + // release its TPR after extracting the pointer value. + void *TgtPteeBegin = [&]() -> void * { + if (auto PteeTPROpt = LookupTargetPointer(HstPteeBegin, 0, "pointee")) + return PteeTPROpt->TargetPointer; + return nullptr; + }(); + + if (!TgtPteeBegin) + continue; + + // Get device version of the pointer (e.g., &p) next. We need to keep its + // TPR for use in shadow-pointer handling during pointer-attachment. + auto PtrTPROpt = LookupTargetPointer(HstPtr, PtrSize, "pointer"); + if (!PtrTPROpt) + continue; + TargetPointerResultTy &PtrTPR = *PtrTPROpt; + void **TgtPtrBase = reinterpret_cast<void **>(PtrTPR.TargetPointer); + + // Insert a data-fence before the first pointer-attachment. + if (IsFirstPointerAttachment) { + IsFirstPointerAttachment = false; + DP("Inserting a data fence before the first pointer attachment.\n"); + Ret = Device.dataFence(AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Failed to insert data fence.\n"); + return OFFLOAD_FAIL; + } + } + + // Do the pointer-attachment, i.e. update the device pointer to point to + // device pointee. + Ret = performPointerAttachment(Device, AsyncInfo, HstPtr, HstPteeBase, + HstPteeBegin, TgtPtrBase, TgtPteeBegin, + PtrSize, PtrTPR); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + DP("ATTACH entry %zu processed successfully\n", EntryIdx); + } + + return OFFLOAD_SUCCESS; +} + namespace { /// This structure contains information to deallocate a target pointer, aka. /// used to fix up the shadow map and potentially delete the entry from the @@ -624,7 +980,8 @@ postProcessingTargetDataEnd(DeviceTy *Device, int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { + void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo, bool FromMapper) { int Ret = OFFLOAD_SUCCESS; auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>(); // process each input. @@ -635,6 +992,14 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) continue; + // Ignore ATTACH entries - they should only be honored on map-entering + // directives. They may be encountered here while handling the "end" part of + // "#pragma omp target". + if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) { + DP("Ignoring ATTACH entry %d in targetDataEnd\n", I); + continue; + } + if (ArgMappers && ArgMappers[I]) { // Instead of executing the regular path of targetDataEnd, call the // targetDataMapper variant which will call targetDataEnd again @@ -900,7 +1265,8 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { + void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo, bool FromMapper) { // process each input. for (int32_t I = 0; I < ArgNum; ++I) { if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || @@ -1213,13 +1579,27 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, if (!DeviceOrErr) FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + // Create AttachInfo for tracking any ATTACH entries, or new-allocations + // when handling the "begin" mapping for a target constructs. + AttachInfoTy AttachInfo; + int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers, AsyncInfo); + ArgTypes, ArgNames, ArgMappers, AsyncInfo, + &AttachInfo, false /*FromMapper=*/); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin failed, abort target.\n"); return OFFLOAD_FAIL; } + // Process collected ATTACH entries + if (!AttachInfo.AttachEntries.empty()) { + Ret = processAttachEntries(*DeviceOrErr, AttachInfo, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Failed to process ATTACH entries.\n"); + return OFFLOAD_FAIL; + } + } + // List of (first-)private arrays allocated for this target region SmallVector<int> TgtArgsPositions(ArgNum, -1); diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h index 0b3d545..90e5e17 100644 --- a/offload/libomptarget/private.h +++ b/offload/libomptarget/private.h @@ -55,7 +55,14 @@ printKernelArguments(const ident_t *Loc, const int64_t DeviceId, const char *Type = nullptr; const char *Implicit = (ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT) ? "(implicit)" : ""; - if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) + + if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH && + ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS) + Type = "attach:always"; + else if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) + Type = "attach"; + else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && + ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) Type = "tofrom"; else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) Type = "to"; diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h index 61f680b..ad135f7 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h @@ -71,9 +71,15 @@ typedef enum { } hsa_isa_info_t; typedef enum { + HSA_MACHINE_MODEL_SMALL = 0, + HSA_MACHINE_MODEL_LARGE = 1 +} hsa_machine_model_t; + +typedef enum { HSA_AGENT_INFO_NAME = 0, HSA_AGENT_INFO_VENDOR_NAME = 1, HSA_AGENT_INFO_FEATURE = 2, + HSA_AGENT_INFO_MACHINE_MODEL = 3, HSA_AGENT_INFO_PROFILE = 4, HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h index 3117763..29cfe78 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h @@ -67,6 +67,7 @@ typedef enum hsa_amd_agent_info_s { HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002, HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003, + HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008, HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009, HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A, HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B, diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 7961820..c26cfe9 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -570,6 +570,16 @@ struct AMDGPUKernelTy : public GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Return maximum block size for maximum occupancy + /// + /// TODO: This needs to be implemented for amdgpu + Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice, + uint64_t DynamicMemSize) const override { + return Plugin::error( + ErrorCode::UNSUPPORTED, + "occupancy calculations for AMDGPU are not yet implemented"); + } + /// Print more elaborate kernel launch info for AMDGPU Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], @@ -914,6 +924,7 @@ private: void *Dst; const void *Src; size_t Size; + size_t NumTimes; }; /// Utility struct holding arguments for freeing buffers to memory managers. @@ -964,9 +975,14 @@ private: StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {} /// Schedule a host memory copy action on the slot. - Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) { + /// + /// Num times will repeat the copy that many times, sequentually in the dest + /// buffer. + Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size, + size_t NumTimes = 1) { Callbacks.emplace_back(memcpyAction); - ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size}; + ActionArgs.emplace_back().MemcpyArgs = + MemcpyArgsTy{Dst, Src, Size, NumTimes}; return Plugin::success(); } @@ -1063,6 +1079,20 @@ private: /// Indicate to spread data transfers across all available SDMAs bool UseMultipleSdmaEngines; + /// Wrapper function for implementing host callbacks + static void CallbackWrapper(AMDGPUSignalTy *InputSignal, + AMDGPUSignalTy *OutputSignal, + void (*Callback)(void *), void *UserData) { + // The wait call will not error in this context. + if (InputSignal) + if (auto Err = InputSignal->wait()) + reportFatalInternalError(std::move(Err)); + + Callback(UserData); + + OutputSignal->signal(); + } + /// Return the current number of asynchronous operations on the stream. uint32_t size() const { return NextSlot; } @@ -1192,7 +1222,11 @@ private: assert(Args->Dst && "Invalid destination buffer"); assert(Args->Src && "Invalid source buffer"); - std::memcpy(Args->Dst, Args->Src, Args->Size); + auto BasePtr = Args->Dst; + for (size_t I = 0; I < Args->NumTimes; I++) { + std::memcpy(BasePtr, Args->Src, Args->Size); + BasePtr = reinterpret_cast<uint8_t *>(BasePtr) + Args->Size; + } return Plugin::success(); } @@ -1397,7 +1431,8 @@ public: /// manager once the operation completes. Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, uint64_t CopySize, - AMDGPUMemoryManagerTy &MemoryManager) { + AMDGPUMemoryManagerTy &MemoryManager, + size_t NumTimes = 1) { // Retrieve available signals for the operation's outputs. AMDGPUSignalTy *OutputSignals[2] = {}; if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) @@ -1419,7 +1454,8 @@ public: // The std::memcpy is done asynchronously using an async handler. We store // the function's information in the action but it is not actually a // post action. - if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize)) + if (auto Err = + Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes)) return Err; // Make changes on this slot visible to the async handler's thread. @@ -1440,7 +1476,11 @@ public: std::tie(Curr, InputSignal) = consume(OutputSignal); } else { // All preceding operations completed, copy the memory synchronously. - std::memcpy(Inter, Src, CopySize); + auto *InterPtr = Inter; + for (size_t I = 0; I < NumTimes; I++) { + std::memcpy(InterPtr, Src, CopySize); + InterPtr = reinterpret_cast<uint8_t *>(InterPtr) + CopySize; + } // Return the second signal because it will not be used. OutputSignals[1]->decreaseUseCount(); @@ -1457,11 +1497,11 @@ public: if (InputSignal && InputSignal->load()) { hsa_signal_t InputSignalRaw = InputSignal->get(); return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 1, &InputSignalRaw, - OutputSignal->get()); + Agent, CopySize * NumTimes, 1, + &InputSignalRaw, OutputSignal->get()); } return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, - Agent, CopySize, 0, nullptr, + Agent, CopySize * NumTimes, 0, nullptr, OutputSignal->get()); } @@ -1495,6 +1535,31 @@ public: OutputSignal->get()); } + Error pushHostCallback(void (*Callback)(void *), void *UserData) { + // Retrieve an available signal for the operation's output. + AMDGPUSignalTy *OutputSignal = nullptr; + if (auto Err = SignalManager.getResource(OutputSignal)) + return Err; + OutputSignal->reset(); + OutputSignal->increaseUseCount(); + + AMDGPUSignalTy *InputSignal; + { + std::lock_guard<std::mutex> Lock(Mutex); + + // Consume stream slot and compute dependencies. + InputSignal = consume(OutputSignal).second; + } + + // "Leaking" the thread here is consistent with other work added to the + // queue. The input and output signals will remain valid until the output is + // signaled. + std::thread(CallbackWrapper, InputSignal, OutputSignal, Callback, UserData) + .detach(); + + return Plugin::success(); + } + /// Synchronize with the stream. The current thread waits until all operations /// are finalized and it performs the pending post actions (i.e., releasing /// intermediate buffers). @@ -1519,6 +1584,9 @@ public: /// actions for that and prior events. Error synchronizeOn(AMDGPUEventTy &Event); + /// Return true if the event from this queue is complete + Expected<bool> isEventComplete(const AMDGPUEventTy &Event); + /// Query the stream and complete pending post actions if operations finished. /// Return whether all the operations completed. This operation does not block /// the calling thread. @@ -1683,6 +1751,18 @@ Error AMDGPUStreamTy::synchronizeOn(AMDGPUEventTy &Event) { return completeUntil(Event.RecordedSlot); } +Expected<bool> AMDGPUStreamTy::isEventComplete(const AMDGPUEventTy &Event) { + std::lock_guard<std::mutex> Lock(Mutex); + assert(Event.RecordedStream == this && "event is for a different stream"); + + if (Event.RecordedSyncCycle < SyncCycle) { + return true; + } + assert(Event.RecordedSyncCycle == SyncCycle && "event is from the future?"); + + return !Slots[Event.RecordedSlot].Signal->load(); +} + struct AMDGPUStreamManagerTy final : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> { using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>; @@ -2537,6 +2617,85 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { getAgent(), (uint64_t)Size); } + /// Insert a data fence between previous data operations and the following + /// operations. This is a no-op for AMDGPU devices as operations inserted into + /// a queue are in-order. + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + // Fast case, where we can use the 4 byte hsa_amd_memory_fill + if (Size % 4 == 0 && + (PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) { + uint32_t Pattern; + if (PatternSize == 1) { + auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr); + Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24; + } else if (PatternSize == 2) { + auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr); + Pattern = *Word | (*Word << 16); + } else if (PatternSize == 4) { + Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr); + } else { + // Shouldn't be here if the pattern size is outwith those values + llvm_unreachable("Invalid pattern size"); + } + + if (hasPendingWorkImpl(AsyncInfoWrapper)) { + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + struct MemFillArgsTy { + void *Dst; + uint32_t Pattern; + int64_t Size; + }; + auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4}; + auto Fill = [](void *Data) { + MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data); + assert(Args && "Invalid arguments"); + + auto Status = + hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size); + delete Args; + auto Err = + Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"); + if (Err) { + FATAL_MESSAGE(1, "error performing async fill: %s", + toString(std::move(Err)).data()); + } + }; + + // hsa_amd_memory_fill doesn't signal completion using a signal, so use + // the existing host callback logic to handle that instead + return Stream->pushHostCallback(Fill, Args); + } else { + // If there is no pending work, do the fill synchronously + auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4); + return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"); + } + } + + // Slow case; allocate an appropriate memory size and enqueue copies + void *PinnedPtr = nullptr; + AMDGPUMemoryManagerTy &PinnedMemoryManager = + HostDevice.getPinnedMemoryManager(); + if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) + return Err; + + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr, + PatternSize, PinnedMemoryManager, + Size / PatternSize); + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { // TODO: Implement this function. @@ -2553,6 +2712,15 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } + Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, + AsyncInfoWrapperTy &AsyncInfo) override { + AMDGPUStreamTy *Stream = nullptr; + if (auto Err = getStream(AsyncInfo, Stream)) + return Err; + + return Stream->pushHostCallback(Callback, UserData); + }; + /// Create an event. Error createEventImpl(void **EventPtrStorage) override { AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage); @@ -2601,6 +2769,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Query.takeError(); } + Expected<bool> isEventCompleteImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfo) override { + AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); + auto Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>(); + return Stream && Stream->isEventComplete(*Event); + } + /// Synchronize the current thread with the event. Error syncEventImpl(void *EventPtr) override { AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); @@ -2632,7 +2807,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Product Name", TmpChar); + Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME); Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) @@ -2642,6 +2817,15 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (Status == HSA_STATUS_SUCCESS) Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR); + Info.add("Vendor ID", uint64_t{4130}, "", DeviceInfo::VENDOR_ID); + + hsa_machine_model_t MachineModel; + Status = getDeviceAttrRaw(HSA_AGENT_INFO_MACHINE_MODEL, MachineModel); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Memory Address Size", + uint64_t{MachineModel == HSA_MACHINE_MODEL_SMALL ? 32u : 64u}, + "bits", DeviceInfo::ADDRESS_BITS); + hsa_device_type_t DevType; Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); if (Status == HSA_STATUS_SUCCESS) { @@ -2692,11 +2876,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); if (Status == HSA_STATUS_SUCCESS) - Info.add("Max Clock Freq", TmpUInt, "MHz"); + Info.add("Max Clock Freq", TmpUInt, "MHz", + DeviceInfo::MAX_CLOCK_FREQUENCY); + + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Max Memory Clock Freq", TmpUInt, "MHz", + DeviceInfo::MEMORY_CLOCK_RATE); Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt); if (Status == HSA_STATUS_SUCCESS) - Info.add("Compute Units", TmpUInt); + Info.add("Compute Units", TmpUInt, "", DeviceInfo::NUM_COMPUTE_UNITS); Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt); if (Status == HSA_STATUS_SUCCESS) @@ -2734,11 +2924,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); if (Status == HSA_STATUS_SUCCESS) - Info.add("Grid Max Size", TmpUInt); + Info.add("Grid Max Size", TmpUInt, "", DeviceInfo::MAX_WORK_SIZE); Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); if (Status == HSA_STATUS_SUCCESS) { - auto &MaxDim = *Info.add("Grid Max Size per Dimension"); + auto &MaxDim = *Info.add("Grid Max Size per Dimension", std::monostate{}, + "", DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION); MaxDim.add("x", GridMaxDim.x); MaxDim.add("y", GridMaxDim.y); MaxDim.add("z", GridMaxDim.z); @@ -2778,7 +2969,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt); if (Status == HSA_STATUS_SUCCESS) - PoolNode.add("Size", TmpSt, "bytes"); + PoolNode.add( + "Size", TmpSt, "bytes", + (Pool->isGlobal() && Pool->isCoarseGrained()) + ? std::optional<DeviceInfo>{DeviceInfo::GLOBAL_MEM_SIZE} + : std::nullopt); Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, TmpBool); diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index c9ab34b..2c01ed2 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -388,6 +388,9 @@ struct GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; + virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice, + uint64_t DynamicMemSize) const = 0; + /// Get the kernel name. const char *getName() const { return Name.c_str(); } @@ -431,6 +434,8 @@ protected: return "Generic"; case OMP_TGT_EXEC_MODE_GENERIC_SPMD: return "Generic-SPMD"; + case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP: + return "SPMD-No-Loop"; } llvm_unreachable("Unknown execution mode!"); } @@ -468,7 +473,8 @@ private: uint32_t BlockLimitClause[3], uint64_t LoopTripCount, uint32_t &NumThreads, bool IsNumThreadsFromUser) const; - /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. + /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop + /// or SPMD mode. bool isGenericSPMDMode() const { return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD; @@ -483,6 +489,10 @@ private: bool isBareMode() const { return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE; } + bool isNoLoopMode() const { + return KernelEnvironment.Configuration.ExecMode == + OMP_TGT_EXEC_MODE_SPMD_NO_LOOP; + } /// The kernel name. std::string Name; @@ -944,6 +954,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Instert a data fence between previous data operations and the following + /// operations if necessary for the device + virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0; + /// Exchange data between devices (device to device transfer). Calling this /// function is only valid if GenericPlugin::isDataExchangable() passing the /// two devices returns true. @@ -953,6 +967,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Fill data on the device with a pattern from the host + Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, __tgt_async_info *AsyncInfo); + virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr, + int64_t PatternSize, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Run the kernel associated with \p EntryPtr Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo); @@ -965,6 +986,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { Error initDeviceInfo(__tgt_device_info *DeviceInfo); virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0; + /// Enqueue a host call to AsyncInfo + Error enqueueHostCall(void (*Callback)(void *), void *UserData, + __tgt_async_info *AsyncInfo); + virtual Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, + AsyncInfoWrapperTy &AsyncInfo) = 0; + /// Create an event. Error createEvent(void **EventPtrStorage); virtual Error createEventImpl(void **EventPtrStorage) = 0; @@ -984,6 +1011,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error waitEventImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Check if the event enqueued to AsyncInfo is complete + Expected<bool> isEventComplete(void *Event, __tgt_async_info *AsyncInfo); + virtual Expected<bool> + isEventCompleteImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Synchronize the current thread with the event. Error syncEvent(void *EventPtr); virtual Error syncEventImpl(void *EventPtr) = 0; @@ -1448,6 +1480,10 @@ public: int DstDeviceId, void *DstPtr, int64_t Size, __tgt_async_info *AsyncInfo); + /// Places a fence between previous data movements and following data + /// movements if necessary on the device + int32_t data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo); + /// Begin executing a kernel on the given device. int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs, diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 083d416..e5a313d 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -662,6 +662,10 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit()); } + // Return the number of teams required to cover the loop iterations. + if (isNoLoopMode()) + return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1; + uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks(); uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max(); if (LoopTripCount > 0) { @@ -1337,16 +1341,19 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) { Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue) { + if (!AsyncInfo) + return Plugin::error(ErrorCode::INVALID_ARGUMENT, + "invalid async info queue"); + SmallVector<void *> AllocsToDelete{}; { std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex}; - if (!AsyncInfo || !AsyncInfo->Queue) - return Plugin::error(ErrorCode::INVALID_ARGUMENT, - "invalid async info queue"); - - if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue)) - return Err; + // This can be false when no work has been added to the AsyncInfo. In which + // case, the device has nothing to synchronize. + if (AsyncInfo->Queue) + if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue)) + return Err; std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations); } @@ -1540,6 +1547,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, return Err; } +Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr, + int64_t PatternSize, int64_t Size, + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + auto Err = + dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; +} + Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, @@ -1589,6 +1606,15 @@ Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) { return Err; } +Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData, + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + + auto Err = enqueueHostCallImpl(Callback, UserData, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; +} + Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) { assert(DeviceInfo && "Invalid device info"); @@ -1648,6 +1674,22 @@ Expected<bool> GenericDeviceTy::hasPendingWork(__tgt_async_info *AsyncInfo) { return Res; } +Expected<bool> GenericDeviceTy::isEventComplete(void *Event, + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + auto Res = isEventCompleteImpl(Event, AsyncInfoWrapper); + if (auto Err = Res.takeError()) { + AsyncInfoWrapper.finalize(Err); + return Err; + } + + auto Err = Plugin::success(); + AsyncInfoWrapper.finalize(Err); + if (Err) + return Err; + return Res; +} + Error GenericDeviceTy::syncEvent(void *EventPtr) { return syncEventImpl(EventPtr); } @@ -2324,3 +2366,15 @@ int32_t GenericPluginTy::async_barrier(omp_interop_val_t *Interop) { } return OFFLOAD_SUCCESS; } + +int32_t GenericPluginTy::data_fence(int32_t DeviceId, + __tgt_async_info *AsyncInfo) { + auto Err = getDevice(DeviceId).dataFence(AsyncInfo); + if (Err) { + REPORT("failure to place data fence on device %d: %s\n", DeviceId, + toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index 361a781..f5b2d07 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4) DLWRAP(cuMemcpyHtoD, 3) DLWRAP(cuMemcpyHtoDAsync, 4) +DLWRAP(cuMemsetD8Async, 4) +DLWRAP(cuMemsetD16Async, 4) +DLWRAP(cuMemsetD32Async, 4) +DLWRAP(cuMemsetD2D8Async, 6) +DLWRAP(cuMemsetD2D16Async, 6) +DLWRAP(cuMemsetD2D32Async, 6) + DLWRAP(cuMemFree, 1) DLWRAP(cuMemFreeHost, 1) DLWRAP(cuMemFreeAsync, 2) @@ -72,6 +79,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3) DLWRAP(cuDevicePrimaryCtxSetFlags, 2) DLWRAP(cuDevicePrimaryCtxRetain, 2) DLWRAP(cuModuleLoadDataEx, 5) +DLWRAP(cuOccupancyMaxPotentialBlockSize, 6) DLWRAP(cuDeviceCanAccessPeer, 3) DLWRAP(cuCtxEnablePeerAccess, 2) @@ -82,6 +90,7 @@ DLWRAP(cuCtxSetLimit, 2) DLWRAP(cuEventCreate, 2) DLWRAP(cuEventRecord, 2) +DLWRAP(cuEventQuery, 1) DLWRAP(cuStreamWaitEvent, 3) DLWRAP(cuEventSynchronize, 1) DLWRAP(cuEventDestroy, 1) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index b6c022c..dec4e33 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01; static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02; typedef void (*CUstreamCallback)(CUstream, CUresult, void *); +typedef size_t (*CUoccupancyB2DSize)(int); CUresult cuCtxGetDevice(CUdevice *); CUresult cuDeviceGet(CUdevice *, int); @@ -321,6 +322,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); +CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); +CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); +CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); + CUresult cuMemFree(CUdeviceptr); CUresult cuMemFreeHost(void *); CUresult cuMemFreeAsync(CUdeviceptr, CUstream); @@ -352,6 +363,7 @@ CUresult cuCtxSetLimit(CUlimit, size_t); CUresult cuEventCreate(CUevent *, unsigned int); CUresult cuEventRecord(CUevent, CUstream); +CUresult cuEventQuery(CUevent); CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int); CUresult cuEventSynchronize(CUevent); CUresult cuEventDestroy(CUevent); @@ -372,5 +384,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, CUresult cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); #endif diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index f3f3783..af3c746 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Return maximum block size for maximum occupancy + Expected<uint64_t> maxGroupSize(GenericDeviceTy &, + uint64_t DynamicMemSize) const override { + int minGridSize; + int maxBlockSize; + auto Res = cuOccupancyMaxPotentialBlockSize( + &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX); + if (auto Err = Plugin::check( + Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) { + return Err; + } + return maxBlockSize; + } + private: /// The CUDA kernel function to execute. CUfunction Func; @@ -844,6 +858,64 @@ struct CUDADeviceTy : public GenericDeviceTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (auto Err = setContext()) + return Err; + + CUstream Stream; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + CUresult Res; + size_t N = Size / PatternSize; + if (PatternSize == 1) { + Res = cuMemsetD8Async((CUdeviceptr)TgtPtr, + *(static_cast<const uint8_t *>(PatternPtr)), N, + Stream); + } else if (PatternSize == 2) { + Res = cuMemsetD16Async((CUdeviceptr)TgtPtr, + *(static_cast<const uint16_t *>(PatternPtr)), N, + Stream); + } else if (PatternSize == 4) { + Res = cuMemsetD32Async((CUdeviceptr)TgtPtr, + *(static_cast<const uint32_t *>(PatternPtr)), N, + Stream); + } else { + // For larger patterns we can do a series of strided fills to copy the + // pattern efficiently + int64_t MemsetSize = PatternSize % 4u == 0u ? 4u + : PatternSize % 2u == 0u ? 2u + : 1u; + + int64_t NumberOfSteps = PatternSize / MemsetSize; + int64_t Pitch = NumberOfSteps * MemsetSize; + int64_t Height = Size / PatternSize; + + for (auto Step = 0u; Step < NumberOfSteps; ++Step) { + if (MemsetSize == 4) { + Res = cuMemsetD2D32Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast<const uint32_t *>(PatternPtr) + Step), 1u, Height, + Stream); + } else if (MemsetSize == 2) { + Res = cuMemsetD2D16Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast<const uint16_t *>(PatternPtr) + Step), 1u, Height, + Stream); + } else { + Res = cuMemsetD2D8Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast<const uint8_t *>(PatternPtr) + Step), 1u, Height, + Stream); + } + } + } + + return Plugin::check(Res, "error in cuMemset: %s"); + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { if (auto Err = setContext()) @@ -856,6 +928,13 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } + /// Insert a data fence between previous data operations and the following + /// operations. This is a no-op for CUDA devices as operations inserted into + /// a queue are in-order. + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + /// Initialize the device info for interoperability purposes. Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { assert(Context && "Context is null"); @@ -873,6 +952,19 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } + Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, + AsyncInfoWrapperTy &AsyncInfo) override { + if (auto Err = setContext()) + return Err; + + CUstream Stream; + if (auto Err = getStream(AsyncInfo, Stream)) + return Err; + + CUresult Res = cuLaunchHostFunc(Stream, Callback, UserData); + return Plugin::check(Res, "error in cuStreamLaunchHostFunc: %s"); + }; + /// Create an event. Error createEventImpl(void **EventPtrStorage) override { CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage); @@ -914,9 +1006,33 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::check(Res, "error in cuStreamWaitEvent: %s"); } - // TODO: This should be implementable on CUDA Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override { - return true; + CUstream Stream; + if (auto Err = getStream(AsyncInfo, Stream)) + return Err; + + CUresult Ret = cuStreamQuery(Stream); + if (Ret == CUDA_SUCCESS) + return false; + + if (Ret == CUDA_ERROR_NOT_READY) + return true; + + return Plugin::check(Ret, "error in cuStreamQuery: %s"); + } + + Expected<bool> isEventCompleteImpl(void *EventPtr, + AsyncInfoWrapperTy &) override { + CUevent Event = reinterpret_cast<CUevent>(EventPtr); + + CUresult Ret = cuEventQuery(Event); + if (Ret == CUDA_SUCCESS) + return true; + + if (Ret == CUDA_ERROR_NOT_READY) + return false; + + return Plugin::check(Ret, "error in cuEventQuery: %s"); } /// Synchronize the current thread with the event. @@ -944,18 +1060,27 @@ struct CUDADeviceTy : public GenericDeviceTy { Info.add("CUDA OpenMP Device Number", DeviceId); Res = cuDeviceGetName(TmpChar, 1000, Device); - if (Res == CUDA_SUCCESS) + if (Res == CUDA_SUCCESS) { Info.add("Device Name", TmpChar, "", DeviceInfo::NAME); + Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME); + } Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR); + Info.add("Vendor ID", uint64_t{4318}, "", DeviceInfo::VENDOR_ID); + + Info.add("Memory Address Size", std::numeric_limits<CUdeviceptr>::digits, + "bits", DeviceInfo::ADDRESS_BITS); + Res = cuDeviceTotalMem(&TmpSt, Device); if (Res == CUDA_SUCCESS) - Info.add("Global Memory Size", TmpSt, "bytes"); + Info.add("Global Memory Size", TmpSt, "bytes", + DeviceInfo::GLOBAL_MEM_SIZE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt); if (Res == CUDA_SUCCESS) - Info.add("Number of Multiprocessors", TmpInt); + Info.add("Number of Multiprocessors", TmpInt, "", + DeviceInfo::NUM_COMPUTE_UNITS); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt); if (Res == CUDA_SUCCESS) @@ -995,7 +1120,13 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) MaxBlock.add("z", TmpInt); - auto &MaxGrid = *Info.add("Maximum Grid Dimensions", ""); + // TODO: I assume CUDA devices have no limit on the amount of threads, + // verify this + Info.add("Maximum Grid Size", std::numeric_limits<uint32_t>::max(), "", + DeviceInfo::MAX_WORK_SIZE); + + auto &MaxGrid = *Info.add("Maximum Grid Dimensions", std::monostate{}, "", + DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt); if (Res == CUDA_SUCCESS) MaxGrid.add("x", TmpInt); @@ -1016,7 +1147,8 @@ struct CUDADeviceTy : public GenericDeviceTy { Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt); if (Res == CUDA_SUCCESS) - Info.add("Clock Rate", TmpInt, "kHz"); + Info.add("Clock Rate", TmpInt / 1000, "MHz", + DeviceInfo::MAX_CLOCK_FREQUENCY); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt); if (Res == CUDA_SUCCESS) @@ -1053,7 +1185,8 @@ struct CUDADeviceTy : public GenericDeviceTy { Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt); if (Res == CUDA_SUCCESS) - Info.add("Memory Clock Rate", TmpInt, "kHz"); + Info.add("Memory Clock Rate", TmpInt / 1000, "MHz", + DeviceInfo::MEMORY_CLOCK_RATE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt); if (Res == CUDA_SUCCESS) @@ -1317,9 +1450,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) { CUresult AttrResult = cuFuncSetAttribute( Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem); - return Plugin::check( - AttrResult, - "Error in cuLaunchKernel while setting the memory limits: %s"); + if (auto Err = Plugin::check( + AttrResult, + "error in cuFuncSetAttribute while setting the memory limits: %s")) + return Err; MaxDynCGroupMemLimit = MaxDynCGroupMem; } diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index ed52135..f440eba 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy { return Plugin::success(); } + /// Return maximum block size for maximum occupancy + Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device, + uint64_t DynamicMemSize) const override { + return Plugin::error( + ErrorCode::UNSUPPORTED, + "occupancy calculations are not implemented for the host device"); + } + private: /// The kernel function to execute. void (*Func)(void); @@ -295,6 +303,28 @@ struct GenELF64DeviceTy : public GenericDeviceTy { "dataExchangeImpl not supported"); } + /// Insert a data fence between previous data operations and the following + /// operations. This is a no-op for Host devices as operations inserted into + /// a queue are in-order. + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (PatternSize == 1) { + std::memset(TgtPtr, *static_cast<const char *>(PatternPtr), Size); + } else { + for (unsigned int Step = 0; Step < Size; Step += PatternSize) { + auto *Dst = static_cast<char *>(TgtPtr) + Step; + std::memcpy(Dst, PatternPtr, PatternSize); + } + } + + return Plugin::success(); + } + /// All functions are already synchronous. No need to do anything on this /// synchronization function. Error synchronizeImpl(__tgt_async_info &AsyncInfo, @@ -320,6 +350,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy { "initDeviceInfoImpl not supported"); } + Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, + AsyncInfoWrapperTy &AsyncInfo) override { + Callback(UserData); + return Plugin::success(); + }; + /// This plugin does not support the event API. Do nothing without failing. Error createEventImpl(void **EventPtrStorage) override { *EventPtrStorage = nullptr; @@ -337,6 +373,10 @@ struct GenELF64DeviceTy : public GenericDeviceTy { Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override { return true; } + Expected<bool> isEventCompleteImpl(void *Event, + AsyncInfoWrapperTy &AsyncInfo) override { + return true; + } Error syncEventImpl(void *EventPtr) override { return Plugin::success(); } /// Print information about the device. diff --git a/offload/test/mapping/data_member_ref.cpp b/offload/test/mapping/data_member_ref.cpp index fdb8abc..7947a62 100644 --- a/offload/test/mapping/data_member_ref.cpp +++ b/offload/test/mapping/data_member_ref.cpp @@ -60,7 +60,8 @@ int main() { printf("Host %d %d.\n", Bar.VRef.Data, V.Data); // CHECK: Host 123456. printf("Host %d.\n", *Baz.VRef.Data); -#pragma omp target map(*Baz.VRef.Data) map(from : D1, D2) +#pragma omp target map(Baz.VRef.Data) map(*Baz.VRef.Data) map(V1.Data[0 : 0]) \ + map(from : D1, D2) { // CHECK: Device 123456. D1 = *Baz.VRef.Data; diff --git a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp index c6c5657..45fd042 100644 --- a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp +++ b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp @@ -44,8 +44,8 @@ int main() { int spp00fa = -1, spp00fca = -1, spp00fb_r = -1; __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]); -#pragma omp target map(tofrom: spp[0][0]) firstprivate(p) \ - map(from: spp00fa, spp00fca, spp00fb_r) +#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0]) firstprivate(p) \ + map(from : spp00fa, spp00fca, spp00fb_r) { spp00fa = spp[0][0].f.a; spp00fca = spp[0][0].f.c.a; diff --git a/offload/test/mapping/declare_mapper_nested_mappers.cpp b/offload/test/mapping/declare_mapper_nested_mappers.cpp index a9e3f05..a59ed69 100644 --- a/offload/test/mapping/declare_mapper_nested_mappers.cpp +++ b/offload/test/mapping/declare_mapper_nested_mappers.cpp @@ -42,8 +42,8 @@ int main() { int spp00fa = -1, spp00fb_r = -1, spp00fg1 = -1, spp00fg_r = -1; __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]), p1 = reinterpret_cast<__intptr_t>(&y[0]); -#pragma omp target map(tofrom : spp[0][0]) firstprivate(p, p1) \ - map(from: spp00fa, spp00fb_r, spp00fg1, spp00fg_r) +#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0]) \ + firstprivate(p, p1) map(from : spp00fa, spp00fb_r, spp00fg1, spp00fg_r) { spp00fa = spp[0][0].f.a; spp00fb_r = spp[0][0].f.b == reinterpret_cast<void *>(p) ? 1 : 0; diff --git a/offload/test/mapping/map_ptr_and_star_global.c b/offload/test/mapping/map_ptr_and_star_global.c index c3b0dd2..869fb8c 100644 --- a/offload/test/mapping/map_ptr_and_star_global.c +++ b/offload/test/mapping/map_ptr_and_star_global.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_ptr_and_star_local.c b/offload/test/mapping/map_ptr_and_star_local.c index f0ca84d..cc826b3 100644 --- a/offload/test/mapping/map_ptr_and_star_local.c +++ b/offload/test/mapping/map_ptr_and_star_local.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_ptr_and_subscript_global.c b/offload/test/mapping/map_ptr_and_subscript_global.c index a3a10b6..839db06 100644 --- a/offload/test/mapping/map_ptr_and_subscript_global.c +++ b/offload/test/mapping/map_ptr_and_subscript_global.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_ptr_and_subscript_local.c b/offload/test/mapping/map_ptr_and_subscript_local.c index bb44999..68ac9dc 100644 --- a/offload/test/mapping/map_ptr_and_subscript_local.c +++ b/offload/test/mapping/map_ptr_and_subscript_local.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_structptr_and_member_global.c b/offload/test/mapping/map_structptr_and_member_global.c index 10e72e0..960eea4 100644 --- a/offload/test/mapping/map_structptr_and_member_global.c +++ b/offload/test/mapping/map_structptr_and_member_global.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/map_structptr_and_member_local.c b/offload/test/mapping/map_structptr_and_member_local.c index 9e59551..bd75940 100644 --- a/offload/test/mapping/map_structptr_and_member_local.c +++ b/offload/test/mapping/map_structptr_and_member_local.c @@ -1,5 +1,7 @@ // RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: libc + #include <omp.h> #include <stdio.h> diff --git a/offload/test/mapping/ptr_and_obj_motion.c b/offload/test/mapping/ptr_and_obj_motion.c index 8fa2c98..a94c07aa 100644 --- a/offload/test/mapping/ptr_and_obj_motion.c +++ b/offload/test/mapping/ptr_and_obj_motion.c @@ -17,7 +17,7 @@ void init(double vertexx[]) { } void change(DV *dvptr) { -#pragma omp target map(dvptr->dataptr[0 : 100]) +#pragma omp target map(dvptr->dataptr[0 : 100]) map(alloc : dvptr -> dataptr) { printf("In change: %lf, expected 77.0\n", dvptr->dataptr[77]); dvptr->dataptr[77] += 1.0; diff --git a/offload/test/mapping/target_derefence_array_pointrs.cpp b/offload/test/mapping/target_derefence_array_pointrs.cpp index a6dd4069..d213c87 100644 --- a/offload/test/mapping/target_derefence_array_pointrs.cpp +++ b/offload/test/mapping/target_derefence_array_pointrs.cpp @@ -18,23 +18,24 @@ void foo(int **t1d) { for (j = 0; j < 3; j++) (*t1d)[j] = 0; -#pragma omp target map(tofrom : (*t1d)[0 : 3]) +#pragma omp target map(tofrom : (*t1d)[0 : 3]) map(alloc : *t1d) { (*t1d)[1] = 1; } // CHECK: 1 printf("%d\n", (*t1d)[1]); -#pragma omp target map(tofrom : (**t2d)[0 : 3]) +#pragma omp target map(tofrom : (**t2d)[0 : 3]) map(alloc : **t2d, *t2d) { (**t2d)[1] = 2; } // CHECK: 2 printf("%d\n", (**t2d)[1]); -#pragma omp target map(tofrom : (***t3d)[0 : 3]) +#pragma omp target map(tofrom : (***t3d)[0 : 3]) \ + map(alloc : ***t3d, **t3d, *t3d) { (***t3d)[1] = 3; } // CHECK: 3 printf("%d\n", (***t3d)[1]); -#pragma omp target map(tofrom : (**t1d)) +#pragma omp target map(tofrom : (**t1d)) map(alloc : *t1d) { (*t1d)[0] = 4; } // CHECK: 4 printf("%d\n", (*t1d)[0]); -#pragma omp target map(tofrom : (*(*(t1d + a) + b))) +#pragma omp target map(tofrom : (*(*(t1d + a) + b))) map(to : *(t1d + a)) { *(*(t1d + a) + b) = 5; } // CHECK: 5 printf("%d\n", *(*(t1d + a) + b)); @@ -49,7 +50,7 @@ void bar() { for (int i = 0; i < 3; i++) { (**a)[1] = i; } -#pragma omp target map((**a)[ : 3]) +#pragma omp target map((**a)[ : 3]) map(alloc : **a, *a) { (**a)[1] = 6; // CHECK: 6 @@ -73,7 +74,8 @@ void zoo(int **f, SSA *sa) { *(f + sa->i + 1) = t; *(sa->sa->i + *(f + sa->i + 1)) = 4; printf("%d\n", *(sa->sa->i + *(1 + sa->i + f))); -#pragma omp target map(sa, *(sa->sa->i + *(1 + sa->i + f))) +#pragma omp target map(*(sa->sa->i + *(1 + sa->i + f))) map(alloc : sa->sa) \ + map(to : sa->i) map(to : sa->sa->i) map(to : *(1 + sa->i + f)) { *(sa->sa->i + *(1 + sa->i + f)) = 7; } // CHECK: 7 printf("%d\n", *(sa->sa->i + *(1 + sa->i + f))); @@ -87,13 +89,13 @@ void xoo() { void yoo(int **x) { *x = (int *)malloc(2 * sizeof(int)); -#pragma omp target map(**x) +#pragma omp target map(**x) map(alloc : *x) { **x = 8; // CHECK: 8 printf("%d\n", **x); } -#pragma omp target map(*(*x + 1)) +#pragma omp target map(*(*x + 1)) map(alloc : *x) { *(*x + 1) = 9; // CHECK: 9 diff --git a/offload/test/mapping/target_has_device_addr.c b/offload/test/mapping/target_has_device_addr.c index e8bfff8..f238832 100644 --- a/offload/test/mapping/target_has_device_addr.c +++ b/offload/test/mapping/target_has_device_addr.c @@ -66,8 +66,9 @@ void zoo() { short **xpp = &xp[0]; x[1] = 111; -#pragma omp target data map(tofrom : xpp[1][1]) use_device_addr(xpp[1][1]) -#pragma omp target has_device_addr(xpp[1][1]) +#pragma omp target data map(tofrom : xpp[1][1]) map(xpp[1]) \ + use_device_addr(xpp[1]) +#pragma omp target has_device_addr(xpp[1]) { xpp[1][1] = 222; // CHECK: 222 diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp new file mode 100644 index 0000000..3b1a819 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp @@ -0,0 +1,85 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5]) + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa02 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa02 != mapped_ptr_paa02); + +// (A) use_device_addr operand within mapped address range. +// CHECK: A: 1 +#pragma omp target data use_device_addr(ph[3 : 4]) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_addr operand in extended address range, but not +// mapped address range. +// CHECK: B: 1 +#pragma omp target data use_device_addr(ph[2]) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) use_device_addr/map: same base-array, different first-location. +// CHECK: C: 1 +#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1]) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) use_device_addr/map: different base-array/pointers. +// CHECK: D: 1 +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) use_device_addr operand within mapped range of previous map. +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa[0]) + printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (F) use_device_addr/map: different operands, same base-array. +// CHECK: F: 1 +#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2]) + printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (G) use_device_addr/map: different base-array/pointers. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2]) + printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp new file mode 100644 index 0000000..b9ebde4 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp @@ -0,0 +1,143 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +// (A) No corresponding map, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (B) use_device_addr/map: different operands, same base-pointer. +// use_device_addr operand within mapped address range. +// CHECK: B: 1 1 1 +#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1]) + { + int *mapped_ptr_ph4 = + (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr, + mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4); + } + +// (C) use_device_addr/map: different base-pointers. +// No corresponding storage, lookup should fail. +// CHECK: C: 1 1 1 +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (D) use_device_addr/map: one of two maps with matching base-pointer. +// use_device_addr operand within mapped address range of second map, +// lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding map, lookup should fail +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == (int **)nullptr + 2); + } + +// (F) use_device_addr/map: different operands, same base-array. +// use_device_addr within mapped address range. Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + +// (G) use_device_addr/map: different operands, same base-array. +// use_device_addr extends beyond existing mapping. Not spec compliant. +// But the lookup succeeds because we use the base-address for translation. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[0][4]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr( + original_paa02 + 2, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr, + mapped_ptr_paa04 != original_paa02 + 2, + &paa[0][4] == mapped_ptr_paa04); + } + + int *original_paa020 = &paa[0][2][0]; + int **original_paa0 = (int **)&paa[0]; + +// (H) use_device_addr/map: different base-pointers. +// No corresponding storage for use_device_addr opnd, lookup should fail. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa020 = + (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device()); + int **mapped_ptr_paa0 = + (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr, + mapped_ptr_paa0 == nullptr, &paa[0] == nullptr); + } + +// (I) use_device_addr/map: one map with different, one with same base-ptr. +// Lookup should succeed. +// CHECK: I: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp new file mode 100644 index 0000000..e9a1124 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp @@ -0,0 +1,98 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section on a reference. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5]) + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa02 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa02 != mapped_ptr_paa02); + +// (A) use_device_addr operand within mapped address range. +// EXPECTED: A: 1 +// CHECK: A: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_addr(ph[3 : 4]) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_addr operand in extended address range, but not +// mapped address range. +// EXPECTED: B: 1 +// CHECK: B: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_addr(ph[2]) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) use_device_addr/map: same base-array, different first-location. +// EXPECTED: C: 1 +// CHECK: C: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1]) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) use_device_addr/map: different base-array/pointers. +// EXPECTED: D: 1 +// CHECK: D: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) use_device_addr operand within mapped range of previous map. +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa[0]) + printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (F) use_device_addr/map: different operands, same base-array. +// CHECK: F: 1 +#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2]) + printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +// (G) use_device_addr/map: different base-array/pointers. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2]) + printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp new file mode 100644 index 0000000..0090cdb --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp @@ -0,0 +1,158 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on an array-section on a reference. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + int *original_ph3 = &ph[3]; + int **original_paa02 = &paa[0][2]; + +// (A) No corresponding map, lookup should fail. +// EXPECTED: A: 1 1 1 +// CHECK: A: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (B) use_device_addr/map: different operands, same base-pointer. +// use_device_addr operand within mapped address range. +// EXPECTED: B: 1 1 1 +// CHECK: B: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1]) + { + int *mapped_ptr_ph4 = + (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr, + mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4); + } + +// (C) use_device_addr/map: different base-pointers. +// No corresponding storage, lookup should fail. +// EXPECTED: C: 1 1 1 +// CHECK: C: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3); + } + +// (D) use_device_addr/map: one of two maps with matching base-pointer. +// use_device_addr operand within mapped address range of second map, +// lookup should succeed. +// EXPECTED: D: 1 1 1 +// CHECK: D: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4]) + { + int *mapped_ptr_ph3 = + (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding map, lookup should fail +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == (int **)nullptr + 2); + } + +// (F) use_device_addr/map: different operands, same base-array. +// use_device_addr within mapped address range. Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa[0]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + +// (G) use_device_addr/map: different operands, same base-array. +// use_device_addr extends beyond existing mapping. Not spec compliant. +// But the lookup succeeds because we use the base-address for translation. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[0][4]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr( + original_paa02 + 2, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr, + mapped_ptr_paa04 != original_paa02 + 2, + &paa[0][4] == mapped_ptr_paa04); + } + + int *original_paa020 = &paa[0][2][0]; + int **original_paa0 = (int **)&paa[0]; + +// (H) use_device_addr/map: different base-pointers. +// No corresponding storage for use_device_addr opnd, lookup should fail. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0]) + { + int **mapped_ptr_paa020 = + (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device()); + int **mapped_ptr_paa0 = + (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr, + mapped_ptr_paa0 == nullptr, &paa[0] == nullptr); + } + +// (I) use_device_addr/map: one map with different, one with same base-ptr. +// Lookup should succeed. +// CHECK: I: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2]) + { + int **mapped_ptr_paa02 = + (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr, + mapped_ptr_paa02 != original_paa02, + &paa[0][2] == mapped_ptr_paa02); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp new file mode 100644 index 0000000..883297f --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp @@ -0,0 +1,93 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a variable (not a section). +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +#pragma omp target enter data map(to : g, h, ph, paa) + void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device()); + void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device()); + void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device()); + void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device()); + + // CHECK-COUNT-8: 1 + printf("%d\n", mapped_ptr_g != nullptr); + printf("%d\n", mapped_ptr_h != nullptr); + printf("%d\n", mapped_ptr_ph != nullptr); + printf("%d\n", mapped_ptr_paa != nullptr); + printf("%d\n", original_addr_g != mapped_ptr_g); + printf("%d\n", original_addr_h != mapped_ptr_h); + printf("%d\n", original_addr_ph != mapped_ptr_ph); + printf("%d\n", original_addr_paa != mapped_ptr_paa); + +// (A) +// CHECK: A: 1 +#pragma omp target data use_device_addr(g) + printf("A: %d\n", mapped_ptr_g == &g); + +// (B) +// CHECK: B: 1 +#pragma omp target data use_device_addr(h) + printf("B: %d\n", mapped_ptr_h == &h); + +// (C) +// CHECK: C: 1 +#pragma omp target data use_device_addr(ph) + printf("C: %d\n", mapped_ptr_ph == &ph); + +// (D) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &ph, not &ph[0/1]. +// CHECK: D: 1 +#pragma omp target data map(ph[1 : 2]) use_device_addr(ph) + printf("D: %d\n", mapped_ptr_ph == &ph); + +// (E) +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa) + printf("E: %d\n", mapped_ptr_paa == &paa); + +// (F) use_device_addr/map with same base-array, paa. +// Address translation should happen for &paa. +// CHECK: F: 1 +#pragma omp target data map(paa[0][2]) use_device_addr(paa) + printf("F: %d\n", mapped_ptr_paa == &paa); + +// (G) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &paa. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + printf("G: %d\n", mapped_ptr_paa == &paa); + +#pragma omp target exit data map(release : g, h, ph, paa) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp new file mode 100644 index 0000000..79c6f69 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp @@ -0,0 +1,159 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a variable (not a section). +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g, h[10]; +int *ph = &h[0]; + +struct S { + int *paa[10][10]; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +// (A) No corresponding item, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_g == nullptr, + mapped_ptr_g != original_addr_g, (void *)&g == nullptr); + } + +// (B) Lookup should succeed. +// CHECK: B: 1 1 1 +#pragma omp target data map(g) use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_g != nullptr, + mapped_ptr_g != original_addr_g, &g == mapped_ptr_g); + } + +// (C) No corresponding item, lookup should fail. +// CHECK: C: 1 1 1 +#pragma omp target data use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_h == nullptr, + mapped_ptr_h != original_addr_h, (void *)&h == nullptr); + } + +// (D) Lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(h) use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_h != nullptr, + mapped_ptr_h != original_addr_h, &h == mapped_ptr_h); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (F) Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (G) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: G: 1 1 1 +#pragma omp target data map(ph[0 : 1]) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (H) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (I) No corresponding item, lookup should fail. +// CHECK: I: 1 1 1 +#pragma omp target data use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (J) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: J: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("J: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (K) Lookup should succeed. +// CHECK: K: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("K: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + +// (L) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: L: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("L: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp new file mode 100644 index 0000000..f018c65 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp @@ -0,0 +1,100 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a reference variable. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +#pragma omp target enter data map(to : g, h, ph, paa) + void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device()); + void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device()); + void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device()); + void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device()); + + // CHECK-COUNT-8: 1 + printf("%d\n", mapped_ptr_g != nullptr); + printf("%d\n", mapped_ptr_h != nullptr); + printf("%d\n", mapped_ptr_ph != nullptr); + printf("%d\n", mapped_ptr_paa != nullptr); + printf("%d\n", original_addr_g != mapped_ptr_g); + printf("%d\n", original_addr_h != mapped_ptr_h); + printf("%d\n", original_addr_ph != mapped_ptr_ph); + printf("%d\n", original_addr_paa != mapped_ptr_paa); + +// (A) +// CHECK: A: 1 +#pragma omp target data use_device_addr(g) + printf("A: %d\n", mapped_ptr_g == &g); + +// (B) +// CHECK: B: 1 +#pragma omp target data use_device_addr(h) + printf("B: %d\n", mapped_ptr_h == &h); + +// (C) +// CHECK: C: 1 +#pragma omp target data use_device_addr(ph) + printf("C: %d\n", mapped_ptr_ph == &ph); + +// (D) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &ph, not &ph[0/1]. +// CHECK: D: 1 +#pragma omp target data map(ph[1 : 2]) use_device_addr(ph) + printf("D: %d\n", mapped_ptr_ph == &ph); + +// (E) +// CHECK: E: 1 +#pragma omp target data use_device_addr(paa) + printf("E: %d\n", mapped_ptr_paa == &paa); + +// (F) use_device_addr/map with same base-array, paa. +// Address translation should happen for &paa. +// CHECK: F: 1 +#pragma omp target data map(paa[0][2]) use_device_addr(paa) + printf("F: %d\n", mapped_ptr_paa == &paa); + +// (G) use_device_addr/map with different base-array/pointer. +// Address translation should happen for &paa. +// CHECK: G: 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + printf("G: %d\n", mapped_ptr_paa == &paa); + +#pragma omp target exit data map(release : g, h, ph, paa) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp new file mode 100644 index 0000000..9360db4 --- /dev/null +++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp @@ -0,0 +1,166 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_addr on a reference variable. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int g_ptee; +int &g = g_ptee; + +int h_ptee[10]; +int (&h)[10] = h_ptee; + +int *ph_ptee = &h_ptee[0]; +int *&ph = ph_ptee; +int *paa_ptee[10][10]; + +struct S { + int *(&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa[0][2] = &g; + + void *original_addr_g = &g; + void *original_addr_h = &h; + void *original_addr_ph = &ph; + void *original_addr_paa = &paa; + +// (A) No corresponding item, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_g == nullptr, + mapped_ptr_g != original_addr_g, (void *)&g == nullptr); + } + +// (B) Lookup should succeed. +// CHECK: B: 1 1 1 +#pragma omp target data map(g) use_device_addr(g) + { + void *mapped_ptr_g = + omp_get_mapped_ptr(original_addr_g, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_g != nullptr, + mapped_ptr_g != original_addr_g, &g == mapped_ptr_g); + } + +// (C) No corresponding item, lookup should fail. +// CHECK: C: 1 1 1 +#pragma omp target data use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_h == nullptr, + mapped_ptr_h != original_addr_h, (void *)&h == nullptr); + } + +// (D) Lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(h) use_device_addr(h) + { + void *mapped_ptr_h = + omp_get_mapped_ptr(original_addr_h, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_h != nullptr, + mapped_ptr_h != original_addr_h, &h == mapped_ptr_h); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (F) Lookup should succeed. +// CHECK: F: 1 1 1 +#pragma omp target data map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (G) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: G: 1 1 1 +#pragma omp target data map(ph[0 : 1]) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_ph == nullptr, + mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr); + } + +// (H) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph) + { + void *mapped_ptr_ph = + omp_get_mapped_ptr(original_addr_ph, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_ph != nullptr, + mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph); + } + +// (I) No corresponding item, lookup should fail. +// CHECK: I: 1 1 1 +#pragma omp target data use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("I: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (J) Maps pointee only, but use_device_addr operand is pointer. +// Lookup should fail. +// CHECK: J: 1 1 1 +#pragma omp target data map(paa[0][2][0]) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("J: %d %d %d\n", mapped_ptr_paa == nullptr, + mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr); + } + +// (K) Lookup should succeed. +// CHECK: K: 1 1 1 +#pragma omp target data map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("K: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + +// (L) Maps both pointee and pointer. Lookup for pointer should succeed. +// CHECK: L: 1 1 1 +#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa) + { + void *mapped_ptr_paa = + omp_get_mapped_ptr(original_addr_paa, omp_get_default_device()); + printf("L: %d %d %d\n", mapped_ptr_paa != nullptr, + mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/target_use_device_addr.c b/offload/test/mapping/use_device_addr/target_use_device_addr.c index 5c2bb8a..4a9dbe2 100644 --- a/offload/test/mapping/target_use_device_addr.c +++ b/offload/test/mapping/use_device_addr/target_use_device_addr.c @@ -12,7 +12,9 @@ int main() { printf("%d, %p\n", xp[1], &xp[1]); #pragma omp target data use_device_addr(xp[1 : 3]) map(tofrom : x) #pragma omp target is_device_ptr(xp) - { xp[1] = 222; } + { + xp[1] = 222; + } // CHECK: 222 printf("%d, %p\n", xp[1], &xp[1]); } diff --git a/offload/test/mapping/target_wrong_use_device_addr.c b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c index 7a5babd..28ec685 100644 --- a/offload/test/mapping/target_wrong_use_device_addr.c +++ b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c @@ -14,7 +14,7 @@ int main() { // CHECK: host addr=0x[[#%x,HOST_ADDR:]] fprintf(stderr, "host addr=%p\n", x); -#pragma omp target data map(to : x [0:10]) +#pragma omp target data map(to : x[0 : 10]) { // CHECK: omptarget device 0 info: variable x does not have a valid device // counterpart @@ -27,4 +27,3 @@ int main() { return 0; } - diff --git a/offload/test/mapping/array_section_use_device_ptr.c b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c index 86e2875..4cfcce2 100644 --- a/offload/test/mapping/array_section_use_device_ptr.c +++ b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c @@ -20,7 +20,9 @@ int main() { float *A_dev = NULL; #pragma omp target data use_device_ptr(A) - { A_dev = A; } + { + A_dev = A; + } #pragma omp target exit data map(delete : A[FROM : LENGTH]) // CHECK: Success diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp new file mode 100644 index 0000000..a7745de --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp @@ -0,0 +1,100 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a variable. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int h[10]; +int *ph = &h[0]; + +struct S { + int (*paa)[10][10] = &aa; + + void f1(int i) { + paa--; + void *original_ph3 = &ph[3]; + void *original_paa102 = &paa[1][0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5]) + void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa102 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa102 != mapped_ptr_paa102); + +// (A) Mapped data is within extended address range. Lookup should succeed. +// CHECK: A: 1 +#pragma omp target data use_device_ptr(ph) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_ptr/map on pointer, and pointee already exists. +// Lookup should succeed. +// CHECK: B: 1 +#pragma omp target data map(ph) use_device_ptr(ph) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: C: 1 +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: D: 1 +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) Mapped data is within extended address range. Lookup should succeed. +// Lookup should succeed. +// CHECK: E: 1 +#pragma omp target data use_device_ptr(paa) + printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (F) use_device_ptr/map on pointer, and pointee already exists. +// &paa[0] should be in extended address-range of the existing paa[1][...] +// Lookup should succeed. +// FIXME: However, it currently does not. Might need an RT fix. +// EXPECTED: F: 1 +// CHECK: F: 0 +#pragma omp target data map(paa) use_device_ptr(paa) + printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp new file mode 100644 index 0000000..fe3cdb5 --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp @@ -0,0 +1,125 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a variable. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int h[10]; +int *ph = &h[0]; + +struct S { + int (*paa)[10][10] = &aa; + + void f1(int i) { + paa--; + void *original_addr_ph3 = &ph[3]; + void *original_addr_paa102 = &paa[1][0][2]; + +// (A) No corresponding item, lookup should fail. +// CHECK: A: 1 1 1 +#pragma omp target data use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (B) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// CHECK: B: 1 1 1 +#pragma omp target data map(ph) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: C: 1 1 1 +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: D: 1 1 1 +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (F) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp new file mode 100644 index 0000000..66e65de --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp @@ -0,0 +1,111 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a reference variable. +// The corresponding data is mapped on a previous enter_data directive. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int (*paa_ptee)[10][10] = &aa; + +int h[10]; +int *ph_ptee = &h[0]; +int *&ph = ph_ptee; + +struct S { + int (*&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa--; + void *original_ph3 = &ph[3]; + void *original_paa102 = &paa[1][0][2]; + +#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5]) + void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device()); + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device()); + + // CHECK-COUNT-4: 1 + printf("%d\n", mapped_ptr_ph3 != nullptr); + printf("%d\n", mapped_ptr_paa102 != nullptr); + printf("%d\n", original_ph3 != mapped_ptr_ph3); + printf("%d\n", original_paa102 != mapped_ptr_paa102); + +// (A) Mapped data is within extended address range. Lookup should succeed. +// EXPECTED: A: 1 +// CHECK: A: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_ptr(ph) + printf("A: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (B) use_device_ptr/map on pointer, and pointee already exists. +// Lookup should succeed. +// EXPECTED: B: 1 +// CHECK: B: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_ptr(ph) + printf("B: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: C: 1 +// CHECK: C: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + printf("C: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: D: 1 +// CHECK: D: 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + printf("D: %d\n", mapped_ptr_ph3 == &ph[3]); + +// (E) Mapped data is within extended address range. Lookup should succeed. +// Lookup should succeed. +// CHECK: E: 1 +#pragma omp target data use_device_ptr(paa) + printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (F) use_device_ptr/map on pointer, and pointee already exists. +// &paa[0] should be in extended address-range of the existing paa[1][...] +// Lookup should succeed. +// FIXME: However, it currently does not. Might need an RT fix. +// EXPECTED: F: 1 +// CHECK: F: 0 +#pragma omp target data map(paa) use_device_ptr(paa) + printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]); + +#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5]) + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp new file mode 100644 index 0000000..419ab3e --- /dev/null +++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp @@ -0,0 +1,136 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic + +// XFAIL: * + +#include <omp.h> +#include <stdio.h> + +// Test for various cases of use_device_ptr on a reference variable. +// The corresponding data is not previously mapped. + +// Note that this tests for the current behavior wherein if a lookup fails, +// the runtime returns nullptr, instead of the original host-address. +// That was compatible with OpenMP 5.0, where it was a user error if +// corresponding storage didn't exist, but with 5.1+, the runtime needs to +// return the host address, as it needs to assume that the host-address is +// device-accessible, as the user has guaranteed it. +// Once the runtime returns the original host-address when the lookup fails, the +// test will need to be updated. + +int aa[10][10]; +int (*paa_ptee)[10][10] = &aa; + +int h[10]; +int *ph_ptee = &h[0]; +int *&ph = ph_ptee; + +struct S { + int (*&paa)[10][10] = paa_ptee; + + void f1(int i) { + paa--; + void *original_addr_ph3 = &ph[3]; + void *original_addr_paa102 = &paa[1][0][2]; + +// (A) No corresponding item, lookup should fail. +// EXPECTED: A: 1 1 1 +// CHECK: A: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (B) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// EXPECTED: B: 1 1 1 +// CHECK: B: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr, + mapped_ptr_ph3 != original_addr_ph3, ph == nullptr); + } + +// (C) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: C: 1 1 1 +// CHECK: C: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (D) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// EXPECTED: D: 1 1 1 +// CHECK: D: 1 1 0 +// FIXME: ph is not being privatized in the region. +#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph) + { + void *mapped_ptr_ph3 = + omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device()); + printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr, + mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3); + } + +// (E) No corresponding item, lookup should fail. +// CHECK: E: 1 1 1 +#pragma omp target data use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (F) use_device_ptr/map on pointer, and pointee does not exist. +// Lookup should fail. +// CHECK: F: 1 1 1 +#pragma omp target data map(paa) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr, + mapped_ptr_paa102 != original_addr_paa102, paa == nullptr); + } + +// (G) map on pointee: base-pointer of map matches use_device_ptr operand. +// Lookup should succeed. +// CHECK: G: 1 1 1 +#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + +// (H) map on pointer and pointee. Base-pointer of map on pointee matches +// use_device_ptr operand. +// Lookup should succeed. +// CHECK: H: 1 1 1 +#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa) + { + void *mapped_ptr_paa102 = + omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device()); + printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr, + mapped_ptr_paa102 != original_addr_paa102, + &paa[1][0][2] == mapped_ptr_paa102); + } + } +}; + +S s1; +int main() { s1.f1(1); } diff --git a/offload/test/offloading/fortran/declare-target-automap.f90 b/offload/test/offloading/fortran/declare-target-automap.f90 new file mode 100644 index 0000000..b9c2d34 --- /dev/null +++ b/offload/test/offloading/fortran/declare-target-automap.f90 @@ -0,0 +1,37 @@ +!Offloading test for AUTOMAP modifier in declare target enter +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program automap_program + use iso_c_binding, only: c_loc + use omp_lib, only: omp_get_default_device, omp_target_is_present + integer, parameter :: N = 10 + integer :: i + integer, allocatable, target :: automap_array(:) + !$omp declare target enter(automap:automap_array) + + ! false since the storage is not present even though the descriptor is present + write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device()) + ! CHECK: 0 + + allocate (automap_array(N)) + ! true since the storage should be allocated and reference count incremented by the allocate + write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device()) + ! CHECK: 1 + + ! since storage is present this should not be a runtime error + !$omp target teams loop + do i = 1, N + automap_array(i) = i + end do + + !$omp target update from(automap_array) + write (*, *) automap_array + ! CHECK: 1 2 3 4 5 6 7 8 9 10 + + deallocate (automap_array) + + ! automap_array should have it's storage unmapped on device here + write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device()) + ! CHECK: 0 +end program diff --git a/offload/test/offloading/strided_multiple_update.c b/offload/test/offloading/strided_multiple_update.c new file mode 100644 index 0000000..a3e8d10 --- /dev/null +++ b/offload/test/offloading/strided_multiple_update.c @@ -0,0 +1,62 @@ +// This test checks that #pragma omp target update from(data1[0:3:4], +// data2[0:2:5]) correctly updates disjoint strided sections of multiple arrays +// from the device to the host. + +// RUN: %libomptarget-compile-run-and-check-generic +#include <omp.h> +#include <stdio.h> + +int main() { + int len = 12; + double data1[len], data2[len]; + +// Initial values +#pragma omp target map(tofrom : data1[0 : len], data2[0 : len]) + { + for (int i = 0; i < len; i++) { + data1[i] = i; + data2[i] = i * 10; + } + } + + printf("original host array values:\n"); + printf("data1: "); + for (int i = 0; i < len; i++) + printf("%.1f ", data1[i]); + printf("\ndata2: "); + for (int i = 0; i < len; i++) + printf("%.1f ", data2[i]); + printf("\n\n"); + +#pragma omp target data map(to : data1[0 : len], data2[0 : len]) + { +// Modify arrays on device +#pragma omp target + { + for (int i = 0; i < len; i++) + data1[i] += i; + for (int i = 0; i < len; i++) + data2[i] += 100; + } + +// data1[0:3:4] // indices 0,4,8 +// data2[0:2:5] // indices 0,5 +#pragma omp target update from(data1[0 : 3 : 4], data2[0 : 2 : 5]) + } + + printf("device array values after update from:\n"); + printf("data1: "); + for (int i = 0; i < len; i++) + printf("%.1f ", data1[i]); + printf("\ndata2: "); + for (int i = 0; i < len; i++) + printf("%.1f ", data2[i]); + printf("\n\n"); + + // CHECK: data1: 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 + // CHECK: data2: 0.0 10.0 20.0 30.0 40.0 50.0 60.0 70.0 80.0 90.0 100.0 110.0 + + // CHECK: data1: 0.0 1.0 2.0 3.0 8.0 5.0 6.0 7.0 16.0 9.0 10.0 11.0 + // CHECK: data2: 100.0 10.0 20.0 30.0 40.0 150.0 60.0 70.0 80.0 90.0 100.0 + // 110.0 +} diff --git a/offload/test/offloading/strided_partial_update.c b/offload/test/offloading/strided_partial_update.c new file mode 100644 index 0000000..15d477f --- /dev/null +++ b/offload/test/offloading/strided_partial_update.c @@ -0,0 +1,63 @@ +// This test checks that #pragma omp target update from(data[0:4:3]) correctly +// updates every third element (stride 3) from the device to the host, partially +// across the array + +// RUN: %libomptarget-compile-run-and-check-generic +#include <omp.h> +#include <stdio.h> + +int main() { + int len = 11; + double data[len]; + +#pragma omp target map(tofrom : data[0 : len]) + { + for (int i = 0; i < len; i++) + data[i] = i; + } + + // Initial values + printf("original host array values:\n"); + for (int i = 0; i < len; i++) + printf("%f\n", data[i]); + printf("\n"); + +#pragma omp target data map(to : data[0 : len]) + { +// Modify arrays on device +#pragma omp target + for (int i = 0; i < len; i++) + data[i] += i; + +#pragma omp target update from(data[0 : 4 : 3]) // indices 0,3,6,9 + } + + printf("device array values after update from:\n"); + for (int i = 0; i < len; i++) + printf("%f\n", data[i]); + printf("\n"); + + // CHECK: 0.000000 + // CHECK: 1.000000 + // CHECK: 2.000000 + // CHECK: 3.000000 + // CHECK: 4.000000 + // CHECK: 5.000000 + // CHECK: 6.000000 + // CHECK: 7.000000 + // CHECK: 8.000000 + // CHECK: 9.000000 + // CHECK: 10.000000 + + // CHECK: 0.000000 + // CHECK: 1.000000 + // CHECK: 2.000000 + // CHECK: 6.000000 + // CHECK: 4.000000 + // CHECK: 5.000000 + // CHECK: 12.000000 + // CHECK: 7.000000 + // CHECK: 8.000000 + // CHECK: 18.000000 + // CHECK: 10.000000 +} diff --git a/offload/test/offloading/strided_update.c b/offload/test/offloading/strided_update.c new file mode 100644 index 0000000..fe875b7 --- /dev/null +++ b/offload/test/offloading/strided_update.c @@ -0,0 +1,54 @@ +// This test checks that "update from" clause in OpenMP is supported when the +// elements are updated in a non-contiguous manner. This test checks that +// #pragma omp target update from(data[0:4:2]) correctly updates only every +// other element (stride 2) from the device to the host + +// RUN: %libomptarget-compile-run-and-check-generic +#include <omp.h> +#include <stdio.h> + +int main() { + int len = 8; + double data[len]; +#pragma omp target map(tofrom : len, data[0 : len]) + { + for (int i = 0; i < len; i++) { + data[i] = i; + } + } + // Initial values + printf("original host array values:\n"); + for (int i = 0; i < len; i++) + printf("%f\n", data[i]); + printf("\n"); + +#pragma omp target data map(to : len, data[0 : len]) + { +// Modify arrays on device +#pragma omp target + for (int i = 0; i < len; i++) { + data[i] += i; + } + +#pragma omp target update from(data[0 : 4 : 2]) + } + // CHECK: 0.000000 + // CHECK: 1.000000 + // CHECK: 4.000000 + // CHECK: 3.000000 + // CHECK: 8.000000 + // CHECK: 5.000000 + // CHECK: 12.000000 + // CHECK: 7.000000 + // CHECK-NOT: 2.000000 + // CHECK-NOT: 6.000000 + // CHECK-NOT: 10.000000 + // CHECK-NOT: 14.000000 + + printf("from target array results:\n"); + for (int i = 0; i < len; i++) + printf("%f\n", data[i]); + printf("\n"); + + return 0; +} diff --git a/offload/test/tools/offload-tblgen/default_returns.td b/offload/test/tools/offload-tblgen/default_returns.td index e919492..41949db 100644 --- a/offload/test/tools/offload-tblgen/default_returns.td +++ b/offload/test/tools/offload-tblgen/default_returns.td @@ -6,13 +6,11 @@ include "APIDefs.td" -def : Handle { - let name = "ol_foo_handle_t"; +def ol_foo_handle_t : Handle { let desc = "Example handle type"; } -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/entry_points.td b/offload/test/tools/offload-tblgen/entry_points.td index c66d5b4..94ea820 100644 --- a/offload/test/tools/offload-tblgen/entry_points.td +++ b/offload/test/tools/offload-tblgen/entry_points.td @@ -4,8 +4,7 @@ include "APIDefs.td" -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/functions_basic.td b/offload/test/tools/offload-tblgen/functions_basic.td index dec9357..2802c78 100644 --- a/offload/test/tools/offload-tblgen/functions_basic.td +++ b/offload/test/tools/offload-tblgen/functions_basic.td @@ -6,8 +6,7 @@ include "APIDefs.td" -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/functions_code_loc.td b/offload/test/tools/offload-tblgen/functions_code_loc.td index aec2012..8d7aa00 100644 --- a/offload/test/tools/offload-tblgen/functions_code_loc.td +++ b/offload/test/tools/offload-tblgen/functions_code_loc.td @@ -7,8 +7,7 @@ include "APIDefs.td" -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/functions_ranged_param.td b/offload/test/tools/offload-tblgen/functions_ranged_param.td index d0996b2..1ce8b39 100644 --- a/offload/test/tools/offload-tblgen/functions_ranged_param.td +++ b/offload/test/tools/offload-tblgen/functions_ranged_param.td @@ -8,13 +8,11 @@ include "APIDefs.td" -def : Handle { - let name = "some_handle_t"; +def some_handle_t : Handle { let desc = "An example handle type"; } -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/print_enum.td b/offload/test/tools/offload-tblgen/print_enum.td index 97f8696..c7573a9 100644 --- a/offload/test/tools/offload-tblgen/print_enum.td +++ b/offload/test/tools/offload-tblgen/print_enum.td @@ -4,8 +4,7 @@ include "APIDefs.td" -def : Enum { - let name = "my_enum_t"; +def my_enum_t : Enum { let desc = "An example enum"; let etors =[ Etor<"VALUE_ONE", "The first enum value">, diff --git a/offload/test/tools/offload-tblgen/print_function.td b/offload/test/tools/offload-tblgen/print_function.td index ce1fe4c..74b39f1 100644 --- a/offload/test/tools/offload-tblgen/print_function.td +++ b/offload/test/tools/offload-tblgen/print_function.td @@ -5,13 +5,11 @@ include "APIDefs.td" -def : Handle { - let name = "ol_foo_handle_t"; +def ol_foo_handle_t : Handle { let desc = "Example handle type"; } -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/test/tools/offload-tblgen/type_tagged_enum.td b/offload/test/tools/offload-tblgen/type_tagged_enum.td index 95964e3..b32531a 100644 --- a/offload/test/tools/offload-tblgen/type_tagged_enum.td +++ b/offload/test/tools/offload-tblgen/type_tagged_enum.td @@ -9,13 +9,11 @@ include "APIDefs.td" -def : Handle { - let name = "some_handle_t"; +def some_handle_t: Handle { let desc = "An example handle type"; } -def : Enum { - let name = "my_type_tagged_enum_t"; +def my_type_tagged_enum_t : Enum { let desc = "Example type tagged enum"; let is_typed = 1; let etors = [ @@ -34,8 +32,7 @@ def : Enum { // CHECK-API-NEXT: [some_handle_t] Value three. // CHECK-API-NEXT: MY_TYPE_TAGGED_ENUM_VALUE_THREE = 2, -def : Function { - let name = "FunctionA"; +def FunctionA : Function { let desc = "Function A description"; let details = [ "Function A detailed information" ]; let params = [ diff --git a/offload/tools/offload-tblgen/APIGen.cpp b/offload/tools/offload-tblgen/APIGen.cpp index 8c61d1f..1e79c00 100644 --- a/offload/tools/offload-tblgen/APIGen.cpp +++ b/offload/tools/offload-tblgen/APIGen.cpp @@ -131,7 +131,8 @@ static void ProcessEnum(const EnumRec &Enum, raw_ostream &OS) { OS << formatv("/// @brief {0}\n", Enum.getDesc()); OS << formatv("typedef enum {0} {{\n", Enum.getName()); - uint32_t EtorVal = 0; + // Bitfields start from 1, other enums from 0 + uint32_t EtorVal = Enum.isBitField(); for (const auto &EnumVal : Enum.getValues()) { if (Enum.isTyped()) { OS << MakeComment( @@ -141,7 +142,12 @@ static void ProcessEnum(const EnumRec &Enum, raw_ostream &OS) { OS << MakeComment(EnumVal.getDesc()); } OS << formatv(TAB_1 "{0}_{1} = {2},\n", Enum.getEnumValNamePrefix(), - EnumVal.getName(), EtorVal++); + EnumVal.getName(), EtorVal); + if (Enum.isBitField()) { + EtorVal <<= 1u; + } else { + ++EtorVal; + } } // Add last_element/force uint32 val @@ -220,31 +226,23 @@ OL_APIEXPORT ol_result_t OL_APICALL {0}WithCodeLoc( void EmitOffloadAPI(const RecordKeeper &Records, raw_ostream &OS) { OS << GenericHeader; OS << FileHeader; - // Generate main API definitions - for (auto *R : Records.getAllDerivedDefinitions("APIObject")) { - if (R->isSubClassOf("Macro")) { - ProcessMacro(MacroRec{R}, OS); - } else if (R->isSubClassOf("Typedef")) { - ProcessTypedef(TypedefRec{R}, OS); - } else if (R->isSubClassOf("Handle")) { - ProcessHandle(HandleRec{R}, OS); - } else if (R->isSubClassOf("Function")) { - ProcessFunction(FunctionRec{R}, OS); - } else if (R->isSubClassOf("Enum")) { - ProcessEnum(EnumRec{R}, OS); - } else if (R->isSubClassOf("Struct")) { - ProcessStruct(StructRec{R}, OS); - } else if (R->isSubClassOf("FptrTypedef")) { - ProcessFptrTypedef(FptrTypedefRec{R}, OS); - } - } - // Generate auxiliary definitions (func param structs etc) + // Generate main API definitions + for (auto *R : Records.getAllDerivedDefinitions("Macro")) + ProcessMacro(MacroRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("Handle")) + ProcessHandle(HandleRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("Enum")) + ProcessEnum(EnumRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("Typedef")) + ProcessTypedef(TypedefRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("FptrTypedef")) + ProcessFptrTypedef(FptrTypedefRec{R}, OS); + for (auto *R : Records.getAllDerivedDefinitions("Struct")) + ProcessStruct(StructRec{R}, OS); for (auto *R : Records.getAllDerivedDefinitions("Function")) { ProcessFuncParamStruct(FunctionRec{R}, OS); - } - - for (auto *R : Records.getAllDerivedDefinitions("Function")) { + ProcessFunction(FunctionRec{R}, OS); ProcessFuncWithCodeLocVariant(FunctionRec{R}, OS); } diff --git a/offload/tools/offload-tblgen/MiscGen.cpp b/offload/tools/offload-tblgen/MiscGen.cpp index b90e5cfd..8a8b9ca 100644 --- a/offload/tools/offload-tblgen/MiscGen.cpp +++ b/offload/tools/offload-tblgen/MiscGen.cpp @@ -86,7 +86,7 @@ void EmitOffloadErrcodes(const RecordKeeper &Records, raw_ostream &OS) { )"; - auto ErrorCodeEnum = EnumRec{Records.getDef("ErrorCode")}; + auto ErrorCodeEnum = EnumRec{Records.getDef("ol_errc_t")}; uint32_t EtorVal = 0; for (const auto &EnumVal : ErrorCodeEnum.getValues()) { OS << formatv(TAB_1 "OFFLOAD_ERRC({0}, \"{1}\", {2})\n", EnumVal.getName(), @@ -107,10 +107,16 @@ void EmitOffloadInfo(const RecordKeeper &Records, raw_ostream &OS) { )"; - auto ErrorCodeEnum = EnumRec{Records.getDef("DeviceInfo")}; - uint32_t EtorVal = 0; - for (const auto &EnumVal : ErrorCodeEnum.getValues()) { + auto Enum = EnumRec{Records.getDef("ol_device_info_t")}; + // Bitfields start from 1, other enums from 0 + uint32_t EtorVal = Enum.isBitField(); + for (const auto &EnumVal : Enum.getValues()) { OS << formatv(TAB_1 "OFFLOAD_DEVINFO({0}, \"{1}\", {2})\n", - EnumVal.getName(), EnumVal.getDesc(), EtorVal++); + EnumVal.getName(), EnumVal.getDesc(), EtorVal); + if (Enum.isBitField()) { + EtorVal <<= 1u; + } else { + ++EtorVal; + } } } diff --git a/offload/tools/offload-tblgen/RecordTypes.hpp b/offload/tools/offload-tblgen/RecordTypes.hpp index 65c0a4c..2abd9e1 100644 --- a/offload/tools/offload-tblgen/RecordTypes.hpp +++ b/offload/tools/offload-tblgen/RecordTypes.hpp @@ -16,25 +16,30 @@ namespace llvm { namespace offload { namespace tblgen { -class HandleRec { +class APIObject { public: - explicit HandleRec(const Record *rec) : rec(rec) {} - StringRef getName() const { return rec->getValueAsString("name"); } + StringRef getName() const { return rec->getName(); } StringRef getDesc() const { return rec->getValueAsString("desc"); } -private: +protected: + APIObject(const Record *rec) : rec(rec) {} const Record *rec; }; -class MacroRec { +class HandleRec : public APIObject { public: - explicit MacroRec(const Record *rec) : rec(rec) { - auto Name = rec->getValueAsString("name"); + explicit HandleRec(const Record *rec) : APIObject(rec) {}; +}; + +class MacroRec : public APIObject { +public: + explicit MacroRec(const Record *rec) : APIObject(rec) { + auto Name = rec->getName(); auto OpenBrace = Name.find_first_of("("); nameWithoutArgs = Name.substr(0, OpenBrace); } StringRef getName() const { return nameWithoutArgs; } - StringRef getNameWithArgs() const { return rec->getValueAsString("name"); } + StringRef getNameWithArgs() const { return rec->getName(); } StringRef getDesc() const { return rec->getValueAsString("desc"); } std::optional<StringRef> getCondition() const { @@ -46,19 +51,15 @@ public: } private: - const Record *rec; std::string nameWithoutArgs; }; -class TypedefRec { +class TypedefRec : public APIObject { public: - explicit TypedefRec(const Record *rec) : rec(rec) {} - StringRef getName() const { return rec->getValueAsString("name"); } - StringRef getDesc() const { return rec->getValueAsString("desc"); } - StringRef getValue() const { return rec->getValueAsString("value"); } + explicit TypedefRec(const Record *rec) : APIObject(rec) {}; -private: - const Record *rec; +public: + StringRef getValue() const { return rec->getValueAsString("value"); } }; class EnumValueRec { @@ -74,15 +75,13 @@ private: const Record *rec; }; -class EnumRec { +class EnumRec : public APIObject { public: - explicit EnumRec(const Record *rec) : rec(rec) { + explicit EnumRec(const Record *rec) : APIObject(rec) { for (const auto *Val : rec->getValueAsListOfDefs("etors")) { vals.emplace_back(EnumValueRec{Val}); } } - StringRef getName() const { return rec->getValueAsString("name"); } - StringRef getDesc() const { return rec->getValueAsString("desc"); } const std::vector<EnumValueRec> &getValues() const { return vals; } std::string getEnumValNamePrefix() const { @@ -92,8 +91,9 @@ public: bool isTyped() const { return rec->getValueAsBit("is_typed"); } + bool isBitField() const { return rec->getValueAsBit("is_bit_field"); } + private: - const Record *rec; std::vector<EnumValueRec> vals; }; @@ -110,22 +110,19 @@ private: const Record *rec; }; -class StructRec { +class StructRec : public APIObject { public: - explicit StructRec(const Record *rec) : rec(rec) { + explicit StructRec(const Record *rec) : APIObject(rec) { for (auto *Member : rec->getValueAsListOfDefs("all_members")) { members.emplace_back(StructMemberRec(Member)); } } - StringRef getName() const { return rec->getValueAsString("name"); } - StringRef getDesc() const { return rec->getValueAsString("desc"); } std::optional<StringRef> getBaseClass() const { return rec->getValueAsOptionalString("base_class"); } const std::vector<StructMemberRec> &getMembers() const { return members; } private: - const Record *rec; std::vector<StructMemberRec> members; }; @@ -205,9 +202,9 @@ private: const Record *rec; }; -class FunctionRec { +class FunctionRec : public APIObject { public: - FunctionRec(const Record *rec) : rec(rec) { + FunctionRec(const Record *rec) : APIObject(rec) { for (auto &Ret : rec->getValueAsListOfDefs("all_returns")) rets.emplace_back(Ret); for (auto &Param : rec->getValueAsListOfDefs("params")) @@ -219,11 +216,9 @@ public: llvm::convertToSnakeFromCamelCase(getName())); } - StringRef getName() const { return rec->getValueAsString("name"); } StringRef getClass() const { return rec->getValueAsString("api_class"); } const std::vector<ReturnRec> &getReturns() const { return rets; } const std::vector<ParamRec> &getParams() const { return params; } - StringRef getDesc() const { return rec->getValueAsString("desc"); } std::vector<StringRef> getDetails() const { return rec->getValueAsListOfStrings("details"); } @@ -234,25 +229,19 @@ public: private: std::vector<ReturnRec> rets; std::vector<ParamRec> params; - - const Record *rec; }; -class FptrTypedefRec { +class FptrTypedefRec : public APIObject { public: - explicit FptrTypedefRec(const Record *rec) : rec(rec) { + explicit FptrTypedefRec(const Record *rec) : APIObject(rec) { for (auto &Param : rec->getValueAsListOfDefs("params")) params.emplace_back(Param); } - StringRef getName() const { return rec->getValueAsString("name"); } - StringRef getDesc() const { return rec->getValueAsString("desc"); } StringRef getReturn() const { return rec->getValueAsString("return"); } const std::vector<ParamRec> &getParams() const { return params; } private: std::vector<ParamRec> params; - - const Record *rec; }; } // namespace tblgen diff --git a/offload/unittests/Conformance/README.md b/offload/unittests/Conformance/README.md new file mode 100644 index 0000000..0202242 --- /dev/null +++ b/offload/unittests/Conformance/README.md @@ -0,0 +1,83 @@ +# GPU Math Conformance Tests + +## Overview + +This test suite provides a framework to systematically measure the accuracy of math functions on GPUs and verify their conformance with standards like OpenCL. + +While the primary focus is validating the implementations in the C standard math library (LLVM-libm), these tests can also be executed against other math library providers, such as CUDA Math and HIP Math, for comparison. + +The goals of this project are to empower LLVM-libm contributors with a robust tool for validating their implementations and to build trust with end-users by providing transparent accuracy data. + +### Table of Contents + +- [Getting Started](#getting-started) +- [Running the Tests](#running-the-tests) +- [Adding New Tests](#adding-new-tests) + +## Getting Started + +This guide covers how to build the necessary dependencies, which include the new Offload API and the C standard library for both host and GPU targets. + +### System Requirements + +Before you begin, ensure your system meets the following requirements: + +- A system with an AMD or NVIDIA GPU. +- The latest proprietary GPU drivers installed. +- The corresponding development SDK for your hardware: + - **AMD:** [ROCm SDK](https://rocm.docs.amd.com) + - **NVIDIA:** [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) + +### Building the Dependencies + +The official documentation for building LLVM-libc for GPUs provides a detailed guide and should be considered the primary reference. Please follow the instructions in the **"Standard runtimes build"** section of that guide: + +- [Building the GPU C library (Official Documentation)](https://libc.llvm.org/gpu/building.html) + +> [!IMPORTANT] +> For the conformance tests, the standard `cmake` command from the official documentation must be adapted slightly. You must also add `libc` to the main `-DLLVM_ENABLE_RUNTIMES` list. This is a crucial step because the tests need a host-side build of `libc` to use as the reference oracle for validating GPU results. + +## Running the Tests + +### Default Test + +To build and run the conformance test for a given function (e.g., `logf`) against the default C standard math library `llvm-libm` provider, use the following command. This will execute the test on all available and supported platforms. + +```bash +ninja -C build/runtimes/runtimes-bins offload.conformance.logf +``` + +### Testing Other Providers + +Once the test binary has been built, you can run it against other math library providers using the `--test-configs` flag. + +- **For `cuda-math` on an NVIDIA GPU:** + + ```bash + ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=cuda-math:cuda + ``` + +- **For `hip-math` on an AMD GPU:** + + ```bash + ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=hip-math:amdgpu + ``` + +You can also run all available configurations for a test with: + +```bash +./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=all +``` + +## Adding New Tests + +To add a conformance test for a new math function, follow these steps: + +1. **Implement the Device Kernels**: Create a kernel wrapper for the new function in each provider's source file. For CUDA Math and HIP Math, you must also add a forward declaration for the vendor function in `/device_code/DeviceAPIs.hpp`. + +2. **Implement the Host Test**: Create a new `.cpp` file in `/tests`. This file defines the `FunctionConfig` (function and kernel names, as well as ULP tolerance) and the input generation strategy. + + - Use **exhaustive testing** (`ExhaustiveGenerator`) for functions with small input spaces (e.g., half-precision functions and single-precision univariate functions). This strategy iterates over every representable point in the input space, ensuring complete coverage. + - Use **randomized testing** (`RandomGenerator`) for functions with large input spaces (e.g., single-precision bivariate and double-precision functions), where exhaustive testing is computationally infeasible. Although not exhaustive, this strategy is deterministic, using a fixed seed to sample a large, reproducible subset of points from the input space. + +3. **Add the Build Target**: Add a new `add_conformance_test(...)` entry to `/tests/CMakeLists.txt` to make the test buildable. diff --git a/offload/unittests/Conformance/device_code/CUDAMath.cpp b/offload/unittests/Conformance/device_code/CUDAMath.cpp index a351e92..d80660b 100644 --- a/offload/unittests/Conformance/device_code/CUDAMath.cpp +++ b/offload/unittests/Conformance/device_code/CUDAMath.cpp @@ -26,6 +26,22 @@ using namespace kernels; // Helpers //===----------------------------------------------------------------------===// +static inline float powfRoundedExponent(float Base, float Exponent) { + return __nv_powf(Base, __nv_roundf(Exponent)); +} + +static inline double sincosSin(double X) { + double SinX, CosX; + __nv_sincos(X, &SinX, &CosX); + return SinX; +} + +static inline double sincosCos(double X) { + double SinX, CosX; + __nv_sincos(X, &SinX, &CosX); + return CosX; +} + static inline float sincosfSin(float X) { float SinX, CosX; __nv_sincosf(X, &SinX, &CosX); @@ -44,6 +60,11 @@ static inline float sincosfCos(float X) { extern "C" { +__gpu_kernel void acosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_acos>(NumElements, Out, X); +} + __gpu_kernel void acosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_acosf>(NumElements, Out, X); @@ -54,6 +75,11 @@ __gpu_kernel void acoshfKernel(const float *X, float *Out, runKernelBody<__nv_acoshf>(NumElements, Out, X); } +__gpu_kernel void asinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_asin>(NumElements, Out, X); +} + __gpu_kernel void asinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_asinf>(NumElements, Out, X); @@ -69,16 +95,31 @@ __gpu_kernel void atanfKernel(const float *X, float *Out, runKernelBody<__nv_atanf>(NumElements, Out, X); } +__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_atan2f>(NumElements, Out, X, Y); +} + __gpu_kernel void atanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_atanhf>(NumElements, Out, X); } +__gpu_kernel void cbrtKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_cbrt>(NumElements, Out, X); +} + __gpu_kernel void cbrtfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_cbrtf>(NumElements, Out, X); } +__gpu_kernel void cosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_cos>(NumElements, Out, X); +} + __gpu_kernel void cosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_cosf>(NumElements, Out, X); @@ -99,51 +140,127 @@ __gpu_kernel void erffKernel(const float *X, float *Out, runKernelBody<__nv_erff>(NumElements, Out, X); } +__gpu_kernel void expKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_exp>(NumElements, Out, X); +} + __gpu_kernel void expfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_expf>(NumElements, Out, X); } +__gpu_kernel void exp10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_exp10>(NumElements, Out, X); +} + __gpu_kernel void exp10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_exp10f>(NumElements, Out, X); } +__gpu_kernel void exp2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_exp2>(NumElements, Out, X); +} + __gpu_kernel void exp2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_exp2f>(NumElements, Out, X); } +__gpu_kernel void expm1Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_expm1>(NumElements, Out, X); +} + __gpu_kernel void expm1fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_expm1f>(NumElements, Out, X); } +__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_hypot>(NumElements, Out, X, Y); +} + +__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_hypotf>(NumElements, Out, X, Y); +} + +__gpu_kernel void logKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log>(NumElements, Out, X); +} + __gpu_kernel void logfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_logf>(NumElements, Out, X); } +__gpu_kernel void log10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log10>(NumElements, Out, X); +} + __gpu_kernel void log10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_log10f>(NumElements, Out, X); } +__gpu_kernel void log1pKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log1p>(NumElements, Out, X); +} + __gpu_kernel void log1pfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_log1pf>(NumElements, Out, X); } +__gpu_kernel void log2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log2>(NumElements, Out, X); +} + __gpu_kernel void log2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_log2f>(NumElements, Out, X); } +__gpu_kernel void powfKernel(const float *X, float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_powf>(NumElements, Out, X, Y); +} + +__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y, + float *Out, + size_t NumElements) noexcept { + runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y); +} + +__gpu_kernel void sinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_sin>(NumElements, Out, X); +} + __gpu_kernel void sinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_sinf>(NumElements, Out, X); } +__gpu_kernel void sincosSinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosSin>(NumElements, Out, X); +} + +__gpu_kernel void sincosCosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosCos>(NumElements, Out, X); +} + __gpu_kernel void sincosfSinKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sincosfSin>(NumElements, Out, X); @@ -164,6 +281,11 @@ __gpu_kernel void sinpifKernel(const float *X, float *Out, runKernelBody<__nv_sinpif>(NumElements, Out, X); } +__gpu_kernel void tanKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_tan>(NumElements, Out, X); +} + __gpu_kernel void tanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__nv_tanf>(NumElements, Out, X); diff --git a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp index 8476dcb..894652a 100644 --- a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp +++ b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp @@ -48,29 +48,49 @@ extern const inline uint32_t __oclc_ISA_version = 9000; extern "C" { +double __nv_acos(double); float __nv_acosf(float); float __nv_acoshf(float); +double __nv_asin(double); float __nv_asinf(float); float __nv_asinhf(float); float __nv_atanf(float); +float __nv_atan2f(float, float); float __nv_atanhf(float); +double __nv_cbrt(double); float __nv_cbrtf(float); +double __nv_cos(double); float __nv_cosf(float); float __nv_coshf(float); float __nv_cospif(float); float __nv_erff(float); +double __nv_exp(double); float __nv_expf(float); +double __nv_exp10(double); float __nv_exp10f(float); +double __nv_exp2(double); float __nv_exp2f(float); +double __nv_expm1(double); float __nv_expm1f(float); +double __nv_hypot(double, double); +float __nv_hypotf(float, float); +double __nv_log(double); float __nv_logf(float); +double __nv_log10(double); float __nv_log10f(float); +double __nv_log1p(double); float __nv_log1pf(float); +double __nv_log2(double); float __nv_log2f(float); +float __nv_powf(float, float); +float __nv_roundf(float); +double __nv_sin(double); float __nv_sinf(float); +void __nv_sincos(double, double *, double *); void __nv_sincosf(float, float *, float *); float __nv_sinhf(float); float __nv_sinpif(float); +double __nv_tan(double); float __nv_tanf(float); float __nv_tanhf(float); } // extern "C" @@ -81,31 +101,70 @@ float __nv_tanhf(float); extern "C" { +double __ocml_acos_f64(double); float __ocml_acos_f32(float); +float16 __ocml_acos_f16(float16); float __ocml_acosh_f32(float); +float16 __ocml_acosh_f16(float16); +double __ocml_asin_f64(double); float __ocml_asin_f32(float); +float16 __ocml_asin_f16(float16); float __ocml_asinh_f32(float); +float16 __ocml_asinh_f16(float16); float __ocml_atan_f32(float); +float16 __ocml_atan_f16(float16); +float __ocml_atan2_f32(float, float); float __ocml_atanh_f32(float); +float16 __ocml_atanh_f16(float16); +double __ocml_cbrt_f64(double); float __ocml_cbrt_f32(float); +double __ocml_cos_f64(double); float __ocml_cos_f32(float); +float16 __ocml_cos_f16(float16); float __ocml_cosh_f32(float); +float16 __ocml_cosh_f16(float16); float __ocml_cospi_f32(float); float __ocml_erf_f32(float); +double __ocml_exp_f64(double); float __ocml_exp_f32(float); +float16 __ocml_exp_f16(float16); +double __ocml_exp10_f64(double); float __ocml_exp10_f32(float); +float16 __ocml_exp10_f16(float16); +double __ocml_exp2_f64(double); float __ocml_exp2_f32(float); +float16 __ocml_exp2_f16(float16); +double __ocml_expm1_f64(double); float __ocml_expm1_f32(float); +float16 __ocml_expm1_f16(float16); +double __ocml_hypot_f64(double, double); +float __ocml_hypot_f32(float, float); +double __ocml_log_f64(double); float __ocml_log_f32(float); +float16 __ocml_log_f16(float16); +double __ocml_log10_f64(double); float __ocml_log10_f32(float); +float16 __ocml_log10_f16(float16); +double __ocml_log1p_f64(double); float __ocml_log1p_f32(float); +double __ocml_log2_f64(double); float __ocml_log2_f32(float); +float16 __ocml_log2_f16(float16); +float __ocml_pow_f32(float, float); +float __ocml_round_f32(float); +double __ocml_sin_f64(double); float __ocml_sin_f32(float); +float16 __ocml_sin_f16(float16); +double __ocml_sincos_f64(double, double *); float __ocml_sincos_f32(float, float *); float __ocml_sinh_f32(float); +float16 __ocml_sinh_f16(float16); float __ocml_sinpi_f32(float); +double __ocml_tan_f64(double); float __ocml_tan_f32(float); +float16 __ocml_tan_f16(float16); float __ocml_tanh_f32(float); +float16 __ocml_tanh_f16(float16); } // extern "C" #endif // HIP_MATH_FOUND diff --git a/offload/unittests/Conformance/device_code/HIPMath.cpp b/offload/unittests/Conformance/device_code/HIPMath.cpp index 36efe6b..7cc0ad5 100644 --- a/offload/unittests/Conformance/device_code/HIPMath.cpp +++ b/offload/unittests/Conformance/device_code/HIPMath.cpp @@ -26,6 +26,22 @@ using namespace kernels; // Helpers //===----------------------------------------------------------------------===// +static inline float powfRoundedExponent(float Base, float Exponent) { + return __ocml_pow_f32(Base, __ocml_round_f32(Exponent)); +} + +static inline double sincosSin(double X) { + double CosX; + double SinX = __ocml_sincos_f64(X, &CosX); + return SinX; +} + +static inline double sincosCos(double X) { + double CosX; + double SinX = __ocml_sincos_f64(X, &CosX); + return CosX; +} + static inline float sincosfSin(float X) { float CosX; float SinX = __ocml_sincos_f32(X, &CosX); @@ -44,51 +60,116 @@ static inline float sincosfCos(float X) { extern "C" { +__gpu_kernel void acosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_acos_f64>(NumElements, Out, X); +} + __gpu_kernel void acosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_acos_f32>(NumElements, Out, X); } +__gpu_kernel void acosf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_acos_f16>(NumElements, Out, X); +} + __gpu_kernel void acoshfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_acosh_f32>(NumElements, Out, X); } +__gpu_kernel void acoshf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_acosh_f16>(NumElements, Out, X); +} + +__gpu_kernel void asinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_asin_f64>(NumElements, Out, X); +} + __gpu_kernel void asinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_asin_f32>(NumElements, Out, X); } +__gpu_kernel void asinf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_asin_f16>(NumElements, Out, X); +} + __gpu_kernel void asinhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_asinh_f32>(NumElements, Out, X); } +__gpu_kernel void asinhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_asinh_f16>(NumElements, Out, X); +} + __gpu_kernel void atanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_atan_f32>(NumElements, Out, X); } +__gpu_kernel void atanf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_atan_f16>(NumElements, Out, X); +} + +__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_atan2_f32>(NumElements, Out, X, Y); +} + __gpu_kernel void atanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_atanh_f32>(NumElements, Out, X); } +__gpu_kernel void atanhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_atanh_f16>(NumElements, Out, X); +} + +__gpu_kernel void cbrtKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cbrt_f64>(NumElements, Out, X); +} + __gpu_kernel void cbrtfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_cbrt_f32>(NumElements, Out, X); } +__gpu_kernel void cosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cos_f64>(NumElements, Out, X); +} + __gpu_kernel void cosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_cos_f32>(NumElements, Out, X); } +__gpu_kernel void cosf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cos_f16>(NumElements, Out, X); +} + __gpu_kernel void coshfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_cosh_f32>(NumElements, Out, X); } +__gpu_kernel void coshf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cosh_f16>(NumElements, Out, X); +} + __gpu_kernel void cospifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_cospi_f32>(NumElements, Out, X); @@ -99,51 +180,167 @@ __gpu_kernel void erffKernel(const float *X, float *Out, runKernelBody<__ocml_erf_f32>(NumElements, Out, X); } +__gpu_kernel void expKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp_f64>(NumElements, Out, X); +} + __gpu_kernel void expfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_exp_f32>(NumElements, Out, X); } +__gpu_kernel void expf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp_f16>(NumElements, Out, X); +} + +__gpu_kernel void exp10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp10_f64>(NumElements, Out, X); +} + __gpu_kernel void exp10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_exp10_f32>(NumElements, Out, X); } +__gpu_kernel void exp10f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp10_f16>(NumElements, Out, X); +} + +__gpu_kernel void exp2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp2_f64>(NumElements, Out, X); +} + __gpu_kernel void exp2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_exp2_f32>(NumElements, Out, X); } +__gpu_kernel void exp2f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp2_f16>(NumElements, Out, X); +} + +__gpu_kernel void expm1Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_expm1_f64>(NumElements, Out, X); +} + __gpu_kernel void expm1fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_expm1_f32>(NumElements, Out, X); } +__gpu_kernel void expm1f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_expm1_f16>(NumElements, Out, X); +} + +__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_hypot_f64>(NumElements, Out, X, Y); +} + +__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_hypot_f32>(NumElements, Out, X, Y); +} + +__gpu_kernel void logKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log_f64>(NumElements, Out, X); +} + __gpu_kernel void logfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_log_f32>(NumElements, Out, X); } +__gpu_kernel void logf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log_f16>(NumElements, Out, X); +} + +__gpu_kernel void log10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log10_f64>(NumElements, Out, X); +} + __gpu_kernel void log10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_log10_f32>(NumElements, Out, X); } +__gpu_kernel void log10f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log10_f16>(NumElements, Out, X); +} + +__gpu_kernel void log1pKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log1p_f64>(NumElements, Out, X); +} + __gpu_kernel void log1pfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_log1p_f32>(NumElements, Out, X); } +__gpu_kernel void log2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log2_f64>(NumElements, Out, X); +} + __gpu_kernel void log2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_log2_f32>(NumElements, Out, X); } +__gpu_kernel void log2f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log2_f16>(NumElements, Out, X); +} + +__gpu_kernel void powfKernel(const float *X, float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_pow_f32>(NumElements, Out, X, Y); +} + +__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y, + float *Out, + size_t NumElements) noexcept { + runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y); +} + +__gpu_kernel void sinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_sin_f64>(NumElements, Out, X); +} + __gpu_kernel void sinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_sin_f32>(NumElements, Out, X); } +__gpu_kernel void sinf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_sin_f16>(NumElements, Out, X); +} + +__gpu_kernel void sincosSinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosSin>(NumElements, Out, X); +} + +__gpu_kernel void sincosCosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosCos>(NumElements, Out, X); +} + __gpu_kernel void sincosfSinKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sincosfSin>(NumElements, Out, X); @@ -159,20 +356,40 @@ __gpu_kernel void sinhfKernel(const float *X, float *Out, runKernelBody<__ocml_sinh_f32>(NumElements, Out, X); } +__gpu_kernel void sinhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_sinh_f16>(NumElements, Out, X); +} + __gpu_kernel void sinpifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_sinpi_f32>(NumElements, Out, X); } +__gpu_kernel void tanKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_tan_f64>(NumElements, Out, X); +} + __gpu_kernel void tanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_tan_f32>(NumElements, Out, X); } +__gpu_kernel void tanf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_tan_f16>(NumElements, Out, X); +} + __gpu_kernel void tanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<__ocml_tanh_f32>(NumElements, Out, X); } + +__gpu_kernel void tanhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_tanh_f16>(NumElements, Out, X); +} } // extern "C" #endif // HIP_MATH_FOUND diff --git a/offload/unittests/Conformance/device_code/LLVMLibm.cpp b/offload/unittests/Conformance/device_code/LLVMLibm.cpp index 8869d87..8673d80 100644 --- a/offload/unittests/Conformance/device_code/LLVMLibm.cpp +++ b/offload/unittests/Conformance/device_code/LLVMLibm.cpp @@ -25,6 +25,22 @@ using namespace kernels; // Helpers //===----------------------------------------------------------------------===// +static inline float powfRoundedExponent(float Base, float Exponent) { + return powf(Base, roundf(Exponent)); +} + +static inline double sincosSin(double X) { + double SinX, CosX; + sincos(X, &SinX, &CosX); + return SinX; +} + +static inline double sincosCos(double X) { + double SinX, CosX; + sincos(X, &SinX, &CosX); + return CosX; +} + static inline float sincosfSin(float X) { float SinX, CosX; sincosf(X, &SinX, &CosX); @@ -43,111 +59,302 @@ static inline float sincosfCos(float X) { extern "C" { +__gpu_kernel void acosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<acos>(NumElements, Out, X); +} + __gpu_kernel void acosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<acosf>(NumElements, Out, X); } +__gpu_kernel void acosf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<acosf16>(NumElements, Out, X); +} + __gpu_kernel void acoshfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<acoshf>(NumElements, Out, X); } +__gpu_kernel void acoshf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<acoshf16>(NumElements, Out, X); +} + +__gpu_kernel void acospif16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<acospif16>(NumElements, Out, X); +} + +__gpu_kernel void asinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<asin>(NumElements, Out, X); +} + __gpu_kernel void asinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<asinf>(NumElements, Out, X); } +__gpu_kernel void asinf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<asinf16>(NumElements, Out, X); +} + __gpu_kernel void asinhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<asinhf>(NumElements, Out, X); } +__gpu_kernel void asinhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<asinhf16>(NumElements, Out, X); +} + __gpu_kernel void atanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<atanf>(NumElements, Out, X); } +__gpu_kernel void atanf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<atanf16>(NumElements, Out, X); +} + +__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<atan2f>(NumElements, Out, X, Y); +} + __gpu_kernel void atanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<atanhf>(NumElements, Out, X); } +__gpu_kernel void atanhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<atanhf16>(NumElements, Out, X); +} + +__gpu_kernel void cbrtKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<cbrt>(NumElements, Out, X); +} + __gpu_kernel void cbrtfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<cbrtf>(NumElements, Out, X); } +__gpu_kernel void cosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<cos>(NumElements, Out, X); +} + __gpu_kernel void cosfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<cosf>(NumElements, Out, X); } +__gpu_kernel void cosf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<cosf16>(NumElements, Out, X); +} + __gpu_kernel void coshfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<coshf>(NumElements, Out, X); } +__gpu_kernel void coshf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<coshf16>(NumElements, Out, X); +} + __gpu_kernel void cospifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<cospif>(NumElements, Out, X); } +__gpu_kernel void cospif16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<cospif16>(NumElements, Out, X); +} + __gpu_kernel void erffKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<erff>(NumElements, Out, X); } +__gpu_kernel void expKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<exp>(NumElements, Out, X); +} + __gpu_kernel void expfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<expf>(NumElements, Out, X); } +__gpu_kernel void expf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<expf16>(NumElements, Out, X); +} + +__gpu_kernel void exp10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<exp10>(NumElements, Out, X); +} + __gpu_kernel void exp10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<exp10f>(NumElements, Out, X); } +__gpu_kernel void exp10f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<exp10f16>(NumElements, Out, X); +} + +__gpu_kernel void exp2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<exp2>(NumElements, Out, X); +} + __gpu_kernel void exp2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<exp2f>(NumElements, Out, X); } +__gpu_kernel void exp2f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<exp2f16>(NumElements, Out, X); +} + +__gpu_kernel void expm1Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<expm1>(NumElements, Out, X); +} + __gpu_kernel void expm1fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<expm1f>(NumElements, Out, X); } -__gpu_kernel void hypotf16Kernel(const float16 *X, float16 *Y, float16 *Out, +__gpu_kernel void expm1f16Kernel(const float16 *X, float16 *Out, size_t NumElements) noexcept { + runKernelBody<expm1f16>(NumElements, Out, X); +} + +__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out, + size_t NumElements) noexcept { + runKernelBody<hypot>(NumElements, Out, X, Y); +} + +__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<hypotf>(NumElements, Out, X, Y); +} + +__gpu_kernel void hypotf16Kernel(const float16 *X, const float16 *Y, + float16 *Out, size_t NumElements) noexcept { runKernelBody<hypotf16>(NumElements, Out, X, Y); } +__gpu_kernel void logKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<log>(NumElements, Out, X); +} + __gpu_kernel void logfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<logf>(NumElements, Out, X); } +__gpu_kernel void logf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<logf16>(NumElements, Out, X); +} + +__gpu_kernel void log10Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<log10>(NumElements, Out, X); +} + __gpu_kernel void log10fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<log10f>(NumElements, Out, X); } +__gpu_kernel void log10f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<log10f16>(NumElements, Out, X); +} + +__gpu_kernel void log1pKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<log1p>(NumElements, Out, X); +} + __gpu_kernel void log1pfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<log1pf>(NumElements, Out, X); } +__gpu_kernel void log2Kernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<log2>(NumElements, Out, X); +} + __gpu_kernel void log2fKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<log2f>(NumElements, Out, X); } +__gpu_kernel void log2f16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<log2f16>(NumElements, Out, X); +} + +__gpu_kernel void powfKernel(const float *X, float *Y, float *Out, + size_t NumElements) noexcept { + runKernelBody<powf>(NumElements, Out, X, Y); +} + +__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y, + float *Out, + size_t NumElements) noexcept { + runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y); +} + +__gpu_kernel void sinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sin>(NumElements, Out, X); +} + __gpu_kernel void sinfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sinf>(NumElements, Out, X); } +__gpu_kernel void sinf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<sinf16>(NumElements, Out, X); +} + +__gpu_kernel void sincosSinKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosSin>(NumElements, Out, X); +} + +__gpu_kernel void sincosCosKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<sincosCos>(NumElements, Out, X); +} + __gpu_kernel void sincosfSinKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sincosfSin>(NumElements, Out, X); @@ -163,23 +370,53 @@ __gpu_kernel void sinhfKernel(const float *X, float *Out, runKernelBody<sinhf>(NumElements, Out, X); } +__gpu_kernel void sinhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<sinhf16>(NumElements, Out, X); +} + __gpu_kernel void sinpifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<sinpif>(NumElements, Out, X); } +__gpu_kernel void sinpif16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<sinpif16>(NumElements, Out, X); +} + +__gpu_kernel void tanKernel(const double *X, double *Out, + size_t NumElements) noexcept { + runKernelBody<tan>(NumElements, Out, X); +} + __gpu_kernel void tanfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<tanf>(NumElements, Out, X); } +__gpu_kernel void tanf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<tanf16>(NumElements, Out, X); +} + __gpu_kernel void tanhfKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<tanhf>(NumElements, Out, X); } +__gpu_kernel void tanhf16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<tanhf16>(NumElements, Out, X); +} + __gpu_kernel void tanpifKernel(const float *X, float *Out, size_t NumElements) noexcept { runKernelBody<tanpif>(NumElements, Out, X); } + +__gpu_kernel void tanpif16Kernel(const float16 *X, float16 *Out, + size_t NumElements) noexcept { + runKernelBody<tanpif16>(NumElements, Out, X); +} } // extern "C" diff --git a/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp b/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp index 6f7f7a9..39c6838 100644 --- a/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp +++ b/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp @@ -8,8 +8,8 @@ /// /// \file /// This file contains the definition of the ExhaustiveGenerator class, a -/// concrete input generator that exhaustively creates inputs from a given -/// sequence of ranges. +/// concrete range-based generator that exhaustively creates inputs from a +/// given sequence of ranges. /// //===----------------------------------------------------------------------===// @@ -17,89 +17,62 @@ #define MATHTEST_EXHAUSTIVEGENERATOR_HPP #include "mathtest/IndexedRange.hpp" -#include "mathtest/InputGenerator.hpp" +#include "mathtest/RangeBasedGenerator.hpp" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/Support/Parallel.h" - -#include <algorithm> #include <array> #include <cassert> #include <cstddef> #include <cstdint> +#include <optional> #include <tuple> namespace mathtest { template <typename... InTypes> class [[nodiscard]] ExhaustiveGenerator final - : public InputGenerator<InTypes...> { - static constexpr std::size_t NumInputs = sizeof...(InTypes); - static_assert(NumInputs > 0, "The number of inputs must be at least 1"); + : public RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...> { + + friend class RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...>; + + using Base = RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...>; + using IndexArrayType = std::array<uint64_t, Base::NumInputs>; + + using Base::RangesTuple; + using Base::Size; public: explicit constexpr ExhaustiveGenerator( const IndexedRange<InTypes> &...Ranges) noexcept - : RangesTuple(Ranges...) { - bool Overflowed = getSizeWithOverflow(Ranges..., Size); + : Base(Ranges...) { + const auto MaybeSize = getInputSpaceSize(Ranges...); + + assert(MaybeSize.has_value() && "The size is too large"); + Size = *MaybeSize; - assert(!Overflowed && "The input space size is too large"); - assert((Size > 0) && "The input space size must be at least 1"); + assert((Size > 0) && "The size must be at least 1"); IndexArrayType DimSizes = {}; std::size_t DimIndex = 0; ((DimSizes[DimIndex++] = Ranges.getSize()), ...); - Strides[NumInputs - 1] = 1; - if constexpr (NumInputs > 1) - for (int Index = static_cast<int>(NumInputs) - 2; Index >= 0; --Index) + Strides[Base::NumInputs - 1] = 1; + if constexpr (Base::NumInputs > 1) + for (int Index = static_cast<int>(Base::NumInputs) - 2; Index >= 0; + --Index) Strides[Index] = Strides[Index + 1] * DimSizes[Index + 1]; } - void reset() noexcept override { NextFlatIndex = 0; } - - [[nodiscard]] std::size_t - fill(llvm::MutableArrayRef<InTypes>... Buffers) noexcept override { - const std::array<std::size_t, NumInputs> BufferSizes = {Buffers.size()...}; - const std::size_t BufferSize = BufferSizes[0]; - assert((BufferSize != 0) && "Buffer size cannot be zero"); - assert(std::all_of(BufferSizes.begin(), BufferSizes.end(), - [&](std::size_t Size) { return Size == BufferSize; }) && - "All input buffers must have the same size"); - - if (NextFlatIndex >= Size) - return 0; - - const auto BatchSize = std::min<uint64_t>(BufferSize, Size - NextFlatIndex); - const auto CurrentFlatIndex = NextFlatIndex; - NextFlatIndex += BatchSize; - - auto BufferPtrsTuple = std::make_tuple(Buffers.data()...); - - llvm::parallelFor(0, BatchSize, [&](std::size_t Offset) { - writeInputs(CurrentFlatIndex, Offset, BufferPtrsTuple); - }); - - return static_cast<std::size_t>(BatchSize); - } - private: - using RangesTupleType = std::tuple<IndexedRange<InTypes>...>; - using IndexArrayType = std::array<uint64_t, NumInputs>; - - static bool getSizeWithOverflow(const IndexedRange<InTypes> &...Ranges, - uint64_t &Size) noexcept { - Size = 1; - bool Overflowed = false; - - auto Multiplier = [&](const uint64_t RangeSize) { - if (!Overflowed) - Overflowed = __builtin_mul_overflow(Size, RangeSize, &Size); - }; + [[nodiscard]] constexpr IndexArrayType + getNDIndex(uint64_t FlatIndex) const noexcept { + IndexArrayType NDIndex; - (Multiplier(Ranges.getSize()), ...); + for (std::size_t Index = 0; Index < Base::NumInputs; ++Index) { + NDIndex[Index] = FlatIndex / Strides[Index]; + FlatIndex -= NDIndex[Index] * Strides[Index]; + } - return Overflowed; + return NDIndex; } template <typename BufferPtrsTupleType> @@ -109,31 +82,37 @@ private: writeInputsImpl<0>(NDIndex, Offset, BufferPtrsTuple); } - constexpr IndexArrayType getNDIndex(uint64_t FlatIndex) const noexcept { - IndexArrayType NDIndex; - - for (std::size_t Index = 0; Index < NumInputs; ++Index) { - NDIndex[Index] = FlatIndex / Strides[Index]; - FlatIndex -= NDIndex[Index] * Strides[Index]; - } - - return NDIndex; - } - template <std::size_t Index, typename BufferPtrsTupleType> void writeInputsImpl(IndexArrayType NDIndex, uint64_t Offset, BufferPtrsTupleType BufferPtrsTuple) const noexcept { - if constexpr (Index < NumInputs) { + if constexpr (Index < Base::NumInputs) { const auto &Range = std::get<Index>(RangesTuple); std::get<Index>(BufferPtrsTuple)[Offset] = Range[NDIndex[Index]]; + writeInputsImpl<Index + 1>(NDIndex, Offset, BufferPtrsTuple); } } - uint64_t Size = 1; - RangesTupleType RangesTuple; + [[nodiscard]] static constexpr std::optional<uint64_t> + getInputSpaceSize(const IndexedRange<InTypes> &...Ranges) noexcept { + uint64_t InputSpaceSize = 1; + bool Overflowed = false; + + auto Multiplier = [&](const uint64_t RangeSize) { + if (!Overflowed) + Overflowed = + __builtin_mul_overflow(InputSpaceSize, RangeSize, &InputSpaceSize); + }; + + (Multiplier(Ranges.getSize()), ...); + + if (Overflowed) + return std::nullopt; + + return InputSpaceSize; + } + IndexArrayType Strides = {}; - uint64_t NextFlatIndex = 0; }; } // namespace mathtest diff --git a/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp b/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp new file mode 100644 index 0000000..436cd05 --- /dev/null +++ b/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the definition of the RandomGenerator class, a concrete +/// range-based generator that randomly creates inputs from a given sequence of +/// ranges. +/// +//===----------------------------------------------------------------------===// + +#ifndef MATHTEST_RANDOMGENERATOR_HPP +#define MATHTEST_RANDOMGENERATOR_HPP + +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/RangeBasedGenerator.hpp" + +#include <cstddef> +#include <cstdint> +#include <tuple> + +namespace mathtest { + +template <typename... InTypes> +class [[nodiscard]] RandomGenerator final + : public RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...> { + + friend class RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...>; + + using Base = RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...>; + + using Base::RangesTuple; + using Base::Size; + +public: + explicit constexpr RandomGenerator( + SeedTy BaseSeed, uint64_t Size, + const IndexedRange<InTypes> &...Ranges) noexcept + : Base(Size, Ranges...), BaseSeed(BaseSeed) {} + +private: + [[nodiscard]] static uint64_t getRandomIndex(RandomState &RNG, + uint64_t RangeSize) noexcept { + if (RangeSize == 0) + return 0; + + const uint64_t Threshold = (-RangeSize) % RangeSize; + + uint64_t RandomNumber; + do { + RandomNumber = RNG.next(); + } while (RandomNumber < Threshold); + + return RandomNumber % RangeSize; + } + + template <typename BufferPtrsTupleType> + void writeInputs(uint64_t CurrentFlatIndex, uint64_t Offset, + BufferPtrsTupleType BufferPtrsTuple) const noexcept { + + RandomState RNG(SeedTy{BaseSeed.Value ^ (CurrentFlatIndex + Offset)}); + writeInputsImpl<0>(RNG, Offset, BufferPtrsTuple); + } + + template <std::size_t Index, typename BufferPtrsTupleType> + void writeInputsImpl(RandomState &RNG, uint64_t Offset, + BufferPtrsTupleType BufferPtrsTuple) const noexcept { + if constexpr (Index < Base::NumInputs) { + const auto &Range = std::get<Index>(RangesTuple); + const auto RandomIndex = getRandomIndex(RNG, Range.getSize()); + std::get<Index>(BufferPtrsTuple)[Offset] = Range[RandomIndex]; + + writeInputsImpl<Index + 1>(RNG, Offset, BufferPtrsTuple); + } + } + + SeedTy BaseSeed; +}; +} // namespace mathtest + +#endif // MATHTEST_RANDOMGENERATOR_HPP diff --git a/offload/unittests/Conformance/include/mathtest/RandomState.hpp b/offload/unittests/Conformance/include/mathtest/RandomState.hpp new file mode 100644 index 0000000..322d531 --- /dev/null +++ b/offload/unittests/Conformance/include/mathtest/RandomState.hpp @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the definition of the RandomState class, a fast and +/// lightweight pseudo-random number generator. +/// +/// The implementation is based on the xorshift* generator, seeded using the +/// SplitMix64 generator for robust initialization. For more details on the +/// algorithm, see: https://en.wikipedia.org/wiki/Xorshift +/// +//===----------------------------------------------------------------------===// + +#ifndef MATHTEST_RANDOMSTATE_HPP +#define MATHTEST_RANDOMSTATE_HPP + +#include <cstdint> + +struct SeedTy { + uint64_t Value; +}; + +class [[nodiscard]] RandomState { + uint64_t State; + + [[nodiscard]] static constexpr uint64_t splitMix64(uint64_t X) noexcept { + X += 0x9E3779B97F4A7C15ULL; + X = (X ^ (X >> 30)) * 0xBF58476D1CE4E5B9ULL; + X = (X ^ (X >> 27)) * 0x94D049BB133111EBULL; + X = (X ^ (X >> 31)); + return X ? X : 0x9E3779B97F4A7C15ULL; + } + +public: + explicit constexpr RandomState(SeedTy Seed) noexcept + : State(splitMix64(Seed.Value)) {} + + inline uint64_t next() noexcept { + uint64_t X = State; + X ^= X >> 12; + X ^= X << 25; + X ^= X >> 27; + State = X; + return X * 0x2545F4914F6CDD1DULL; + } +}; + +#endif // MATHTEST_RANDOMSTATE_HPP diff --git a/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp b/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp new file mode 100644 index 0000000..5e1e113 --- /dev/null +++ b/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the definition of the RangeBasedGenerator class, a base +/// class for input generators that operate on a sequence of ranges. +/// +//===----------------------------------------------------------------------===// + +#ifndef MATHTEST_RANGEBASEDGENERATOR_HPP +#define MATHTEST_RANGEBASEDGENERATOR_HPP + +#include "mathtest/IndexedRange.hpp" +#include "mathtest/InputGenerator.hpp" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Parallel.h" + +#include <algorithm> +#include <array> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <tuple> + +namespace mathtest { + +template <typename Derived, typename... InTypes> +class [[nodiscard]] RangeBasedGenerator : public InputGenerator<InTypes...> { +public: + void reset() noexcept override { NextFlatIndex = 0; } + + [[nodiscard]] std::size_t + fill(llvm::MutableArrayRef<InTypes>... Buffers) noexcept override { + const std::array<std::size_t, NumInputs> BufferSizes = {Buffers.size()...}; + const std::size_t BufferSize = BufferSizes[0]; + assert((BufferSize != 0) && "Buffer size cannot be zero"); + assert(std::all_of(BufferSizes.begin(), BufferSizes.end(), + [&](std::size_t Size) { return Size == BufferSize; }) && + "All input buffers must have the same size"); + + if (NextFlatIndex >= Size) + return 0; + + const auto BatchSize = std::min<uint64_t>(BufferSize, Size - NextFlatIndex); + const auto CurrentFlatIndex = NextFlatIndex; + NextFlatIndex += BatchSize; + + auto BufferPtrsTuple = std::make_tuple(Buffers.data()...); + + llvm::parallelFor(0, BatchSize, [&](std::size_t Offset) { + static_cast<Derived *>(this)->writeInputs(CurrentFlatIndex, Offset, + BufferPtrsTuple); + }); + + return static_cast<std::size_t>(BatchSize); + } + +protected: + using RangesTupleType = std::tuple<IndexedRange<InTypes>...>; + + static constexpr std::size_t NumInputs = sizeof...(InTypes); + static_assert(NumInputs > 0, "The number of inputs must be at least 1"); + + explicit constexpr RangeBasedGenerator( + const IndexedRange<InTypes> &...Ranges) noexcept + : RangesTuple(Ranges...) {} + + explicit constexpr RangeBasedGenerator( + uint64_t Size, const IndexedRange<InTypes> &...Ranges) noexcept + : RangesTuple(Ranges...), Size(Size) {} + + RangesTupleType RangesTuple; + uint64_t Size = 0; + +private: + uint64_t NextFlatIndex = 0; +}; +} // namespace mathtest + +#endif // MATHTEST_RANGEBASEDGENERATOR_HPP diff --git a/offload/unittests/Conformance/lib/DeviceContext.cpp b/offload/unittests/Conformance/lib/DeviceContext.cpp index a0068c3..6c3425f 100644 --- a/offload/unittests/Conformance/lib/DeviceContext.cpp +++ b/offload/unittests/Conformance/lib/DeviceContext.cpp @@ -55,13 +55,14 @@ static OffloadInitWrapper Wrapper{}; [[nodiscard]] std::string getDeviceName(ol_device_handle_t DeviceHandle) { std::size_t PropSize = 0; - OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_NAME, &PropSize)); + OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME, + &PropSize)); if (PropSize == 0) return ""; std::string PropValue(PropSize, '\0'); - OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_NAME, PropSize, + OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME, PropSize, PropValue.data())); PropValue.pop_back(); // Remove the null terminator diff --git a/offload/unittests/Conformance/tests/AcosTest.cpp b/offload/unittests/Conformance/tests/AcosTest.cpp new file mode 100644 index 0000000..bc0d1d2 --- /dev/null +++ b/offload/unittests/Conformance/tests/AcosTest.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the acos function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'acos' function to select the double version +constexpr auto acosd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(acos); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<acosd> { + static constexpr llvm::StringRef Name = "acos"; + static constexpr llvm::StringRef KernelName = "acosKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the acos function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/-1.0, + /*End=*/1.0, + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<acosd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Acosf16Test.cpp b/offload/unittests/Conformance/tests/Acosf16Test.cpp new file mode 100644 index 0000000..ce11cc2 --- /dev/null +++ b/offload/unittests/Conformance/tests/Acosf16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the acosf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 acosf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<acosf16> { + static constexpr llvm::StringRef Name = "acosf16"; + static constexpr llvm::StringRef KernelName = "acosf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the acosf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(-1.0), + /*End=*/float16(1.0), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<acosf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/AcosfTest.cpp b/offload/unittests/Conformance/tests/AcosfTest.cpp index e69ee3b..65b2d18 100644 --- a/offload/unittests/Conformance/tests/AcosfTest.cpp +++ b/offload/unittests/Conformance/tests/AcosfTest.cpp @@ -40,7 +40,9 @@ int main(int argc, const char **argv) { using namespace mathtest; - IndexedRange<float> Range; + IndexedRange<float> Range(/*Begin=*/-1.0f, + /*End=*/1.0f, + /*Inclusive=*/true); ExhaustiveGenerator<float> Generator(Range); const auto Configs = cl::getTestConfigs(); diff --git a/offload/unittests/Conformance/tests/Acoshf16Test.cpp b/offload/unittests/Conformance/tests/Acoshf16Test.cpp new file mode 100644 index 0000000..8043447 --- /dev/null +++ b/offload/unittests/Conformance/tests/Acoshf16Test.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the acoshf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/Numerics.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 acoshf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<acoshf16> { + static constexpr llvm::StringRef Name = "acoshf16"; + static constexpr llvm::StringRef KernelName = "acoshf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the acoshf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(1.0), + /*End=*/getMaxOrInf<float16>(), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<acoshf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Acospif16Test.cpp b/offload/unittests/Conformance/tests/Acospif16Test.cpp new file mode 100644 index 0000000..c5871e2 --- /dev/null +++ b/offload/unittests/Conformance/tests/Acospif16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the acospif16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 acospif16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<acospif16> { + static constexpr llvm::StringRef Name = "acospif16"; + static constexpr llvm::StringRef KernelName = "acospif16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the acospif16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(-1.0), + /*End=*/float16(1.0), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<acospif16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/AsinTest.cpp b/offload/unittests/Conformance/tests/AsinTest.cpp new file mode 100644 index 0000000..aaaa37a --- /dev/null +++ b/offload/unittests/Conformance/tests/AsinTest.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the asin function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'asin' function to select the double version +constexpr auto asind // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(asin); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<asind> { + static constexpr llvm::StringRef Name = "asin"; + static constexpr llvm::StringRef KernelName = "asinKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the asin function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/-1.0, + /*End=*/1.0, + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<asind>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Asinf16Test.cpp b/offload/unittests/Conformance/tests/Asinf16Test.cpp new file mode 100644 index 0000000..5784d6b --- /dev/null +++ b/offload/unittests/Conformance/tests/Asinf16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the asinf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 asinf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<asinf16> { + static constexpr llvm::StringRef Name = "asinf16"; + static constexpr llvm::StringRef KernelName = "asinf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the asinf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(-1.0), + /*End=*/float16(1.0), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<asinf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/AsinfTest.cpp b/offload/unittests/Conformance/tests/AsinfTest.cpp index 991f79b..aeee648 100644 --- a/offload/unittests/Conformance/tests/AsinfTest.cpp +++ b/offload/unittests/Conformance/tests/AsinfTest.cpp @@ -40,7 +40,9 @@ int main(int argc, const char **argv) { using namespace mathtest; - IndexedRange<float> Range; + IndexedRange<float> Range(/*Begin=*/-1.0f, + /*End=*/1.0f, + /*Inclusive=*/true); ExhaustiveGenerator<float> Generator(Range); const auto Configs = cl::getTestConfigs(); diff --git a/offload/unittests/Conformance/tests/Asinhf16Test.cpp b/offload/unittests/Conformance/tests/Asinhf16Test.cpp new file mode 100644 index 0000000..0af9bcb --- /dev/null +++ b/offload/unittests/Conformance/tests/Asinhf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the asinhf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 asinhf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<asinhf16> { + static constexpr llvm::StringRef Name = "asinhf16"; + static constexpr llvm::StringRef KernelName = "asinhf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the asinhf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<asinhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Atan2fTest.cpp b/offload/unittests/Conformance/tests/Atan2fTest.cpp new file mode 100644 index 0000000..4a46f9a --- /dev/null +++ b/offload/unittests/Conformance/tests/Atan2fTest.cpp @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the atan2f function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace mathtest { + +template <> struct FunctionConfig<atan2f> { + static constexpr llvm::StringRef Name = "atan2f"; + static constexpr llvm::StringRef KernelName = "atan2fKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 65, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 6; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the atan2f function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<float> RangeX; + IndexedRange<float> RangeY; + RandomGenerator<float, float> Generator(SeedTy{Seed}, Size, RangeX, RangeY); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<atan2f>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Atanf16Test.cpp b/offload/unittests/Conformance/tests/Atanf16Test.cpp new file mode 100644 index 0000000..3d3fa38 --- /dev/null +++ b/offload/unittests/Conformance/tests/Atanf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the atanf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 atanf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<atanf16> { + static constexpr llvm::StringRef Name = "atanf16"; + static constexpr llvm::StringRef KernelName = "atanf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the atanf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<atanf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Atanhf16Test.cpp b/offload/unittests/Conformance/tests/Atanhf16Test.cpp new file mode 100644 index 0000000..86a0f82 --- /dev/null +++ b/offload/unittests/Conformance/tests/Atanhf16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the atanhf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 atanhf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<atanhf16> { + static constexpr llvm::StringRef Name = "atanhf16"; + static constexpr llvm::StringRef KernelName = "atanhf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the atanhf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(-1.0), + /*End=*/float16(1.0), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<atanhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/CMakeLists.txt b/offload/unittests/Conformance/tests/CMakeLists.txt index 8c0109ba..ad94df8 100644 --- a/offload/unittests/Conformance/tests/CMakeLists.txt +++ b/offload/unittests/Conformance/tests/CMakeLists.txt @@ -3,30 +3,72 @@ if(NOT TARGET libc) return() endif() +add_conformance_test(acos AcosTest.cpp) add_conformance_test(acosf AcosfTest.cpp) +add_conformance_test(acosf16 Acosf16Test.cpp) add_conformance_test(acoshf AcoshfTest.cpp) +add_conformance_test(acoshf16 Acoshf16Test.cpp) +add_conformance_test(acospif16 Acospif16Test.cpp) +add_conformance_test(asin AsinTest.cpp) add_conformance_test(asinf AsinfTest.cpp) +add_conformance_test(asinf16 Asinf16Test.cpp) add_conformance_test(asinhf AsinhfTest.cpp) +add_conformance_test(asinhf16 Asinhf16Test.cpp) add_conformance_test(atanf AtanfTest.cpp) +add_conformance_test(atanf16 Atanf16Test.cpp) +add_conformance_test(atan2f Atan2fTest.cpp) add_conformance_test(atanhf AtanhfTest.cpp) +add_conformance_test(atanhf16 Atanhf16Test.cpp) +add_conformance_test(cbrt CbrtTest.cpp) add_conformance_test(cbrtf CbrtfTest.cpp) +add_conformance_test(cos CosTest.cpp) add_conformance_test(cosf CosfTest.cpp) +add_conformance_test(cosf16 Cosf16Test.cpp) add_conformance_test(coshf CoshfTest.cpp) +add_conformance_test(coshf16 Coshf16Test.cpp) add_conformance_test(cospif CospifTest.cpp) +add_conformance_test(cospif16 Cospif16Test.cpp) add_conformance_test(erff ErffTest.cpp) +add_conformance_test(exp ExpTest.cpp) add_conformance_test(expf ExpfTest.cpp) +add_conformance_test(expf16 Expf16Test.cpp) +add_conformance_test(exp10 Exp10Test.cpp) add_conformance_test(exp10f Exp10fTest.cpp) +add_conformance_test(exp10f16 Exp10f16Test.cpp) +add_conformance_test(exp2 Exp2Test.cpp) add_conformance_test(exp2f Exp2fTest.cpp) +add_conformance_test(exp2f16 Exp2f16Test.cpp) +add_conformance_test(expm1 Expm1Test.cpp) add_conformance_test(expm1f Expm1fTest.cpp) +add_conformance_test(expm1f16 Expm1f16Test.cpp) +add_conformance_test(hypot HypotTest.cpp) +add_conformance_test(hypotf HypotfTest.cpp) add_conformance_test(hypotf16 Hypotf16Test.cpp) +add_conformance_test(log LogTest.cpp) add_conformance_test(logf LogfTest.cpp) +add_conformance_test(logf16 Logf16Test.cpp) +add_conformance_test(log10 Log10Test.cpp) add_conformance_test(log10f Log10fTest.cpp) +add_conformance_test(log10f16 Log10f16Test.cpp) +add_conformance_test(log1p Log1pTest.cpp) add_conformance_test(log1pf Log1pfTest.cpp) +add_conformance_test(log2 Log2Test.cpp) add_conformance_test(log2f Log2fTest.cpp) +add_conformance_test(log2f16 Log2f16Test.cpp) +add_conformance_test(powf PowfTest.cpp) +add_conformance_test(sin SinTest.cpp) add_conformance_test(sinf SinfTest.cpp) +add_conformance_test(sinf16 Sinf16Test.cpp) +add_conformance_test(sincos SincosTest.cpp) add_conformance_test(sincosf SincosfTest.cpp) add_conformance_test(sinhf SinhfTest.cpp) +add_conformance_test(sinhf16 Sinhf16Test.cpp) add_conformance_test(sinpif SinpifTest.cpp) +add_conformance_test(sinpif16 Sinpif16Test.cpp) +add_conformance_test(tan TanTest.cpp) add_conformance_test(tanf TanfTest.cpp) +add_conformance_test(tanf16 Tanf16Test.cpp) add_conformance_test(tanhf TanhfTest.cpp) +add_conformance_test(tanhf16 Tanhf16Test.cpp) add_conformance_test(tanpif TanpifTest.cpp) +add_conformance_test(tanpif16 Tanpif16Test.cpp) diff --git a/offload/unittests/Conformance/tests/CbrtTest.cpp b/offload/unittests/Conformance/tests/CbrtTest.cpp new file mode 100644 index 0000000..3a6523b --- /dev/null +++ b/offload/unittests/Conformance/tests/CbrtTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the cbrt function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'cbrt' function to select the double version +constexpr auto cbrtd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(cbrt); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<cbrtd> { + static constexpr llvm::StringRef Name = "cbrt"; + static constexpr llvm::StringRef KernelName = "cbrtKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the cbrt function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<cbrtd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/CosTest.cpp b/offload/unittests/Conformance/tests/CosTest.cpp new file mode 100644 index 0000000..e3d3d3d --- /dev/null +++ b/offload/unittests/Conformance/tests/CosTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the cos function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'cos' function to select the double version +constexpr auto cosd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(cos); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<cosd> { + static constexpr llvm::StringRef Name = "cos"; + static constexpr llvm::StringRef KernelName = "cosKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the cos function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<cosd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Cosf16Test.cpp b/offload/unittests/Conformance/tests/Cosf16Test.cpp new file mode 100644 index 0000000..680e4b9 --- /dev/null +++ b/offload/unittests/Conformance/tests/Cosf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the cosf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 cosf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<cosf16> { + static constexpr llvm::StringRef Name = "cosf16"; + static constexpr llvm::StringRef KernelName = "cosf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the cosf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<cosf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Coshf16Test.cpp b/offload/unittests/Conformance/tests/Coshf16Test.cpp new file mode 100644 index 0000000..1b378b5 --- /dev/null +++ b/offload/unittests/Conformance/tests/Coshf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the coshf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 coshf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<coshf16> { + static constexpr llvm::StringRef Name = "coshf16"; + static constexpr llvm::StringRef KernelName = "coshf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the coshf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<coshf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Cospif16Test.cpp b/offload/unittests/Conformance/tests/Cospif16Test.cpp new file mode 100644 index 0000000..84aa682 --- /dev/null +++ b/offload/unittests/Conformance/tests/Cospif16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the cospif16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 cospif16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<cospif16> { + static constexpr llvm::StringRef Name = "cospif16"; + static constexpr llvm::StringRef KernelName = "cospif16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the cospif16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<cospif16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Exp10Test.cpp b/offload/unittests/Conformance/tests/Exp10Test.cpp new file mode 100644 index 0000000..05af478 --- /dev/null +++ b/offload/unittests/Conformance/tests/Exp10Test.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp10 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'exp10' function to select the double version +constexpr auto exp10d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(exp10); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<exp10d> { + static constexpr llvm::StringRef Name = "exp10"; + static constexpr llvm::StringRef KernelName = "exp10Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the exp10 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<exp10d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Exp10f16Test.cpp b/offload/unittests/Conformance/tests/Exp10f16Test.cpp new file mode 100644 index 0000000..7d61ad0 --- /dev/null +++ b/offload/unittests/Conformance/tests/Exp10f16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp10f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 exp10f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<exp10f16> { + static constexpr llvm::StringRef Name = "exp10f16"; + static constexpr llvm::StringRef KernelName = "exp10f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the exp10f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<exp10f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Exp2Test.cpp b/offload/unittests/Conformance/tests/Exp2Test.cpp new file mode 100644 index 0000000..bb2fa10 --- /dev/null +++ b/offload/unittests/Conformance/tests/Exp2Test.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp2 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'exp2' function to select the double version +constexpr auto exp2d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(exp2); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<exp2d> { + static constexpr llvm::StringRef Name = "exp2"; + static constexpr llvm::StringRef KernelName = "exp2Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the exp2 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<exp2d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Exp2f16Test.cpp b/offload/unittests/Conformance/tests/Exp2f16Test.cpp new file mode 100644 index 0000000..9ea9256 --- /dev/null +++ b/offload/unittests/Conformance/tests/Exp2f16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp2f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 exp2f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<exp2f16> { + static constexpr llvm::StringRef Name = "exp2f16"; + static constexpr llvm::StringRef KernelName = "exp2f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the exp2f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<exp2f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/ExpTest.cpp b/offload/unittests/Conformance/tests/ExpTest.cpp new file mode 100644 index 0000000..9aa52b1 --- /dev/null +++ b/offload/unittests/Conformance/tests/ExpTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the exp function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'exp' function to select the double version +constexpr auto expd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(exp); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<expd> { + static constexpr llvm::StringRef Name = "exp"; + static constexpr llvm::StringRef KernelName = "expKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the exp function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<expd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Expf16Test.cpp b/offload/unittests/Conformance/tests/Expf16Test.cpp new file mode 100644 index 0000000..8938815 --- /dev/null +++ b/offload/unittests/Conformance/tests/Expf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the expf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 expf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<expf16> { + static constexpr llvm::StringRef Name = "expf16"; + static constexpr llvm::StringRef KernelName = "expf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the expf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<expf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Expm1Test.cpp b/offload/unittests/Conformance/tests/Expm1Test.cpp new file mode 100644 index 0000000..a27944b --- /dev/null +++ b/offload/unittests/Conformance/tests/Expm1Test.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the expm1 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'expm1' function to select the double version +constexpr auto expm1d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(expm1); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<expm1d> { + static constexpr llvm::StringRef Name = "expm1"; + static constexpr llvm::StringRef KernelName = "expm1Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the expm1 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<expm1d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Expm1f16Test.cpp b/offload/unittests/Conformance/tests/Expm1f16Test.cpp new file mode 100644 index 0000000..447196bb --- /dev/null +++ b/offload/unittests/Conformance/tests/Expm1f16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the expm1f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 expm1f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<expm1f16> { + static constexpr llvm::StringRef Name = "expm1f16"; + static constexpr llvm::StringRef KernelName = "expm1f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the expm1f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<expm1f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/HypotTest.cpp b/offload/unittests/Conformance/tests/HypotTest.cpp new file mode 100644 index 0000000..0417ad9 --- /dev/null +++ b/offload/unittests/Conformance/tests/HypotTest.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the hypot function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'hypot' function to select the double version +constexpr auto hypotd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double, double)>(hypot); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<hypotd> { + static constexpr llvm::StringRef Name = "hypot"; + static constexpr llvm::StringRef KernelName = "hypotKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the hypot function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> RangeX; + IndexedRange<double> RangeY; + RandomGenerator<double, double> Generator(SeedTy{Seed}, Size, RangeX, RangeY); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<hypotd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/HypotfTest.cpp b/offload/unittests/Conformance/tests/HypotfTest.cpp new file mode 100644 index 0000000..98a4e90 --- /dev/null +++ b/offload/unittests/Conformance/tests/HypotfTest.cpp @@ -0,0 +1,58 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the hypotf function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace mathtest { + +template <> struct FunctionConfig<hypotf> { + static constexpr llvm::StringRef Name = "hypotf"; + static constexpr llvm::StringRef KernelName = "hypotfKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 65, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the hypotf function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<float> RangeX; + IndexedRange<float> RangeY; + RandomGenerator<float, float> Generator(SeedTy{Seed}, Size, RangeX, RangeY); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<hypotf>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log10Test.cpp b/offload/unittests/Conformance/tests/Log10Test.cpp new file mode 100644 index 0000000..bf46f11 --- /dev/null +++ b/offload/unittests/Conformance/tests/Log10Test.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log10 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <limits> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'log10' function to select the double version +constexpr auto log10d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(log10); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<log10d> { + static constexpr llvm::StringRef Name = "log10"; + static constexpr llvm::StringRef KernelName = "log10Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log10 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/0.0, + /*End=*/std::numeric_limits<double>::infinity(), + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<log10d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log10f16Test.cpp b/offload/unittests/Conformance/tests/Log10f16Test.cpp new file mode 100644 index 0000000..605e1ae --- /dev/null +++ b/offload/unittests/Conformance/tests/Log10f16Test.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log10f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/Numerics.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 log10f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<log10f16> { + static constexpr llvm::StringRef Name = "log10f16"; + static constexpr llvm::StringRef KernelName = "log10f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the log10f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(0.0), + /*End=*/getMaxOrInf<float16>(), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<log10f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log1pTest.cpp b/offload/unittests/Conformance/tests/Log1pTest.cpp new file mode 100644 index 0000000..023b67e --- /dev/null +++ b/offload/unittests/Conformance/tests/Log1pTest.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log1p function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <limits> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'log1p' function to select the double version +constexpr auto log1pd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(log1p); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<log1pd> { + static constexpr llvm::StringRef Name = "log1p"; + static constexpr llvm::StringRef KernelName = "log1pKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log1p function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/-1.0, + /*End=*/std::numeric_limits<double>::infinity(), + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<log1pd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log2Test.cpp b/offload/unittests/Conformance/tests/Log2Test.cpp new file mode 100644 index 0000000..2ae7e5c --- /dev/null +++ b/offload/unittests/Conformance/tests/Log2Test.cpp @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log2 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <limits> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'log2' function to select the double version +constexpr auto log2d // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(log2); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<log2d> { + static constexpr llvm::StringRef Name = "log2"; + static constexpr llvm::StringRef KernelName = "log2Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log2 function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/0.0, + /*End=*/std::numeric_limits<double>::infinity(), + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<log2d>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Log2f16Test.cpp b/offload/unittests/Conformance/tests/Log2f16Test.cpp new file mode 100644 index 0000000..5ce4696 --- /dev/null +++ b/offload/unittests/Conformance/tests/Log2f16Test.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log2f16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/Numerics.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 log2f16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<log2f16> { + static constexpr llvm::StringRef Name = "log2f16"; + static constexpr llvm::StringRef KernelName = "log2f16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log2f16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(0.0), + /*End=*/getMaxOrInf<float16>(), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<log2f16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/LogTest.cpp b/offload/unittests/Conformance/tests/LogTest.cpp new file mode 100644 index 0000000..ae568e2 --- /dev/null +++ b/offload/unittests/Conformance/tests/LogTest.cpp @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the log function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <limits> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'log' function to select the double version +constexpr auto logd // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(log); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<logd> { + static constexpr llvm::StringRef Name = "log"; + static constexpr llvm::StringRef KernelName = "logKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 3; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the log function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range(/*Begin=*/0.0, + /*End=*/std::numeric_limits<double>::infinity(), + /*Inclusive=*/true); + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<logd>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Logf16Test.cpp b/offload/unittests/Conformance/tests/Logf16Test.cpp new file mode 100644 index 0000000..372dccb --- /dev/null +++ b/offload/unittests/Conformance/tests/Logf16Test.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the logf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/Numerics.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 logf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<logf16> { + static constexpr llvm::StringRef Name = "logf16"; + static constexpr llvm::StringRef KernelName = "logf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the logf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range(/*Begin=*/float16(0.0), + /*End=*/getMaxOrInf<float16>(), + /*Inclusive=*/true); + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<logf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/PowfTest.cpp b/offload/unittests/Conformance/tests/PowfTest.cpp new file mode 100644 index 0000000..246801e --- /dev/null +++ b/offload/unittests/Conformance/tests/PowfTest.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the powf function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +static inline float powfRoundedExponent(float Base, float Exponent) { + return powf(Base, roundf(Exponent)); +} + +namespace mathtest { + +template <> struct FunctionConfig<powf> { + static constexpr llvm::StringRef Name = "powf (real exponents)"; + static constexpr llvm::StringRef KernelName = "powfKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 65, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 16; +}; + +template <> struct FunctionConfig<powfRoundedExponent> { + static constexpr llvm::StringRef Name = "powf (integer exponents)"; + static constexpr llvm::StringRef KernelName = "powfRoundedExponentKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 65, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 16; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the powf function"); + + using namespace mathtest; + + uint64_t Size = 1ULL << 32; + IndexedRange<float> RangeX; + IndexedRange<float> RangeY; + RandomGenerator<float, float> Generator0(SeedTy{42}, Size, RangeX, RangeY); + RandomGenerator<float, float> Generator1(SeedTy{51}, Size, RangeX, RangeY); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool RealExponentsPassed = + runTests<powf>(Generator0, Configs, DeviceBinaryDir, IsVerbose); + bool IntegerExponentsPassed = runTests<powfRoundedExponent>( + Generator1, Configs, DeviceBinaryDir, IsVerbose); + + return (RealExponentsPassed && IntegerExponentsPassed) ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/SinTest.cpp b/offload/unittests/Conformance/tests/SinTest.cpp new file mode 100644 index 0000000..36897d7 --- /dev/null +++ b/offload/unittests/Conformance/tests/SinTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sin function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'sin' function to select the double version +constexpr auto sind // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(sin); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<sind> { + static constexpr llvm::StringRef Name = "sin"; + static constexpr llvm::StringRef KernelName = "sinKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the sin function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<sind>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/SincosTest.cpp b/offload/unittests/Conformance/tests/SincosTest.cpp new file mode 100644 index 0000000..a3d1650 --- /dev/null +++ b/offload/unittests/Conformance/tests/SincosTest.cpp @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sincos function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +static inline double sincosSin(double X) { + double SinX, CosX; + sincos(X, &SinX, &CosX); + return SinX; +} + +static inline double sincosCos(double X) { + double SinX, CosX; + sincos(X, &SinX, &CosX); + return CosX; +} + +namespace mathtest { + +template <> struct FunctionConfig<sincosSin> { + static constexpr llvm::StringRef Name = "sincos (sin part)"; + static constexpr llvm::StringRef KernelName = "sincosSinKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; + +template <> struct FunctionConfig<sincosCos> { + static constexpr llvm::StringRef Name = "sincos (cos part)"; + static constexpr llvm::StringRef KernelName = "sincosCosKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 4; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the sincos function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool SinPartPassed = + runTests<sincosSin>(Generator, Configs, DeviceBinaryDir, IsVerbose); + bool CosPartPassed = + runTests<sincosCos>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return (SinPartPassed && CosPartPassed) ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Sinf16Test.cpp b/offload/unittests/Conformance/tests/Sinf16Test.cpp new file mode 100644 index 0000000..4c5fb22 --- /dev/null +++ b/offload/unittests/Conformance/tests/Sinf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sinf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 sinf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<sinf16> { + static constexpr llvm::StringRef Name = "sinf16"; + static constexpr llvm::StringRef KernelName = "sinf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the sinf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<sinf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Sinhf16Test.cpp b/offload/unittests/Conformance/tests/Sinhf16Test.cpp new file mode 100644 index 0000000..fe6f7dd --- /dev/null +++ b/offload/unittests/Conformance/tests/Sinhf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sinhf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 sinhf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<sinhf16> { + static constexpr llvm::StringRef Name = "sinhf16"; + static constexpr llvm::StringRef KernelName = "sinhf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the sinhf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<sinhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Sinpif16Test.cpp b/offload/unittests/Conformance/tests/Sinpif16Test.cpp new file mode 100644 index 0000000..ff9c93c --- /dev/null +++ b/offload/unittests/Conformance/tests/Sinpif16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the sinpif16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 sinpif16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<sinpif16> { + static constexpr llvm::StringRef Name = "sinpif16"; + static constexpr llvm::StringRef KernelName = "sinpif16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the sinpif16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<sinpif16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/TanTest.cpp b/offload/unittests/Conformance/tests/TanTest.cpp new file mode 100644 index 0000000..3a9a058 --- /dev/null +++ b/offload/unittests/Conformance/tests/TanTest.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the tan function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/RandomGenerator.hpp" +#include "mathtest/RandomState.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +namespace { + +// Disambiguate the overloaded 'tan' function to select the double version +constexpr auto tand // NOLINT(readability-identifier-naming) + = static_cast<double (*)(double)>(tan); +} // namespace + +namespace mathtest { + +template <> struct FunctionConfig<tand> { + static constexpr llvm::StringRef Name = "tan"; + static constexpr llvm::StringRef KernelName = "tanKernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 68, Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 5; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the tan function"); + + using namespace mathtest; + + uint64_t Seed = 42; + uint64_t Size = 1ULL << 32; + IndexedRange<double> Range; + RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = runTests<tand>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Tanf16Test.cpp b/offload/unittests/Conformance/tests/Tanf16Test.cpp new file mode 100644 index 0000000..eae9818 --- /dev/null +++ b/offload/unittests/Conformance/tests/Tanf16Test.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the tanf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 tanf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<tanf16> { + static constexpr llvm::StringRef Name = "tanf16"; + static constexpr llvm::StringRef KernelName = "tanf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + // Note: The minimum accuracy at the source is 2.5 ULP, but we round it + // down to ensure conformance. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the tanf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<tanf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Tanhf16Test.cpp b/offload/unittests/Conformance/tests/Tanhf16Test.cpp new file mode 100644 index 0000000..1a11f3d --- /dev/null +++ b/offload/unittests/Conformance/tests/Tanhf16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the tanhf16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 tanhf16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<tanhf16> { + static constexpr llvm::StringRef Name = "tanhf16"; + static constexpr llvm::StringRef KernelName = "tanhf16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions(argc, argv, + "Conformance test of the tanhf16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<tanhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/Conformance/tests/Tanpif16Test.cpp b/offload/unittests/Conformance/tests/Tanpif16Test.cpp new file mode 100644 index 0000000..7637480 --- /dev/null +++ b/offload/unittests/Conformance/tests/Tanpif16Test.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the conformance test of the tanpif16 function. +/// +//===----------------------------------------------------------------------===// + +#include "mathtest/CommandLineExtras.hpp" +#include "mathtest/ExhaustiveGenerator.hpp" +#include "mathtest/IndexedRange.hpp" +#include "mathtest/TestConfig.hpp" +#include "mathtest/TestRunner.hpp" +#include "mathtest/TypeExtras.hpp" + +#include "llvm/ADT/StringRef.h" + +#include <cstdlib> +#include <math.h> + +using namespace mathtest; + +extern "C" float16 tanpif16(float16); + +namespace mathtest { + +template <> struct FunctionConfig<tanpif16> { + static constexpr llvm::StringRef Name = "tanpif16"; + static constexpr llvm::StringRef KernelName = "tanpif16Kernel"; + + // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4, + // Table 69 (Full Profile), Khronos Registry [July 10, 2025]. + static constexpr uint64_t UlpTolerance = 2; +}; +} // namespace mathtest + +int main(int argc, const char **argv) { + llvm::cl::ParseCommandLineOptions( + argc, argv, "Conformance test of the tanpif16 function"); + + using namespace mathtest; + + IndexedRange<float16> Range; + ExhaustiveGenerator<float16> Generator(Range); + + const auto Configs = cl::getTestConfigs(); + const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR; + const bool IsVerbose = cl::IsVerbose; + + bool Passed = + runTests<tanpif16>(Generator, Configs, DeviceBinaryDir, IsVerbose); + + return Passed ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt index 8f0267e..b2d51442 100644 --- a/offload/unittests/OffloadAPI/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/CMakeLists.txt @@ -20,10 +20,12 @@ add_offload_unittest("init" target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER) add_offload_unittest("kernel" + kernel/olCalculateOptimalOccupancy.cpp kernel/olLaunchKernel.cpp) add_offload_unittest("memory" memory/olMemAlloc.cpp + memory/olMemFill.cpp memory/olMemFree.cpp memory/olMemcpy.cpp) @@ -41,7 +43,8 @@ add_offload_unittest("queue" queue/olDestroyQueue.cpp queue/olGetQueueInfo.cpp queue/olGetQueueInfoSize.cpp - queue/olWaitEvents.cpp) + queue/olWaitEvents.cpp + queue/olLaunchHostFunction.cpp) add_offload_unittest("symbol" symbol/olGetSymbol.cpp diff --git a/offload/unittests/OffloadAPI/common/Environment.cpp b/offload/unittests/OffloadAPI/common/Environment.cpp index ef092cd..8007713 100644 --- a/offload/unittests/OffloadAPI/common/Environment.cpp +++ b/offload/unittests/OffloadAPI/common/Environment.cpp @@ -41,9 +41,9 @@ raw_ostream &operator<<(raw_ostream &Out, raw_ostream &operator<<(raw_ostream &Out, const ol_device_handle_t &Device) { size_t Size; - olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size); + olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size); std::vector<char> Name(Size); - olGetDeviceInfo(Device, OL_DEVICE_INFO_NAME, Size, Name.data()); + olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data()); Out << Name.data(); return Out; } diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp index 43240fa..0538e60f 100644 --- a/offload/unittests/OffloadAPI/common/Fixtures.hpp +++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp @@ -26,6 +26,20 @@ } while (0) #endif +#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED +#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL) \ + do { \ + ol_result_t Res = ACTUAL; \ + if (Res && Res->Code == OL_ERRC_UNSUPPORTED) { \ + GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test"; \ + return; \ + } else if (Res && Res->Code != OL_ERRC_SUCCESS) { \ + GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": " \ + << Res->Details; \ + } \ + } while (0) +#endif + // TODO: rework this so the EXPECTED/ACTUAL results are readable #ifndef ASSERT_ERROR #define ASSERT_ERROR(EXPECTED, ACTUAL) \ @@ -75,6 +89,40 @@ template <typename Fn> inline void threadify(Fn body) { } } +/// Enqueues a task to the queue that can be manually resolved. +// It will block until `trigger` is called. +struct ManuallyTriggeredTask { + std::mutex M; + std::condition_variable CV; + bool Flag = false; + ol_event_handle_t CompleteEvent; + + ol_result_t enqueue(ol_queue_handle_t Queue) { + if (auto Err = olLaunchHostFunction( + Queue, + [](void *That) { + static_cast<ManuallyTriggeredTask *>(That)->wait(); + }, + this)) + return Err; + + return olCreateEvent(Queue, &CompleteEvent); + } + + void wait() { + std::unique_lock<std::mutex> lk(M); + CV.wait_for(lk, std::chrono::milliseconds(1000), [&] { return Flag; }); + EXPECT_TRUE(Flag); + } + + ol_result_t trigger() { + Flag = true; + CV.notify_one(); + + return olSyncEvent(CompleteEvent); + } +}; + struct OffloadTest : ::testing::Test { ol_device_handle_t Host = TestEnvironment::getHostDevice(); }; diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp index 5657320..8cb0b80 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp @@ -13,6 +13,38 @@ using olGetDeviceInfoTest = OffloadDeviceTest; OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetDeviceInfoTest); +#define OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Dev, \ + Expr) \ + TEST_P(olGetDeviceInfoTest, Test##Dev##TestName) { \ + PropType Value; \ + ASSERT_SUCCESS(olGetDeviceInfo(Dev, PropName, sizeof(Value), &Value)); \ + Expr; \ + } + +#define OL_DEVICE_INFO_TEST_DEVICE_SUCCESS(TestName, PropType, PropName) \ + OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Device, {}) + +#define OL_DEVICE_INFO_TEST_HOST_SUCCESS(TestName, PropType, PropName) \ + OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Host, {}) + +#define OL_DEVICE_INFO_TEST_SUCCESS(TestName, PropType, PropName) \ + OL_DEVICE_INFO_TEST_DEVICE_SUCCESS(TestName, PropType, PropName) \ + OL_DEVICE_INFO_TEST_HOST_SUCCESS(TestName, PropType, PropName) + +#define OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(TestName, PropType, PropName, \ + LowBound) \ + OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Device, \ + ASSERT_GT(Value, LowBound)) + +#define OL_DEVICE_INFO_TEST_HOST_VALUE_GT(TestName, PropType, PropName, \ + LowBound) \ + OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Host, \ + ASSERT_GT(Value, LowBound)) + +#define OL_DEVICE_INFO_TEST_VALUE_GT(TestName, PropType, PropName, LowBound) \ + OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(TestName, PropType, PropName, LowBound) \ + OL_DEVICE_INFO_TEST_HOST_VALUE_GT(TestName, PropType, PropName, LowBound) + TEST_P(olGetDeviceInfoTest, SuccessType) { ol_device_type_t DeviceType; ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_TYPE, @@ -54,6 +86,29 @@ TEST_P(olGetDeviceInfoTest, HostName) { ASSERT_EQ(std::strlen(Name.data()), Size - 1); } +TEST_P(olGetDeviceInfoTest, SuccessProductName) { + size_t Size = 0; + ASSERT_SUCCESS( + olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> Name; + Name.resize(Size); + ASSERT_SUCCESS( + olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data())); + ASSERT_EQ(std::strlen(Name.data()), Size - 1); +} + +TEST_P(olGetDeviceInfoTest, HostProductName) { + size_t Size = 0; + ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> Name; + Name.resize(Size); + ASSERT_SUCCESS( + olGetDeviceInfo(Host, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data())); + ASSERT_EQ(std::strlen(Name.data()), Size - 1); +} + TEST_P(olGetDeviceInfoTest, SuccessVendor) { size_t Size = 0; ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size)); @@ -77,12 +132,8 @@ TEST_P(olGetDeviceInfoTest, SuccessDriverVersion) { ASSERT_EQ(std::strlen(DriverVersion.data()), Size - 1); } -TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSize) { - uint32_t Value; - ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, - sizeof(Value), &Value)); - ASSERT_GT(Value, 0u); -} +OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkGroupSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, 0); TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) { ol_dimensions_t Value{0, 0, 0}; @@ -94,6 +145,59 @@ TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) { ASSERT_GT(Value.z, 0u); } +OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_SIZE, 0); + +TEST_P(olGetDeviceInfoTest, SuccessMaxWorkSizePerDimension) { + ol_dimensions_t Value{0, 0, 0}; + ASSERT_SUCCESS(olGetDeviceInfo(Device, + OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, + sizeof(Value), &Value)); + ASSERT_GT(Value.x, 0u); + ASSERT_GT(Value.y, 0u); + ASSERT_GT(Value.z, 0u); +} + +OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(VendorId, uint32_t, + OL_DEVICE_INFO_VENDOR_ID, 0); +OL_DEVICE_INFO_TEST_HOST_SUCCESS(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID); +OL_DEVICE_INFO_TEST_VALUE_GT(NumComputeUnits, uint32_t, + OL_DEVICE_INFO_NUM_COMPUTE_UNITS, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(SingleFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_SINGLE_FP_CONFIG, 0); +OL_DEVICE_INFO_TEST_SUCCESS(HalfFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_HALF_FP_CONFIG); +OL_DEVICE_INFO_TEST_VALUE_GT(DoubleFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_DOUBLE_FP_CONFIG, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthChar, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthShort, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthInt, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthLong, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthFloat, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthDouble, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE, 0); +OL_DEVICE_INFO_TEST_SUCCESS(NativeVectorWidthHalf, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF); +OL_DEVICE_INFO_TEST_VALUE_GT(MaxClockFrequency, uint32_t, + OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(MemoryClockRate, uint32_t, + OL_DEVICE_INFO_MEMORY_CLOCK_RATE, 0); +OL_DEVICE_INFO_TEST_VALUE_GT(AddressBits, uint32_t, OL_DEVICE_INFO_ADDRESS_BITS, + 0); +OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(MaxMemAllocSize, uint64_t, + OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, 0); +OL_DEVICE_INFO_TEST_HOST_SUCCESS(MaxMemAllocSize, uint64_t, + OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE); +OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t, + OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0); +OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t, + OL_DEVICE_INFO_GLOBAL_MEM_SIZE); + TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) { ol_device_type_t DeviceType; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp index 4e29978..c4a3c2d 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp @@ -13,48 +13,76 @@ using olGetDeviceInfoSizeTest = OffloadDeviceTest; OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetDeviceInfoSizeTest); -TEST_P(olGetDeviceInfoSizeTest, SuccessType) { - size_t Size = 0; - ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_TYPE, &Size)); - ASSERT_EQ(Size, sizeof(ol_device_type_t)); -} +#define OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, Expr) \ + TEST_P(olGetDeviceInfoSizeTest, Success##TestName) { \ + size_t Size = 0; \ + ASSERT_SUCCESS(olGetDeviceInfoSize(Device, PropName, &Size)); \ + Expr; \ + } -TEST_P(olGetDeviceInfoSizeTest, SuccessPlatform) { - size_t Size = 0; - ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PLATFORM, &Size)); - ASSERT_EQ(Size, sizeof(ol_platform_handle_t)); -} +#define OL_DEVICE_INFO_SIZE_TEST_EQ(TestName, PropType, PropName) \ + OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, \ + ASSERT_EQ(Size, sizeof(PropType))); -TEST_P(olGetDeviceInfoSizeTest, SuccessName) { - size_t Size = 0; - ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size)); - ASSERT_NE(Size, 0ul); -} - -TEST_P(olGetDeviceInfoSizeTest, SuccessVendor) { - size_t Size = 0; - ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size)); - ASSERT_NE(Size, 0ul); -} +#define OL_DEVICE_INFO_SIZE_TEST_NONZERO(TestName, PropName) \ + OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, ASSERT_NE(Size, 0ul)); -TEST_P(olGetDeviceInfoSizeTest, SuccessDriverVersion) { - size_t Size = 0; - ASSERT_SUCCESS( - olGetDeviceInfoSize(Device, OL_DEVICE_INFO_DRIVER_VERSION, &Size)); - ASSERT_NE(Size, 0ul); -} +OL_DEVICE_INFO_SIZE_TEST_EQ(Type, ol_device_type_t, OL_DEVICE_INFO_TYPE); +OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t, + OL_DEVICE_INFO_PLATFORM); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION); +OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkSize, uint32_t, + OL_DEVICE_INFO_MAX_WORK_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID); +OL_DEVICE_INFO_SIZE_TEST_EQ(NumComputeUnits, uint32_t, + OL_DEVICE_INFO_NUM_COMPUTE_UNITS); +OL_DEVICE_INFO_SIZE_TEST_EQ(SingleFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_SINGLE_FP_CONFIG); +OL_DEVICE_INFO_SIZE_TEST_EQ(HalfFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_HALF_FP_CONFIG); +OL_DEVICE_INFO_SIZE_TEST_EQ(DoubleFPConfig, ol_device_fp_capability_flags_t, + OL_DEVICE_INFO_DOUBLE_FP_CONFIG); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthChar, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthShort, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthInt, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthLong, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthFloat, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthDouble, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE); +OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthHalf, uint32_t, + OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF); +OL_DEVICE_INFO_SIZE_TEST_EQ(MaxClockFrequency, uint32_t, + OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY); +OL_DEVICE_INFO_SIZE_TEST_EQ(MemoryClockRate, uint32_t, + OL_DEVICE_INFO_MEMORY_CLOCK_RATE); +OL_DEVICE_INFO_SIZE_TEST_EQ(AddressBits, uint32_t, OL_DEVICE_INFO_ADDRESS_BITS); +OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t, + OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t, + OL_DEVICE_INFO_GLOBAL_MEM_SIZE); -TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSize) { +TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) { size_t Size = 0; - ASSERT_SUCCESS( - olGetDeviceInfoSize(Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, &Size)); - ASSERT_EQ(Size, sizeof(uint32_t)); + ASSERT_SUCCESS(olGetDeviceInfoSize( + Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, &Size)); + ASSERT_EQ(Size, sizeof(ol_dimensions_t)); + ASSERT_EQ(Size, sizeof(uint32_t) * 3); } -TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) { +TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkSizePerDimension) { size_t Size = 0; ASSERT_SUCCESS(olGetDeviceInfoSize( - Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, &Size)); + Device, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, &Size)); ASSERT_EQ(Size, sizeof(ol_dimensions_t)); ASSERT_EQ(Size, sizeof(uint32_t) * 3); } diff --git a/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp b/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp index 908d2dc..b86d15f 100644 --- a/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp +++ b/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp @@ -13,13 +13,22 @@ using olGetEventInfoTest = OffloadEventTest; OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetEventInfoTest); -TEST_P(olGetEventInfoTest, SuccessDevice) { +TEST_P(olGetEventInfoTest, SuccessQueue) { ol_queue_handle_t RetrievedQueue; ASSERT_SUCCESS(olGetEventInfo(Event, OL_EVENT_INFO_QUEUE, sizeof(ol_queue_handle_t), &RetrievedQueue)); ASSERT_EQ(Queue, RetrievedQueue); } +TEST_P(olGetEventInfoTest, SuccessIsComplete) { + bool Complete = false; + while (!Complete) { + ASSERT_SUCCESS(olGetEventInfo(Event, OL_EVENT_INFO_IS_COMPLETE, + sizeof(Complete), &Complete)); + } + ASSERT_EQ(Complete, true); +} + TEST_P(olGetEventInfoTest, InvalidNullHandle) { ol_queue_handle_t RetrievedQueue; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, diff --git a/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp b/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp index d7dee58..36f36c3 100644 --- a/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp +++ b/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp @@ -19,6 +19,12 @@ TEST_P(olGetEventInfoSizeTest, SuccessQueue) { ASSERT_EQ(Size, sizeof(ol_queue_handle_t)); } +TEST_P(olGetEventInfoSizeTest, SuccessIsComplete) { + size_t Size = 0; + ASSERT_SUCCESS(olGetEventInfoSize(Event, OL_EVENT_INFO_IS_COMPLETE, &Size)); + ASSERT_EQ(Size, sizeof(bool)); +} + TEST_P(olGetEventInfoSizeTest, InvalidNullHandle) { size_t Size = 0; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, diff --git a/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp new file mode 100644 index 0000000..17fa383 --- /dev/null +++ b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp @@ -0,0 +1,45 @@ +//===------- Offload API tests - olCalculateOptimalOccupancy --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include <OffloadAPI.h> +#include <gtest/gtest.h> + +using olCalculateOptimalOccupancyTest = OffloadKernelTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olCalculateOptimalOccupancyTest); + +TEST_P(olCalculateOptimalOccupancyTest, Success) { + size_t Size{0}; + ASSERT_SUCCESS_OR_UNSUPPORTED( + olCalculateOptimalOccupancy(Device, Kernel, 0, &Size)); + ASSERT_GT(Size, 0u); +} + +TEST_P(olCalculateOptimalOccupancyTest, SuccessMem) { + size_t Size{0}; + ASSERT_SUCCESS_OR_UNSUPPORTED( + olCalculateOptimalOccupancy(Device, Kernel, 1024, &Size)); + ASSERT_GT(Size, 0u); +} + +TEST_P(olCalculateOptimalOccupancyTest, NullKernel) { + size_t Size; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olCalculateOptimalOccupancy(Device, nullptr, 0, &Size)); +} + +TEST_P(olCalculateOptimalOccupancyTest, NullDevice) { + size_t Size; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olCalculateOptimalOccupancy(nullptr, Kernel, 0, &Size)); +} + +TEST_P(olCalculateOptimalOccupancyTest, NullOutput) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olCalculateOptimalOccupancy(Device, Kernel, 0, nullptr)); +} diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp new file mode 100644 index 0000000..a84ed3d78 --- /dev/null +++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp @@ -0,0 +1,193 @@ +//===------- Offload API tests - olMemFill --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include <OffloadAPI.h> +#include <gtest/gtest.h> + +struct olMemFillTest : OffloadQueueTest { + template <typename PatternTy, PatternTy PatternVal, size_t Size, + bool Block = false> + void test_body() { + ManuallyTriggeredTask Manual; + + // Block/enqueue tests ensure that the test has been enqueued to a queue + // (rather than being done synchronously if the queue happens to be empty) + if constexpr (Block) { + ASSERT_SUCCESS(Manual.enqueue(Queue)); + } + + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + PatternTy Pattern = PatternVal; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + if constexpr (Block) { + ASSERT_SUCCESS(Manual.trigger()); + } + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternTy *AllocPtr = reinterpret_cast<PatternTy *>(Alloc); + ASSERT_EQ(AllocPtr[i], Pattern); + } + + olMemFree(Alloc); + } +}; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest); + +TEST_P(olMemFillTest, Success8) { test_body<uint8_t, 0x42, 1024>(); } +TEST_P(olMemFillTest, Success8NotMultiple4) { + test_body<uint8_t, 0x42, 1023>(); +} +TEST_P(olMemFillTest, Success8Enqueue) { + test_body<uint8_t, 0x42, 1024, true>(); +} +TEST_P(olMemFillTest, Success8NotMultiple4Enqueue) { + test_body<uint8_t, 0x42, 1023, true>(); +} + +TEST_P(olMemFillTest, Success16) { test_body<uint8_t, 0x42, 1024>(); } +TEST_P(olMemFillTest, Success16NotMultiple4) { + test_body<uint16_t, 0x4243, 1022>(); +} +TEST_P(olMemFillTest, Success16Enqueue) { + test_body<uint8_t, 0x42, 1024, true>(); +} +TEST_P(olMemFillTest, Success16NotMultiple4Enqueue) { + test_body<uint16_t, 0x4243, 1022, true>(); +} + +TEST_P(olMemFillTest, Success32) { test_body<uint32_t, 0xDEADBEEF, 1024>(); } +TEST_P(olMemFillTest, Success32Enqueue) { + test_body<uint32_t, 0xDEADBEEF, 1024, true>(); +} + +TEST_P(olMemFillTest, SuccessLarge) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct PatternT { + uint64_t A; + uint64_t B; + } Pattern{UINT64_MAX, UINT64_MAX}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLargeEnqueue) { + constexpr size_t Size = 1024; + void *Alloc; + ManuallyTriggeredTask Manual; + ASSERT_SUCCESS(Manual.enqueue(Queue)); + + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct PatternT { + uint64_t A; + uint64_t B; + } Pattern{UINT64_MAX, UINT64_MAX}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + Manual.trigger(); + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLargeByteAligned) { + constexpr size_t Size = 17 * 64; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct __attribute__((packed)) PatternT { + uint64_t A; + uint64_t B; + uint8_t C; + } Pattern{UINT64_MAX, UINT64_MAX, 255}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].C, 255); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLargeByteAlignedEnqueue) { + constexpr size_t Size = 17 * 64; + void *Alloc; + ManuallyTriggeredTask Manual; + ASSERT_SUCCESS(Manual.enqueue(Queue)); + + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct __attribute__((packed)) PatternT { + uint64_t A; + uint64_t B; + uint8_t C; + } Pattern{UINT64_MAX, UINT64_MAX, 255}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + Manual.trigger(); + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].C, 255); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, InvalidPatternSize) { + constexpr size_t Size = 1025; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint16_t Pattern = 0x4242; + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + olMemFree(Alloc); +} diff --git a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp index 0dc8527..aa9e372 100644 --- a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp +++ b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp @@ -18,6 +18,15 @@ TEST_P(olDestroyQueueTest, Success) { Queue = nullptr; } +TEST_P(olDestroyQueueTest, SuccessDelayedResolution) { + ManuallyTriggeredTask Manual; + ASSERT_SUCCESS(Manual.enqueue(Queue)); + ASSERT_SUCCESS(olDestroyQueue(Queue)); + Queue = nullptr; + + ASSERT_SUCCESS(Manual.trigger()); +} + TEST_P(olDestroyQueueTest, InvalidNullHandle) { ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, olDestroyQueue(nullptr)); } diff --git a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp new file mode 100644 index 0000000..aa86750 --- /dev/null +++ b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp @@ -0,0 +1,107 @@ +//===------- Offload API tests - olLaunchHostFunction ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include <OffloadAPI.h> +#include <gtest/gtest.h> +#include <thread> + +struct olLaunchHostFunctionTest : OffloadQueueTest {}; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchHostFunctionTest); + +struct olLaunchHostFunctionKernelTest : OffloadKernelTest {}; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchHostFunctionKernelTest); + +TEST_P(olLaunchHostFunctionTest, Success) { + ASSERT_SUCCESS(olLaunchHostFunction(Queue, [](void *) {}, nullptr)); +} + +TEST_P(olLaunchHostFunctionTest, SuccessSequence) { + uint32_t Buff[16] = {1, 1}; + + for (auto BuffPtr = &Buff[2]; BuffPtr != &Buff[16]; BuffPtr++) { + ASSERT_SUCCESS(olLaunchHostFunction( + Queue, + [](void *BuffPtr) { + uint32_t *AsU32 = reinterpret_cast<uint32_t *>(BuffPtr); + AsU32[0] = AsU32[-1] + AsU32[-2]; + }, + BuffPtr)); + } + + ASSERT_SUCCESS(olSyncQueue(Queue)); + + for (uint32_t i = 2; i < 16; i++) { + ASSERT_EQ(Buff[i], Buff[i - 1] + Buff[i - 2]); + } +} + +TEST_P(olLaunchHostFunctionKernelTest, SuccessBlocking) { + // Verify that a host kernel can block execution - A host task is created that + // only resolves when Block is set to false. + ol_kernel_launch_size_args_t LaunchArgs; + LaunchArgs.Dimensions = 1; + LaunchArgs.GroupSize = {64, 1, 1}; + LaunchArgs.NumGroups = {1, 1, 1}; + LaunchArgs.DynSharedMemory = 0; + + ol_queue_handle_t Queue; + ASSERT_SUCCESS(olCreateQueue(Device, &Queue)); + + void *Mem; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, + LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem)); + + uint32_t *Data = (uint32_t *)Mem; + for (uint32_t i = 0; i < 64; i++) { + Data[i] = 0; + } + + volatile bool Block = true; + ASSERT_SUCCESS(olLaunchHostFunction( + Queue, + [](void *Ptr) { + volatile bool *Block = + reinterpret_cast<volatile bool *>(reinterpret_cast<bool *>(Ptr)); + + while (*Block) + std::this_thread::yield(); + }, + const_cast<bool *>(&Block))); + + struct { + void *Mem; + } Args{Mem}; + ASSERT_SUCCESS( + olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs)); + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + for (uint32_t i = 0; i < 64; i++) { + ASSERT_EQ(Data[i], 0); + } + + Block = false; + ASSERT_SUCCESS(olSyncQueue(Queue)); + + for (uint32_t i = 0; i < 64; i++) { + ASSERT_EQ(Data[i], i); + } + + ASSERT_SUCCESS(olDestroyQueue(Queue)); + ASSERT_SUCCESS(olMemFree(Mem)); +} + +TEST_P(olLaunchHostFunctionTest, InvalidNullCallback) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olLaunchHostFunction(Queue, nullptr, nullptr)); +} + +TEST_P(olLaunchHostFunctionTest, InvalidNullQueue) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olLaunchHostFunction(nullptr, [](void *) {}, nullptr)); +} |