aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDominik Adamski <dominik.adamski@amd.com>2023-11-29 14:25:57 +0100
committerGitHub <noreply@github.com>2023-11-29 14:25:57 +0100
commitd4d88b8499b66e0904e5c545613a14d170b3ea62 (patch)
tree9a42bd08ac04baeee80b4c82ca6e8d2aa7cfa8bb
parent3930a0b57a9a558409a7368f52f905029112e3c2 (diff)
downloadllvm-d4d88b8499b66e0904e5c545613a14d170b3ea62.zip
llvm-d4d88b8499b66e0904e5c545613a14d170b3ea62.tar.gz
llvm-d4d88b8499b66e0904e5c545613a14d170b3ea62.tar.bz2
[OpenMP] New Openmp device RTL functions (#73225)
Add new implementation of workshare loop functions. These functions will be used by OpenMPIRBuilder to support handling of OpenMP workshare loops for the target region. --------- Co-authored-by: Johannes Doerfert <johannes@jdoerfert.de>
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/OMPKinds.def30
-rw-r--r--openmp/libomptarget/DeviceRTL/src/Workshare.cpp254
2 files changed, 284 insertions, 0 deletions
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 6a719d2..d22d2a8 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -464,6 +464,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
+__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
__OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
@@ -650,6 +662,24 @@ __OMP_RTL_ATTRS(__kmpc_cancel, InaccessibleArgOnlyAttrs, SExt,
ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, SExt,
ParamAttrs(ReadOnlyPtrAttrs, SExt))
+__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ SExt, SExt, SExt, SExt))
+__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ ZExt, ZExt, ZExt, ZExt))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ SExt, SExt))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ ZExt, ZExt))
+__OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ SExt, SExt, SExt))
+__OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ ZExt, ZExt, ZExt))
__OMP_RTL_ATTRS(__kmpc_error, AttributeSet(), AttributeSet(),
ParamAttrs(AttributeSet(), SExt))
__OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(),
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 0dbfafc..b587b85 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -45,6 +45,9 @@ struct DynamicScheduleTracker {
#pragma omp begin declare target device_type(nohost)
+extern int32_t __omp_rtl_assume_teams_oversubscription;
+extern int32_t __omp_rtl_assume_threads_oversubscription;
+
// TODO: This variable is a hack inherited from the old runtime.
static uint64_t SHARED(Cnt);
@@ -636,4 +639,255 @@ void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
}
+namespace ompx {
+
+/// Helper class to hide the generic loop nest and provide the template argument
+/// throughout.
+template <typename Ty> class StaticLoopChunker {
+
+ /// Generic loop nest that handles block and/or thread distribution in the
+ /// absence of user specified chunk sizes. This implicitly picks a block chunk
+ /// size equal to the number of threads in the block and a thread chunk size
+ /// equal to one. In contrast to the chunked version we can get away with a
+ /// single loop in this case
+ static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
+ Ty NumBlocks, Ty BId, Ty NumThreads,
+ Ty TId, Ty NumIters,
+ bool OneIterationPerThread) {
+ Ty KernelIteration = NumBlocks * NumThreads;
+
+ // Start index in the normalized space.
+ Ty IV = BId * NumThreads + TId;
+ ASSERT(IV >= 0, "Bad index");
+
+ // Cover the entire iteration space, assumptions in the caller might allow
+ // to simplify this loop to a conditional.
+ if (IV < NumIters) {
+ do {
+
+ // Execute the loop body.
+ LoopBody(IV, Arg);
+
+ // Every thread executed one block and thread chunk now.
+ IV += KernelIteration;
+
+ if (OneIterationPerThread)
+ return;
+
+ } while (IV < NumIters);
+ }
+ }
+
+ /// Generic loop nest that handles block and/or thread distribution in the
+ /// presence of user specified chunk sizes (for at least one of them).
+ static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
+ Ty BlockChunk, Ty NumBlocks, Ty BId,
+ Ty ThreadChunk, Ty NumThreads, Ty TId,
+ Ty NumIters,
+ bool OneIterationPerThread) {
+ Ty KernelIteration = NumBlocks * BlockChunk;
+
+ // Start index in the chunked space.
+ Ty IV = BId * BlockChunk + TId;
+ ASSERT(IV >= 0, "Bad index");
+
+ // Cover the entire iteration space, assumptions in the caller might allow
+ // to simplify this loop to a conditional.
+ do {
+
+ Ty BlockChunkLeft =
+ BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
+ Ty ThreadChunkLeft =
+ ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+
+ while (ThreadChunkLeft--) {
+
+ // Given the blocking it's hard to keep track of what to execute.
+ if (IV >= NumIters)
+ return;
+
+ // Execute the loop body.
+ LoopBody(IV, Arg);
+
+ if (OneIterationPerThread)
+ return;
+
+ ++IV;
+ }
+
+ IV += KernelIteration;
+
+ } while (IV < NumIters);
+ }
+
+public:
+ /// Worksharing `for`-loop.
+ static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+ Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+ ASSERT(NumIters >= 0, "Bad iteration count");
+ ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+ // All threads need to participate but we don't know if we are in a
+ // parallel at all or if the user might have used a `num_threads` clause
+ // on the parallel and reduced the number compared to the block size.
+ // Since nested parallels are possible too we need to get the thread id
+ // from the `omp` getter and not the mapping directly.
+ Ty TId = omp_get_thread_num();
+
+ // There are no blocks involved here.
+ Ty BlockChunk = 0;
+ Ty NumBlocks = 1;
+ Ty BId = 0;
+
+ // If the thread chunk is not specified we pick a default now.
+ if (ThreadChunk == 0)
+ ThreadChunk = 1;
+
+ // If we know we have more threads than iterations we can indicate that to
+ // avoid an outer loop.
+ bool OneIterationPerThread = false;
+ if (__omp_rtl_assume_threads_oversubscription) {
+ ASSERT(NumThreads >= NumIters, "Broken assumption");
+ OneIterationPerThread = true;
+ }
+
+ if (ThreadChunk != 1)
+ NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+ ThreadChunk, NumThreads, TId, NumIters,
+ OneIterationPerThread);
+ else
+ NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+ NumIters, OneIterationPerThread);
+ }
+
+ /// Worksharing `distrbute`-loop.
+ static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+ Ty NumIters, Ty BlockChunk) {
+ ASSERT(icv::Level == 0, "Bad distribute");
+ ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+ ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+ ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+
+ ASSERT(NumIters >= 0, "Bad iteration count");
+ ASSERT(BlockChunk >= 0, "Bad block count");
+
+ // There are no threads involved here.
+ Ty ThreadChunk = 0;
+ Ty NumThreads = 1;
+ Ty TId = 0;
+ ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");
+
+ // All teams need to participate.
+ Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+ Ty BId = mapping::getBlockIdInKernel();
+
+ // If the block chunk is not specified we pick a default now.
+ if (BlockChunk == 0)
+ BlockChunk = NumThreads;
+
+ // If we know we have more blocks than iterations we can indicate that to
+ // avoid an outer loop.
+ bool OneIterationPerThread = false;
+ if (__omp_rtl_assume_teams_oversubscription) {
+ ASSERT(NumBlocks >= NumIters, "Broken assumption");
+ OneIterationPerThread = true;
+ }
+
+ if (BlockChunk != NumThreads)
+ NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+ ThreadChunk, NumThreads, TId, NumIters,
+ OneIterationPerThread);
+ else
+ NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+ NumIters, OneIterationPerThread);
+
+ ASSERT(icv::Level == 0, "Bad distribute");
+ ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+ ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+ ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+ }
+
+ /// Worksharing `distrbute parallel for`-loop.
+ static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
+ void *Arg, Ty NumIters, Ty NumThreads,
+ Ty BlockChunk, Ty ThreadChunk) {
+ ASSERT(icv::Level == 1, "Bad distribute");
+ ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+ ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+
+ ASSERT(NumIters >= 0, "Bad iteration count");
+ ASSERT(BlockChunk >= 0, "Bad block count");
+ ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+ // All threads need to participate but the user might have used a
+ // `num_threads` clause on the parallel and reduced the number compared to
+ // the block size.
+ Ty TId = mapping::getThreadIdInBlock();
+
+ // All teams need to participate.
+ Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+ Ty BId = mapping::getBlockIdInKernel();
+
+ // If the block chunk is not specified we pick a default now.
+ if (BlockChunk == 0)
+ BlockChunk = NumThreads;
+
+ // If the thread chunk is not specified we pick a default now.
+ if (ThreadChunk == 0)
+ ThreadChunk = 1;
+
+ // If we know we have more threads (across all blocks) than iterations we
+ // can indicate that to avoid an outer loop.
+ bool OneIterationPerThread = false;
+ if (__omp_rtl_assume_teams_oversubscription &
+ __omp_rtl_assume_threads_oversubscription) {
+ OneIterationPerThread = true;
+ ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
+ }
+
+ if (BlockChunk != NumThreads || ThreadChunk != 1)
+ NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+ ThreadChunk, NumThreads, TId, NumIters,
+ OneIterationPerThread);
+ else
+ NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+ NumIters, OneIterationPerThread);
+
+ ASSERT(icv::Level == 1, "Bad distribute");
+ ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+ ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+ }
+};
+
+} // namespace ompx
+
+#define OMP_LOOP_ENTRY(BW, TY) \
+ [[gnu::flatten, clang::always_inline]] void \
+ __kmpc_distribute_for_static_loop##BW( \
+ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
+ TY num_threads, TY block_chunk, TY thread_chunk) { \
+ ompx::StaticLoopChunker<TY>::DistributeFor( \
+ loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk); \
+ } \
+ [[gnu::flatten, clang::always_inline]] void \
+ __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
+ void *arg, TY num_iters, \
+ TY block_chunk) { \
+ ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1, \
+ block_chunk); \
+ } \
+ [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
+ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
+ TY num_threads, TY thread_chunk) { \
+ ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
+ thread_chunk); \
+ }
+
+extern "C" {
+OMP_LOOP_ENTRY(_4, int32_t)
+OMP_LOOP_ENTRY(_4u, uint32_t)
+OMP_LOOP_ENTRY(_8, int64_t)
+OMP_LOOP_ENTRY(_8u, uint64_t)
+}
+
#pragma omp end declare target