diff options
author | Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com> | 2019-01-09 18:30:14 +0000 |
---|---|---|
committer | Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com> | 2019-01-09 18:30:14 +0000 |
commit | 1653633a1c5af6bb9667997bba0a9bc0fcbb1346 (patch) | |
tree | be4ee2608c529c20a326a02b6d66be58c8def44a /openmp | |
parent | edb54b22d3c00174a013cfd0615fe964e9829330 (diff) | |
download | llvm-1653633a1c5af6bb9667997bba0a9bc0fcbb1346.zip llvm-1653633a1c5af6bb9667997bba0a9bc0fcbb1346.tar.gz llvm-1653633a1c5af6bb9667997bba0a9bc0fcbb1346.tar.bz2 |
[OpenMP][libomptarget] Use shared memory variable for tracking parallel level
Summary: Replace existing infrastructure for tracking parallel level using global memory with a per-team shared memory variable. This minimizes the impact of the overhead of tracking the parallel level for non-nested cases.
Reviewers: ABataev, caomhin
Reviewed By: ABataev
Subscribers: guansong, openmp-commits
Differential Revision: https://reviews.llvm.org/D55773
llvm-svn: 350747
Diffstat (limited to 'openmp')
6 files changed, 21 insertions, 69 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu index 9abe599..63bf6b4 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -165,7 +165,7 @@ EXTERN int omp_get_level(void) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode only with uninitialized runtime."); - return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel(); + return parallelLevel; } int level = 0; omptarget_nvptx_TaskDescr *currTaskDescr = diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu index 35f94ac..0700577 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -27,22 +27,17 @@ __device__ omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT> omptarget_nvptx_device_State[MAX_SM]; -__device__ omptarget_nvptx_Queue<omptarget_nvptx_SimpleThreadPrivateContext, - OMP_STATE_COUNT> - omptarget_nvptx_device_simpleState[MAX_SM]; - __device__ omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; __device__ __shared__ uint32_t usedMemIdx; __device__ __shared__ uint32_t usedSlotIdx; +__device__ __shared__ uint8_t parallelLevel; + // Pointer to this team's OpenMP state object __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; -__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext - *omptarget_nvptx_simpleThreadPrivateContext; - //////////////////////////////////////////////////////////////////////////////// // The team master sets the outlined parallel function in this variable to // communicate with the workers. Since it is in shared memory, there is one diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index 2a3d49c56..7034d02 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -21,10 +21,6 @@ extern __device__ omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT> omptarget_nvptx_device_State[MAX_SM]; -extern __device__ omptarget_nvptx_Queue< - omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT> - omptarget_nvptx_device_simpleState[MAX_SM]; - //////////////////////////////////////////////////////////////////////////////// // init entry points //////////////////////////////////////////////////////////////////////////////// @@ -100,14 +96,10 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, // If OMP runtime is not required don't initialize OMP state. setExecutionParameters(Spmd, RuntimeUninitialized); if (GetThreadIdInBlock() == 0) { - int slot = smid() % MAX_SM; - usedSlotIdx = slot; - omptarget_nvptx_simpleThreadPrivateContext = - omptarget_nvptx_device_simpleState[slot].Dequeue(); + parallelLevel = 0; + usedSlotIdx = smid() % MAX_SM; } - // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. __SYNCTHREADS(); - omptarget_nvptx_simpleThreadPrivateContext->Init(); return; } setExecutionParameters(Spmd, RuntimeInitialized); @@ -172,18 +164,12 @@ EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() { EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) { // We're not going to pop the task descr stack of each thread since // there are no more parallel regions in SPMD mode. + if (!RequiresOMPRuntime) + return; + // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. __SYNCTHREADS(); int threadId = GetThreadIdInBlock(); - if (!RequiresOMPRuntime) { - if (threadId == 0) { - // Enqueue omp state object for use by another team. - int slot = usedSlotIdx; - omptarget_nvptx_device_simpleState[slot].Enqueue( - omptarget_nvptx_simpleThreadPrivateContext); - } - return; - } if (threadId == 0) { // Enqueue omp state object for use by another team. int slot = usedSlotIdx; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index cb6c0b7..d23010e 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -391,39 +391,6 @@ public: INLINE const void *Acquire(const void *buf, size_t size); }; -class omptarget_nvptx_SimpleThreadPrivateContext { - uint16_t par_level[MAX_THREADS_PER_TEAM]; - -public: - INLINE void Init() { - ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), - "Expected SPMD + uninitialized runtime modes."); - par_level[GetThreadIdInBlock()] = 0; - } - INLINE void IncParLevel() { - ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), - "Expected SPMD + uninitialized runtime modes."); - ++par_level[GetThreadIdInBlock()]; - } - INLINE void DecParLevel() { - ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), - "Expected SPMD + uninitialized runtime modes."); - ASSERT0(LT_FUSSY, par_level[GetThreadIdInBlock()] > 0, - "Expected parallel level >0."); - --par_level[GetThreadIdInBlock()]; - } - INLINE bool InL2OrHigherParallelRegion() const { - ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), - "Expected SPMD + uninitialized runtime modes."); - return par_level[GetThreadIdInBlock()] > 0; - } - INLINE uint16_t GetParallelLevel() const { - ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(), - "Expected SPMD + uninitialized runtime modes."); - return par_level[GetThreadIdInBlock()] + 1; - } -}; - //////////////////////////////////////////////////////////////////////////////// // global device envrionment //////////////////////////////////////////////////////////////////////////////// @@ -440,10 +407,9 @@ extern __device__ omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; extern __device__ __shared__ uint32_t usedMemIdx; extern __device__ __shared__ uint32_t usedSlotIdx; +extern __device__ __shared__ uint8_t parallelLevel; extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; -extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext - *omptarget_nvptx_simpleThreadPrivateContext; extern __device__ __shared__ uint32_t execution_param; extern __device__ __shared__ void *ReductionScratchpadPtr; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu index c5edd31..8de8f59 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -340,7 +340,11 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - omptarget_nvptx_simpleThreadPrivateContext->IncParLevel(); + __SYNCTHREADS(); + if (GetThreadIdInBlock() == 0) + ++parallelLevel; + __SYNCTHREADS(); + return; } @@ -379,7 +383,10 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - omptarget_nvptx_simpleThreadPrivateContext->DecParLevel(); + __SYNCTHREADS(); + if (GetThreadIdInBlock() == 0) + --parallelLevel; + __SYNCTHREADS(); return; } @@ -401,7 +408,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel(); + return parallelLevel; } int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h index ece3295..b8f661c 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -155,8 +155,7 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode, ASSERT0(LT_FUSSY, isSPMDExecutionMode, "Uninitialized runtime with non-SPMD mode."); // For level 2 parallelism all parallel regions are executed sequentially. - if (omptarget_nvptx_simpleThreadPrivateContext - ->InL2OrHigherParallelRegion()) + if (parallelLevel > 0) rc = 0; else rc = GetThreadIdInBlock(); @@ -177,8 +176,7 @@ INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode, ASSERT0(LT_FUSSY, isSPMDExecutionMode, "Uninitialized runtime with non-SPMD mode."); // For level 2 parallelism all parallel regions are executed sequentially. - if (omptarget_nvptx_simpleThreadPrivateContext - ->InL2OrHigherParallelRegion()) + if (parallelLevel > 0) rc = 1; else rc = GetNumberOfThreadsInBlock(); |