aboutsummaryrefslogtreecommitdiff
path: root/openmp
diff options
context:
space:
mode:
authorGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2019-01-09 18:30:14 +0000
committerGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2019-01-09 18:30:14 +0000
commit1653633a1c5af6bb9667997bba0a9bc0fcbb1346 (patch)
treebe4ee2608c529c20a326a02b6d66be58c8def44a /openmp
parentedb54b22d3c00174a013cfd0615fe964e9829330 (diff)
downloadllvm-1653633a1c5af6bb9667997bba0a9bc0fcbb1346.zip
llvm-1653633a1c5af6bb9667997bba0a9bc0fcbb1346.tar.gz
llvm-1653633a1c5af6bb9667997bba0a9bc0fcbb1346.tar.bz2
[OpenMP][libomptarget] Use shared memory variable for tracking parallel level
Summary: Replace existing infrastructure for tracking parallel level using global memory with a per-team shared memory variable. This minimizes the impact of the overhead of tracking the parallel level for non-nested cases. Reviewers: ABataev, caomhin Reviewed By: ABataev Subscribers: guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D55773 llvm-svn: 350747
Diffstat (limited to 'openmp')
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu9
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu24
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h36
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu13
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h6
6 files changed, 21 insertions, 69 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
index 9abe599..63bf6b4 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -165,7 +165,7 @@ EXTERN int omp_get_level(void) {
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode only with uninitialized runtime.");
- return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
+ return parallelLevel;
}
int level = 0;
omptarget_nvptx_TaskDescr *currTaskDescr =
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index 35f94ac..0700577 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -27,22 +27,17 @@ __device__
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];
-__device__ omptarget_nvptx_Queue<omptarget_nvptx_SimpleThreadPrivateContext,
- OMP_STATE_COUNT>
- omptarget_nvptx_device_simpleState[MAX_SM];
-
__device__ omptarget_nvptx_SimpleMemoryManager
omptarget_nvptx_simpleMemoryManager;
__device__ __shared__ uint32_t usedMemIdx;
__device__ __shared__ uint32_t usedSlotIdx;
+__device__ __shared__ uint8_t parallelLevel;
+
// Pointer to this team's OpenMP state object
__device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
-__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
- *omptarget_nvptx_simpleThreadPrivateContext;
-
////////////////////////////////////////////////////////////////////////////////
// The team master sets the outlined parallel function in this variable to
// communicate with the workers. Since it is in shared memory, there is one
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index 2a3d49c56..7034d02 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -21,10 +21,6 @@ extern __device__
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];
-extern __device__ omptarget_nvptx_Queue<
- omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
- omptarget_nvptx_device_simpleState[MAX_SM];
-
////////////////////////////////////////////////////////////////////////////////
// init entry points
////////////////////////////////////////////////////////////////////////////////
@@ -100,14 +96,10 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
// If OMP runtime is not required don't initialize OMP state.
setExecutionParameters(Spmd, RuntimeUninitialized);
if (GetThreadIdInBlock() == 0) {
- int slot = smid() % MAX_SM;
- usedSlotIdx = slot;
- omptarget_nvptx_simpleThreadPrivateContext =
- omptarget_nvptx_device_simpleState[slot].Dequeue();
+ parallelLevel = 0;
+ usedSlotIdx = smid() % MAX_SM;
}
- // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
- omptarget_nvptx_simpleThreadPrivateContext->Init();
return;
}
setExecutionParameters(Spmd, RuntimeInitialized);
@@ -172,18 +164,12 @@ EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() {
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
// We're not going to pop the task descr stack of each thread since
// there are no more parallel regions in SPMD mode.
+ if (!RequiresOMPRuntime)
+ return;
+
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
int threadId = GetThreadIdInBlock();
- if (!RequiresOMPRuntime) {
- if (threadId == 0) {
- // Enqueue omp state object for use by another team.
- int slot = usedSlotIdx;
- omptarget_nvptx_device_simpleState[slot].Enqueue(
- omptarget_nvptx_simpleThreadPrivateContext);
- }
- return;
- }
if (threadId == 0) {
// Enqueue omp state object for use by another team.
int slot = usedSlotIdx;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index cb6c0b7..d23010e 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -391,39 +391,6 @@ public:
INLINE const void *Acquire(const void *buf, size_t size);
};
-class omptarget_nvptx_SimpleThreadPrivateContext {
- uint16_t par_level[MAX_THREADS_PER_TEAM];
-
-public:
- INLINE void Init() {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- par_level[GetThreadIdInBlock()] = 0;
- }
- INLINE void IncParLevel() {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- ++par_level[GetThreadIdInBlock()];
- }
- INLINE void DecParLevel() {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- ASSERT0(LT_FUSSY, par_level[GetThreadIdInBlock()] > 0,
- "Expected parallel level >0.");
- --par_level[GetThreadIdInBlock()];
- }
- INLINE bool InL2OrHigherParallelRegion() const {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- return par_level[GetThreadIdInBlock()] > 0;
- }
- INLINE uint16_t GetParallelLevel() const {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- return par_level[GetThreadIdInBlock()] + 1;
- }
-};
-
////////////////////////////////////////////////////////////////////////////////
// global device envrionment
////////////////////////////////////////////////////////////////////////////////
@@ -440,10 +407,9 @@ extern __device__ omptarget_nvptx_SimpleMemoryManager
omptarget_nvptx_simpleMemoryManager;
extern __device__ __shared__ uint32_t usedMemIdx;
extern __device__ __shared__ uint32_t usedSlotIdx;
+extern __device__ __shared__ uint8_t parallelLevel;
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
-extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
- *omptarget_nvptx_simpleThreadPrivateContext;
extern __device__ __shared__ uint32_t execution_param;
extern __device__ __shared__ void *ReductionScratchpadPtr;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index c5edd31..8de8f59 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -340,7 +340,11 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- omptarget_nvptx_simpleThreadPrivateContext->IncParLevel();
+ __SYNCTHREADS();
+ if (GetThreadIdInBlock() == 0)
+ ++parallelLevel;
+ __SYNCTHREADS();
+
return;
}
@@ -379,7 +383,10 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- omptarget_nvptx_simpleThreadPrivateContext->DecParLevel();
+ __SYNCTHREADS();
+ if (GetThreadIdInBlock() == 0)
+ --parallelLevel;
+ __SYNCTHREADS();
return;
}
@@ -401,7 +408,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
+ return parallelLevel;
}
int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
index ece3295..b8f661c 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -155,8 +155,7 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
"Uninitialized runtime with non-SPMD mode.");
// For level 2 parallelism all parallel regions are executed sequentially.
- if (omptarget_nvptx_simpleThreadPrivateContext
- ->InL2OrHigherParallelRegion())
+ if (parallelLevel > 0)
rc = 0;
else
rc = GetThreadIdInBlock();
@@ -177,8 +176,7 @@ INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
"Uninitialized runtime with non-SPMD mode.");
// For level 2 parallelism all parallel regions are executed sequentially.
- if (omptarget_nvptx_simpleThreadPrivateContext
- ->InL2OrHigherParallelRegion())
+ if (parallelLevel > 0)
rc = 1;
else
rc = GetNumberOfThreadsInBlock();