aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoseph Huber <jhuber6@vols.utk.edu>2021-07-21 12:48:39 -0400
committerHuber, Joseph <huberjn@ornl.gov>2021-07-21 20:57:28 -0400
commit7d576392644d44a765336f6ebefd45f5db61231e (patch)
tree74807a5cf7fdcbf1f37b3278dd0dc28f43283d2e
parent754eb1c210d865234292df7a983636e2ff49e2c8 (diff)
downloadllvm-7d576392644d44a765336f6ebefd45f5db61231e.zip
llvm-7d576392644d44a765336f6ebefd45f5db61231e.tar.gz
llvm-7d576392644d44a765336f6ebefd45f5db61231e.tar.bz2
[OpenMP] Add new execution mode for SPMD execution with Generic semantics
Qualified kernels can be transformed from generic-mode to SPMD mode using an optimization in OpenMPOpt. This patch introduces a new execution mode to indicate kernels that have been transformed from generic-mode to SPMD-mode. These kernels have SPMD-mode execution, but need generic-mode semantics for scheduling the blocks and threads. Without this far too few blocks will be scheduled for a generic region as SPMD mode expects the trip count to be divided by the number of threads. Reviewed By: ggeorgakoudis Differential Revision: https://reviews.llvm.org/D106460
-rw-r--r--llvm/lib/Transforms/IPO/OpenMPOpt.cpp8
-rw-r--r--llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll2
-rw-r--r--llvm/test/Transforms/OpenMP/spmdization.ll2
-rw-r--r--openmp/libomptarget/plugins/amdgpu/src/rtl.cpp16
-rw-r--r--openmp/libomptarget/plugins/cuda/src/rtl.cpp24
5 files changed, 37 insertions, 15 deletions
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index c2c2129..44aa249 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2886,8 +2886,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
assert(ExecMode->getInitializer() &&
ExecMode->getInitializer()->isOneValue() &&
"Initially non-SPMD kernel has SPMD exec mode!");
- ExecMode->setInitializer(
- ConstantInt::get(ExecMode->getInitializer()->getType(), 0));
+
+ // Set the global exec mode flag to indicate SPMD-Generic mode.
+ constexpr int SPMDGeneric = 2;
+ if (!ExecMode->getInitializer()->isZeroValue())
+ ExecMode->setInitializer(
+ ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric));
// Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
const int InitIsSPMDArgNo = 1;
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index 9ff9d68..9144cd3 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -13,7 +13,7 @@ target triple = "nvptx64"
;.
; CHECK: @[[IS_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
-; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
+; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
; CHECK: @[[NON_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
; CHECK: @[[WILL_NOT_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 7dedbc8..c19ecbf 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -32,7 +32,7 @@ target triple = "nvptx64"
;.
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
-; CHECK: @[[__OMP_OFFLOADING_2C_38C77_SEQUENTIAL_LOOP_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
+; CHECK: @[[__OMP_OFFLOADING_2C_38C77_SEQUENTIAL_LOOP_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata"
;.
define weak void @__omp_offloading_2c_38c77_sequential_loop_l4() #0 {
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 2190136..843e2a1 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -127,9 +127,10 @@ struct FuncOrGblEntryTy {
};
enum ExecutionModeType {
- SPMD, // constructors, destructors,
- // combined constructs (`teams distribute parallel for [simd]`)
- GENERIC, // everything else
+ SPMD, // constructors, destructors,
+ // combined constructs (`teams distribute parallel for [simd]`)
+ GENERIC, // everything else
+ SPMD_GENERIC, // Generic kernel with SPMD execution
NONE
};
@@ -240,6 +241,7 @@ struct KernelTy {
// execution mode of kernel
// 0 - SPMD mode (without master warp)
// 1 - Generic mode (with master warp)
+ // 2 - SPMD mode execution with Generic mode semantics.
int8_t ExecutionMode;
int16_t ConstWGSize;
int32_t device_id;
@@ -1730,7 +1732,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
DP("After loading global for %s ExecMode = %d\n", ExecModeName,
ExecModeVal);
- if (ExecModeVal < 0 || ExecModeVal > 1) {
+ if (ExecModeVal < 0 || ExecModeVal > 2) {
DP("Error wrong exec_mode value specified in HSA code object file: "
"%d\n",
ExecModeVal);
@@ -1965,7 +1967,11 @@ launchVals getLaunchVals(EnvironmentVariables Env, int ConstWGSize,
if (ExecutionMode == SPMD) {
// round up to the nearest integer
num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
- } else {
+ } else if (ExecutionMode == GENERIC) {
+ num_groups = loop_tripcount;
+ } else if (ExecutionMode == SPMD_GENERIC) {
+ // This is a generic kernel that was transformed to use SPMD-mode
+ // execution but uses Generic-mode semantics for scheduling.
num_groups = loop_tripcount;
}
DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 7b04bc9..fff33fe 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -70,9 +70,10 @@ struct FuncOrGblEntryTy {
};
enum ExecutionModeType {
- SPMD, // constructors, destructors,
- // combined constructs (`teams distribute parallel for [simd]`)
- GENERIC, // everything else
+ SPMD, // constructors, destructors,
+ // combined constructs (`teams distribute parallel for [simd]`)
+ GENERIC, // everything else
+ SPMD_GENERIC, // Generic kernel with SPMD execution
NONE
};
@@ -83,6 +84,7 @@ struct KernelTy {
// execution mode of kernel
// 0 - SPMD mode (without master warp)
// 1 - Generic mode (with master warp)
+ // 2 - SPMD mode execution with Generic mode semantics.
int8_t ExecutionMode;
/// Maximal number of threads per block for this kernel.
@@ -796,7 +798,7 @@ public:
return nullptr;
}
- if (ExecModeVal < 0 || ExecModeVal > 1) {
+ if (ExecModeVal < 0 || ExecModeVal > 2) {
DP("Error wrong exec_mode value specified in cubin file: %d\n",
ExecModeVal);
return nullptr;
@@ -1045,7 +1047,7 @@ public:
// will execute one iteration of the loop. round up to the nearest
// integer
CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
- } else {
+ } else if (KernelInfo->ExecutionMode == GENERIC) {
// If we reach this point, then we have a non-combined construct, i.e.
// `teams distribute` with a nested `parallel for` and each team is
// assigned one iteration of the `distribute` loop. E.g.:
@@ -1059,6 +1061,14 @@ public:
// Threads within a team will execute the iterations of the `parallel`
// loop.
CudaBlocksPerGrid = LoopTripCount;
+ } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
+ // If we reach this point, then we are executing a kernel that was
+ // transformed from Generic-mode to SPMD-mode. This kernel has
+ // SPMD-mode execution, but needs its blocks to be scheduled
+ // differently because the current loop trip count only applies to the
+ // `teams distribute` region and will create var too few blocks using
+ // the regular SPMD-mode method.
+ CudaBlocksPerGrid = LoopTripCount;
}
DP("Using %d teams due to loop trip count %" PRIu32
" and number of threads per block %d\n",
@@ -1083,7 +1093,9 @@ public:
? getOffloadEntry(DeviceId, TgtEntryPtr)->name
: "(null)",
CudaBlocksPerGrid, CudaThreadsPerBlock,
- (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
+ (KernelInfo->ExecutionMode != SPMD
+ ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" : "SPMD-Generic")
+ : "SPMD"));
CUstream Stream = getStream(DeviceId, AsyncInfo);
Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,