aboutsummaryrefslogtreecommitdiff
path: root/offload/DeviceRTL/src/Reduction.cpp
diff options
context:
space:
mode:
authorNAKAMURA Takumi <geek4civic@gmail.com>2025-01-09 17:16:04 +0900
committerNAKAMURA Takumi <geek4civic@gmail.com>2025-01-09 17:16:04 +0900
commit0aa930a41f2d1ebf1fa90ec42da8f96d15a4dcbb (patch)
tree6a77b463f700e090df586672c26b9fe765fd115b /offload/DeviceRTL/src/Reduction.cpp
parentec6892d1c979ce0b84c86918d5cdbb03037b409a (diff)
parent6d16b1c5c468a79ecf867293023c89ac518ecdda (diff)
downloadllvm-users/chapuni/cov/single/nextcount-base.zip
llvm-users/chapuni/cov/single/nextcount-base.tar.gz
llvm-users/chapuni/cov/single/nextcount-base.tar.bz2
Merge branch 'users/chapuni/cov/single/pair' into users/chapuni/cov/single/nextcount-baseusers/chapuni/cov/single/nextcount-base
Diffstat (limited to 'offload/DeviceRTL/src/Reduction.cpp')
-rw-r--r--offload/DeviceRTL/src/Reduction.cpp91
1 files changed, 46 insertions, 45 deletions
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 57df159d..d3b4528 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
ShuffleReductFnTy shflFct) {
uint32_t size, remote_id, physical_lane_id;
@@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
} while (logical_lane_id % 2 == 0 && size > 1);
return (logical_lane_id == 0);
}
-#endif
static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
ShuffleReductFnTy shflFct,
@@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
uint32_t NumThreads = omp_get_num_threads();
if (NumThreads == 1)
return 1;
- /*
- * This reduce function handles reduction within a team. It handles
- * parallel regions in both L1 and L2 parallelism levels. It also
- * supports Generic, SPMD, and NoOMP modes.
- *
- * 1. Reduce within a warp.
- * 2. Warp master copies value to warp 0 via shared memory.
- * 3. Warp 0 reduces to a single value.
- * 4. The reduced value is available in the thread that returns 1.
- */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- uint32_t WarpsNeeded =
- (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
- uint32_t WarpId = mapping::getWarpIdInBlock();
-
- // Volta execution model:
- // For the Generic execution mode a parallel region either has 1 thread and
- // beyond that, always a multiple of 32. For the SPMD execution mode we may
- // have any number of threads.
- if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
- gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/NumThreads % mapping::getWarpSize(),
- /*LaneId=*/mapping::getThreadIdInBlock() %
- mapping::getWarpSize());
- // When we have more than [mapping::getWarpSize()] number of threads
- // a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
- if (NumThreads > mapping::getWarpSize()) {
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
+ //
+ // This reduce function handles reduction within a team. It handles
+ // parallel regions in both L1 and L2 parallelism levels. It also
+ // supports Generic, SPMD, and NoOMP modes.
+ //
+ // 1. Reduce within a warp.
+ // 2. Warp master copies value to warp 0 via shared memory.
+ // 3. Warp 0 reduces to a single value.
+ // 4. The reduced value is available in the thread that returns 1.
+ //
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
+#if __has_builtin(__nvvm_reflect)
+ if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
+ uint32_t WarpsNeeded =
+ (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+ uint32_t WarpId = mapping::getWarpIdInBlock();
+
+ // Volta execution model:
+ // For the Generic execution mode a parallel region either has 1 thread and
+ // beyond that, always a multiple of 32. For the SPMD execution mode we may
+ // have any number of threads.
+ if ((NumThreads % mapping::getWarpSize() == 0) ||
+ (WarpId < WarpsNeeded - 1))
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+ gpu_irregular_warp_reduce(
+ reduce_data, shflFct,
+ /*LaneCount=*/NumThreads % mapping::getWarpSize(),
+ /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());
+
+ // When we have more than [mapping::getWarpSize()] number of threads
+ // a block reduction is performed here.
+ //
+ // Only L1 parallel region can enter this if condition.
+ if (NumThreads > mapping::getWarpSize()) {
+ // Gather all the reduced values from each warp
+ // to the first warp.
+ cpyFct(reduce_data, WarpsNeeded);
+
+ if (WarpId == 0)
+ gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+ BlockThreadId);
+ }
+ return BlockThreadId == 0;
}
- return BlockThreadId == 0;
-#else
+#endif
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
if (Liveness == lanes::All) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
@@ -150,10 +152,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
return BlockThreadId == 0;
}
- // Get the OMP thread Id. This is different from BlockThreadId in the case of
- // an L2 parallel region.
+ // Get the OMP thread Id. This is different from BlockThreadId in the case
+ // of an L2 parallel region.
return BlockThreadId == 0;
-#endif // __CUDA_ARCH__ >= 700
}
uint32_t roundToWarpsize(uint32_t s) {