diff options
author | NAKAMURA Takumi <geek4civic@gmail.com> | 2025-01-09 17:16:04 +0900 |
---|---|---|
committer | NAKAMURA Takumi <geek4civic@gmail.com> | 2025-01-09 17:16:04 +0900 |
commit | 0aa930a41f2d1ebf1fa90ec42da8f96d15a4dcbb (patch) | |
tree | 6a77b463f700e090df586672c26b9fe765fd115b /offload/DeviceRTL/src/Reduction.cpp | |
parent | ec6892d1c979ce0b84c86918d5cdbb03037b409a (diff) | |
parent | 6d16b1c5c468a79ecf867293023c89ac518ecdda (diff) | |
download | llvm-users/chapuni/cov/single/nextcount-base.zip llvm-users/chapuni/cov/single/nextcount-base.tar.gz llvm-users/chapuni/cov/single/nextcount-base.tar.bz2 |
Merge branch 'users/chapuni/cov/single/pair' into users/chapuni/cov/single/nextcount-baseusers/chapuni/cov/single/nextcount-base
Diffstat (limited to 'offload/DeviceRTL/src/Reduction.cpp')
-rw-r--r-- | offload/DeviceRTL/src/Reduction.cpp | 91 |
1 files changed, 46 insertions, 45 deletions
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp index 57df159d..d3b4528 100644 --- a/offload/DeviceRTL/src/Reduction.cpp +++ b/offload/DeviceRTL/src/Reduction.cpp @@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, } } -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 static uint32_t gpu_irregular_simd_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { uint32_t size, remote_id, physical_lane_id; @@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data, } while (logical_lane_id % 2 == 0 && size > 1); return (logical_lane_id == 0); } -#endif static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, ShuffleReductFnTy shflFct, @@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, uint32_t NumThreads = omp_get_num_threads(); if (NumThreads == 1) return 1; - /* - * This reduce function handles reduction within a team. It handles - * parallel regions in both L1 and L2 parallelism levels. It also - * supports Generic, SPMD, and NoOMP modes. - * - * 1. Reduce within a warp. - * 2. Warp master copies value to warp 0 via shared memory. - * 3. Warp 0 reduces to a single value. - * 4. The reduced value is available in the thread that returns 1. - */ - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t WarpsNeeded = - (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - uint32_t WarpId = mapping::getWarpIdInBlock(); - - // Volta execution model: - // For the Generic execution mode a parallel region either has 1 thread and - // beyond that, always a multiple of 32. For the SPMD execution mode we may - // have any number of threads. - if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/NumThreads % mapping::getWarpSize(), - /*LaneId=*/mapping::getThreadIdInBlock() % - mapping::getWarpSize()); - // When we have more than [mapping::getWarpSize()] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > mapping::getWarpSize()) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); + // + // This reduce function handles reduction within a team. It handles + // parallel regions in both L1 and L2 parallelism levels. It also + // supports Generic, SPMD, and NoOMP modes. + // + // 1. Reduce within a warp. + // 2. Warp master copies value to warp 0 via shared memory. + // 3. Warp 0 reduces to a single value. + // 4. The reduced value is available in the thread that returns 1. + // - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); +#if __has_builtin(__nvvm_reflect) + if (__nvvm_reflect("__CUDA_ARCH") >= 700) { + uint32_t WarpsNeeded = + (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); + uint32_t WarpId = mapping::getWarpIdInBlock(); + + // Volta execution model: + // For the Generic execution mode a parallel region either has 1 thread and + // beyond that, always a multiple of 32. For the SPMD execution mode we may + // have any number of threads. + if ((NumThreads % mapping::getWarpSize() == 0) || + (WarpId < WarpsNeeded - 1)) + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce( + reduce_data, shflFct, + /*LaneCount=*/NumThreads % mapping::getWarpSize(), + /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); + + // When we have more than [mapping::getWarpSize()] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > mapping::getWarpSize()) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + } + return BlockThreadId == 0; } - return BlockThreadId == 0; -#else +#endif __kmpc_impl_lanemask_t Liveness = mapping::activemask(); if (Liveness == lanes::All) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); @@ -150,10 +152,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, return BlockThreadId == 0; } - // Get the OMP thread Id. This is different from BlockThreadId in the case of - // an L2 parallel region. + // Get the OMP thread Id. This is different from BlockThreadId in the case + // of an L2 parallel region. return BlockThreadId == 0; -#endif // __CUDA_ARCH__ >= 700 } uint32_t roundToWarpsize(uint32_t s) { |