Merge branch 'users/chapuni/cov/single/pair' into users/chapuni/cov/single/nextcount-baseusers/chapuni/cov/single/nextcount-base

author: NAKAMURA Takumi <geek4civic@gmail.com> 2025-01-09 17:16:04 +0900
committer: NAKAMURA Takumi <geek4civic@gmail.com> 2025-01-09 17:16:04 +0900
commit: 0aa930a41f2d1ebf1fa90ec42da8f96d15a4dcbb (patch)
tree: 6a77b463f700e090df586672c26b9fe765fd115b /offload/DeviceRTL/src/Reduction.cpp
parent: ec6892d1c979ce0b84c86918d5cdbb03037b409a (diff)
parent: 6d16b1c5c468a79ecf867293023c89ac518ecdda (diff)
download: llvm-users/chapuni/cov/single/nextcount-base.zip
llvm-users/chapuni/cov/single/nextcount-base.tar.gz
llvm-users/chapuni/cov/single/nextcount-base.tar.bz2
1 files changed, 46 insertions, 45 deletions
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 57df159d..d3b4528 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
   }
 }
 
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
                                           ShuffleReductFnTy shflFct) {
   uint32_t size, remote_id, physical_lane_id;
@@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
   } while (logical_lane_id % 2 == 0 && size > 1);
   return (logical_lane_id == 0);
 }
-#endif
 
 static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
                                             ShuffleReductFnTy shflFct,
@@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   uint32_t NumThreads = omp_get_num_threads();
   if (NumThreads == 1)
     return 1;
-    /*
-     * This reduce function handles reduction within a team. It handles
-     * parallel regions in both L1 and L2 parallelism levels. It also
-     * supports Generic, SPMD, and NoOMP modes.
-     *
-     * 1. Reduce within a warp.
-     * 2. Warp master copies value to warp 0 via shared memory.
-     * 3. Warp 0 reduces to a single value.
-     * 4. The reduced value is available in the thread that returns 1.
-     */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t WarpsNeeded =
-      (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-  uint32_t WarpId = mapping::getWarpIdInBlock();
-
-  // Volta execution model:
-  // For the Generic execution mode a parallel region either has 1 thread and
-  // beyond that, always a multiple of 32. For the SPMD execution mode we may
-  // have any number of threads.
-  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
 
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
+    //
+    // This reduce function handles reduction within a team. It handles
+    // parallel regions in both L1 and L2 parallelism levels. It also
+    // supports Generic, SPMD, and NoOMP modes.
+    //
+    // 1. Reduce within a warp.
+    // 2. Warp master copies value to warp 0 via shared memory.
+    // 3. Warp 0 reduces to a single value.
+    // 4. The reduced value is available in the thread that returns 1.
+    //
 
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
+#if __has_builtin(__nvvm_reflect)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
+    uint32_t WarpsNeeded =
+        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+    uint32_t WarpId = mapping::getWarpIdInBlock();
+
+    // Volta execution model:
+    // For the Generic execution mode a parallel region either has 1 thread and
+    // beyond that, always a multiple of 32. For the SPMD execution mode we may
+    // have any number of threads.
+    if ((NumThreads % mapping::getWarpSize() == 0) ||
+        (WarpId < WarpsNeeded - 1))
+      gpu_regular_warp_reduce(reduce_data, shflFct);
+    else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+      gpu_irregular_warp_reduce(
+          reduce_data, shflFct,
+          /*LaneCount=*/NumThreads % mapping::getWarpSize(),
+          /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());
+
+    // When we have more than [mapping::getWarpSize()] number of threads
+    // a block reduction is performed here.
+    //
+    // Only L1 parallel region can enter this if condition.
+    if (NumThreads > mapping::getWarpSize()) {
+      // Gather all the reduced values from each warp
+      // to the first warp.
+      cpyFct(reduce_data, WarpsNeeded);
+
+      if (WarpId == 0)
+        gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                  BlockThreadId);
+    }
+    return BlockThreadId == 0;
   }
-  return BlockThreadId == 0;
-#else
+#endif
   __kmpc_impl_lanemask_t Liveness = mapping::activemask();
   if (Liveness == lanes::All) // Full warp
     gpu_regular_warp_reduce(reduce_data, shflFct);
@@ -150,10 +152,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
     return BlockThreadId == 0;
   }
 
-  // Get the OMP thread Id. This is different from BlockThreadId in the case of
-  // an L2 parallel region.
+  // Get the OMP thread Id. This is different from BlockThreadId in the case
+  // of an L2 parallel region.
   return BlockThreadId == 0;
-#endif // __CUDA_ARCH__ >= 700
 }
 
 uint32_t roundToWarpsize(uint32_t s) {
author	NAKAMURA Takumi <geek4civic@gmail.com>	2025-01-09 17:16:04 +0900
committer	NAKAMURA Takumi <geek4civic@gmail.com>	2025-01-09 17:16:04 +0900
commit	0aa930a41f2d1ebf1fa90ec42da8f96d15a4dcbb (patch)
tree	6a77b463f700e090df586672c26b9fe765fd115b /offload/DeviceRTL/src/Reduction.cpp
parent	ec6892d1c979ce0b84c86918d5cdbb03037b409a (diff)
parent	6d16b1c5c468a79ecf867293023c89ac518ecdda (diff)
download	llvm-users/chapuni/cov/single/nextcount-base.zip llvm-users/chapuni/cov/single/nextcount-base.tar.gz llvm-users/chapuni/cov/single/nextcount-base.tar.bz2