[libc] Change GPU startup and loader to use multiple kernels

The GPU has a different execution model to standard `_start` implementations. On the GPU, all threads are active at the start of a kernel. In order to correctly intitialize and call the constructors we want single threaded semantics. Previously, this was done using a makeshift global barrier with atomics. However, it should be easier to simply put the portions of the code that must be single threaded in separate kernels and then call those with only one thread. Generally, mixing global state between kernel launches makes optimizations more difficult, similarly to calling a function outside of the TU, but for testing it is better to be correct. Depends on D149527 D148943 Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D149581
author: Joseph Huber <jhuber6@vols.utk.edu> 2023-05-01 08:17:39 -0500
committer: Joseph Huber <jhuber6@vols.utk.edu> 2023-05-04 19:31:41 -0500
commit: 901266dad313c114e12c181651249e30e5902e26 (patch)
tree: f58bb03eeae716e0df8ee5416d97e0946bb1b7ad /libc/startup
parent: 507edb52f9a9a5c1ab2a92ec2e291a7b63c3fbff (diff)
download: llvm-901266dad313c114e12c181651249e30e5902e26.zip
llvm-901266dad313c114e12c181651249e30e5902e26.tar.gz
llvm-901266dad313c114e12c181651249e30e5902e26.tar.bz2
2 files changed, 47 insertions, 107 deletions
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index b28ad79..d1dfc7b 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -17,8 +17,6 @@ namespace __llvm_libc {
 
 static cpp::Atomic<uint32_t> lock = 0;
 
-static cpp::Atomic<uint32_t> count = 0;
-
 extern "C" uintptr_t __init_array_start[];
 extern "C" uintptr_t __init_array_end[];
 extern "C" uintptr_t __fini_array_start[];
@@ -27,10 +25,6 @@ extern "C" uintptr_t __fini_array_end[];
 using InitCallback = void(int, char **, char **);
 using FiniCallback = void(void);
 
-static uint64_t get_grid_size() {
-  return gpu::get_num_threads() * gpu::get_num_blocks();
-}
-
 static void call_init_array_callbacks(int argc, char **argv, char **env) {
   size_t init_array_size = __init_array_end - __init_array_start;
   for (size_t i = 0; i < init_array_size; ++i)
@@ -43,56 +37,34 @@ static void call_fini_array_callbacks() {
     reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
 }
 
-void initialize(int argc, char **argv, char **env, void *in, void *out,
-                void *buffer) {
-  // We need a single GPU thread to perform the initialization of the global
-  // constructors and data. We simply mask off all but a single thread and
-  // execute.
-  count.fetch_add(1, cpp::MemoryOrder::RELAXED);
-  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
-    // We need to set up the RPC client first in case any of the constructors
-    // require it.
-    rpc::client.reset(gpu::get_lane_size(), &lock, in, out, buffer);
-
-    // We want the fini array callbacks to be run after other atexit
-    // callbacks are run. So, we register them before running the init
-    // array callbacks as they can potentially register their own atexit
-    // callbacks.
-    atexit(&call_fini_array_callbacks);
-    call_init_array_callbacks(argc, argv, env);
-  }
-
-  // We wait until every single thread launched on the GPU has seen the
-  // initialization code. This will get very, very slow for high thread counts,
-  // but for testing purposes it is unlikely to matter.
-  while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
-    rpc::sleep_briefly();
-  gpu::sync_threads();
-}
-
-void finalize(int retval) {
-  // We wait until every single thread launched on the GPU has finished
-  // executing and reached the finalize region.
-  count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
-  while (count.load(cpp::MemoryOrder::RELAXED) != 0)
-    rpc::sleep_briefly();
-  gpu::sync_threads();
-  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
-    // Only a single thread should call `exit` here, the rest should gracefully
-    // return from the kernel. This is so only one thread calls the destructors
-    // registred with 'atexit' above.
-    __llvm_libc::exit(retval);
-  }
-}
-
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
-_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
-       void *buffer) {
-  __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
+_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) {
+  // We need to set up the RPC client first in case any of the constructors
+  // require it.
+  __llvm_libc::rpc::client.reset(__llvm_libc::gpu::get_lane_size(),
+                                 &__llvm_libc::lock, in, out, buffer);
+
+  // We want the fini array callbacks to be run after other atexit
+  // callbacks are run. So, we register them before running the init
+  // array callbacks as they can potentially register their own atexit
+  // callbacks.
+  __llvm_libc::atexit(&__llvm_libc::call_fini_array_callbacks);
+  __llvm_libc::call_init_array_callbacks(argc, argv, env);
+}
 
+extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+_start(int argc, char **argv, char **envp, int *ret) {
+  // Invoke the 'main' function with every active thread that the user launched
+  // the _start kernel with.
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+}
 
-  __llvm_libc::finalize(*ret);
+extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+_end(int retval) {
+  // Only a single thread should call `exit` here, the rest should gracefully
+  // return from the kernel. This is so only one thread calls the destructors
+  // registred with 'atexit' above.
+  __llvm_libc::exit(retval);
 }
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 9ed7559..83453ae 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -17,8 +17,6 @@ namespace __llvm_libc {
 
 static cpp::Atomic<uint32_t> lock = 0;
 
-static cpp::Atomic<uint32_t> count = 0;
-
 extern "C" {
 // Nvidia's 'nvlink' linker does not provide these symbols. We instead need
 // to manually create them and update the globals in the loader implememtation.
@@ -31,10 +29,6 @@ uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
 using InitCallback = void(int, char **, char **);
 using FiniCallback = void(void);
 
-static uint64_t get_grid_size() {
-  return gpu::get_num_threads() * gpu::get_num_blocks();
-}
-
 static void call_init_array_callbacks(int argc, char **argv, char **env) {
   size_t init_array_size = __init_array_end - __init_array_start;
   for (size_t i = 0; i < init_array_size; ++i)
@@ -47,59 +41,33 @@ static void call_fini_array_callbacks() {
     reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
 }
 
-// TODO: Put this in a separate kernel and call it with one thread.
-void initialize(int argc, char **argv, char **env, void *in, void *out,
-                void *buffer) {
-  // We need a single GPU thread to perform the initialization of the global
-  // constructors and data. We simply mask off all but a single thread and
-  // execute.
-  count.fetch_add(1, cpp::MemoryOrder::RELAXED);
-  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
-    // We need to set up the RPC client first in case any of the constructors
-    // require it.
-    rpc::client.reset(gpu::get_lane_size(), &lock, in, out, buffer);
-
-    // We want the fini array callbacks to be run after other atexit
-    // callbacks are run. So, we register them before running the init
-    // array callbacks as they can potentially register their own atexit
-    // callbacks.
-    // FIXME: The function pointer escaping this TU causes warnings.
-    __llvm_libc::atexit(&call_fini_array_callbacks);
-    call_init_array_callbacks(argc, argv, env);
-  }
-
-  // We wait until every single thread launched on the GPU has seen the
-  // initialization code. This will get very, very slow for high thread counts,
-  // but for testing purposes it is unlikely to matter.
-  while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
-    rpc::sleep_briefly();
-  gpu::sync_threads();
-}
-
-// TODO: Put this in a separate kernel and call it with one thread.
-void finalize(int retval) {
-  // We wait until every single thread launched on the GPU has finished
-  // executing and reached the finalize region.
-  count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
-  while (count.load(cpp::MemoryOrder::RELAXED) != 0)
-    rpc::sleep_briefly();
-  gpu::sync_threads();
-  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
-    // Only a single thread should call `exit` here, the rest should gracefully
-    // return from the kernel. This is so only one thread calls the destructors
-    // registred with 'atexit' above.
-    __llvm_libc::exit(retval);
-  }
-}
-
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
-       void *buffer) {
-  __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
+_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) {
+  // We need to set up the RPC client first in case any of the constructors
+  // require it.
+  __llvm_libc::rpc::client.reset(__llvm_libc::gpu::get_lane_size(),
+                                 &__llvm_libc::lock, in, out, buffer);
+
+  // We want the fini array callbacks to be run after other atexit
+  // callbacks are run. So, we register them before running the init
+  // array callbacks as they can potentially register their own atexit
+  // callbacks.
+  __llvm_libc::atexit(&__llvm_libc::call_fini_array_callbacks);
+  __llvm_libc::call_init_array_callbacks(argc, argv, env);
+}
 
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
+_start(int argc, char **argv, char **envp, int *ret) {
+  // Invoke the 'main' function with every active thread that the user launched
+  // the _start kernel with.
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+}
 
-  __llvm_libc::finalize(*ret);
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
+_end(int retval) {
+  // To finis the execution we invoke all the callbacks registered via 'atexit'
+  // and then exit with the appropriate return value.
+  __llvm_libc::exit(retval);
 }
author	Joseph Huber <jhuber6@vols.utk.edu>	2023-05-01 08:17:39 -0500
committer	Joseph Huber <jhuber6@vols.utk.edu>	2023-05-04 19:31:41 -0500
commit	901266dad313c114e12c181651249e30e5902e26 (patch)
tree	f58bb03eeae716e0df8ee5416d97e0946bb1b7ad /libc/startup
parent	507edb52f9a9a5c1ab2a92ec2e291a7b63c3fbff (diff)
download	llvm-901266dad313c114e12c181651249e30e5902e26.zip llvm-901266dad313c114e12c181651249e30e5902e26.tar.gz llvm-901266dad313c114e12c181651249e30e5902e26.tar.bz2