[libc] Support global constructors and destructors on NVPTX

This patch adds the necessary hacks to support global constructors and destructors. This is an incredibly hacky process caused by the primary fact that Nvidia does not provide any binary tools and very little linker support. We first had to emit references to these functions and their priority in D149451. Then we dig them out of the module once it's loaded to manually create the list that the linker should have made for us. This patch also contains a few Nvidia specific hacks, but it passes the test, albeit with a stack size warning from `ptxas` for the callback. But this should be fine given the resource usage of a common test. This also adds a dependency on LLVM to the NVPTX loader, which hopefully doesn't cause problems with our CUDA buildbot. Depends on D149451 Reviewed By: tra Differential Revision: https://reviews.llvm.org/D149527
author: Joseph Huber <jhuber6@vols.utk.edu> 2023-04-28 09:33:44 -0500
committer: Joseph Huber <jhuber6@vols.utk.edu> 2023-05-04 07:13:00 -0500
commit: 2e1c0ec6297958f73ca5ed35ce47803ea0f48dba (patch)
tree: 5ecac614701cb678d6c7623dd9d32b705c7af286 /libc/startup
parent: f05ce9045af4a40232c08451cb0aef64b0e673b2 (diff)
download: llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.zip
llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.tar.gz
llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.tar.bz2
2 files changed, 72 insertions, 8 deletions
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index b8a9f49..49fa489 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -6,6 +6,8 @@ add_startup_object(
   DEPENDS
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
+    libc.src.stdlib.exit
+    libc.src.stdlib.atexit
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 7b88e30..fe09666 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -8,6 +8,8 @@
 
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
+#include "src/stdlib/atexit.h"
+#include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
 
@@ -15,21 +17,79 @@ namespace __llvm_libc {
 
 static cpp::Atomic<uint32_t> lock = 0;
 
-static cpp::Atomic<uint32_t> init = 0;
+static cpp::Atomic<uint32_t> count = 0;
 
-void init_rpc(void *in, void *out, void *buffer) {
-  // Only a single thread should update the RPC data.
+extern "C" {
+// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
+// to manually create them and update the globals in the loader implememtation.
+uintptr_t *__init_array_start [[gnu::visibility("protected")]];
+uintptr_t *__init_array_end [[gnu::visibility("protected")]];
+uintptr_t *__fini_array_start [[gnu::visibility("protected")]];
+uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
+}
+
+using InitCallback = void(int, char **, char **);
+using FiniCallback = void(void);
+
+static uint64_t get_grid_size() {
+  return gpu::get_num_threads() * gpu::get_num_blocks();
+}
+
+static void call_init_array_callbacks(int argc, char **argv, char **env) {
+  size_t init_array_size = __init_array_end - __init_array_start;
+  for (size_t i = 0; i < init_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
+}
+
+static void call_fini_array_callbacks() {
+  size_t fini_array_size = __fini_array_end - __fini_array_start;
+  for (size_t i = 0; i < fini_array_size; ++i)
+    reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
+}
+
+// TODO: Put this in a separate kernel and call it with one thread.
+void initialize(int argc, char **argv, char **env, void *in, void *out,
+                void *buffer) {
+  // We need a single GPU thread to perform the initialization of the global
+  // constructors and data. We simply mask off all but a single thread and
+  // execute.
+  count.fetch_add(1, cpp::MemoryOrder::RELAXED);
   if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // We need to set up the RPC client first in case any of the constructors
+    // require it.
     rpc::client.reset(&lock, in, out, buffer);
-    init.store(1, cpp::MemoryOrder::RELAXED);
+
+    // We want the fini array callbacks to be run after other atexit
+    // callbacks are run. So, we register them before running the init
+    // array callbacks as they can potentially register their own atexit
+    // callbacks.
+    // FIXME: The function pointer escaping this TU causes warnings.
+    __llvm_libc::atexit(&call_fini_array_callbacks);
+    call_init_array_callbacks(argc, argv, env);
   }
 
-  // Wait until the previous thread signals that the data has been written.
-  while (!init.load(cpp::MemoryOrder::RELAXED))
+  // We wait until every single thread launched on the GPU has seen the
+  // initialization code. This will get very, very slow for high thread counts,
+  // but for testing purposes it is unlikely to matter.
+  while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
     rpc::sleep_briefly();
+  gpu::sync_threads();
+}
 
-  // Wait for the threads in the block to converge and fence the write.
+// TODO: Put this in a separate kernel and call it with one thread.
+void finalize(int retval) {
+  // We wait until every single thread launched on the GPU has finished
+  // executing and reached the finalize region.
+  count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
+  while (count.load(cpp::MemoryOrder::RELAXED) != 0)
+    rpc::sleep_briefly();
   gpu::sync_threads();
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // Only a single thread should call `exit` here, the rest should gracefully
+    // return from the kernel. This is so only one thread calls the destructors
+    // registred with 'atexit' above.
+    __llvm_libc::exit(retval);
+  }
 }
 
 } // namespace __llvm_libc
@@ -37,7 +97,9 @@ void init_rpc(void *in, void *out, void *buffer) {
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::init_rpc(in, out, buffer);
+  __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+
+  __llvm_libc::finalize(*ret);
 }
author	Joseph Huber <jhuber6@vols.utk.edu>	2023-04-28 09:33:44 -0500
committer	Joseph Huber <jhuber6@vols.utk.edu>	2023-05-04 07:13:00 -0500
commit	2e1c0ec6297958f73ca5ed35ce47803ea0f48dba (patch)
tree	5ecac614701cb678d6c7623dd9d32b705c7af286 /libc/startup
parent	f05ce9045af4a40232c08451cb0aef64b0e673b2 (diff)
download	llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.zip llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.tar.gz llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.tar.bz2