[libc] Add support for global ctors / dtors for AMDGPU

This patch makes the necessary changes to support calling global constructors and destructors on the GPU. The patch in D149340 allows the `lld` linker to create the symbols pointing us to these globals. These should be executed by a single thread, which is more difficult on the GPU because all threads are active. I chose to use an atomic counter to sync every thread on the GPU. This is very slow if you use more than a few thousand threads, but for testing purposes it should be sufficient. Depends on D149340 D149363 Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D149398
author: Joseph Huber <jhuber6@vols.utk.edu> 2023-04-27 18:29:10 -0500
committer: Joseph Huber <jhuber6@vols.utk.edu> 2023-04-29 08:40:20 -0500
commit: 1b823abea74d5a43c4778a252f7d2d3a9a5768c2 (patch)
tree: 9cc4e6609700a566afd23945cd9d3c57caa2259a /libc/startup
parent: a1da7461571cf1763136e22a018a20a271bb70b9 (diff)
download: llvm-1b823abea74d5a43c4778a252f7d2d3a9a5768c2.zip
llvm-1b823abea74d5a43c4778a252f7d2d3a9a5768c2.tar.gz
llvm-1b823abea74d5a43c4778a252f7d2d3a9a5768c2.tar.bz2
2 files changed, 65 insertions, 8 deletions
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
index a9f33af..0f1d4ed 100644
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -5,6 +5,8 @@ add_startup_object(
   DEPENDS
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
+    libc.src.stdlib.exit
+    libc.src.stdlib.atexit
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index e8b5029..ab83ea5 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -8,6 +8,8 @@
 
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
+#include "src/stdlib/atexit.h"
+#include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
 
@@ -15,21 +17,72 @@ namespace __llvm_libc {
 
 static cpp::Atomic<uint32_t> lock = 0;
 
-static cpp::Atomic<uint32_t> init = 0;
+static cpp::Atomic<uint32_t> count = 0;
 
-void init_rpc(void *in, void *out, void *buffer) {
-  // Only a single thread should update the RPC data.
+extern "C" uintptr_t __init_array_start[];
+extern "C" uintptr_t __init_array_end[];
+extern "C" uintptr_t __fini_array_start[];
+extern "C" uintptr_t __fini_array_end[];
+
+using InitCallback = void(int, char **, char **);
+using FiniCallback = void(void);
+
+static uint64_t get_grid_size() {
+  return gpu::get_num_threads() * gpu::get_num_blocks();
+}
+
+static void call_init_array_callbacks(int argc, char **argv, char **env) {
+  size_t init_array_size = __init_array_end - __init_array_start;
+  for (size_t i = 0; i < init_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
+}
+
+static void call_fini_array_callbacks() {
+  size_t fini_array_size = __fini_array_end - __fini_array_start;
+  for (size_t i = 0; i < fini_array_size; ++i)
+    reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
+}
+
+void initialize(int argc, char **argv, char **env, void *in, void *out,
+                void *buffer) {
+  // We need a single GPU thread to perform the initialization of the global
+  // constructors and data. We simply mask off all but a single thread and
+  // execute.
+  count.fetch_add(1, cpp::MemoryOrder::RELAXED);
   if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // We need to set up the RPC client first in case any of the constructors
+    // require it.
     rpc::client.reset(&lock, in, out, buffer);
-    init.store(1, cpp::MemoryOrder::RELAXED);
+
+    // We want the fini array callbacks to be run after other atexit
+    // callbacks are run. So, we register them before running the init
+    // array callbacks as they can potentially register their own atexit
+    // callbacks.
+    atexit(&call_fini_array_callbacks);
+    call_init_array_callbacks(argc, argv, env);
   }
 
-  // Wait until the previous thread signals that the data has been written.
-  while (!init.load(cpp::MemoryOrder::RELAXED))
+  // We wait until every single thread launched on the GPU has seen the
+  // initialization code. This will get very, very slow for high thread counts,
+  // but for testing purposes it is unlikely to matter.
+  while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
     rpc::sleep_briefly();
+  gpu::sync_threads();
+}
 
-  // Wait for the threads in the block to converge and fence the write.
+void finalize(int retval) {
+  // We wait until every single thread launched on the GPU has finished
+  // executing and reached the finalize region.
+  count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
+  while (count.load(cpp::MemoryOrder::RELAXED) != 0)
+    rpc::sleep_briefly();
   gpu::sync_threads();
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // Only a single thread should call `exit` here, the rest should gracefully
+    // return from the kernel. This is so only one thread calls the destructors
+    // registred with 'atexit' above.
+    __llvm_libc::exit(retval);
+  }
 }
 
 } // namespace __llvm_libc
@@ -37,7 +90,9 @@ void init_rpc(void *in, void *out, void *buffer) {
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::init_rpc(in, out, buffer);
+  __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+
+  __llvm_libc::finalize(*ret);
 }
author	Joseph Huber <jhuber6@vols.utk.edu>	2023-04-27 18:29:10 -0500
committer	Joseph Huber <jhuber6@vols.utk.edu>	2023-04-29 08:40:20 -0500
commit	1b823abea74d5a43c4778a252f7d2d3a9a5768c2 (patch)
tree	9cc4e6609700a566afd23945cd9d3c57caa2259a /libc/startup
parent	a1da7461571cf1763136e22a018a20a271bb70b9 (diff)
download	llvm-1b823abea74d5a43c4778a252f7d2d3a9a5768c2.zip llvm-1b823abea74d5a43c4778a252f7d2d3a9a5768c2.tar.gz llvm-1b823abea74d5a43c4778a252f7d2d3a9a5768c2.tar.bz2