aboutsummaryrefslogtreecommitdiff
path: root/libc/startup
diff options
context:
space:
mode:
authorJoseph Huber <jhuber6@vols.utk.edu>2023-04-27 18:29:10 -0500
committerJoseph Huber <jhuber6@vols.utk.edu>2023-04-29 08:40:20 -0500
commit1b823abea74d5a43c4778a252f7d2d3a9a5768c2 (patch)
tree9cc4e6609700a566afd23945cd9d3c57caa2259a /libc/startup
parenta1da7461571cf1763136e22a018a20a271bb70b9 (diff)
downloadllvm-1b823abea74d5a43c4778a252f7d2d3a9a5768c2.zip
llvm-1b823abea74d5a43c4778a252f7d2d3a9a5768c2.tar.gz
llvm-1b823abea74d5a43c4778a252f7d2d3a9a5768c2.tar.bz2
[libc] Add support for global ctors / dtors for AMDGPU
This patch makes the necessary changes to support calling global constructors and destructors on the GPU. The patch in D149340 allows the `lld` linker to create the symbols pointing us to these globals. These should be executed by a single thread, which is more difficult on the GPU because all threads are active. I chose to use an atomic counter to sync every thread on the GPU. This is very slow if you use more than a few thousand threads, but for testing purposes it should be sufficient. Depends on D149340 D149363 Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D149398
Diffstat (limited to 'libc/startup')
-rw-r--r--libc/startup/gpu/amdgpu/CMakeLists.txt2
-rw-r--r--libc/startup/gpu/amdgpu/start.cpp71
2 files changed, 65 insertions, 8 deletions
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
index a9f33af..0f1d4ed 100644
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -5,6 +5,8 @@ add_startup_object(
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
+ libc.src.stdlib.exit
+ libc.src.stdlib.atexit
COMPILE_OPTIONS
-ffreestanding # To avoid compiler warnings about calling the main function.
-fno-builtin
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index e8b5029..ab83ea5 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -8,6 +8,8 @@
#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"
+#include "src/stdlib/atexit.h"
+#include "src/stdlib/exit.h"
extern "C" int main(int argc, char **argv, char **envp);
@@ -15,21 +17,72 @@ namespace __llvm_libc {
static cpp::Atomic<uint32_t> lock = 0;
-static cpp::Atomic<uint32_t> init = 0;
+static cpp::Atomic<uint32_t> count = 0;
-void init_rpc(void *in, void *out, void *buffer) {
- // Only a single thread should update the RPC data.
+extern "C" uintptr_t __init_array_start[];
+extern "C" uintptr_t __init_array_end[];
+extern "C" uintptr_t __fini_array_start[];
+extern "C" uintptr_t __fini_array_end[];
+
+using InitCallback = void(int, char **, char **);
+using FiniCallback = void(void);
+
+static uint64_t get_grid_size() {
+ return gpu::get_num_threads() * gpu::get_num_blocks();
+}
+
+static void call_init_array_callbacks(int argc, char **argv, char **env) {
+ size_t init_array_size = __init_array_end - __init_array_start;
+ for (size_t i = 0; i < init_array_size; ++i)
+ reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
+}
+
+static void call_fini_array_callbacks() {
+ size_t fini_array_size = __fini_array_end - __fini_array_start;
+ for (size_t i = 0; i < fini_array_size; ++i)
+ reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
+}
+
+void initialize(int argc, char **argv, char **env, void *in, void *out,
+ void *buffer) {
+ // We need a single GPU thread to perform the initialization of the global
+ // constructors and data. We simply mask off all but a single thread and
+ // execute.
+ count.fetch_add(1, cpp::MemoryOrder::RELAXED);
if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+ // We need to set up the RPC client first in case any of the constructors
+ // require it.
rpc::client.reset(&lock, in, out, buffer);
- init.store(1, cpp::MemoryOrder::RELAXED);
+
+ // We want the fini array callbacks to be run after other atexit
+ // callbacks are run. So, we register them before running the init
+ // array callbacks as they can potentially register their own atexit
+ // callbacks.
+ atexit(&call_fini_array_callbacks);
+ call_init_array_callbacks(argc, argv, env);
}
- // Wait until the previous thread signals that the data has been written.
- while (!init.load(cpp::MemoryOrder::RELAXED))
+ // We wait until every single thread launched on the GPU has seen the
+ // initialization code. This will get very, very slow for high thread counts,
+ // but for testing purposes it is unlikely to matter.
+ while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
rpc::sleep_briefly();
+ gpu::sync_threads();
+}
- // Wait for the threads in the block to converge and fence the write.
+void finalize(int retval) {
+ // We wait until every single thread launched on the GPU has finished
+ // executing and reached the finalize region.
+ count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
+ while (count.load(cpp::MemoryOrder::RELAXED) != 0)
+ rpc::sleep_briefly();
gpu::sync_threads();
+ if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+ // Only a single thread should call `exit` here, the rest should gracefully
+ // return from the kernel. This is so only one thread calls the destructors
+ // registred with 'atexit' above.
+ __llvm_libc::exit(retval);
+ }
}
} // namespace __llvm_libc
@@ -37,7 +90,9 @@ void init_rpc(void *in, void *out, void *buffer) {
extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
void *buffer) {
- __llvm_libc::init_rpc(in, out, buffer);
+ __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+
+ __llvm_libc::finalize(*ret);
}