diff options
author | Joseph Huber <jhuber6@vols.utk.edu> | 2023-04-28 09:33:44 -0500 |
---|---|---|
committer | Joseph Huber <jhuber6@vols.utk.edu> | 2023-05-04 07:13:00 -0500 |
commit | 2e1c0ec6297958f73ca5ed35ce47803ea0f48dba (patch) | |
tree | 5ecac614701cb678d6c7623dd9d32b705c7af286 /libc/startup | |
parent | f05ce9045af4a40232c08451cb0aef64b0e673b2 (diff) | |
download | llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.zip llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.tar.gz llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.tar.bz2 |
[libc] Support global constructors and destructors on NVPTX
This patch adds the necessary hacks to support global constructors and
destructors. This is an incredibly hacky process caused by the primary
fact that Nvidia does not provide any binary tools and very little
linker support. We first had to emit references to these functions and
their priority in D149451. Then we dig them out of the module once it's
loaded to manually create the list that the linker should have made for
us. This patch also contains a few Nvidia specific hacks, but it passes
the test, albeit with a stack size warning from `ptxas` for the
callback. But this should be fine given the resource usage of a common
test.
This also adds a dependency on LLVM to the NVPTX loader, which hopefully doesn't
cause problems with our CUDA buildbot.
Depends on D149451
Reviewed By: tra
Differential Revision: https://reviews.llvm.org/D149527
Diffstat (limited to 'libc/startup')
-rw-r--r-- | libc/startup/gpu/nvptx/CMakeLists.txt | 2 | ||||
-rw-r--r-- | libc/startup/gpu/nvptx/start.cpp | 78 |
2 files changed, 72 insertions, 8 deletions
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt index b8a9f49..49fa489 100644 --- a/libc/startup/gpu/nvptx/CMakeLists.txt +++ b/libc/startup/gpu/nvptx/CMakeLists.txt @@ -6,6 +6,8 @@ add_startup_object( DEPENDS libc.src.__support.RPC.rpc_client libc.src.__support.GPU.utils + libc.src.stdlib.exit + libc.src.stdlib.atexit COMPILE_OPTIONS -ffreestanding # To avoid compiler warnings about calling the main function. -fno-builtin diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp index 7b88e30..fe09666 100644 --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -8,6 +8,8 @@ #include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" +#include "src/stdlib/atexit.h" +#include "src/stdlib/exit.h" extern "C" int main(int argc, char **argv, char **envp); @@ -15,21 +17,79 @@ namespace __llvm_libc { static cpp::Atomic<uint32_t> lock = 0; -static cpp::Atomic<uint32_t> init = 0; +static cpp::Atomic<uint32_t> count = 0; -void init_rpc(void *in, void *out, void *buffer) { - // Only a single thread should update the RPC data. +extern "C" { +// Nvidia's 'nvlink' linker does not provide these symbols. We instead need +// to manually create them and update the globals in the loader implememtation. +uintptr_t *__init_array_start [[gnu::visibility("protected")]]; +uintptr_t *__init_array_end [[gnu::visibility("protected")]]; +uintptr_t *__fini_array_start [[gnu::visibility("protected")]]; +uintptr_t *__fini_array_end [[gnu::visibility("protected")]]; +} + +using InitCallback = void(int, char **, char **); +using FiniCallback = void(void); + +static uint64_t get_grid_size() { + return gpu::get_num_threads() * gpu::get_num_blocks(); +} + +static void call_init_array_callbacks(int argc, char **argv, char **env) { + size_t init_array_size = __init_array_end - __init_array_start; + for (size_t i = 0; i < init_array_size; ++i) + reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env); +} + +static void call_fini_array_callbacks() { + size_t fini_array_size = __fini_array_end - __fini_array_start; + for (size_t i = 0; i < fini_array_size; ++i) + reinterpret_cast<FiniCallback *>(__fini_array_start[i])(); +} + +// TODO: Put this in a separate kernel and call it with one thread. +void initialize(int argc, char **argv, char **env, void *in, void *out, + void *buffer) { + // We need a single GPU thread to perform the initialization of the global + // constructors and data. We simply mask off all but a single thread and + // execute. + count.fetch_add(1, cpp::MemoryOrder::RELAXED); if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { + // We need to set up the RPC client first in case any of the constructors + // require it. rpc::client.reset(&lock, in, out, buffer); - init.store(1, cpp::MemoryOrder::RELAXED); + + // We want the fini array callbacks to be run after other atexit + // callbacks are run. So, we register them before running the init + // array callbacks as they can potentially register their own atexit + // callbacks. + // FIXME: The function pointer escaping this TU causes warnings. + __llvm_libc::atexit(&call_fini_array_callbacks); + call_init_array_callbacks(argc, argv, env); } - // Wait until the previous thread signals that the data has been written. - while (!init.load(cpp::MemoryOrder::RELAXED)) + // We wait until every single thread launched on the GPU has seen the + // initialization code. This will get very, very slow for high thread counts, + // but for testing purposes it is unlikely to matter. + while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size()) rpc::sleep_briefly(); + gpu::sync_threads(); +} - // Wait for the threads in the block to converge and fence the write. +// TODO: Put this in a separate kernel and call it with one thread. +void finalize(int retval) { + // We wait until every single thread launched on the GPU has finished + // executing and reached the finalize region. + count.fetch_sub(1, cpp::MemoryOrder::RELAXED); + while (count.load(cpp::MemoryOrder::RELAXED) != 0) + rpc::sleep_briefly(); gpu::sync_threads(); + if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { + // Only a single thread should call `exit` here, the rest should gracefully + // return from the kernel. This is so only one thread calls the destructors + // registred with 'atexit' above. + __llvm_libc::exit(retval); + } } } // namespace __llvm_libc @@ -37,7 +97,9 @@ void init_rpc(void *in, void *out, void *buffer) { extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void _start(int argc, char **argv, char **envp, int *ret, void *in, void *out, void *buffer) { - __llvm_libc::init_rpc(in, out, buffer); + __llvm_libc::initialize(argc, argv, envp, in, out, buffer); __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED); + + __llvm_libc::finalize(*ret); } |