From 979fb95021c81024c7684e0572b6899cf5a28445 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 19 Jul 2023 09:25:34 -0500 Subject: Revert "[libc] Treat the locks array as a bitfield" Summary: This caused test failures on the gfx90a buildbot. This works on my gfx1030 and the Nvidia buildbots, so we'll need to investigate what is going wrong here. For now revert it to get the bots green. This reverts commit 05abcc579244b68162b847a6780d27b22bd58f74. --- libc/src/__support/RPC/rpc.h | 53 +++++++++++++----------------------- libc/startup/gpu/amdgpu/start.cpp | 2 +- libc/startup/gpu/nvptx/start.cpp | 2 +- libc/utils/gpu/server/rpc_server.cpp | 2 +- libc/utils/gpu/server/rpc_server.h | 2 +- 5 files changed, 23 insertions(+), 38 deletions(-) (limited to 'libc') diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index a252002..d91a5c8 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -56,8 +56,11 @@ template struct alignas(64) Packet { Payload payload; }; -/// The maximum number of parallel ports that the RPC interface can support. -constexpr uint64_t MAX_PORT_COUNT = 512; +// TODO: This should be configured by the server and passed in. The general rule +// of thumb is that you should have at least as many ports as possible +// concurrent work items on the GPU to mitigate the lack offorward +// progress guarantees on the GPU. +constexpr uint64_t DEFAULT_PORT_COUNT = 64; /// A common process used to synchronize communication between a client and a /// server. The process contains a read-only inbox and a write-only outbox used @@ -84,8 +87,7 @@ template struct Process { cpp::Atomic *outbox; Packet *packet; - static constexpr uint64_t NUM_BITS_IN_WORD = (sizeof(uint32_t) * 8); - cpp::Atomic lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0}; + cpp::Atomic lock[DEFAULT_PORT_COUNT] = {0}; /// Initialize the communication channels. LIBC_INLINE void reset(uint64_t port_count, void *buffer) { @@ -156,9 +158,10 @@ template struct Process { /// Attempt to claim the lock at index. Return true on lock taken. /// lane_mask is a bitmap of the threads in the warp that would hold the - /// single lock on success, e.g. the result of gpu::get_lane_mask(). - /// The lock is held when the nth bit of the lock bitfield is set, otherwise - /// it is availible. + /// single lock on success, e.g. the result of gpu::get_lane_mask() + /// The lock is held when the zeroth bit of the uint32_t at lock[index] + /// is set, and available when that bit is clear. Bits [1, 32) are zero. + /// Or with one is a no-op when the lock is already held. [[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask, uint64_t index) { // On amdgpu, test and set to lock[index] and a sync_lane would suffice @@ -170,10 +173,11 @@ template struct Process { // succeed in taking the lock, as otherwise it will leak. This is handled // by making threads which are not in lane_mask or with 0, a no-op. uint32_t id = gpu::get_lane_id(); - bool id_in_lane_mask = lane_mask & (1u << id); + bool id_in_lane_mask = lane_mask & (1ul << id); // All threads in the warp call fetch_or. Possibly at the same time. - bool before = set_nth(lock, index, id_in_lane_mask); + bool before = + lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED); uint64_t packed = gpu::ballot(lane_mask, before); // If every bit set in lane_mask is also set in packed, every single thread @@ -208,11 +212,12 @@ template struct Process { // Wait for other threads in the warp to finish using the lock gpu::sync_lane(lane_mask); - // Use exactly one thread to clear the associated lock bit. Must restrict - // to a single thread to avoid one thread dropping the lock, then an - // unrelated warp claiming the lock, then a second thread in this warp - // dropping the lock again. - clear_nth(lock, index, rpc::is_first_lane(lane_mask)); + // Use exactly one thread to clear the bit at position 0 in lock[index] + // Must restrict to a single thread to avoid one thread dropping the lock, + // then an unrelated warp claiming the lock, then a second thread in this + // warp dropping the lock again. + uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0); + lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED); gpu::sync_lane(lane_mask); } @@ -240,26 +245,6 @@ template struct Process { LIBC_INLINE static constexpr uint64_t buffer_offset(uint64_t port_count) { return align_up(2 * mailbox_bytes(port_count), alignof(Packet)); } - - /// Conditionally set the n-th bit in the atomic bitfield. - LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic *bits, - uint64_t index, bool cond) { - const uint32_t slot = index / NUM_BITS_IN_WORD; - const uint32_t bit = index % NUM_BITS_IN_WORD; - return bits[slot].fetch_or(static_cast(cond) << bit, - cpp::MemoryOrder::RELAXED) & - (1u << bit); - } - - /// Conditionally clear the n-th bit in the atomic bitfield. - LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic *bits, - uint64_t index, bool cond) { - const uint32_t slot = index / NUM_BITS_IN_WORD; - const uint32_t bit = index % NUM_BITS_IN_WORD; - return bits[slot].fetch_and(-1u ^ (static_cast(cond) << bit), - cpp::MemoryOrder::RELAXED) & - (1u << bit); - } }; /// Invokes a function accross every active buffer across the total lane size. diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp index b2adb1d..7081ae6 100644 --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -47,7 +47,7 @@ extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void _begin(int argc, char **argv, char **env, void *rpc_shared_buffer) { // We need to set up the RPC client first in case any of the constructors // require it. - __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT, + __llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT, rpc_shared_buffer); // We want the fini array callbacks to be run after other atexit diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp index cd44239..90f0049 100644 --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -45,7 +45,7 @@ extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void _begin(int argc, char **argv, char **env, void *rpc_shared_buffer) { // We need to set up the RPC client first in case any of the constructors // require it. - __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT, + __llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT, rpc_shared_buffer); // We want the fini array callbacks to be run after other atexit diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp index 52c202e..da9f506 100644 --- a/libc/utils/gpu/server/rpc_server.cpp +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -23,7 +23,7 @@ using namespace __llvm_libc; static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer), "Buffer size mismatch"); -static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT, +static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::DEFAULT_PORT_COUNT, "Incorrect maximum port count"); // The client needs to support different lane sizes for the SIMT model. Because diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h index 3c9ba1d..556f4e9 100644 --- a/libc/utils/gpu/server/rpc_server.h +++ b/libc/utils/gpu/server/rpc_server.h @@ -18,7 +18,7 @@ extern "C" { #endif /// The maxium number of ports that can be opened for any server. -const uint64_t RPC_MAXIMUM_PORT_COUNT = 512; +const uint64_t RPC_MAXIMUM_PORT_COUNT = 64; /// The symbol name associated with the client for use with the LLVM C library /// implementation. -- cgit v1.1