[libc] Treat the locks array as a bitfield

Currently we keep an internal buffer of device memory that is used to indicate ownership of a port. Since we only use this as a single bit we can simply turn this into a bitfield. I did this manually rather than having a separate type as we need very special handling of the masks used to interact with the locks. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D155511
author: Joseph Huber <jhuber6@vols.utk.edu> 2023-07-18 10:22:01 -0500
committer: Joseph Huber <jhuber6@vols.utk.edu> 2023-07-18 11:34:21 -0500
commit: 05abcc579244b68162b847a6780d27b22bd58f74 (patch)
tree: 112385134836ef333baeacf4dc1a5e563d8c486f /libc
parent: 3ad4f92f83b7a6a79957c9413dfa3a2667c236a6 (diff)
download: llvm-05abcc579244b68162b847a6780d27b22bd58f74.zip
llvm-05abcc579244b68162b847a6780d27b22bd58f74.tar.gz
llvm-05abcc579244b68162b847a6780d27b22bd58f74.tar.bz2
5 files changed, 38 insertions, 23 deletions
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 9c39cd2..2886e06 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -56,11 +56,8 @@ template <uint32_t lane_size = gpu::LANE_SIZE> struct alignas(64) Packet {
   Payload<lane_size> payload;
 };
 
-// TODO: This should be configured by the server and passed in. The general rule
-//       of thumb is that you should have at least as many ports as possible
-//       concurrent work items on the GPU to mitigate the lack offorward
-//       progress guarantees on the GPU.
-constexpr uint64_t DEFAULT_PORT_COUNT = 64;
+/// The maximum number of parallel ports that the RPC interface can support.
+constexpr uint64_t MAX_PORT_COUNT = 512;
 
 /// A common process used to synchronize communication between a client and a
 /// server. The process contains a read-only inbox and a write-only outbox used
@@ -87,7 +84,8 @@ template <bool Invert, typename Packet> struct Process {
   cpp::Atomic<uint32_t> *outbox;
   Packet *packet;
 
-  cpp::Atomic<uint32_t> lock[DEFAULT_PORT_COUNT] = {0};
+  static constexpr uint64_t NUM_BITS_IN_WORD = (sizeof(uint32_t) * 8);
+  cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
 
   /// Initialize the communication channels.
   LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
@@ -158,10 +156,9 @@ template <bool Invert, typename Packet> struct Process {
 
   /// Attempt to claim the lock at index. Return true on lock taken.
   /// lane_mask is a bitmap of the threads in the warp that would hold the
-  /// single lock on success, e.g. the result of gpu::get_lane_mask()
-  /// The lock is held when the zeroth bit of the uint32_t at lock[index]
-  /// is set, and available when that bit is clear. Bits [1, 32) are zero.
-  /// Or with one is a no-op when the lock is already held.
+  /// single lock on success, e.g. the result of gpu::get_lane_mask().
+  /// The lock is held when the nth bit of the lock bitfield is set, otherwise
+  /// it is availible.
   [[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
                                                   uint64_t index) {
     // On amdgpu, test and set to lock[index] and a sync_lane would suffice
@@ -173,11 +170,10 @@ template <bool Invert, typename Packet> struct Process {
     // succeed in taking the lock, as otherwise it will leak. This is handled
     // by making threads which are not in lane_mask or with 0, a no-op.
     uint32_t id = gpu::get_lane_id();
-    bool id_in_lane_mask = lane_mask & (1ul << id);
+    bool id_in_lane_mask = lane_mask & (1u << id);
 
     // All threads in the warp call fetch_or. Possibly at the same time.
-    bool before =
-        lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED);
+    bool before = set_nth(lock, index, id_in_lane_mask);
     uint64_t packed = gpu::ballot(lane_mask, before);
 
     // If every bit set in lane_mask is also set in packed, every single thread
@@ -212,12 +208,11 @@ template <bool Invert, typename Packet> struct Process {
     // Wait for other threads in the warp to finish using the lock
     gpu::sync_lane(lane_mask);
 
-    // Use exactly one thread to clear the bit at position 0 in lock[index]
-    // Must restrict to a single thread to avoid one thread dropping the lock,
-    // then an unrelated warp claiming the lock, then a second thread in this
-    // warp dropping the lock again.
-    uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0);
-    lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED);
+    // Use exactly one thread to clear the associated lock bit. Must restrict
+    // to a single thread to avoid one thread dropping the lock, then an
+    // unrelated warp claiming the lock, then a second thread in this warp
+    // dropping the lock again.
+    clear_nth(lock, index, rpc::is_first_lane(lane_mask));
     gpu::sync_lane(lane_mask);
   }
 
@@ -245,6 +240,26 @@ template <bool Invert, typename Packet> struct Process {
   LIBC_INLINE static constexpr uint64_t buffer_offset(uint64_t port_count) {
     return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
   }
+
+  /// Conditionally set the n-th bit in the atomic bitfield.
+  LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
+                                                uint64_t index, bool cond) {
+    const uint32_t slot = index / NUM_BITS_IN_WORD;
+    const uint32_t bit = index % NUM_BITS_IN_WORD;
+    return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
+                               cpp::MemoryOrder::RELAXED) &
+           (1u << bit);
+  }
+
+  /// Conditionally clear the n-th bit in the atomic bitfield.
+  LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
+                                                  uint64_t index, bool cond) {
+    const uint32_t slot = index / NUM_BITS_IN_WORD;
+    const uint32_t bit = index % NUM_BITS_IN_WORD;
+    return bits[slot].fetch_and(-1u ^ (static_cast<uint32_t>(cond) << bit),
+                                cpp::MemoryOrder::RELAXED) &
+           (1u << bit);
+  }
 };
 
 /// Invokes a function accross every active buffer across the total lane size.
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 7081ae6..b2adb1d 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -47,7 +47,7 @@ extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
   // We need to set up the RPC client first in case any of the constructors
   // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
+  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
                                  rpc_shared_buffer);
 
   // We want the fini array callbacks to be run after other atexit
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 90f0049..cd44239 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -45,7 +45,7 @@ extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
   // We need to set up the RPC client first in case any of the constructors
   // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
+  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
                                  rpc_shared_buffer);
 
   // We want the fini array callbacks to be run after other atexit
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index da9f506..52c202e 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -23,7 +23,7 @@ using namespace __llvm_libc;
 static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer),
               "Buffer size mismatch");
 
-static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::DEFAULT_PORT_COUNT,
+static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
               "Incorrect maximum port count");
 
 // The client needs to support different lane sizes for the SIMT model. Because
diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h
index 556f4e9..3c9ba1d 100644
--- a/libc/utils/gpu/server/rpc_server.h
+++ b/libc/utils/gpu/server/rpc_server.h
@@ -18,7 +18,7 @@ extern "C" {
 #endif
 
 /// The maxium number of ports that can be opened for any server.
-const uint64_t RPC_MAXIMUM_PORT_COUNT = 64;
+const uint64_t RPC_MAXIMUM_PORT_COUNT = 512;
 
 /// The symbol name associated with the client for use with the LLVM C library
 /// implementation.
author	Joseph Huber <jhuber6@vols.utk.edu>	2023-07-18 10:22:01 -0500
committer	Joseph Huber <jhuber6@vols.utk.edu>	2023-07-18 11:34:21 -0500
commit	05abcc579244b68162b847a6780d27b22bd58f74 (patch)
tree	112385134836ef333baeacf4dc1a5e563d8c486f /libc
parent	3ad4f92f83b7a6a79957c9413dfa3a2667c236a6 (diff)
download	llvm-05abcc579244b68162b847a6780d27b22bd58f74.zip llvm-05abcc579244b68162b847a6780d27b22bd58f74.tar.gz llvm-05abcc579244b68162b847a6780d27b22bd58f74.tar.bz2