5 files changed, 152 insertions, 86 deletions
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index f157d90..3024ac6 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -16,6 +16,7 @@ add_header_library(
     rounding_mode.h
   DEPENDS
     libc.hdr.fenv_macros
+    libc.src.__support.CPP.type_traits
     libc.src.__support.macros.attributes
     libc.src.__support.macros.properties.architectures
     libc.src.__support.macros.sanitizer
@@ -274,6 +275,7 @@ add_header_library(
     bfloat16.h
   DEPENDS
     .cast
+    .comparison_operations
     .dyadic_float
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.type_traits
diff --git a/libc/src/__support/FPUtil/bfloat16.h b/libc/src/__support/FPUtil/bfloat16.h
index bc0b8b2..84a0dca 100644
--- a/libc/src/__support/FPUtil/bfloat16.h
+++ b/libc/src/__support/FPUtil/bfloat16.h
@@ -12,6 +12,7 @@
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/comparison_operations.h"
 #include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/types.h"
@@ -57,6 +58,30 @@ struct BFloat16 {
     uint32_t x_bits = static_cast<uint32_t>(bits) << 16U;
     return cpp::bit_cast<float>(x_bits);
   }
+
+  LIBC_INLINE bool operator==(BFloat16 other) const {
+    return fputil::equals(*this, other);
+  }
+
+  LIBC_INLINE bool operator!=(BFloat16 other) const {
+    return !fputil::equals(*this, other);
+  }
+
+  LIBC_INLINE bool operator<(BFloat16 other) const {
+    return fputil::less_than(*this, other);
+  }
+
+  LIBC_INLINE bool operator<=(BFloat16 other) const {
+    return fputil::less_than_or_equals(*this, other);
+  }
+
+  LIBC_INLINE bool operator>(BFloat16 other) const {
+    return fputil::greater_than(*this, other);
+  }
+
+  LIBC_INLINE bool operator>=(BFloat16 other) const {
+    return fputil::greater_than_or_equals(*this, other);
+  }
 }; // struct BFloat16
 
 } // namespace fputil
diff --git a/libc/src/__support/FPUtil/rounding_mode.h b/libc/src/__support/FPUtil/rounding_mode.h
index bc66d09..4ee0a0b 100644
--- a/libc/src/__support/FPUtil/rounding_mode.h
+++ b/libc/src/__support/FPUtil/rounding_mode.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_ROUNDING_MODE_H
 
 #include "hdr/fenv_macros.h"
+#include "src/__support/CPP/type_traits.h"   // is_constant_evaluated
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"
 
@@ -20,18 +21,26 @@ namespace fputil {
 // Using the following observation:
 //   1.0f + 2^-25 = 1.0f        for FE_TONEAREST, FE_DOWNWARD, FE_TOWARDZERO
 //                = 0x1.000002f for FE_UPWARD.
-LIBC_INLINE bool fenv_is_round_up() {
-  volatile float x = 0x1.0p-25f;
-  return (1.0f + x != 1.0f);
+LIBC_INLINE static constexpr bool fenv_is_round_up() {
+  if (cpp::is_constant_evaluated()) {
+    return false;
+  } else {
+    volatile float x = 0x1.0p-25f;
+    return (1.0f + x != 1.0f);
+  }
 }
 
 // Quick free-standing test whether fegetround() == FE_DOWNWARD.
 // Using the following observation:
 //   -1.0f - 2^-25 = -1.0f        for FE_TONEAREST, FE_UPWARD, FE_TOWARDZERO
 //                 = -0x1.000002f for FE_DOWNWARD.
-LIBC_INLINE bool fenv_is_round_down() {
-  volatile float x = 0x1.0p-25f;
-  return (-1.0f - x != -1.0f);
+LIBC_INLINE static constexpr bool fenv_is_round_down() {
+  if (cpp::is_constant_evaluated()) {
+    return false;
+  } else {
+    volatile float x = 0x1.0p-25f;
+    return (-1.0f - x != -1.0f);
+  }
 }
 
 // Quick free-standing test whether fegetround() == FE_TONEAREST.
@@ -40,10 +49,14 @@ LIBC_INLINE bool fenv_is_round_down() {
 //                = 0x1.100002p0f  for FE_UPWARD,
 //   1.5f - 2^-24 = 1.5f           for FE_TONEAREST, FE_UPWARD
 //                = 0x1.0ffffep-1f for FE_DOWNWARD, FE_TOWARDZERO
-LIBC_INLINE bool fenv_is_round_to_nearest() {
-  static volatile float x = 0x1.0p-24f;
-  float y = x;
-  return (1.5f + y == 1.5f - y);
+LIBC_INLINE static constexpr bool fenv_is_round_to_nearest() {
+  if (cpp::is_constant_evaluated()) {
+    return true;
+  } else {
+    volatile float x = 0x1.0p-24f;
+    float y = 1.5f + x;
+    return (y == 1.5f - x);
+  }
 }
 
 // Quick free-standing test whether fegetround() == FE_TOWARDZERO.
@@ -56,23 +69,31 @@ LIBC_INLINE bool fenv_is_round_to_nearest() {
 // (0x1.000002p0f + 2^-24) + (-1.0f - 2^-24) = 2^-23 for FE_TOWARDZERO
 //                                           = 2^-22 for FE_TONEAREST, FE_UPWARD
 //                                           = 0 for FE_DOWNWARD
-LIBC_INLINE bool fenv_is_round_to_zero() {
-  static volatile float x = 0x1.0p-24f;
-  float y = x;
-  return ((0x1.000002p0f + y) + (-1.0f - y) == 0x1.0p-23f);
+LIBC_INLINE static constexpr bool fenv_is_round_to_zero() {
+  if (cpp::is_constant_evaluated()) {
+    return false;
+  } else {
+    volatile float x = 0x1.0p-24f;
+    volatile float y = 0x1.000002p0f + x;
+    return (y + (-1.0f - x) == 0x1.0p-23f);
+  }
 }
 
 // Quick free standing get rounding mode based on the above observations.
-LIBC_INLINE int quick_get_round() {
-  static volatile float x = 0x1.0p-24f;
-  float y = x;
-  float z = (0x1.000002p0f + y) + (-1.0f - y);
+LIBC_INLINE static constexpr int quick_get_round() {
+  if (cpp::is_constant_evaluated()) {
+    return FE_TONEAREST;
+  } else {
+    volatile float x = 0x1.0p-24f;
+    volatile float y = 0x1.000002p0f + x;
+    float z = y + (-1.0f - x);
 
-  if (z == 0.0f)
-    return FE_DOWNWARD;
-  if (z == 0x1.0p-23f)
-    return FE_TOWARDZERO;
-  return (2.0f + y == 2.0f) ? FE_TONEAREST : FE_UPWARD;
+    if (z == 0.0f)
+      return FE_DOWNWARD;
+    if (z == 0x1.0p-23f)
+      return FE_TOWARDZERO;
+    return (2.0f + x == 2.0f) ? FE_TONEAREST : FE_UPWARD;
+  }
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 7923fbb..866aea7 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -34,13 +34,12 @@ constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
 constexpr static uint32_t MIN_SIZE = 16;
 constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
 
+// The number of times to attempt claiming an in-progress slab allocation.
+constexpr static uint32_t MAX_TRIES = 128;
+
 // A sentinel used to indicate an invalid but non-null pointer value.
 constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
 
-// The number of times we will try starting on a single index before skipping
-// past it.
-constexpr static uint32_t MAX_TRIES = 512;
-
 static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
 
 namespace impl {
@@ -92,20 +91,10 @@ static inline uint32_t xorshift32(uint32_t &state) {
   return state * 0x9e3779bb;
 }
 
-// Final stage of murmurhash used to get a unique index for the global array
-static inline uint32_t hash(uint32_t x) {
-  x ^= x >> 16;
-  x *= 0x85ebca6b;
-  x ^= x >> 13;
-  x *= 0xc2b2ae35;
-  x ^= x >> 16;
-  return x;
-}
-
 // Rounds the input value to the closest permitted chunk size. Here we accept
 // the sum of the closest three powers of two. For a 2MiB slab size this is 48
 // different chunk sizes. This gives us average internal fragmentation of 87.5%.
-static inline uint32_t get_chunk_size(uint32_t x) {
+static inline constexpr uint32_t get_chunk_size(uint32_t x) {
   uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
   uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
 
@@ -123,6 +112,16 @@ static inline uint32_t get_chunk_size(uint32_t x) {
   return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
 }
 
+// Converts a chunk size into an index suitable for a statically sized array.
+static inline constexpr uint32_t get_chunk_id(uint32_t x) {
+  if (x <= MIN_SIZE)
+    return 0;
+  uint32_t y = x >> 4;
+  if (x < MIN_SIZE << 2)
+    return cpp::popcount(y);
+  return cpp::popcount(y) + 3 * (BITS_IN_WORD - cpp::countl_zero(y)) - 7;
+}
+
 // Rounds to the nearest power of two.
 template <uint32_t N, typename T>
 static inline constexpr T round_up(const T x) {
@@ -143,6 +142,12 @@ static inline constexpr bool is_pow2(uint64_t x) {
   return x && (x & (x - 1)) == 0;
 }
 
+// Where this chunk size should start looking in the global array.
+static inline constexpr uint32_t start_index(uint32_t chunk_index) {
+  return (ARRAY_SIZE * impl::get_chunk_id(chunk_index)) /
+         impl::get_chunk_id(SLAB_SIZE / 2);
+}
+
 } // namespace impl
 
 /// A slab allocator used to hand out identically sized slabs of memory.
@@ -251,12 +256,18 @@ struct Slab {
     // The uniform mask represents which lanes contain a uniform target pointer.
     // We attempt to place these next to each other.
     void *result = nullptr;
+    uint32_t after = ~0u;
+    uint32_t old_index = 0;
     for (uint64_t mask = lane_mask; mask;
          mask = gpu::ballot(lane_mask, !result)) {
       if (result)
         continue;
 
-      uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state));
+      // We try using any known empty bits from the previous attempt first.
+      uint32_t start = gpu::shuffle(mask, cpp::countr_zero(uniform & mask),
+                                    ~after ? (old_index & ~(BITS_IN_WORD - 1)) +
+                                                 cpp::countr_zero(~after)
+                                           : impl::xorshift32(state));
 
       uint32_t id = impl::lane_count(uniform & mask);
       uint32_t index = (start + id) % usable_bits(chunk_size);
@@ -266,8 +277,9 @@ struct Slab {
       // Get the mask of bits destined for the same slot and coalesce it.
       uint64_t match = uniform & gpu::match_any(mask, slot);
       uint32_t length = cpp::popcount(match);
-      uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1)
-                         << bit;
+      uint32_t bitmask = gpu::shuffle(
+          mask, cpp::countr_zero(match),
+          static_cast<uint32_t>((uint64_t(1) << length) - 1) << bit);
 
       uint32_t before = 0;
       if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
@@ -278,6 +290,9 @@ struct Slab {
         result = ptr_from_index(index, chunk_size);
       else
         sleep_briefly();
+
+      after = before | bitmask;
+      old_index = index;
     }
 
     cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
@@ -451,66 +466,65 @@ public:
 // The global array used to search for a valid slab to allocate from.
 static GuardPtr slots[ARRAY_SIZE] = {};
 
+// Keep a cache of the last successful slot for each chunk size. Initialize it
+// to an even spread of the total size. Must be updated if the chunking scheme
+// changes.
+#define S(X) (impl::start_index(X))
+static cpp::Atomic<uint32_t> indices[] = {
+    S(16),     S(32),     S(48),     S(64),     S(96),     S(112),    S(128),
+    S(192),    S(224),    S(256),    S(384),    S(448),    S(512),    S(768),
+    S(896),    S(1024),   S(1536),   S(1792),   S(2048),   S(3072),   S(3584),
+    S(4096),   S(6144),   S(7168),   S(8192),   S(12288),  S(14336),  S(16384),
+    S(24576),  S(28672),  S(32768),  S(49152),  S(57344),  S(65536),  S(98304),
+    S(114688), S(131072), S(196608), S(229376), S(262144), S(393216), S(458752),
+    S(524288), S(786432), S(917504), S(1048576)};
+#undef S
+
 // Tries to find a slab in the table that can support the given chunk size.
 static Slab *find_slab(uint32_t chunk_size) {
-  // We start at a hashed value to spread out different chunk sizes.
-  uint32_t start = impl::hash(chunk_size);
-  uint64_t lane_mask = gpu::get_lane_mask();
-  uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
-
-  Slab *result = nullptr;
-  uint32_t nudge = 0;
-  for (uint64_t mask = lane_mask; mask;
-       mask = gpu::ballot(lane_mask, !result), ++nudge) {
-    uint32_t index = cpp::numeric_limits<uint32_t>::max();
-    for (uint32_t offset = nudge / MAX_TRIES;
-         gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
-         offset += cpp::popcount(uniform & lane_mask)) {
-      uint32_t candidate =
-          (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
-      uint64_t available =
-          gpu::ballot(lane_mask, slots[candidate].use_count() <
-                                     Slab::available_chunks(chunk_size));
-      uint32_t new_index = gpu::shuffle(
-          lane_mask, cpp::countr_zero(available & uniform), candidate);
-
-      // Each uniform group will use the first empty slot they find.
-      if ((index == cpp::numeric_limits<uint32_t>::max() &&
-           (available & uniform)))
-        index = new_index;
-
-      // Guaruntees that this loop will eventuall exit if there is no space.
-      if (offset >= ARRAY_SIZE) {
-        result = reinterpret_cast<Slab *>(SENTINEL);
-        index = 0;
-      }
-    }
+  // We start at the index of the last successful allocation for this kind.
+  uint32_t chunk_id = impl::get_chunk_id(chunk_size);
+  uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
+  uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
+
+  for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
+    uint32_t index =
+        !offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE;
 
-    // Try to claim a slot for the found slot.
-    if (!result) {
+    if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
+      uint64_t lane_mask = gpu::get_lane_mask();
       uint64_t reserved = 0;
-      Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask,
+
+      Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
                                          reserved, chunk_size, index);
+
+      // If there is a slab allocation in progress we retry a few times.
+      for (uint32_t retries = 0;
+           retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
+        uint64_t lane_mask = gpu::get_lane_mask();
+        slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
+                                     chunk_size, index);
+        sleep_briefly();
+      }
+
       // If we find a slab with a matching chunk size then we store the result.
       // Otherwise, we need to free the claimed lock and continue. In the case
-      // of out-of-memory we return a sentinel value.
+      // of out-of-memory we receive a sentinel value and return a failure.
       if (slab && reserved <= Slab::available_chunks(chunk_size) &&
           slab->get_chunk_size() == chunk_size) {
-        result = slab;
+        if (index != start)
+          indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
+        return slab;
       } else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
                           slab->get_chunk_size() != chunk_size)) {
-        if (slab->get_chunk_size() != chunk_size)
-          start = index + 1;
         slots[index].unlock(gpu::get_lane_mask(),
                             gpu::get_lane_mask() & uniform);
-      } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
-        result = reinterpret_cast<Slab *>(SENTINEL);
-      } else {
-        sleep_briefly();
+      } else if (!slab && reserved == SENTINEL) {
+        return nullptr;
       }
     }
   }
-  return result;
+  return nullptr;
 }
 
 // Release the lock associated with a given slab.
diff --git a/libc/src/__support/RPC/rpc_server.h b/libc/src/__support/RPC/rpc_server.h
index dc3d803..beea4dd 100644
--- a/libc/src/__support/RPC/rpc_server.h
+++ b/libc/src/__support/RPC/rpc_server.h
@@ -15,13 +15,17 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_RPC_RPC_SERVER_H
 #define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_SERVER_H
 
+#include "src/__support/macros/properties/compiler.h"
+
 // Workaround for missing __has_builtin in < GCC 10.
 #ifndef __has_builtin
 #define __has_builtin(x) 0
 #endif
 
 // Workaround for missing __builtin_is_constant_evaluated in < GCC 10.
-#ifndef __builtin_is_constant_evaluated
+// Also this builtin is defined for GCC 9.
+#if !(__has_builtin(__builtin_is_constant_evaluated) ||                        \
+      (defined(LIBC_COMPILER_IS_GCC) && (LIBC_COMPILER_GCC_VER >= 900)))
 #define __builtin_is_constant_evaluated(x) 0
 #endif