54 files changed, 1297 insertions, 2 deletions
diff --git a/libc/config/baremetal/aarch64/entrypoints.txt b/libc/config/baremetal/aarch64/entrypoints.txt
index 683c746..af9f687 100644
--- a/libc/config/baremetal/aarch64/entrypoints.txt
+++ b/libc/config/baremetal/aarch64/entrypoints.txt
@@ -757,6 +757,12 @@ endif()
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
@@ -765,6 +771,14 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.truncbf16
 )
 
+if(LIBC_TYPES_HAS_FLOAT128)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C++23 mixed bfloat16 and _Float128 entrypoints
+    libc.src.math.bf16addf128
+    libc.src.math.bf16subf128
+  )
+endif()
+
 if(LIBC_COMPILER_HAS_FIXED_POINT)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # stdfix.h _Fract and _Accum entrypoints
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index f8ecc2e..ce8d7c0 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -760,6 +760,12 @@ endif()
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
@@ -768,6 +774,14 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.truncbf16
 )
 
+if(LIBC_TYPES_HAS_FLOAT128)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C++23 mixed bfloat16 and _Float128 entrypoints
+    libc.src.math.bf16addf128
+    libc.src.math.bf16subf128
+  )
+endif()
+
 if(LIBC_COMPILER_HAS_FIXED_POINT)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # stdfix.h _Fract and _Accum entrypoints
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 679bfb3..7eeec24 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -760,6 +760,12 @@ endif()
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
@@ -768,6 +774,14 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.truncbf16
 )
 
+if(LIBC_TYPES_HAS_FLOAT128)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C++23 mixed bfloat16 and _Float128 entrypoints
+    libc.src.math.bf16addf128
+    libc.src.math.bf16subf128
+  )
+endif()
+
 if(LIBC_COMPILER_HAS_FIXED_POINT)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # stdfix.h _Fract and _Accum entrypoints
diff --git a/libc/config/darwin/aarch64/entrypoints.txt b/libc/config/darwin/aarch64/entrypoints.txt
index 72b0265..de4b4df 100644
--- a/libc/config/darwin/aarch64/entrypoints.txt
+++ b/libc/config/darwin/aarch64/entrypoints.txt
@@ -590,6 +590,12 @@ endif()
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
@@ -598,6 +604,14 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.truncbf16
 )
 
+if(LIBC_TYPES_HAS_FLOAT128)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C++23 mixed bfloat16 and _Float128 entrypoints
+    libc.src.math.bf16addf128
+    libc.src.math.bf16subf128
+  )
+endif()
+
 if(LIBC_COMPILER_HAS_FIXED_POINT)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # stdfix.h _Fract and _Accum entrypoints
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index b5ab1ee..f668e8a 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -233,6 +233,12 @@ set(TARGET_LIBM_ENTRYPOINTS
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
diff --git a/libc/config/gpu/amdgpu/entrypoints.txt b/libc/config/gpu/amdgpu/entrypoints.txt
index 77a13a6..f7b8b2f 100644
--- a/libc/config/gpu/amdgpu/entrypoints.txt
+++ b/libc/config/gpu/amdgpu/entrypoints.txt
@@ -616,6 +616,12 @@ endif()
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
diff --git a/libc/config/gpu/nvptx/entrypoints.txt b/libc/config/gpu/nvptx/entrypoints.txt
index 61c9c71..23afb40 100644
--- a/libc/config/gpu/nvptx/entrypoints.txt
+++ b/libc/config/gpu/nvptx/entrypoints.txt
@@ -617,6 +617,12 @@ endif()
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index fbdf8fb..62a3ae9 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -844,6 +844,12 @@ endif()
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
@@ -852,6 +858,14 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.truncbf16
 )
 
+if(LIBC_TYPES_HAS_FLOAT128)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C++23 mixed bfloat16 and _Float128 entrypoints
+    libc.src.math.bf16addf128
+    libc.src.math.bf16subf128
+  )
+endif()
+
 if(LLVM_LIBC_FULL_BUILD)
   list(APPEND TARGET_LIBC_ENTRYPOINTS
     # assert.h entrypoints
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index e3f5fee..eedf184 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -460,6 +460,12 @@ set(TARGET_LIBM_ENTRYPOINTS
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index ba67ddd..cd56979 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -863,6 +863,12 @@ endif()
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
@@ -871,6 +877,14 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.truncbf16
 )
 
+if(LIBC_TYPES_HAS_FLOAT128)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C++23 mixed bfloat16 and _Float128 entrypoints
+    libc.src.math.bf16addf128
+    libc.src.math.bf16subf128
+  )
+endif()
+
 if(LIBC_COMPILER_HAS_FIXED_POINT)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # stdfix.h _Fract and _Accum entrypoints
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 066dc21..2b0cf33 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -895,6 +895,12 @@ endif()
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
@@ -903,6 +909,14 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.truncbf16
 )
 
+if(LIBC_TYPES_HAS_FLOAT128)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C++23 mixed bfloat16 and _Float128 entrypoints
+    libc.src.math.bf16addf128
+    libc.src.math.bf16subf128
+  )
+endif()
+
 if(LIBC_COMPILER_HAS_FIXED_POINT)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # stdfix.h _Fract and _Accum entrypoints
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 994078c..1b1db5e 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -306,6 +306,12 @@ set(TARGET_LIBM_ENTRYPOINTS
 
 list(APPEND TARGET_LIBM_ENTRYPOINTS
   # bfloat16 entrypoints
+  libc.src.math.bf16add
+  libc.src.math.bf16addf
+  libc.src.math.bf16addl
+  libc.src.math.bf16sub
+  libc.src.math.bf16subf
+  libc.src.math.bf16subl
   libc.src.math.ceilbf16
   libc.src.math.fabsbf16
   libc.src.math.floorbf16
diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index f8fdfeb..72a7879 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -9,6 +9,12 @@ add_header_library(
     utils.h
 )
 
+add_header_library(
+  fixedstack
+  HDRS
+    fixedstack.h
+)
+
 add_object_library(
   allocator
   SRCS
@@ -23,4 +29,5 @@ add_object_library(
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.new
     .utils
+    .fixedstack
 )
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 250bebd..534a309 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -20,6 +20,7 @@
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/new.h"
+#include "src/__support/GPU/fixedstack.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/threads/sleep.h"
@@ -39,6 +40,9 @@ constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
 // The number of times to attempt claiming an in-progress slab allocation.
 constexpr static uint32_t MAX_TRIES = 1024;
 
+// The number of previously allocated slabs we will keep in memory.
+constexpr static uint32_t CACHED_SLABS = 8;
+
 static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
 
 namespace impl {
@@ -185,20 +189,35 @@ struct Slab {
   struct alignas(MIN_SIZE) Header {
     uint32_t chunk_size;
     uint32_t global_index;
+    uint32_t cached_chunk_size;
   };
 
   // Initialize the slab with its chunk size and index in the global table for
   // use when freeing.
   Slab(uint32_t chunk_size, uint32_t global_index) {
     Header *header = reinterpret_cast<Header *>(memory);
+    header->cached_chunk_size = cpp::numeric_limits<uint32_t>::max();
     header->chunk_size = chunk_size;
     header->global_index = global_index;
   }
 
+  // Reset the memory with a new index and chunk size, not thread safe.
+  Slab *reset(uint32_t chunk_size, uint32_t global_index) {
+    Header *header = reinterpret_cast<Header *>(memory);
+    header->cached_chunk_size = header->chunk_size;
+    header->chunk_size = chunk_size;
+    header->global_index = global_index;
+    return this;
+  }
+
   // Set the necessary bitfield bytes to zero in parallel using many lanes. This
   // must be called before the bitfield can be accessed safely, memory is not
   // guaranteed to be zero initialized in the current implementation.
   void initialize(uint64_t uniform) {
+    // If this is a re-used slab the memory is already set to zero.
+    if (get_cached_chunk_size() <= get_chunk_size())
+      return;
+
     uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
                     sizeof(uint32_t);
     impl::uniform_memset(get_bitfield(), 0, size, uniform);
@@ -236,6 +255,11 @@ struct Slab {
     return reinterpret_cast<const Header *>(memory)->chunk_size;
   }
 
+  // Get the chunk size that was previously used.
+  uint32_t get_cached_chunk_size() const {
+    return reinterpret_cast<const Header *>(memory)->cached_chunk_size;
+  }
+
   // Get the location in the memory where we will store the global index.
   uint32_t get_global_index() const {
     return reinterpret_cast<const Header *>(memory)->global_index;
@@ -337,6 +361,9 @@ struct Slab {
   uint8_t memory[SLAB_SIZE];
 };
 
+// A global cache of previously allocated slabs for efficient reuse.
+static FixedStack<Slab *, CACHED_SLABS> slab_cache;
+
 /// A wait-free guard around a pointer resource to be created dynamically if
 /// space is available and freed once there are no more users.
 struct GuardPtr {
@@ -408,6 +435,11 @@ private:
             reinterpret_cast<Slab *>(cpp::numeric_limits<uintptr_t>::max()),
             cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
       count = cpp::numeric_limits<uint32_t>::max();
+
+      Slab *cached = nullptr;
+      if (slab_cache.pop(cached))
+        return cached->reset(cpp::forward<Args>(args)...);
+
       void *raw = impl::rpc_allocate(sizeof(Slab));
       if (!raw)
         return nullptr;
@@ -475,8 +507,10 @@ public:
     if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
         ref.release(cpp::popcount(mask))) {
       Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
-      p->~Slab();
-      impl::rpc_free(p);
+      if (!slab_cache.push(p)) {
+        p->~Slab();
+        impl::rpc_free(p);
+      }
       cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
       ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
     }
diff --git a/libc/src/__support/GPU/fixedstack.h b/libc/src/__support/GPU/fixedstack.h
new file mode 100644
index 0000000..6ceaa2f
--- /dev/null
+++ b/libc/src/__support/GPU/fixedstack.h
@@ -0,0 +1,111 @@
+//===-- A lock-free data structure for a fixed capacity stack ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
+#define LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
+
+#include "src/__support/CPP/atomic.h"
+#include "src/__support/threads/sleep.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+// A lock-free fixed size stack backed by an underlying array of data. It
+// supports push and pop operations in a completely lock-free manner.
+template <typename T, uint32_t CAPACITY> struct alignas(16) FixedStack {
+  // The index is stored as a 20-bit value and cannot index into any more.
+  static_assert(CAPACITY < 1024 * 1024, "Invalid buffer size");
+
+  // The head of the free and used stacks. Represents as a 20-bit index combined
+  // with a 44-bit ABA tag that is updated in a single atomic operation.
+  uint64_t free;
+  uint64_t used;
+
+  // The stack is a linked list of indices into the underlying data
+  uint32_t next[CAPACITY];
+  T data[CAPACITY];
+
+  // Get the 20-bit index into the underlying array from the head.
+  LIBC_INLINE static constexpr uint32_t get_node(uint64_t head) {
+    return static_cast<uint32_t>(head & 0xfffff);
+  }
+
+  // Increment the old ABA tag and merge it into the new index.
+  LIBC_INLINE static constexpr uint64_t make_head(uint64_t orig,
+                                                  uint32_t node) {
+    return static_cast<uint64_t>(node) | (((orig >> 20ul) + 1ul) << 20ul);
+  }
+
+  // Attempts to pop data from the given stack by making it point to the next
+  // node. We repeatedly attempt to write to the head using compare-and-swap,
+  // expecting that it has not been changed by any other thread.
+  LIBC_INLINE uint32_t pop_impl(cpp::AtomicRef<uint64_t> head) {
+    uint64_t orig = head.load(cpp::MemoryOrder::RELAXED);
+
+    for (;;) {
+      if (get_node(orig) == CAPACITY)
+        return CAPACITY;
+
+      uint32_t node =
+          cpp::AtomicRef(next[get_node(orig)]).load(cpp::MemoryOrder::RELAXED);
+      if (head.compare_exchange_strong(orig, make_head(orig, node),
+                                       cpp::MemoryOrder::ACQUIRE,
+                                       cpp::MemoryOrder::RELAXED))
+        break;
+    }
+    return get_node(orig);
+  }
+
+  // Attempts to push data to the given stack by making it point to the new
+  // node. We repeatedly attempt to write to the head using compare-and-swap,
+  // expecting that it has not been changed by any other thread.
+  LIBC_INLINE uint32_t push_impl(cpp::AtomicRef<uint64_t> head, uint32_t node) {
+    uint64_t orig = head.load(cpp::MemoryOrder::RELAXED);
+    for (;;) {
+      next[node] = get_node(orig);
+      if (head.compare_exchange_strong(orig, make_head(orig, node),
+                                       cpp::MemoryOrder::RELEASE,
+                                       cpp::MemoryOrder::RELAXED))
+        break;
+    }
+    return get_node(head.load(cpp::MemoryOrder::RELAXED));
+  }
+
+public:
+  // Initialize the free stack to be full and the used stack to be empty. We use
+  // the capacity of the stack as a sentinel value.
+  LIBC_INLINE constexpr FixedStack() : free(0), used(CAPACITY), data{} {
+    for (uint32_t i = 0; i < CAPACITY; ++i)
+      next[i] = i + 1;
+  }
+
+  LIBC_INLINE bool push(const T &val) {
+    uint32_t node = pop_impl(cpp::AtomicRef(free));
+    if (node == CAPACITY)
+      return false;
+
+    data[node] = val;
+    push_impl(cpp::AtomicRef(used), node);
+    return true;
+  }
+
+  LIBC_INLINE bool pop(T &val) {
+    uint32_t node = pop_impl(cpp::AtomicRef(used));
+    if (node == CAPACITY)
+      return false;
+
+    val = data[node];
+    push_impl(cpp::AtomicRef(free), node);
+    return true;
+  }
+};
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_GPU_FIXEDSTACK_H
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index c3840d3..660c3681 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -563,3 +563,13 @@ add_math_entrypoint_object(ufromfpxf)
 add_math_entrypoint_object(ufromfpxl)
 add_math_entrypoint_object(ufromfpxf16)
 add_math_entrypoint_object(ufromfpxf128)
+
+add_math_entrypoint_object(bf16add)
+add_math_entrypoint_object(bf16addf)
+add_math_entrypoint_object(bf16addl)
+add_math_entrypoint_object(bf16addf128)
+
+add_math_entrypoint_object(bf16sub)
+add_math_entrypoint_object(bf16subf)
+add_math_entrypoint_object(bf16subl)
+add_math_entrypoint_object(bf16subf128)
diff --git a/libc/src/math/bf16add.h b/libc/src/math/bf16add.h
new file mode 100644
index 0000000..a29970e
--- /dev/null
+++ b/libc/src/math/bf16add.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16add -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16ADD_H
+#define LLVM_LIBC_SRC_MATH_BF16ADD_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16add(double x, double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16ADD_H
diff --git a/libc/src/math/bf16addf.h b/libc/src/math/bf16addf.h
new file mode 100644
index 0000000..80a5e2a
--- /dev/null
+++ b/libc/src/math/bf16addf.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16addf ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16ADDF_H
+#define LLVM_LIBC_SRC_MATH_BF16ADDF_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16addf(float x, float y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16ADDF_H
diff --git a/libc/src/math/bf16addf128.h b/libc/src/math/bf16addf128.h
new file mode 100644
index 0000000..3c2f3a1
--- /dev/null
+++ b/libc/src/math/bf16addf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16addf128 -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16ADDF128_H
+#define LLVM_LIBC_SRC_MATH_BF16ADDF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16addf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16ADDF128_H
diff --git a/libc/src/math/bf16addl.h b/libc/src/math/bf16addl.h
new file mode 100644
index 0000000..a9e7d68
--- /dev/null
+++ b/libc/src/math/bf16addl.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16addl ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16ADDL_H
+#define LLVM_LIBC_SRC_MATH_BF16ADDL_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16addl(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16ADDL_H
diff --git a/libc/src/math/bf16sub.h b/libc/src/math/bf16sub.h
new file mode 100644
index 0000000..8108e914
--- /dev/null
+++ b/libc/src/math/bf16sub.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16sub -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16SUB_H
+#define LLVM_LIBC_SRC_MATH_BF16SUB_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16sub(double x, double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16SUB_H
diff --git a/libc/src/math/bf16subf.h b/libc/src/math/bf16subf.h
new file mode 100644
index 0000000..1bd79bf
--- /dev/null
+++ b/libc/src/math/bf16subf.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16subf ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16SUBF_H
+#define LLVM_LIBC_SRC_MATH_BF16SUBF_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16subf(float x, float y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16SUBF_H
diff --git a/libc/src/math/bf16subf128.h b/libc/src/math/bf16subf128.h
new file mode 100644
index 0000000..19590e8
--- /dev/null
+++ b/libc/src/math/bf16subf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16subf128 -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16SUBF128_H
+#define LLVM_LIBC_SRC_MATH_BF16SUBF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16subf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16SUBF128_H
diff --git a/libc/src/math/bf16subl.h b/libc/src/math/bf16subl.h
new file mode 100644
index 0000000..13b2093
--- /dev/null
+++ b/libc/src/math/bf16subl.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16subl ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16SUBL_H
+#define LLVM_LIBC_SRC_MATH_BF16SUBL_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16subl(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16SUBL_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 8936066..5aeacc8 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -4911,3 +4911,116 @@ add_header_library(
     libc.src.__support.math.expf16_utils
     libc.src.__support.math.exp10_float16_constants
 )
+
+add_entrypoint_object(
+  bf16add
+  SRCS
+    bf16add.cpp
+  HDRS
+    ../bf16add.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
+  bf16addf
+  SRCS
+    bf16addf.cpp
+  HDRS
+    ../bf16addf.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
+  bf16addl
+  SRCS
+    bf16addl.cpp
+  HDRS
+    ../bf16addl.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
+  bf16addf128
+  SRCS
+    bf16addf128.cpp
+  HDRS
+    ../bf16addf128.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+
+add_entrypoint_object(
+  bf16sub
+  SRCS
+    bf16sub.cpp
+  HDRS
+    ../bf16sub.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
+  bf16subf
+  SRCS
+    bf16subf.cpp
+  HDRS
+    ../bf16subf.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
+  bf16subl
+  SRCS
+    bf16subl.cpp
+  HDRS
+    ../bf16subl.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
+  bf16subf128
+  SRCS
+    bf16subf128.cpp
+  HDRS
+    ../bf16subf128.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
diff --git a/libc/src/math/generic/bf16add.cpp b/libc/src/math/generic/bf16add.cpp
new file mode 100644
index 0000000..257596a
--- /dev/null
+++ b/libc/src/math/generic/bf16add.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16add function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16add.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16add, (double x, double y)) {
+  return fputil::generic::add<bfloat16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16addf.cpp b/libc/src/math/generic/bf16addf.cpp
new file mode 100644
index 0000000..65e6cbf
--- /dev/null
+++ b/libc/src/math/generic/bf16addf.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16addf function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16addf.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16addf, (float x, float y)) {
+  return fputil::generic::add<bfloat16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16addf128.cpp b/libc/src/math/generic/bf16addf128.cpp
new file mode 100644
index 0000000..03f70af
--- /dev/null
+++ b/libc/src/math/generic/bf16addf128.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16addf128 function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16addf128.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16addf128, (float128 x, float128 y)) {
+  return fputil::generic::add<bfloat16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16addl.cpp b/libc/src/math/generic/bf16addl.cpp
new file mode 100644
index 0000000..c212195
--- /dev/null
+++ b/libc/src/math/generic/bf16addl.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16addl function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16addl.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16addl, (long double x, long double y)) {
+  return fputil::generic::add<bfloat16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16sub.cpp b/libc/src/math/generic/bf16sub.cpp
new file mode 100644
index 0000000..65eb209
--- /dev/null
+++ b/libc/src/math/generic/bf16sub.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16sub function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16sub.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16sub, (double x, double y)) {
+  return fputil::generic::sub<bfloat16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16subf.cpp b/libc/src/math/generic/bf16subf.cpp
new file mode 100644
index 0000000..6bba4be
--- /dev/null
+++ b/libc/src/math/generic/bf16subf.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16subf function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16subf.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16subf, (float x, float y)) {
+  return fputil::generic::sub<bfloat16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16subf128.cpp b/libc/src/math/generic/bf16subf128.cpp
new file mode 100644
index 0000000..e5fe107
--- /dev/null
+++ b/libc/src/math/generic/bf16subf128.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16subf128 function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16subf128.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16subf128, (float128 x, float128 y)) {
+  return fputil::generic::sub<bfloat16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16subl.cpp b/libc/src/math/generic/bf16subl.cpp
new file mode 100644
index 0000000..d3a970c
--- /dev/null
+++ b/libc/src/math/generic/bf16subl.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16subl function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16subl.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16subl, (long double x, long double y)) {
+  return fputil::generic::sub<bfloat16>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index e066830..1fb175b 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -27,3 +27,16 @@ add_integration_test(
   LOADER_ARGS
     --threads 64
 )
+
+add_libc_test(
+  fixedstack_test
+  SUITE
+    libc-support-gpu-tests
+  SRCS
+    fixedstack_test.cpp
+  DEPENDS
+    libc.src.__support.GPU.fixedstack
+  LOADER_ARGS
+    --threads 32
+    --blocks 16
+)
diff --git a/libc/test/integration/src/__support/GPU/fixedstack_test.cpp b/libc/test/integration/src/__support/GPU/fixedstack_test.cpp
new file mode 100644
index 0000000..fde51df
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/fixedstack_test.cpp
@@ -0,0 +1,44 @@
+//===-- Integration test for the lock-free stack --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/GPU/fixedstack.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+static FixedStack<uint32_t, 2048> global_stack;
+
+void run() {
+  // We need enough space in the stack as threads in flight can temporarily
+  // consume memory before they finish comitting it back to the stack.
+  ASSERT_EQ(gpu::get_num_blocks() * gpu::get_num_threads(), 512);
+
+  uint32_t val;
+  uint32_t num_threads = static_cast<uint32_t>(gpu::get_num_threads());
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_TRUE(global_stack.push(UINT32_MAX))
+    EXPECT_TRUE(global_stack.pop(val))
+    ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
+  }
+
+  EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
+  EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
+  EXPECT_TRUE(global_stack.pop(val));
+  ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
+
+  // Fill the rest of the stack with the default value.
+  while (!global_stack.push(UINT32_MAX))
+    ;
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  run();
+
+  return 0;
+}
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 43cde0d..a74f9fe 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -2972,6 +2972,118 @@ add_fp_unittest(
     libc.src.__support.macros.properties.types
 )
 
+add_fp_unittest(
+  bf16add_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16add_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.bf16add
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16addf_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16addf_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.bf16addf
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16addl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16addl_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.bf16addl
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16addf128_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16addf128_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.bf16addf128
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16sub_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16sub_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.src.math.bf16sub
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16subf_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16subf_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.src.math.bf16subf
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16subl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16subl_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.src.math.bf16subl
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16subf128_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16subf128_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.src.math.bf16subf128
+    libc.src.__support.FPUtil.bfloat16
+)
+
 add_subdirectory(generic)
 add_subdirectory(smoke)
 
diff --git a/libc/test/src/math/bf16add_test.cpp b/libc/test/src/math/bf16add_test.cpp
new file mode 100644
index 0000000..9e9c594
--- /dev/null
+++ b/libc/test/src/math/bf16add_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16add ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16add.h"
+
+LIST_ADD_TESTS(bfloat16, double, LIBC_NAMESPACE::bf16add)
diff --git a/libc/test/src/math/bf16addf128_test.cpp b/libc/test/src/math/bf16addf128_test.cpp
new file mode 100644
index 0000000..46f7ad3
--- /dev/null
+++ b/libc/test/src/math/bf16addf128_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16addf128 -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16addf128.h"
+
+LIST_ADD_TESTS(bfloat16, float128, LIBC_NAMESPACE::bf16addf128)
diff --git a/libc/test/src/math/bf16addf_test.cpp b/libc/test/src/math/bf16addf_test.cpp
new file mode 100644
index 0000000..06d56cf
--- /dev/null
+++ b/libc/test/src/math/bf16addf_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16addf --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16addf.h"
+
+LIST_ADD_TESTS(bfloat16, float, LIBC_NAMESPACE::bf16addf)
diff --git a/libc/test/src/math/bf16addl_test.cpp b/libc/test/src/math/bf16addl_test.cpp
new file mode 100644
index 0000000..bf54827
--- /dev/null
+++ b/libc/test/src/math/bf16addl_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16addl --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16addl.h"
+
+LIST_ADD_TESTS(bfloat16, long double, LIBC_NAMESPACE::bf16addl)
diff --git a/libc/test/src/math/bf16sub_test.cpp b/libc/test/src/math/bf16sub_test.cpp
new file mode 100644
index 0000000..4a793dc
--- /dev/null
+++ b/libc/test/src/math/bf16sub_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16sub ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16sub.h"
+
+LIST_SUB_TESTS(bfloat16, double, LIBC_NAMESPACE::bf16sub)
diff --git a/libc/test/src/math/bf16subf128_test.cpp b/libc/test/src/math/bf16subf128_test.cpp
new file mode 100644
index 0000000..25d6711
--- /dev/null
+++ b/libc/test/src/math/bf16subf128_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16subf128 -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16subf128.h"
+
+LIST_SUB_TESTS(bfloat16, float128, LIBC_NAMESPACE::bf16subf128)
diff --git a/libc/test/src/math/bf16subf_test.cpp b/libc/test/src/math/bf16subf_test.cpp
new file mode 100644
index 0000000..e8c7440
--- /dev/null
+++ b/libc/test/src/math/bf16subf_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16subf --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16subf.h"
+
+LIST_SUB_TESTS(bfloat16, float, LIBC_NAMESPACE::bf16subf)
diff --git a/libc/test/src/math/bf16subl_test.cpp b/libc/test/src/math/bf16subl_test.cpp
new file mode 100644
index 0000000..2997369
--- /dev/null
+++ b/libc/test/src/math/bf16subl_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16subl --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16subl.h"
+
+LIST_SUB_TESTS(bfloat16, long double, LIBC_NAMESPACE::bf16subl)
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 5f497c6..dc1850a 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -5465,3 +5465,131 @@ add_fp_unittest(
     libc.src.__support.macros.properties.os
     libc.src.__support.macros.properties.types
 )
+
+add_fp_unittest(
+  bf16add_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16add_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16add
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.macros.properties.os
+)
+
+add_fp_unittest(
+  bf16addf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16addf_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16addf
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.macros.properties.os
+)
+
+add_fp_unittest(
+  bf16addl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16addl_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16addl
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.macros.properties.os
+)
+
+add_fp_unittest(
+  bf16addf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16addf128_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16addf128
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.macros.properties.os
+)
+
+add_fp_unittest(
+  bf16sub_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16sub_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16sub
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.macros.properties.os
+)
+
+add_fp_unittest(
+  bf16subf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16subf_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16subf
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.macros.properties.os
+)
+
+add_fp_unittest(
+  bf16subl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16subl_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16subl
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.macros.properties.os
+)
+
+add_fp_unittest(
+  bf16subf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16subf128_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16subf128
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.macros.properties.os
+)
diff --git a/libc/test/src/math/smoke/bf16add_test.cpp b/libc/test/src/math/smoke/bf16add_test.cpp
new file mode 100644
index 0000000..9e9c594
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16add_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16add ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16add.h"
+
+LIST_ADD_TESTS(bfloat16, double, LIBC_NAMESPACE::bf16add)
diff --git a/libc/test/src/math/smoke/bf16addf128_test.cpp b/libc/test/src/math/smoke/bf16addf128_test.cpp
new file mode 100644
index 0000000..46f7ad3
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16addf128_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16addf128 -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16addf128.h"
+
+LIST_ADD_TESTS(bfloat16, float128, LIBC_NAMESPACE::bf16addf128)
diff --git a/libc/test/src/math/smoke/bf16addf_test.cpp b/libc/test/src/math/smoke/bf16addf_test.cpp
new file mode 100644
index 0000000..06d56cf
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16addf_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16addf --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16addf.h"
+
+LIST_ADD_TESTS(bfloat16, float, LIBC_NAMESPACE::bf16addf)
diff --git a/libc/test/src/math/smoke/bf16addl_test.cpp b/libc/test/src/math/smoke/bf16addl_test.cpp
new file mode 100644
index 0000000..bf54827
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16addl_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16addl --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16addl.h"
+
+LIST_ADD_TESTS(bfloat16, long double, LIBC_NAMESPACE::bf16addl)
diff --git a/libc/test/src/math/smoke/bf16sub_test.cpp b/libc/test/src/math/smoke/bf16sub_test.cpp
new file mode 100644
index 0000000..4a793dc
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16sub_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16sub ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16sub.h"
+
+LIST_SUB_TESTS(bfloat16, double, LIBC_NAMESPACE::bf16sub)
diff --git a/libc/test/src/math/smoke/bf16subf128_test.cpp b/libc/test/src/math/smoke/bf16subf128_test.cpp
new file mode 100644
index 0000000..25d6711
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16subf128_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16subf128 -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16subf128.h"
+
+LIST_SUB_TESTS(bfloat16, float128, LIBC_NAMESPACE::bf16subf128)
diff --git a/libc/test/src/math/smoke/bf16subf_test.cpp b/libc/test/src/math/smoke/bf16subf_test.cpp
new file mode 100644
index 0000000..e8c7440
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16subf_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16subf --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16subf.h"
+
+LIST_SUB_TESTS(bfloat16, float, LIBC_NAMESPACE::bf16subf)
diff --git a/libc/test/src/math/smoke/bf16subl_test.cpp b/libc/test/src/math/smoke/bf16subl_test.cpp
new file mode 100644
index 0000000..2997369
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16subl_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16subl --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16subl.h"
+
+LIST_SUB_TESTS(bfloat16, long double, LIBC_NAMESPACE::bf16subl)
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index ae12a83..57e818c 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -411,6 +411,21 @@ template void explain_binary_operation_one_output_error(
 #endif
 template void explain_binary_operation_one_output_error(
     Operation, const BinaryInput<bfloat16> &, bfloat16, double, RoundingMode);
+template void
+explain_binary_operation_one_output_error(Operation, const BinaryInput<float> &,
+                                          bfloat16, double, RoundingMode);
+template void explain_binary_operation_one_output_error(
+    Operation, const BinaryInput<double> &, bfloat16, double, RoundingMode);
+template void
+explain_binary_operation_one_output_error(Operation,
+                                          const BinaryInput<long double> &,
+                                          bfloat16, double, RoundingMode);
+#if defined(LIBC_TYPES_HAS_FLOAT128) &&                                        \
+    defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
+template void explain_binary_operation_one_output_error(
+    Operation, const BinaryInput<float128> &, bfloat16, double, RoundingMode);
+#endif // defined(LIBC_TYPES_HAS_FLOAT128) &&
+       // defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
 
 template <typename InputType, typename OutputType>
 void explain_ternary_operation_one_output_error(
@@ -648,6 +663,26 @@ template bool compare_binary_operation_one_output(Operation,
                                                   const BinaryInput<bfloat16> &,
                                                   bfloat16, double,
                                                   RoundingMode);
+
+template bool compare_binary_operation_one_output(Operation,
+                                                  const BinaryInput<float> &,
+                                                  bfloat16, double,
+                                                  RoundingMode);
+template bool compare_binary_operation_one_output(Operation,
+                                                  const BinaryInput<double> &,
+                                                  bfloat16, double,
+                                                  RoundingMode);
+template bool
+compare_binary_operation_one_output(Operation, const BinaryInput<long double> &,
+                                    bfloat16, double, RoundingMode);
+#if defined(LIBC_TYPES_HAS_FLOAT128) &&                                        \
+    defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
+template bool compare_binary_operation_one_output(Operation,
+                                                  const BinaryInput<float128> &,
+                                                  bfloat16, double,
+                                                  RoundingMode);
+#endif // defined(LIBC_TYPES_HAS_FLOAT128) &&
+       // defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
 template <typename InputType, typename OutputType>
 bool compare_ternary_operation_one_output(Operation op,
                                           const TernaryInput<InputType> &input,