53 files changed, 1730 insertions, 776 deletions
diff --git a/libc/src/__support/FPUtil/cast.h b/libc/src/__support/FPUtil/cast.h
index e6fad1b..e999ece 100644
--- a/libc/src/__support/FPUtil/cast.h
+++ b/libc/src/__support/FPUtil/cast.h
@@ -66,9 +66,9 @@ cast(InType x) {
         cpp::max(OutFPBits::FRACTION_LEN, InFPBits::FRACTION_LEN);
     DyadicFloat<cpp::bit_ceil(MAX_FRACTION_LEN)> xd(x);
     return xd.template as<OutType, /*ShouldSignalExceptions=*/true>();
+  } else {
+    return static_cast<OutType>(x);
   }
-
-  return static_cast<OutType>(x);
 }
 
 } // namespace LIBC_NAMESPACE::fputil
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 2b78c4d..bd0a55c 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -266,38 +266,31 @@ struct Slab {
 
   // Randomly walks the bitfield until it finds a free bit. Allocations attempt
   // to put lanes right next to each other for better caching and convergence.
-  void *allocate(uint64_t lane_mask, uint64_t uniform) {
+  void *allocate(uint64_t uniform, uint32_t reserved) {
     uint32_t chunk_size = get_chunk_size();
     uint32_t state = impl::entropy();
 
-    // The uniform mask represents which lanes contain a uniform target pointer.
-    // We attempt to place these next to each other.
-    void *result = nullptr;
-    uint32_t after = ~0u;
-    uint32_t old_index = 0;
-    for (uint64_t mask = lane_mask; mask;
-         mask = gpu::ballot(lane_mask, !result)) {
-      if (result)
-        continue;
-
-      // We try using any known empty bits from the previous attempt first.
-      uint32_t start = gpu::shuffle(
-          mask, cpp::countr_zero(uniform & mask),
-          ~after ? (old_index & ~(BITS_IN_WORD - 1)) + cpp::countr_zero(~after)
-                 : __builtin_align_down(impl::xorshift32(state), BITS_IN_WORD));
+    // Try to find the empty bit in the bitfield to finish the allocation. We
+    // start at the number of allocations as this is guaranteed to be available
+    // until the user starts freeing memory.
+    uint64_t lane_mask = gpu::get_lane_mask();
+    uint32_t start = gpu::shuffle(
+        lane_mask, cpp::countr_zero(uniform & lane_mask), reserved);
+    for (;;) {
+      uint64_t lane_mask = gpu::get_lane_mask();
 
       // Each lane tries to claim one bit in a single contiguous mask.
-      uint32_t id = impl::lane_count(uniform & mask, gpu::get_lane_id());
+      uint32_t id = impl::lane_count(uniform & lane_mask, gpu::get_lane_id());
       uint32_t index = (start + id) % usable_bits(chunk_size);
       uint32_t slot = index / BITS_IN_WORD;
       uint32_t bit = index % BITS_IN_WORD;
 
       // Get the mask of bits destined for the same slot and coalesce it.
       uint32_t leader = impl::get_leader_id(
-          uniform & gpu::ballot(mask, !id || index % BITS_IN_WORD == 0),
+          uniform & gpu::ballot(lane_mask, !id || index % BITS_IN_WORD == 0),
           gpu::get_lane_id());
-      uint32_t length = cpp::popcount(uniform & mask) -
-                        impl::lane_count(uniform & mask, leader);
+      uint32_t length = cpp::popcount(uniform & lane_mask) -
+                        impl::lane_count(uniform & lane_mask, leader);
       uint32_t bitmask =
           static_cast<uint32_t>(
               (uint64_t(1) << cpp::min(length, BITS_IN_WORD)) - 1)
@@ -307,18 +300,23 @@ struct Slab {
       if (gpu::get_lane_id() == leader)
         before = cpp::AtomicRef(get_bitfield()[slot])
                      .fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
-      before = gpu::shuffle(mask, leader, before);
-      if (~before & (1 << bit))
-        result = ptr_from_index(index, chunk_size);
-      else
-        sleep_briefly();
+      before = gpu::shuffle(lane_mask, leader, before);
+      if (~before & (1 << bit)) {
+        cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+        return ptr_from_index(index, chunk_size);
+      }
 
-      after = before | bitmask;
-      old_index = index;
+      // If the previous operation found an empty bit we move there, otherwise
+      // we generate new random index to start at.
+      uint32_t after = before | bitmask;
+      start = gpu::shuffle(
+          gpu::get_lane_mask(),
+          cpp::countr_zero(uniform & gpu::get_lane_mask()),
+          ~after ? __builtin_align_down(index, BITS_IN_WORD) +
+                       cpp::countr_zero(~after)
+                 : __builtin_align_down(impl::xorshift32(state), BITS_IN_WORD));
+      sleep_briefly();
     }
-
-    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
-    return result;
   }
 
   // Deallocates memory by resetting its corresponding bit in the bitfield.
@@ -460,11 +458,13 @@ public:
       result->initialize(uniform);
       if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
         finalize(result, cpp::popcount(uniform), count);
+      count =
+          gpu::shuffle(gpu::get_lane_mask(), cpp::countr_zero(uniform), count);
     }
 
     if (!impl::is_sentinel(count))
       count = count - cpp::popcount(uniform) +
-              impl::lane_count(uniform, gpu::get_lane_id()) + 1;
+              impl::lane_count(uniform, gpu::get_lane_id());
 
     return result;
   }
@@ -505,7 +505,8 @@ static cpp::Atomic<uint32_t> indices[] = {
 #undef S
 
 // Tries to find a slab in the table that can support the given chunk size.
-static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
+static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform,
+                       uint32_t &reserved) {
   // We start at the index of the last successful allocation for this kind.
   uint32_t chunk_id = impl::get_chunk_id(chunk_size);
   uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
@@ -518,7 +519,6 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
     if (!offset ||
         slots[index].use_count() < Slab::available_chunks(chunk_size)) {
       uint64_t lane_mask = gpu::get_lane_mask();
-      uint32_t reserved = 0;
 
       Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
                                          reserved, chunk_size, index);
@@ -536,13 +536,13 @@ static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
       // If we find a slab with a matching chunk size then we store the result.
       // Otherwise, we need to free the claimed lock and continue. In the case
       // of out-of-memory we receive a sentinel value and return a failure.
-      if (slab && reserved <= Slab::available_chunks(chunk_size) &&
+      if (slab && reserved < Slab::available_chunks(chunk_size) &&
           slab->get_chunk_size() == chunk_size) {
         if (index != start)
           indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
         uniform = uniform & gpu::get_lane_mask();
         return slab;
-      } else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
+      } else if (slab && (reserved >= Slab::available_chunks(chunk_size) ||
                           slab->get_chunk_size() != chunk_size)) {
         slots[index].unlock(gpu::get_lane_mask(),
                             gpu::get_lane_mask() & uniform);
@@ -578,12 +578,12 @@ void *allocate(uint64_t size) {
   // Try to find a slab for the rounded up chunk size and allocate from it.
   uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
   uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
-  Slab *slab = find_slab(chunk_size, uniform);
-  if (!slab || impl::is_sentinel(reinterpret_cast<uintptr_t>(slab)))
+  uint32_t reserved = 0;
+  Slab *slab = find_slab(chunk_size, uniform, reserved);
+  if (!slab)
     return nullptr;
 
-  uint64_t lane_mask = gpu::get_lane_mask();
-  void *ptr = slab->allocate(lane_mask, uniform);
+  void *ptr = slab->allocate(uniform, reserved);
   return ptr;
 }
 
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 13f46a1..bbb07b6 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -155,6 +155,95 @@ add_header_library(
 )
 
 add_header_library(
+  asinhf16
+  HDRS
+    asinhf16.h
+  DEPENDS
+    .acoshf_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.FPUtil.sqrt
+    libc.src.__support.macros.config
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  atan_utils
+  HDRS
+    atan_utils.h
+  DEPENDS
+    libc.src.__support.integer_literals
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  atan
+  HDRS
+    atan.h
+  DEPENDS
+    .atan_utils
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  atan2
+  HDRS
+    atan2.h
+  DEPENDS
+    .atan_utils
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  atanf
+  HDRS
+    atanf.h
+  DEPENDS
+    .inv_trigf_utils
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  atanf16
+  HDRS
+    atanf16.h
+  DEPENDS
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.sqrt
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
   asinf
   HDRS
     asinf.h
diff --git a/libc/src/__support/math/asinhf16.h b/libc/src/__support/math/asinhf16.h
new file mode 100644
index 0000000..3c5171e
--- /dev/null
+++ b/libc/src/__support/math/asinhf16.h
@@ -0,0 +1,121 @@
+//===-- Implementation header for asinhf16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ASINHF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ASINHF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "acoshf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float16 asinhf16(float16 x) {
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr size_t N_EXCEPTS = 8;
+
+  constexpr fputil::ExceptValues<float16, N_EXCEPTS> ASINHF16_EXCEPTS{{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+
+      // x = 0x1.da4p-2, asinhf16(x) = 0x1.ca8p-2 (RZ)
+      {0x3769, 0x372a, 1, 0, 1},
+      // x = 0x1.d6cp-1, asinhf16(x) = 0x1.a58p-1 (RZ)
+      {0x3b5b, 0x3a96, 1, 0, 0},
+      // x = 0x1.c7cp+3, asinhf16(x) = 0x1.accp+1 (RZ)
+      {0x4b1f, 0x42b3, 1, 0, 0},
+      // x = 0x1.26cp+4, asinhf16(x) = 0x1.cd8p+1 (RZ)
+      {0x4c9b, 0x4336, 1, 0, 1},
+      // x = -0x1.da4p-2, asinhf16(x) = -0x1.ca8p-2 (RZ)
+      {0xb769, 0xb72a, 0, 1, 1},
+      // x = -0x1.d6cp-1, asinhf16(x) = -0x1.a58p-1 (RZ)
+      {0xbb5b, 0xba96, 0, 1, 0},
+      // x = -0x1.c7cp+3, asinhf16(x) = -0x1.accp+1 (RZ)
+      {0xcb1f, 0xc2b3, 0, 1, 0},
+      // x = -0x1.26cp+4, asinhf16(x) = -0x1.cd8p+1 (RZ)
+      {0xcc9b, 0xc336, 0, 1, 1},
+  }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using namespace acoshf_internal;
+  using FPBits = fputil::FPBits<float16>;
+  FPBits xbits(x);
+
+  uint16_t x_u = xbits.uintval();
+  uint16_t x_abs = x_u & 0x7fff;
+
+  if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
+    if (xbits.is_signaling_nan()) {
+      fputil::raise_except_if_required(FE_INVALID);
+      return FPBits::quiet_nan().get_val();
+    }
+
+    return x;
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  // Handle exceptional values
+  if (auto r = ASINHF16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  float xf = x;
+  const float SIGN[2] = {1.0f, -1.0f};
+  float x_sign = SIGN[x_u >> 15];
+
+  // |x| <= 0.25
+  if (LIBC_UNLIKELY(x_abs <= 0x3400)) {
+    // when |x| < 0x1.718p-5, asinhf16(x) = x. Adjust by 1 ULP for certain
+    // rounding types.
+    if (LIBC_UNLIKELY(x_abs < 0x29c6)) {
+      int rounding = fputil::quick_get_round();
+      if ((rounding == FE_UPWARD || rounding == FE_TOWARDZERO) && xf < 0)
+        return fputil::cast<float16>(xf + 0x1p-24f);
+      if ((rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO) && xf > 0)
+        return fputil::cast<float16>(xf - 0x1p-24f);
+      return fputil::cast<float16>(xf);
+    }
+
+    float x_sq = xf * xf;
+    // Generated by Sollya with:
+    // > P = fpminimax(asinh(x)/x, [|0, 2, 4, 6, 8|], [|SG...|], [0, 2^-2]);
+    // The last coefficient 0x1.bd114ep-6f has been changed to 0x1.bd114ep-5f
+    // for better accuracy.
+    float p = fputil::polyeval(x_sq, 1.0f, -0x1.555552p-3f, 0x1.332f6ap-4f,
+                               -0x1.6c53dep-5f, 0x1.bd114ep-5f);
+
+    return fputil::cast<float16>(xf * p);
+  }
+
+  // General case: asinh(x) = ln(x + sqrt(x^2 + 1))
+  float sqrt_term = fputil::sqrt<float>(fputil::multiply_add(xf, xf, 1.0f));
+  return fputil::cast<float16>(
+      x_sign * log_eval(fputil::multiply_add(xf, x_sign, sqrt_term)));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ASINHF16_H
diff --git a/libc/src/__support/math/atan.h b/libc/src/__support/math/atan.h
new file mode 100644
index 0000000..62190b0
--- /dev/null
+++ b/libc/src/__support/math/atan.h
@@ -0,0 +1,189 @@
+//===-- Implementation header for atan --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATAN_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATAN_H
+
+#include "atan_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+// To compute atan(x), we divided it into the following cases:
+// * |x| < 2^-26:
+//      Since |x| > atan(|x|) > |x| - |x|^3/3, and |x|^3/3 < ulp(x)/2, we simply
+//      return atan(x) = x - sign(x) * epsilon.
+// * 2^-26 <= |x| < 1:
+//      We perform range reduction mod 2^-6 = 1/64 as follow:
+//      Let k = 2^(-6) * round(|x| * 2^6), then
+//        atan(x) = sign(x) * atan(|x|)
+//                = sign(x) * (atan(k) + atan((|x| - k) / (1 + |x|*k)).
+//      We store atan(k) in a look up table, and perform intermediate steps in
+//      double-double.
+// * 1 < |x| < 2^53:
+//      First we perform the transformation y = 1/|x|:
+//        atan(x) = sign(x) * (pi/2 - atan(1/|x|))
+//                = sign(x) * (pi/2 - atan(y)).
+//      Then we compute atan(y) using range reduction mod 2^-6 = 1/64 as the
+//      previous case:
+//      Let k = 2^(-6) * round(y * 2^6), then
+//        atan(y) = atan(k) + atan((y - k) / (1 + y*k))
+//                = atan(k) + atan((1/|x| - k) / (1 + k/|x|)
+//                = atan(k) + atan((1 - k*|x|) / (|x| + k)).
+// * |x| >= 2^53:
+//      Using the reciprocal transformation:
+//        atan(x) = sign(x) * (pi/2 - atan(1/|x|)).
+//      We have that:
+//        atan(1/|x|) <= 1/|x| <= 2^-53,
+//      which is smaller than ulp(pi/2) / 2.
+//      So we can return:
+//        atan(x) = sign(x) * (pi/2 - epsilon)
+
+LIBC_INLINE static constexpr double atan(double x) {
+
+  using namespace atan_internal;
+  using FPBits = fputil::FPBits<double>;
+
+  constexpr double IS_NEG[2] = {1.0, -1.0};
+  constexpr DoubleDouble PI_OVER_2 = {0x1.1a62633145c07p-54,
+                                      0x1.921fb54442d18p0};
+  constexpr DoubleDouble MPI_OVER_2 = {-0x1.1a62633145c07p-54,
+                                       -0x1.921fb54442d18p0};
+
+  FPBits xbits(x);
+  bool x_sign = xbits.is_neg();
+  xbits = xbits.abs();
+  uint64_t x_abs = xbits.uintval();
+  int x_exp =
+      static_cast<int>(x_abs >> FPBits::FRACTION_LEN) - FPBits::EXP_BIAS;
+
+  // |x| < 1.
+  if (x_exp < 0) {
+    if (LIBC_UNLIKELY(x_exp < -26)) {
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      return x;
+#else
+      if (x == 0.0)
+        return x;
+      // |x| < 2^-26
+      return fputil::multiply_add(-0x1.0p-54, x, x);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+    }
+
+    double x_d = xbits.get_val();
+    // k = 2^-6 * round(2^6 * |x|)
+    double k = fputil::nearest_integer(0x1.0p6 * x_d);
+    unsigned idx = static_cast<unsigned>(k);
+    k *= 0x1.0p-6;
+
+    // numerator = |x| - k
+    DoubleDouble num, den;
+    num.lo = 0.0;
+    num.hi = x_d - k;
+
+    // denominator = 1 - k * |x|
+    den.hi = fputil::multiply_add(x_d, k, 1.0);
+    DoubleDouble prod = fputil::exact_mult(x_d, k);
+    // Using Dekker's 2SUM algorithm to compute the lower part.
+    den.lo = ((1.0 - den.hi) + prod.hi) + prod.lo;
+
+    // x_r = (|x| - k) / (1 + k * |x|)
+    DoubleDouble x_r = fputil::div(num, den);
+
+    // Approximating atan(x_r) using Taylor polynomial.
+    DoubleDouble p = atan_eval(x_r);
+
+    // atan(x) = sign(x) * (atan(k) + atan(x_r))
+    //         = sign(x) * (atan(k) + atan( (|x| - k) / (1 + k * |x|) ))
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+    return IS_NEG[x_sign] * (ATAN_I[idx].hi + (p.hi + (p.lo + ATAN_I[idx].lo)));
+#else
+
+    DoubleDouble c0 = fputil::exact_add(ATAN_I[idx].hi, p.hi);
+    double c1 = c0.lo + (ATAN_I[idx].lo + p.lo);
+    double r = IS_NEG[x_sign] * (c0.hi + c1);
+
+    return r;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  }
+
+  // |x| >= 2^53 or x is NaN.
+  if (LIBC_UNLIKELY(x_exp >= 53)) {
+    // x is nan
+    if (xbits.is_nan()) {
+      if (xbits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+      return x;
+    }
+    // |x| >= 2^53
+    // atan(x) ~ sign(x) * pi/2.
+    if (x_exp >= 53)
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      return IS_NEG[x_sign] * PI_OVER_2.hi;
+#else
+      return fputil::multiply_add(IS_NEG[x_sign], PI_OVER_2.hi,
+                                  IS_NEG[x_sign] * PI_OVER_2.lo);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  }
+
+  double x_d = xbits.get_val();
+  double y = 1.0 / x_d;
+
+  // k = 2^-6 * round(2^6 / |x|)
+  double k = fputil::nearest_integer(0x1.0p6 * y);
+  unsigned idx = static_cast<unsigned>(k);
+  k *= 0x1.0p-6;
+
+  // denominator = |x| + k
+  DoubleDouble den = fputil::exact_add(x_d, k);
+  // numerator = 1 - k * |x|
+  DoubleDouble num;
+  num.hi = fputil::multiply_add(-x_d, k, 1.0);
+  DoubleDouble prod = fputil::exact_mult(x_d, k);
+  // Using Dekker's 2SUM algorithm to compute the lower part.
+  num.lo = ((1.0 - num.hi) - prod.hi) - prod.lo;
+
+  // x_r = (1/|x| - k) / (1 - k/|x|)
+  //     = (1 - k * |x|) / (|x| - k)
+  DoubleDouble x_r = fputil::div(num, den);
+
+  // Approximating atan(x_r) using Taylor polynomial.
+  DoubleDouble p = atan_eval(x_r);
+
+  // atan(x) = sign(x) * (pi/2 - atan(1/|x|))
+  //         = sign(x) * (pi/2 - atan(k) - atan(x_r))
+  //         = (-sign(x)) * (-pi/2 + atan(k) + atan((1 - k*|x|)/(|x| - k)))
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  double lo_part = p.lo + ATAN_I[idx].lo + MPI_OVER_2.lo;
+  return IS_NEG[!x_sign] * (MPI_OVER_2.hi + ATAN_I[idx].hi + (p.hi + lo_part));
+#else
+  DoubleDouble c0 = fputil::exact_add(MPI_OVER_2.hi, ATAN_I[idx].hi);
+  DoubleDouble c1 = fputil::exact_add(c0.hi, p.hi);
+  double c2 = c1.lo + (c0.lo + p.lo) + (ATAN_I[idx].lo + MPI_OVER_2.lo);
+
+  double r = IS_NEG[!x_sign] * (c1.hi + c2);
+
+  return r;
+#endif
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ATAN_H
diff --git a/libc/src/__support/math/atan2.h b/libc/src/__support/math/atan2.h
new file mode 100644
index 0000000..90ed926
--- /dev/null
+++ b/libc/src/__support/math/atan2.h
@@ -0,0 +1,209 @@
+//===-- Implementation header for atan2 -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATAN2_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATAN2_H
+
+#include "atan_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+// There are several range reduction steps we can take for atan2(y, x) as
+// follow:
+
+// * Range reduction 1: signness
+// atan2(y, x) will return a number between -PI and PI representing the angle
+// forming by the 0x axis and the vector (x, y) on the 0xy-plane.
+// In particular, we have that:
+//   atan2(y, x) = atan( y/x )         if x >= 0 and y >= 0 (I-quadrant)
+//               = pi + atan( y/x )    if x < 0 and y >= 0  (II-quadrant)
+//               = -pi + atan( y/x )   if x < 0 and y < 0   (III-quadrant)
+//               = atan( y/x )         if x >= 0 and y < 0  (IV-quadrant)
+// Since atan function is odd, we can use the formula:
+//   atan(-u) = -atan(u)
+// to adjust the above conditions a bit further:
+//   atan2(y, x) = atan( |y|/|x| )         if x >= 0 and y >= 0 (I-quadrant)
+//               = pi - atan( |y|/|x| )    if x < 0 and y >= 0  (II-quadrant)
+//               = -pi + atan( |y|/|x| )   if x < 0 and y < 0   (III-quadrant)
+//               = -atan( |y|/|x| )        if x >= 0 and y < 0  (IV-quadrant)
+// Which can be simplified to:
+//   atan2(y, x) = sign(y) * atan( |y|/|x| )             if x >= 0
+//               = sign(y) * (pi - atan( |y|/|x| ))      if x < 0
+
+// * Range reduction 2: reciprocal
+// Now that the argument inside atan is positive, we can use the formula:
+//   atan(1/x) = pi/2 - atan(x)
+// to make the argument inside atan <= 1 as follow:
+//   atan2(y, x) = sign(y) * atan( |y|/|x|)            if 0 <= |y| <= x
+//               = sign(y) * (pi/2 - atan( |x|/|y| )   if 0 <= x < |y|
+//               = sign(y) * (pi - atan( |y|/|x| ))    if 0 <= |y| <= -x
+//               = sign(y) * (pi/2 + atan( |x|/|y| ))  if 0 <= -x < |y|
+
+// * Range reduction 3: look up table.
+// After the previous two range reduction steps, we reduce the problem to
+// compute atan(u) with 0 <= u <= 1, or to be precise:
+//   atan( n / d ) where n = min(|x|, |y|) and d = max(|x|, |y|).
+// An accurate polynomial approximation for the whole [0, 1] input range will
+// require a very large degree.  To make it more efficient, we reduce the input
+// range further by finding an integer idx such that:
+//   | n/d - idx/64 | <= 1/128.
+// In particular,
+//   idx := round(2^6 * n/d)
+// Then for the fast pass, we find a polynomial approximation for:
+//   atan( n/d ) ~ atan( idx/64 ) + (n/d - idx/64) * Q(n/d - idx/64)
+// For the accurate pass, we use the addition formula:
+//   atan( n/d ) - atan( idx/64 ) = atan( (n/d - idx/64)/(1 + (n*idx)/(64*d)) )
+//                                = atan( (n - d*(idx/64))/(d + n*(idx/64)) )
+// And for the fast pass, we use degree-9 Taylor polynomial to compute the RHS:
+//   atan(u) ~ P(u) = u - u^3/3 + u^5/5 - u^7/7 + u^9/9
+// with absolute errors bounded by:
+//   |atan(u) - P(u)| < |u|^11 / 11 < 2^-80
+// and relative errors bounded by:
+//   |(atan(u) - P(u)) / P(u)| < u^10 / 11 < 2^-73.
+
+LIBC_INLINE static constexpr double atan2(double y, double x) {
+  using namespace atan_internal;
+  using FPBits = fputil::FPBits<double>;
+
+  constexpr double IS_NEG[2] = {1.0, -1.0};
+  constexpr DoubleDouble ZERO = {0.0, 0.0};
+  constexpr DoubleDouble MZERO = {-0.0, -0.0};
+  constexpr DoubleDouble PI = {0x1.1a62633145c07p-53, 0x1.921fb54442d18p+1};
+  constexpr DoubleDouble MPI = {-0x1.1a62633145c07p-53, -0x1.921fb54442d18p+1};
+  constexpr DoubleDouble PI_OVER_2 = {0x1.1a62633145c07p-54,
+                                      0x1.921fb54442d18p0};
+  constexpr DoubleDouble MPI_OVER_2 = {-0x1.1a62633145c07p-54,
+                                       -0x1.921fb54442d18p0};
+  constexpr DoubleDouble PI_OVER_4 = {0x1.1a62633145c07p-55,
+                                      0x1.921fb54442d18p-1};
+  constexpr DoubleDouble THREE_PI_OVER_4 = {0x1.a79394c9e8a0ap-54,
+                                            0x1.2d97c7f3321d2p+1};
+  // Adjustment for constant term:
+  //   CONST_ADJ[x_sign][y_sign][recip]
+  constexpr DoubleDouble CONST_ADJ[2][2][2] = {
+      {{ZERO, MPI_OVER_2}, {MZERO, MPI_OVER_2}},
+      {{MPI, PI_OVER_2}, {MPI, PI_OVER_2}}};
+
+  FPBits x_bits(x), y_bits(y);
+  bool x_sign = x_bits.sign().is_neg();
+  bool y_sign = y_bits.sign().is_neg();
+  x_bits = x_bits.abs();
+  y_bits = y_bits.abs();
+  uint64_t x_abs = x_bits.uintval();
+  uint64_t y_abs = y_bits.uintval();
+  bool recip = x_abs < y_abs;
+  uint64_t min_abs = recip ? x_abs : y_abs;
+  uint64_t max_abs = !recip ? x_abs : y_abs;
+  unsigned min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+  unsigned max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+
+  double num = FPBits(min_abs).get_val();
+  double den = FPBits(max_abs).get_val();
+
+  // Check for exceptional cases, whether inputs are 0, inf, nan, or close to
+  // overflow, or close to underflow.
+  if (LIBC_UNLIKELY(max_exp > 0x7ffU - 128U || min_exp < 128U)) {
+    if (x_bits.is_nan() || y_bits.is_nan()) {
+      if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan())
+        fputil::raise_except_if_required(FE_INVALID);
+      return FPBits::quiet_nan().get_val();
+    }
+    unsigned x_except = x == 0.0 ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1);
+    unsigned y_except = y == 0.0 ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1);
+
+    // Exceptional cases:
+    //   EXCEPT[y_except][x_except][x_is_neg]
+    // with x_except & y_except:
+    //   0: zero
+    //   1: finite, non-zero
+    //   2: infinity
+    constexpr DoubleDouble EXCEPTS[3][3][2] = {
+        {{ZERO, PI}, {ZERO, PI}, {ZERO, PI}},
+        {{PI_OVER_2, PI_OVER_2}, {ZERO, ZERO}, {ZERO, PI}},
+        {{PI_OVER_2, PI_OVER_2},
+         {PI_OVER_2, PI_OVER_2},
+         {PI_OVER_4, THREE_PI_OVER_4}},
+    };
+
+    if ((x_except != 1) || (y_except != 1)) {
+      DoubleDouble r = EXCEPTS[y_except][x_except][x_sign];
+      return fputil::multiply_add(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo);
+    }
+    bool scale_up = min_exp < 128U;
+    bool scale_down = max_exp > 0x7ffU - 128U;
+    // At least one input is denormal, multiply both numerator and denominator
+    // by some large enough power of 2 to normalize denormal inputs.
+    if (scale_up) {
+      num *= 0x1.0p64;
+      if (!scale_down)
+        den *= 0x1.0p64;
+    } else if (scale_down) {
+      den *= 0x1.0p-64;
+      if (!scale_up)
+        num *= 0x1.0p-64;
+    }
+
+    min_abs = FPBits(num).uintval();
+    max_abs = FPBits(den).uintval();
+    min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+    max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+  }
+
+  double final_sign = IS_NEG[(x_sign != y_sign) != recip];
+  DoubleDouble const_term = CONST_ADJ[x_sign][y_sign][recip];
+  unsigned exp_diff = max_exp - min_exp;
+  // We have the following bound for normalized n and d:
+  //   2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1).
+  if (LIBC_UNLIKELY(exp_diff > 54)) {
+    return fputil::multiply_add(final_sign, const_term.hi,
+                                final_sign * (const_term.lo + num / den));
+  }
+
+  double k = fputil::nearest_integer(64.0 * num / den);
+  unsigned idx = static_cast<unsigned>(k);
+  // k = idx / 64
+  k *= 0x1.0p-6;
+
+  // Range reduction:
+  // atan(n/d) - atan(k/64) = atan((n/d - k/64) / (1 + (n/d) * (k/64)))
+  //                        = atan((n - d * k/64)) / (d + n * k/64))
+  DoubleDouble num_k = fputil::exact_mult(num, k);
+  DoubleDouble den_k = fputil::exact_mult(den, k);
+
+  // num_dd = n - d * k
+  DoubleDouble num_dd = fputil::exact_add(num - den_k.hi, -den_k.lo);
+  // den_dd = d + n * k
+  DoubleDouble den_dd = fputil::exact_add(den, num_k.hi);
+  den_dd.lo += num_k.lo;
+
+  // q = (n - d * k) / (d + n * k)
+  DoubleDouble q = fputil::div(num_dd, den_dd);
+  // p ~ atan(q)
+  DoubleDouble p = atan_eval(q);
+
+  DoubleDouble r = fputil::add(const_term, fputil::add(ATAN_I[idx], p));
+  r.hi *= final_sign;
+  r.lo *= final_sign;
+
+  return r.hi + r.lo;
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ATAN2_H
diff --git a/libc/src/math/generic/atan_utils.h b/libc/src/__support/math/atan_utils.h
index 24c7271..9e8d7d6 100644
--- a/libc/src/math/generic/atan_utils.h
+++ b/libc/src/__support/math/atan_utils.h
@@ -18,7 +18,7 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-namespace {
+namespace atan_internal {
 
 using DoubleDouble = fputil::DoubleDouble;
 using Float128 = fputil::DyadicFloat<128>;
@@ -29,7 +29,7 @@ using Float128 = fputil::DyadicFloat<128>;
 //     b = round(atan(i/64) - a, D, RN);
 //     print("{", b, ",", a, "},");
 //   };
-constexpr DoubleDouble ATAN_I[65] = {
+static constexpr DoubleDouble ATAN_I[65] = {
     {0.0, 0.0},
     {-0x1.220c39d4dff5p-61, 0x1.fff555bbb729bp-7},
     {-0x1.5ec431444912cp-60, 0x1.ffd55bba97625p-6},
@@ -110,7 +110,8 @@ constexpr DoubleDouble ATAN_I[65] = {
 //        + x_lo * (1 - x_hi^2 + x_hi^4)
 // Since p.lo is ~ x^3/3, the relative error from rounding is bounded by:
 //   |(atan(x) - P(x))/atan(x)| < ulp(x^2) <= 2^(-14-52) = 2^-66.
-[[maybe_unused]] DoubleDouble atan_eval(const DoubleDouble &x) {
+[[maybe_unused]] LIBC_INLINE static DoubleDouble
+atan_eval(const DoubleDouble &x) {
   DoubleDouble p;
   p.hi = x.hi;
   double x_hi_sq = x.hi * x.hi;
@@ -142,7 +143,7 @@ constexpr DoubleDouble ATAN_I[65] = {
 //     b = 2^ll + a;
 //     print("{Sign::POS, ", 2^(ll - 128), ",", b, "},");
 // };
-constexpr Float128 ATAN_I_F128[65] = {
+static constexpr Float128 ATAN_I_F128[65] = {
     {Sign::POS, 0, 0_u128},
     {Sign::POS, -134, 0xfffaaadd'db94d5bb'e78c5640'15f76048_u128},
     {Sign::POS, -133, 0xffeaaddd'4bb12542'779d776d'da8c6214_u128},
@@ -215,7 +216,7 @@ constexpr Float128 ATAN_I_F128[65] = {
 //                 [0, 2^-7]);
 // > dirtyinfnorm(atan(x) - P, [0, 2^-7]);
 // 0x1.26016ad97f323875760f869684c0898d7b7bb8bep-122
-constexpr Float128 ATAN_POLY_F128[] = {
+static constexpr Float128 ATAN_POLY_F128[] = {
     {Sign::NEG, -129, 0xaaaaaaaa'aaaaaaaa'aaaaaaa6'003c5d1d_u128},
     {Sign::POS, -130, 0xcccccccc'cccccccc'cca00232'8776b063_u128},
     {Sign::NEG, -130, 0x92492492'49249201'27f5268a'cb24aec0_u128},
@@ -225,7 +226,8 @@ constexpr Float128 ATAN_POLY_F128[] = {
 };
 
 // Approximate atan for |x| <= 2^-7.
-[[maybe_unused]] Float128 atan_eval(const Float128 &x) {
+[[maybe_unused]] LIBC_INLINE static constexpr Float128
+atan_eval(const Float128 &x) {
   Float128 x_sq = fputil::quick_mul(x, x);
   Float128 x3 = fputil::quick_mul(x, x_sq);
   Float128 p = fputil::polyeval(x_sq, ATAN_POLY_F128[0], ATAN_POLY_F128[1],
@@ -234,7 +236,7 @@ constexpr Float128 ATAN_POLY_F128[] = {
   return fputil::multiply_add(x3, p, x);
 }
 
-} // anonymous namespace
+} // namespace atan_internal
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/__support/math/atanf.h b/libc/src/__support/math/atanf.h
new file mode 100644
index 0000000..92799dc
--- /dev/null
+++ b/libc/src/__support/math/atanf.h
@@ -0,0 +1,129 @@
+//===-- Implementation header for atanf -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATANF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATANF_H
+
+#include "inv_trigf_utils.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float atanf(float x) {
+  using namespace inv_trigf_utils_internal;
+  using FPBits = typename fputil::FPBits<float>;
+
+  constexpr double FINAL_SIGN[2] = {1.0, -1.0};
+  constexpr double SIGNED_PI_OVER_2[2] = {0x1.921fb54442d18p0,
+                                          -0x1.921fb54442d18p0};
+
+  FPBits x_bits(x);
+  Sign sign = x_bits.sign();
+  x_bits.set_sign(Sign::POS);
+  uint32_t x_abs = x_bits.uintval();
+
+  // x is inf or nan, |x| < 2^-4 or |x|= > 16.
+  if (LIBC_UNLIKELY(x_abs <= 0x3d80'0000U || x_abs >= 0x4180'0000U)) {
+    double x_d = static_cast<double>(x);
+    double const_term = 0.0;
+    if (LIBC_UNLIKELY(x_abs >= 0x4180'0000)) {
+      // atan(+-Inf) = +-pi/2.
+      if (x_bits.is_inf()) {
+        volatile double sign_pi_over_2 = SIGNED_PI_OVER_2[sign.is_neg()];
+        return static_cast<float>(sign_pi_over_2);
+      }
+      if (x_bits.is_nan())
+        return x;
+      // x >= 16
+      x_d = -1.0 / x_d;
+      const_term = SIGNED_PI_OVER_2[sign.is_neg()];
+    }
+    // 0 <= x < 1/16;
+    if (LIBC_UNLIKELY(x_bits.is_zero()))
+      return x;
+    // x <= 2^-12;
+    if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
+      return fputil::multiply_add(x, -0x1.0p-25f, x);
+#else
+      double x_d = static_cast<double>(x);
+      return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+    }
+    // Use Taylor polynomial:
+    //   atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
+    constexpr double ATAN_TAYLOR[6] = {
+        0x1.0000000000000p+0,  -0x1.5555555555555p-2, 0x1.999999999999ap-3,
+        -0x1.2492492492492p-3, 0x1.c71c71c71c71cp-4,  -0x1.745d1745d1746p-4,
+    };
+    double x2 = x_d * x_d;
+    double x4 = x2 * x2;
+    double c0 = fputil::multiply_add(x2, ATAN_TAYLOR[1], ATAN_TAYLOR[0]);
+    double c1 = fputil::multiply_add(x2, ATAN_TAYLOR[3], ATAN_TAYLOR[2]);
+    double c2 = fputil::multiply_add(x2, ATAN_TAYLOR[5], ATAN_TAYLOR[4]);
+    double p = fputil::polyeval(x4, c0, c1, c2);
+    double r = fputil::multiply_add(x_d, p, const_term);
+    return static_cast<float>(r);
+  }
+
+  // Range reduction steps:
+  // 1)  atan(x) = sign(x) * atan(|x|)
+  // 2)  If |x| > 1, atan(|x|) = pi/2 - atan(1/|x|)
+  // 3)  For 1/16 < x <= 1, we find k such that: |x - k/16| <= 1/32.
+  // 4)  Then we use polynomial approximation:
+  //   atan(x) ~ atan((k/16) + (x - (k/16)) * Q(x - k/16)
+  //           = P(x - k/16)
+  double x_d = 0, const_term = 0, final_sign = 0;
+  int idx = 0;
+
+  if (x_abs > 0x3f80'0000U) {
+    // |x| > 1, we need to invert x, so we will perform range reduction in
+    // double precision.
+    x_d = 1.0 / static_cast<double>(x_bits.get_val());
+    double k_d = fputil::nearest_integer(x_d * 0x1.0p4);
+    x_d = fputil::multiply_add(k_d, -0x1.0p-4, x_d);
+    idx = static_cast<int>(k_d);
+    final_sign = FINAL_SIGN[sign.is_pos()];
+    // Adjust constant term of the polynomial by +- pi/2.
+    const_term = fputil::multiply_add(final_sign, ATAN_COEFFS[idx][0],
+                                      SIGNED_PI_OVER_2[sign.is_neg()]);
+  } else {
+    // Exceptional value:
+    if (LIBC_UNLIKELY(x_abs == 0x3d8d'6b23U)) { // |x| = 0x1.1ad646p-4
+      return sign.is_pos() ? fputil::round_result_slightly_down(0x1.1a6386p-4f)
+                           : fputil::round_result_slightly_up(-0x1.1a6386p-4f);
+    }
+    // Perform range reduction in single precision.
+    float x_f = x_bits.get_val();
+    float k_f = fputil::nearest_integer(x_f * 0x1.0p4f);
+    x_f = fputil::multiply_add(k_f, -0x1.0p-4f, x_f);
+    x_d = static_cast<double>(x_f);
+    idx = static_cast<int>(k_f);
+    final_sign = FINAL_SIGN[sign.is_neg()];
+    const_term = final_sign * ATAN_COEFFS[idx][0];
+  }
+
+  double p = atan_eval(x_d, idx);
+  double r = fputil::multiply_add(final_sign * x_d, p, const_term);
+
+  return static_cast<float>(r);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ATANF_H
diff --git a/libc/src/__support/math/atanf16.h b/libc/src/__support/math/atanf16.h
new file mode 100644
index 0000000..f75d145
--- /dev/null
+++ b/libc/src/__support/math/atanf16.h
@@ -0,0 +1,119 @@
+//===-- Implementation header for atanf16 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATANF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATANF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float16 atanf16(float16 x) {
+  // Generated by Solly using the following command:
+  // > round(pi/2, SG, RN);
+  constexpr float PI_2 = 0x1.921fb6p0;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr size_t N_EXCEPTS = 6;
+
+  constexpr fputil::ExceptValues<float16, N_EXCEPTS> ATANF16_EXCEPTS{{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      {0x2745, 0x2744, 1, 0, 1},
+      {0x3099, 0x3090, 1, 0, 1},
+      {0x3c6c, 0x3aae, 1, 0, 1},
+      {0x466e, 0x3daa, 1, 0, 1},
+      {0x48ae, 0x3ddb, 1, 0, 0},
+      {0x5619, 0x3e3d, 1, 0, 1},
+  }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using FPBits = fputil::FPBits<float16>;
+  FPBits xbits(x);
+
+  uint16_t x_u = xbits.uintval();
+  uint16_t x_abs = x_u & 0x7fff;
+  bool x_sign = x_u >> 15;
+  float sign = (x_sign ? -1.0 : 1.0);
+
+  // |x| >= +/-inf
+  if (LIBC_UNLIKELY(x_abs >= 0x7c00)) {
+    if (xbits.is_nan()) {
+      if (xbits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+      return x;
+    }
+
+    // atanf16(+/-inf) = +/-pi/2
+    return fputil::cast<float16>(sign * PI_2);
+  }
+
+  float xf = x;
+  float xsq = xf * xf;
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  // Handle exceptional values
+  if (auto r = ATANF16_EXCEPTS.lookup_odd(x_abs, x_sign);
+      LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif
+
+  // |x| <= 0x1p0, |x| <= 1
+  if (x_abs <= 0x3c00) {
+    // atanf16(+/-0) = +/-0
+    if (LIBC_UNLIKELY(x_abs == 0))
+      return x;
+
+    // Degree-14 minimax odd polynomial of atan(x) generated by Sollya with:
+    // > P = fpminimax(atan(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14|], [|SG...|],
+    // [0, 1]);
+    float result = fputil::polyeval(
+        xsq, 0x1.fffffcp-1f, -0x1.55519ep-2f, 0x1.98f6a8p-3f, -0x1.1f0a92p-3f,
+        0x1.95b654p-4f, -0x1.e65492p-5f, 0x1.8c0c36p-6f, -0x1.32316ep-8f);
+    return fputil::cast<float16>(xf * result);
+  }
+
+  // If |x| > 1
+  // y = atan(x) = sign(x) * atan(|x|)
+  // atan(|x|) = pi/2 - atan(1/|x|)
+  // Recall, 1/|x| < 1
+  float x_inv_sq = 1.0f / xsq;
+  float x_inv = fputil::sqrt<float>(x_inv_sq);
+
+  // Degree-14 minimax odd polynomial of atan(x) generated by Sollya with:
+  // > P = fpminimax(atan(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14|], [|SG...|],
+  // [0, 1]);
+  float interm =
+      fputil::polyeval(x_inv_sq, 0x1.fffffcp-1f, -0x1.55519ep-2f,
+                       0x1.98f6a8p-3f, -0x1.1f0a92p-3f, 0x1.95b654p-4f,
+                       -0x1.e65492p-5f, 0x1.8c0c36p-6f, -0x1.32316ep-8f);
+
+  return fputil::cast<float16>(sign *
+                               fputil::multiply_add(x_inv, -interm, PI_2));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ATANF16_H
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index cbb7886..14aaad2 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -124,3 +124,14 @@ add_object_library(
     libc.src.__support.threads.linux.raw_mutex
     libc.src.__support.CPP.mutex
 )
+
+add_object_library(
+  barrier
+  HDRS
+    barrier.h
+  SRCS
+    barrier.cpp
+  DEPENDS
+    libc.src.__support.threads.CndVar
+    libc.src.__support.threads.mutex
+)
diff --git a/libc/src/__support/threads/linux/barrier.cpp b/libc/src/__support/threads/linux/barrier.cpp
new file mode 100644
index 0000000..cf7207b5
--- /dev/null
+++ b/libc/src/__support/threads/linux/barrier.cpp
@@ -0,0 +1,85 @@
+//===-- Implementation of Barrier class ------------- ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/threads/linux/barrier.h"
+#include "hdr/errno_macros.h"
+#include "src/__support/threads/CndVar.h"
+#include "src/__support/threads/mutex.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int Barrier::init(Barrier *b,
+                  [[maybe_unused]] const pthread_barrierattr_t *attr,
+                  unsigned count) {
+  LIBC_ASSERT(attr == nullptr); // TODO implement barrierattr
+  if (count == 0)
+    return EINVAL;
+
+  b->expected = count;
+  b->waiting = 0;
+  b->blocking = true;
+
+  int err;
+  err = CndVar::init(&b->entering);
+  if (err != 0)
+    return err;
+
+  err = CndVar::init(&b->exiting);
+  if (err != 0)
+    return err;
+
+  auto mutex_err = Mutex::init(&b->m, false, false, false, false);
+  if (mutex_err != MutexError::NONE)
+    return EAGAIN;
+
+  return 0;
+}
+
+int Barrier::wait() {
+  m.lock();
+
+  // if the barrier is emptying out threads, wait until it finishes
+  while (!blocking)
+    entering.wait(&m);
+  waiting++;
+
+  if (waiting < expected) {
+    // block threads until waiting = expected
+    while (blocking)
+      exiting.wait(&m);
+  } else {
+    // this is the last thread to call wait(), so lets wake everyone up
+    blocking = false;
+    exiting.broadcast();
+  }
+  waiting--;
+
+  if (waiting == 0) {
+    // all threads have exited the barrier, let's let the ones waiting to enter
+    // continue
+    blocking = true;
+    entering.broadcast();
+    m.unlock();
+
+    // POSIX dictates that the barrier should return a special value to just one
+    // thread, so we can arbitrarily choose this thread
+    return PTHREAD_BARRIER_SERIAL_THREAD;
+  }
+  m.unlock();
+
+  return 0;
+}
+
+int Barrier::destroy(Barrier *b) {
+  CndVar::destroy(&b->entering);
+  CndVar::destroy(&b->exiting);
+  Mutex::destroy(&b->m);
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/threads/linux/barrier.h b/libc/src/__support/threads/linux/barrier.h
new file mode 100644
index 0000000..f0655bf
--- /dev/null
+++ b/libc/src/__support/threads/linux/barrier.h
@@ -0,0 +1,50 @@
+//===-- A platform independent abstraction layer for barriers --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC___SUPPORT_SRC_THREADS_LINUX_BARRIER_H
+#define LLVM_LIBC___SUPPORT_SRC_THREADS_LINUX_BARRIER_H
+
+#include "hdr/pthread_macros.h"
+#include "include/llvm-libc-types/pthread_barrier_t.h"
+#include "include/llvm-libc-types/pthread_barrierattr_t.h"
+#include "src/__support/threads/CndVar.h"
+#include "src/__support/threads/mutex.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// NOTE: if the size of this class changes, you must ensure that the size of
+// pthread_barrier_t (found in include/llvm-libc/types/pthread_barrier_t.h) is
+// the same size
+class Barrier {
+private:
+  unsigned expected;
+  unsigned waiting;
+  bool blocking;
+  CndVar entering;
+  CndVar exiting;
+  Mutex m;
+
+public:
+  static int init(Barrier *b, const pthread_barrierattr_t *attr,
+                  unsigned count);
+  static int destroy(Barrier *b);
+  int wait();
+};
+
+static_assert(
+    sizeof(Barrier) == sizeof(pthread_barrier_t),
+    "The public pthread_barrier_t type cannot accommodate the internal "
+    "barrier type.");
+
+static_assert(alignof(Barrier) == alignof(pthread_barrier_t),
+              "The public pthread_barrier_t type has a different alignment "
+              "than the internal barrier type.");
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC___SUPPORT_SRC_THREADS_LINUX_BARRIER_H
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index e363ad3..aed1d53 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -60,14 +60,31 @@ add_object_library(
   SRCS
     mbrtowc.cpp
   DEPENDS
-  libc.hdr.errno_macros
-  libc.hdr.types.wchar_t
-  libc.hdr.types.size_t
-  libc.src.__support.common
-  libc.src.__support.error_or
-  libc.src.__support.macros.config
-  .character_converter
-  .mbstate
+    libc.hdr.errno_macros
+    libc.hdr.types.wchar_t
+    libc.hdr.types.size_t
+    libc.src.__support.common
+    libc.src.__support.error_or
+    libc.src.__support.macros.config
+    .character_converter
+    .mbstate
+)
+
+add_header_library(
+  mbsnrtowcs
+  HDRS
+    mbsnrtowcs.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.types.wchar_t
+    libc.hdr.types.size_t
+    libc.src.__support.common
+    libc.src.__support.error_or
+    libc.src.__support.macros.config
+    libc.src.__support.macros.null_check
+    .character_converter
+    .mbstate
+    .string_converter
 )
 
 add_header_library(
diff --git a/libc/src/__support/wchar/mbsnrtowcs.h b/libc/src/__support/wchar/mbsnrtowcs.h
new file mode 100644
index 0000000..54e3152
--- /dev/null
+++ b/libc/src/__support/wchar/mbsnrtowcs.h
@@ -0,0 +1,66 @@
+//===-- Implementation for mbsnrtowcs function ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCHAR_MBSNRTOWCS_H
+#define LLVM_LIBC_SRC___SUPPORT_WCHAR_MBSNRTOWCS_H
+
+#include "hdr/errno_macros.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/string_converter.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+LIBC_INLINE static ErrorOr<size_t> mbsnrtowcs(wchar_t *__restrict dst,
+                                              const char **__restrict src,
+                                              size_t nmc, size_t len,
+                                              mbstate *__restrict ps) {
+  LIBC_CRASH_ON_NULLPTR(src);
+  // Checking if mbstate is valid
+  CharacterConverter char_conv(ps);
+  if (!char_conv.isValidState())
+    return Error(EINVAL);
+
+  StringConverter<char8_t> str_conv(reinterpret_cast<const char8_t *>(*src), ps,
+                                    len, nmc);
+  size_t dst_idx = 0;
+  ErrorOr<char32_t> converted = str_conv.popUTF32();
+  while (converted.has_value()) {
+    if (dst != nullptr)
+      dst[dst_idx] = converted.value();
+    // null terminator should not be counted in return value
+    if (converted.value() == L'\0') {
+      if (dst != nullptr)
+        *src = nullptr;
+      return dst_idx;
+    }
+    dst_idx++;
+    converted = str_conv.popUTF32();
+  }
+
+  if (converted.error() == -1) { // if we hit conversion limit
+    if (dst != nullptr)
+      *src += str_conv.getSourceIndex();
+    return dst_idx;
+  }
+
+  return Error(converted.error());
+}
+
+} // namespace internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCHAR_MBSNRTOWCS_H
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 455ad34..0522e0e 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -189,6 +189,7 @@ add_math_entrypoint_object(fabsf)
 add_math_entrypoint_object(fabsl)
 add_math_entrypoint_object(fabsf16)
 add_math_entrypoint_object(fabsf128)
+add_math_entrypoint_object(fabsbf16)
 
 add_math_entrypoint_object(fadd)
 add_math_entrypoint_object(faddl)
diff --git a/libc/src/math/fabsbf16.h b/libc/src/math/fabsbf16.h
new file mode 100644
index 0000000..4993668
--- /dev/null
+++ b/libc/src/math/fabsbf16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for fabsbf16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FABSBF16_H
+#define LLVM_LIBC_SRC_MATH_FABSBF16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 fabsbf16(bfloat16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_FABSBF16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index f91feacb..6bcb1e2 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -698,6 +698,19 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  fabsbf16
+  SRCS
+    fabsbf16.cpp
+  HDRS
+    ../fabsbf16.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
   fadd
   SRCS
     fadd.cpp
@@ -3899,18 +3912,7 @@ add_entrypoint_object(
   HDRS
     ../asinhf16.h
   DEPENDS
-    .explogxf
-    libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.FPUtil.sqrt
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.asinhf16
 )
 
 add_entrypoint_object(
@@ -4018,19 +4020,6 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
-add_header_library(
-  atan_utils
-  HDRS
-    atan_utils.h
-  DEPENDS
-    libc.src.__support.integer_literals
-    libc.src.__support.FPUtil.double_double
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.macros.optimization
-)
-
 add_entrypoint_object(
   atanf
   SRCS
@@ -4038,14 +4027,7 @@ add_entrypoint_object(
   HDRS
     ../atanf.h
   DEPENDS
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.math.inv_trigf_utils
+    libc.src.__support.math.atanf
 )
 
 add_entrypoint_object(
@@ -4055,17 +4037,7 @@ add_entrypoint_object(
   HDRS
     ../atanf16.h
   DEPENDS
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.sqrt
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.atanf16
 )
 
 add_entrypoint_object(
@@ -4077,13 +4049,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-    .atan_utils
-    libc.src.__support.FPUtil.double_double
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.atan
 )
 
 add_entrypoint_object(
@@ -4113,13 +4079,7 @@ add_entrypoint_object(
   HDRS
     ../atan2.h
   DEPENDS
-    .atan_utils
-    libc.src.__support.FPUtil.double_double
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.atan2
 )
 
 add_entrypoint_object(
@@ -4129,7 +4089,7 @@ add_entrypoint_object(
   HDRS
     ../atan2l.h
   DEPENDS
-    .atan2
+    libc.src.__support.math.atan2
 )
 
 add_entrypoint_object(
@@ -4139,7 +4099,7 @@ add_entrypoint_object(
   HDRS
     ../atan2f128.h
   DEPENDS
-    .atan_utils
+    libc.src.__support.math.atan_utils
     libc.src.__support.integer_literals
     libc.src.__support.uint128
     libc.src.__support.FPUtil.dyadic_float
diff --git a/libc/src/math/generic/asinhf16.cpp b/libc/src/math/generic/asinhf16.cpp
index 0a0b471..d517e63 100644
--- a/libc/src/math/generic/asinhf16.cpp
+++ b/libc/src/math/generic/asinhf16.cpp
@@ -7,102 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/asinhf16.h"
-#include "explogxf.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/FPUtil/sqrt.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
+#include "src/__support/math/asinhf16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr size_t N_EXCEPTS = 8;
-
-static constexpr fputil::ExceptValues<float16, N_EXCEPTS> ASINHF16_EXCEPTS{{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-
-    // x = 0x1.da4p-2, asinhf16(x) = 0x1.ca8p-2 (RZ)
-    {0x3769, 0x372a, 1, 0, 1},
-    // x = 0x1.d6cp-1, asinhf16(x) = 0x1.a58p-1 (RZ)
-    {0x3b5b, 0x3a96, 1, 0, 0},
-    // x = 0x1.c7cp+3, asinhf16(x) = 0x1.accp+1 (RZ)
-    {0x4b1f, 0x42b3, 1, 0, 0},
-    // x = 0x1.26cp+4, asinhf16(x) = 0x1.cd8p+1 (RZ)
-    {0x4c9b, 0x4336, 1, 0, 1},
-    // x = -0x1.da4p-2, asinhf16(x) = -0x1.ca8p-2 (RZ)
-    {0xb769, 0xb72a, 0, 1, 1},
-    // x = -0x1.d6cp-1, asinhf16(x) = -0x1.a58p-1 (RZ)
-    {0xbb5b, 0xba96, 0, 1, 0},
-    // x = -0x1.c7cp+3, asinhf16(x) = -0x1.accp+1 (RZ)
-    {0xcb1f, 0xc2b3, 0, 1, 0},
-    // x = -0x1.26cp+4, asinhf16(x) = -0x1.cd8p+1 (RZ)
-    {0xcc9b, 0xc336, 0, 1, 1},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, asinhf16, (float16 x)) {
-  using namespace acoshf_internal;
-  using FPBits = fputil::FPBits<float16>;
-  FPBits xbits(x);
-
-  uint16_t x_u = xbits.uintval();
-  uint16_t x_abs = x_u & 0x7fff;
-
-  if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
-    if (xbits.is_signaling_nan()) {
-      fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::quiet_nan().get_val();
-    }
-
-    return x;
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  // Handle exceptional values
-  if (auto r = ASINHF16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  float xf = x;
-  const float SIGN[2] = {1.0f, -1.0f};
-  float x_sign = SIGN[x_u >> 15];
-
-  // |x| <= 0.25
-  if (LIBC_UNLIKELY(x_abs <= 0x3400)) {
-    // when |x| < 0x1.718p-5, asinhf16(x) = x. Adjust by 1 ULP for certain
-    // rounding types.
-    if (LIBC_UNLIKELY(x_abs < 0x29c6)) {
-      int rounding = fputil::quick_get_round();
-      if ((rounding == FE_UPWARD || rounding == FE_TOWARDZERO) && xf < 0)
-        return fputil::cast<float16>(xf + 0x1p-24f);
-      if ((rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO) && xf > 0)
-        return fputil::cast<float16>(xf - 0x1p-24f);
-      return fputil::cast<float16>(xf);
-    }
-
-    float x_sq = xf * xf;
-    // Generated by Sollya with:
-    // > P = fpminimax(asinh(x)/x, [|0, 2, 4, 6, 8|], [|SG...|], [0, 2^-2]);
-    // The last coefficient 0x1.bd114ep-6f has been changed to 0x1.bd114ep-5f
-    // for better accuracy.
-    float p = fputil::polyeval(x_sq, 1.0f, -0x1.555552p-3f, 0x1.332f6ap-4f,
-                               -0x1.6c53dep-5f, 0x1.bd114ep-5f);
-
-    return fputil::cast<float16>(xf * p);
-  }
-
-  // General case: asinh(x) = ln(x + sqrt(x^2 + 1))
-  float sqrt_term = fputil::sqrt<float>(fputil::multiply_add(xf, xf, 1.0f));
-  return fputil::cast<float16>(
-      x_sign * log_eval(fputil::multiply_add(xf, x_sign, sqrt_term)));
-}
+LLVM_LIBC_FUNCTION(float16, asinhf16, (float16 x)) { return math::asinhf16(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/atan.cpp b/libc/src/math/generic/atan.cpp
index cbca605..93bf2e1 100644
--- a/libc/src/math/generic/atan.cpp
+++ b/libc/src/math/generic/atan.cpp
@@ -7,173 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atan.h"
-#include "atan_utils.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/atan.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-// To compute atan(x), we divided it into the following cases:
-// * |x| < 2^-26:
-//      Since |x| > atan(|x|) > |x| - |x|^3/3, and |x|^3/3 < ulp(x)/2, we simply
-//      return atan(x) = x - sign(x) * epsilon.
-// * 2^-26 <= |x| < 1:
-//      We perform range reduction mod 2^-6 = 1/64 as follow:
-//      Let k = 2^(-6) * round(|x| * 2^6), then
-//        atan(x) = sign(x) * atan(|x|)
-//                = sign(x) * (atan(k) + atan((|x| - k) / (1 + |x|*k)).
-//      We store atan(k) in a look up table, and perform intermediate steps in
-//      double-double.
-// * 1 < |x| < 2^53:
-//      First we perform the transformation y = 1/|x|:
-//        atan(x) = sign(x) * (pi/2 - atan(1/|x|))
-//                = sign(x) * (pi/2 - atan(y)).
-//      Then we compute atan(y) using range reduction mod 2^-6 = 1/64 as the
-//      previous case:
-//      Let k = 2^(-6) * round(y * 2^6), then
-//        atan(y) = atan(k) + atan((y - k) / (1 + y*k))
-//                = atan(k) + atan((1/|x| - k) / (1 + k/|x|)
-//                = atan(k) + atan((1 - k*|x|) / (|x| + k)).
-// * |x| >= 2^53:
-//      Using the reciprocal transformation:
-//        atan(x) = sign(x) * (pi/2 - atan(1/|x|)).
-//      We have that:
-//        atan(1/|x|) <= 1/|x| <= 2^-53,
-//      which is smaller than ulp(pi/2) / 2.
-//      So we can return:
-//        atan(x) = sign(x) * (pi/2 - epsilon)
-
-LLVM_LIBC_FUNCTION(double, atan, (double x)) {
-  using FPBits = fputil::FPBits<double>;
-
-  constexpr double IS_NEG[2] = {1.0, -1.0};
-  constexpr DoubleDouble PI_OVER_2 = {0x1.1a62633145c07p-54,
-                                      0x1.921fb54442d18p0};
-  constexpr DoubleDouble MPI_OVER_2 = {-0x1.1a62633145c07p-54,
-                                       -0x1.921fb54442d18p0};
-
-  FPBits xbits(x);
-  bool x_sign = xbits.is_neg();
-  xbits = xbits.abs();
-  uint64_t x_abs = xbits.uintval();
-  int x_exp =
-      static_cast<int>(x_abs >> FPBits::FRACTION_LEN) - FPBits::EXP_BIAS;
-
-  // |x| < 1.
-  if (x_exp < 0) {
-    if (LIBC_UNLIKELY(x_exp < -26)) {
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-      return x;
-#else
-      if (x == 0.0)
-        return x;
-      // |x| < 2^-26
-      return fputil::multiply_add(-0x1.0p-54, x, x);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-    }
-
-    double x_d = xbits.get_val();
-    // k = 2^-6 * round(2^6 * |x|)
-    double k = fputil::nearest_integer(0x1.0p6 * x_d);
-    unsigned idx = static_cast<unsigned>(k);
-    k *= 0x1.0p-6;
-
-    // numerator = |x| - k
-    DoubleDouble num, den;
-    num.lo = 0.0;
-    num.hi = x_d - k;
-
-    // denominator = 1 - k * |x|
-    den.hi = fputil::multiply_add(x_d, k, 1.0);
-    DoubleDouble prod = fputil::exact_mult(x_d, k);
-    // Using Dekker's 2SUM algorithm to compute the lower part.
-    den.lo = ((1.0 - den.hi) + prod.hi) + prod.lo;
-
-    // x_r = (|x| - k) / (1 + k * |x|)
-    DoubleDouble x_r = fputil::div(num, den);
-
-    // Approximating atan(x_r) using Taylor polynomial.
-    DoubleDouble p = atan_eval(x_r);
-
-    // atan(x) = sign(x) * (atan(k) + atan(x_r))
-    //         = sign(x) * (atan(k) + atan( (|x| - k) / (1 + k * |x|) ))
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-    return IS_NEG[x_sign] * (ATAN_I[idx].hi + (p.hi + (p.lo + ATAN_I[idx].lo)));
-#else
-
-    DoubleDouble c0 = fputil::exact_add(ATAN_I[idx].hi, p.hi);
-    double c1 = c0.lo + (ATAN_I[idx].lo + p.lo);
-    double r = IS_NEG[x_sign] * (c0.hi + c1);
-
-    return r;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  }
-
-  // |x| >= 2^53 or x is NaN.
-  if (LIBC_UNLIKELY(x_exp >= 53)) {
-    // x is nan
-    if (xbits.is_nan()) {
-      if (xbits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-      return x;
-    }
-    // |x| >= 2^53
-    // atan(x) ~ sign(x) * pi/2.
-    if (x_exp >= 53)
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-      return IS_NEG[x_sign] * PI_OVER_2.hi;
-#else
-      return fputil::multiply_add(IS_NEG[x_sign], PI_OVER_2.hi,
-                                  IS_NEG[x_sign] * PI_OVER_2.lo);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  }
-
-  double x_d = xbits.get_val();
-  double y = 1.0 / x_d;
-
-  // k = 2^-6 * round(2^6 / |x|)
-  double k = fputil::nearest_integer(0x1.0p6 * y);
-  unsigned idx = static_cast<unsigned>(k);
-  k *= 0x1.0p-6;
-
-  // denominator = |x| + k
-  DoubleDouble den = fputil::exact_add(x_d, k);
-  // numerator = 1 - k * |x|
-  DoubleDouble num;
-  num.hi = fputil::multiply_add(-x_d, k, 1.0);
-  DoubleDouble prod = fputil::exact_mult(x_d, k);
-  // Using Dekker's 2SUM algorithm to compute the lower part.
-  num.lo = ((1.0 - num.hi) - prod.hi) - prod.lo;
-
-  // x_r = (1/|x| - k) / (1 - k/|x|)
-  //     = (1 - k * |x|) / (|x| - k)
-  DoubleDouble x_r = fputil::div(num, den);
-
-  // Approximating atan(x_r) using Taylor polynomial.
-  DoubleDouble p = atan_eval(x_r);
-
-  // atan(x) = sign(x) * (pi/2 - atan(1/|x|))
-  //         = sign(x) * (pi/2 - atan(k) - atan(x_r))
-  //         = (-sign(x)) * (-pi/2 + atan(k) + atan((1 - k*|x|)/(|x| - k)))
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  double lo_part = p.lo + ATAN_I[idx].lo + MPI_OVER_2.lo;
-  return IS_NEG[!x_sign] * (MPI_OVER_2.hi + ATAN_I[idx].hi + (p.hi + lo_part));
-#else
-  DoubleDouble c0 = fputil::exact_add(MPI_OVER_2.hi, ATAN_I[idx].hi);
-  DoubleDouble c1 = fputil::exact_add(c0.hi, p.hi);
-  double c2 = c1.lo + (c0.lo + p.lo) + (ATAN_I[idx].lo + MPI_OVER_2.lo);
-
-  double r = IS_NEG[!x_sign] * (c1.hi + c2);
-
-  return r;
-#endif
-}
+LLVM_LIBC_FUNCTION(double, atan, (double x)) { return math::atan(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/atan2.cpp b/libc/src/math/generic/atan2.cpp
index aa770de..4aaa63d 100644
--- a/libc/src/math/generic/atan2.cpp
+++ b/libc/src/math/generic/atan2.cpp
@@ -7,194 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atan2.h"
-#include "atan_utils.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/atan2.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-// There are several range reduction steps we can take for atan2(y, x) as
-// follow:
-
-// * Range reduction 1: signness
-// atan2(y, x) will return a number between -PI and PI representing the angle
-// forming by the 0x axis and the vector (x, y) on the 0xy-plane.
-// In particular, we have that:
-//   atan2(y, x) = atan( y/x )         if x >= 0 and y >= 0 (I-quadrant)
-//               = pi + atan( y/x )    if x < 0 and y >= 0  (II-quadrant)
-//               = -pi + atan( y/x )   if x < 0 and y < 0   (III-quadrant)
-//               = atan( y/x )         if x >= 0 and y < 0  (IV-quadrant)
-// Since atan function is odd, we can use the formula:
-//   atan(-u) = -atan(u)
-// to adjust the above conditions a bit further:
-//   atan2(y, x) = atan( |y|/|x| )         if x >= 0 and y >= 0 (I-quadrant)
-//               = pi - atan( |y|/|x| )    if x < 0 and y >= 0  (II-quadrant)
-//               = -pi + atan( |y|/|x| )   if x < 0 and y < 0   (III-quadrant)
-//               = -atan( |y|/|x| )        if x >= 0 and y < 0  (IV-quadrant)
-// Which can be simplified to:
-//   atan2(y, x) = sign(y) * atan( |y|/|x| )             if x >= 0
-//               = sign(y) * (pi - atan( |y|/|x| ))      if x < 0
-
-// * Range reduction 2: reciprocal
-// Now that the argument inside atan is positive, we can use the formula:
-//   atan(1/x) = pi/2 - atan(x)
-// to make the argument inside atan <= 1 as follow:
-//   atan2(y, x) = sign(y) * atan( |y|/|x|)            if 0 <= |y| <= x
-//               = sign(y) * (pi/2 - atan( |x|/|y| )   if 0 <= x < |y|
-//               = sign(y) * (pi - atan( |y|/|x| ))    if 0 <= |y| <= -x
-//               = sign(y) * (pi/2 + atan( |x|/|y| ))  if 0 <= -x < |y|
-
-// * Range reduction 3: look up table.
-// After the previous two range reduction steps, we reduce the problem to
-// compute atan(u) with 0 <= u <= 1, or to be precise:
-//   atan( n / d ) where n = min(|x|, |y|) and d = max(|x|, |y|).
-// An accurate polynomial approximation for the whole [0, 1] input range will
-// require a very large degree.  To make it more efficient, we reduce the input
-// range further by finding an integer idx such that:
-//   | n/d - idx/64 | <= 1/128.
-// In particular,
-//   idx := round(2^6 * n/d)
-// Then for the fast pass, we find a polynomial approximation for:
-//   atan( n/d ) ~ atan( idx/64 ) + (n/d - idx/64) * Q(n/d - idx/64)
-// For the accurate pass, we use the addition formula:
-//   atan( n/d ) - atan( idx/64 ) = atan( (n/d - idx/64)/(1 + (n*idx)/(64*d)) )
-//                                = atan( (n - d*(idx/64))/(d + n*(idx/64)) )
-// And for the fast pass, we use degree-9 Taylor polynomial to compute the RHS:
-//   atan(u) ~ P(u) = u - u^3/3 + u^5/5 - u^7/7 + u^9/9
-// with absolute errors bounded by:
-//   |atan(u) - P(u)| < |u|^11 / 11 < 2^-80
-// and relative errors bounded by:
-//   |(atan(u) - P(u)) / P(u)| < u^10 / 11 < 2^-73.
-
 LLVM_LIBC_FUNCTION(double, atan2, (double y, double x)) {
-  using FPBits = fputil::FPBits<double>;
-
-  constexpr double IS_NEG[2] = {1.0, -1.0};
-  constexpr DoubleDouble ZERO = {0.0, 0.0};
-  constexpr DoubleDouble MZERO = {-0.0, -0.0};
-  constexpr DoubleDouble PI = {0x1.1a62633145c07p-53, 0x1.921fb54442d18p+1};
-  constexpr DoubleDouble MPI = {-0x1.1a62633145c07p-53, -0x1.921fb54442d18p+1};
-  constexpr DoubleDouble PI_OVER_2 = {0x1.1a62633145c07p-54,
-                                      0x1.921fb54442d18p0};
-  constexpr DoubleDouble MPI_OVER_2 = {-0x1.1a62633145c07p-54,
-                                       -0x1.921fb54442d18p0};
-  constexpr DoubleDouble PI_OVER_4 = {0x1.1a62633145c07p-55,
-                                      0x1.921fb54442d18p-1};
-  constexpr DoubleDouble THREE_PI_OVER_4 = {0x1.a79394c9e8a0ap-54,
-                                            0x1.2d97c7f3321d2p+1};
-  // Adjustment for constant term:
-  //   CONST_ADJ[x_sign][y_sign][recip]
-  constexpr DoubleDouble CONST_ADJ[2][2][2] = {
-      {{ZERO, MPI_OVER_2}, {MZERO, MPI_OVER_2}},
-      {{MPI, PI_OVER_2}, {MPI, PI_OVER_2}}};
-
-  FPBits x_bits(x), y_bits(y);
-  bool x_sign = x_bits.sign().is_neg();
-  bool y_sign = y_bits.sign().is_neg();
-  x_bits = x_bits.abs();
-  y_bits = y_bits.abs();
-  uint64_t x_abs = x_bits.uintval();
-  uint64_t y_abs = y_bits.uintval();
-  bool recip = x_abs < y_abs;
-  uint64_t min_abs = recip ? x_abs : y_abs;
-  uint64_t max_abs = !recip ? x_abs : y_abs;
-  unsigned min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
-  unsigned max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
-
-  double num = FPBits(min_abs).get_val();
-  double den = FPBits(max_abs).get_val();
-
-  // Check for exceptional cases, whether inputs are 0, inf, nan, or close to
-  // overflow, or close to underflow.
-  if (LIBC_UNLIKELY(max_exp > 0x7ffU - 128U || min_exp < 128U)) {
-    if (x_bits.is_nan() || y_bits.is_nan()) {
-      if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan())
-        fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::quiet_nan().get_val();
-    }
-    unsigned x_except = x == 0.0 ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1);
-    unsigned y_except = y == 0.0 ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1);
-
-    // Exceptional cases:
-    //   EXCEPT[y_except][x_except][x_is_neg]
-    // with x_except & y_except:
-    //   0: zero
-    //   1: finite, non-zero
-    //   2: infinity
-    constexpr DoubleDouble EXCEPTS[3][3][2] = {
-        {{ZERO, PI}, {ZERO, PI}, {ZERO, PI}},
-        {{PI_OVER_2, PI_OVER_2}, {ZERO, ZERO}, {ZERO, PI}},
-        {{PI_OVER_2, PI_OVER_2},
-         {PI_OVER_2, PI_OVER_2},
-         {PI_OVER_4, THREE_PI_OVER_4}},
-    };
-
-    if ((x_except != 1) || (y_except != 1)) {
-      DoubleDouble r = EXCEPTS[y_except][x_except][x_sign];
-      return fputil::multiply_add(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo);
-    }
-    bool scale_up = min_exp < 128U;
-    bool scale_down = max_exp > 0x7ffU - 128U;
-    // At least one input is denormal, multiply both numerator and denominator
-    // by some large enough power of 2 to normalize denormal inputs.
-    if (scale_up) {
-      num *= 0x1.0p64;
-      if (!scale_down)
-        den *= 0x1.0p64;
-    } else if (scale_down) {
-      den *= 0x1.0p-64;
-      if (!scale_up)
-        num *= 0x1.0p-64;
-    }
-
-    min_abs = FPBits(num).uintval();
-    max_abs = FPBits(den).uintval();
-    min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
-    max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
-  }
-
-  double final_sign = IS_NEG[(x_sign != y_sign) != recip];
-  DoubleDouble const_term = CONST_ADJ[x_sign][y_sign][recip];
-  unsigned exp_diff = max_exp - min_exp;
-  // We have the following bound for normalized n and d:
-  //   2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1).
-  if (LIBC_UNLIKELY(exp_diff > 54)) {
-    return fputil::multiply_add(final_sign, const_term.hi,
-                                final_sign * (const_term.lo + num / den));
-  }
-
-  double k = fputil::nearest_integer(64.0 * num / den);
-  unsigned idx = static_cast<unsigned>(k);
-  // k = idx / 64
-  k *= 0x1.0p-6;
-
-  // Range reduction:
-  // atan(n/d) - atan(k/64) = atan((n/d - k/64) / (1 + (n/d) * (k/64)))
-  //                        = atan((n - d * k/64)) / (d + n * k/64))
-  DoubleDouble num_k = fputil::exact_mult(num, k);
-  DoubleDouble den_k = fputil::exact_mult(den, k);
-
-  // num_dd = n - d * k
-  DoubleDouble num_dd = fputil::exact_add(num - den_k.hi, -den_k.lo);
-  // den_dd = d + n * k
-  DoubleDouble den_dd = fputil::exact_add(den, num_k.hi);
-  den_dd.lo += num_k.lo;
-
-  // q = (n - d * k) / (d + n * k)
-  DoubleDouble q = fputil::div(num_dd, den_dd);
-  // p ~ atan(q)
-  DoubleDouble p = atan_eval(q);
-
-  DoubleDouble r = fputil::add(const_term, fputil::add(ATAN_I[idx], p));
-  r.hi *= final_sign;
-  r.lo *= final_sign;
-
-  return r.hi + r.lo;
+  return math::atan2(y, x);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/atan2f128.cpp b/libc/src/math/generic/atan2f128.cpp
index a3aba0b..8838d94 100644
--- a/libc/src/math/generic/atan2f128.cpp
+++ b/libc/src/math/generic/atan2f128.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atan2f128.h"
-#include "atan_utils.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/FPUtil/multiply_add.h"
@@ -16,6 +15,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/__support/macros/properties/types.h"
+#include "src/__support/math/atan_utils.h"
 #include "src/__support/uint128.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -103,6 +103,7 @@ static constexpr Float128 CONST_ADJ[2][2][2] = {
 //   |(atan(u) - P(u)) / P(u)| < 2^-114.
 
 LLVM_LIBC_FUNCTION(float128, atan2f128, (float128 y, float128 x)) {
+  using namespace atan_internal;
   using FPBits = fputil::FPBits<float128>;
   using Float128 = fputil::DyadicFloat<128>;
 
diff --git a/libc/src/math/generic/atan2l.cpp b/libc/src/math/generic/atan2l.cpp
index 47a2e985..a7824c6 100644
--- a/libc/src/math/generic/atan2l.cpp
+++ b/libc/src/math/generic/atan2l.cpp
@@ -9,7 +9,7 @@
 #include "src/math/atan2l.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/properties/types.h"
-#include "src/math/atan2.h"
+#include "src/__support/math/atan2.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(long double, atan2l, (long double y, long double x)) {
 #if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
   return static_cast<long double>(
-      atan2(static_cast<double>(y), static_cast<double>(x)));
+      math::atan2(static_cast<double>(y), static_cast<double>(x)));
 #else
 #error "Extended precision is not yet supported"
 #endif
diff --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp
index 22f962e..acd32f0 100644
--- a/libc/src/math/generic/atanf.cpp
+++ b/libc/src/math/generic/atanf.cpp
@@ -7,116 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atanf.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/__support/math/inv_trigf_utils.h"
+#include "src/__support/math/atanf.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
-  using namespace inv_trigf_utils_internal;
-  using FPBits = typename fputil::FPBits<float>;
-
-  constexpr double FINAL_SIGN[2] = {1.0, -1.0};
-  constexpr double SIGNED_PI_OVER_2[2] = {0x1.921fb54442d18p0,
-                                          -0x1.921fb54442d18p0};
-
-  FPBits x_bits(x);
-  Sign sign = x_bits.sign();
-  x_bits.set_sign(Sign::POS);
-  uint32_t x_abs = x_bits.uintval();
-
-  // x is inf or nan, |x| < 2^-4 or |x|= > 16.
-  if (LIBC_UNLIKELY(x_abs <= 0x3d80'0000U || x_abs >= 0x4180'0000U)) {
-    double x_d = static_cast<double>(x);
-    double const_term = 0.0;
-    if (LIBC_UNLIKELY(x_abs >= 0x4180'0000)) {
-      // atan(+-Inf) = +-pi/2.
-      if (x_bits.is_inf()) {
-        volatile double sign_pi_over_2 = SIGNED_PI_OVER_2[sign.is_neg()];
-        return static_cast<float>(sign_pi_over_2);
-      }
-      if (x_bits.is_nan())
-        return x;
-      // x >= 16
-      x_d = -1.0 / x_d;
-      const_term = SIGNED_PI_OVER_2[sign.is_neg()];
-    }
-    // 0 <= x < 1/16;
-    if (LIBC_UNLIKELY(x_bits.is_zero()))
-      return x;
-    // x <= 2^-12;
-    if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
-#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
-      return fputil::multiply_add(x, -0x1.0p-25f, x);
-#else
-      double x_d = static_cast<double>(x);
-      return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
-#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
-    }
-    // Use Taylor polynomial:
-    //   atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
-    constexpr double ATAN_TAYLOR[6] = {
-        0x1.0000000000000p+0,  -0x1.5555555555555p-2, 0x1.999999999999ap-3,
-        -0x1.2492492492492p-3, 0x1.c71c71c71c71cp-4,  -0x1.745d1745d1746p-4,
-    };
-    double x2 = x_d * x_d;
-    double x4 = x2 * x2;
-    double c0 = fputil::multiply_add(x2, ATAN_TAYLOR[1], ATAN_TAYLOR[0]);
-    double c1 = fputil::multiply_add(x2, ATAN_TAYLOR[3], ATAN_TAYLOR[2]);
-    double c2 = fputil::multiply_add(x2, ATAN_TAYLOR[5], ATAN_TAYLOR[4]);
-    double p = fputil::polyeval(x4, c0, c1, c2);
-    double r = fputil::multiply_add(x_d, p, const_term);
-    return static_cast<float>(r);
-  }
-
-  // Range reduction steps:
-  // 1)  atan(x) = sign(x) * atan(|x|)
-  // 2)  If |x| > 1, atan(|x|) = pi/2 - atan(1/|x|)
-  // 3)  For 1/16 < x <= 1, we find k such that: |x - k/16| <= 1/32.
-  // 4)  Then we use polynomial approximation:
-  //   atan(x) ~ atan((k/16) + (x - (k/16)) * Q(x - k/16)
-  //           = P(x - k/16)
-  double x_d, const_term, final_sign;
-  int idx;
-
-  if (x_abs > 0x3f80'0000U) {
-    // |x| > 1, we need to invert x, so we will perform range reduction in
-    // double precision.
-    x_d = 1.0 / static_cast<double>(x_bits.get_val());
-    double k_d = fputil::nearest_integer(x_d * 0x1.0p4);
-    x_d = fputil::multiply_add(k_d, -0x1.0p-4, x_d);
-    idx = static_cast<int>(k_d);
-    final_sign = FINAL_SIGN[sign.is_pos()];
-    // Adjust constant term of the polynomial by +- pi/2.
-    const_term = fputil::multiply_add(final_sign, ATAN_COEFFS[idx][0],
-                                      SIGNED_PI_OVER_2[sign.is_neg()]);
-  } else {
-    // Exceptional value:
-    if (LIBC_UNLIKELY(x_abs == 0x3d8d'6b23U)) { // |x| = 0x1.1ad646p-4
-      return sign.is_pos() ? fputil::round_result_slightly_down(0x1.1a6386p-4f)
-                           : fputil::round_result_slightly_up(-0x1.1a6386p-4f);
-    }
-    // Perform range reduction in single precision.
-    float x_f = x_bits.get_val();
-    float k_f = fputil::nearest_integer(x_f * 0x1.0p4f);
-    x_f = fputil::multiply_add(k_f, -0x1.0p-4f, x_f);
-    x_d = static_cast<double>(x_f);
-    idx = static_cast<int>(k_f);
-    final_sign = FINAL_SIGN[sign.is_neg()];
-    const_term = final_sign * ATAN_COEFFS[idx][0];
-  }
-
-  double p = atan_eval(x_d, idx);
-  double r = fputil::multiply_add(final_sign * x_d, p, const_term);
-
-  return static_cast<float>(r);
-}
+LLVM_LIBC_FUNCTION(float, atanf, (float x)) { return math::atanf(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/atanf16.cpp b/libc/src/math/generic/atanf16.cpp
index 9b6ec65..7191c42 100644
--- a/libc/src/math/generic/atanf16.cpp
+++ b/libc/src/math/generic/atanf16.cpp
@@ -7,101 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atanf16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/sqrt.h"
-#include "src/__support/macros/optimization.h"
+#include "src/__support/math/atanf16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-// Generated by Solly using the following command:
-// > round(pi/2, SG, RN);
-static constexpr float PI_2 = 0x1.921fb6p0;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr size_t N_EXCEPTS = 6;
-
-static constexpr fputil::ExceptValues<float16, N_EXCEPTS> ATANF16_EXCEPTS{{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    {0x2745, 0x2744, 1, 0, 1},
-    {0x3099, 0x3090, 1, 0, 1},
-    {0x3c6c, 0x3aae, 1, 0, 1},
-    {0x466e, 0x3daa, 1, 0, 1},
-    {0x48ae, 0x3ddb, 1, 0, 0},
-    {0x5619, 0x3e3d, 1, 0, 1},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, atanf16, (float16 x)) {
-  using FPBits = fputil::FPBits<float16>;
-  FPBits xbits(x);
-
-  uint16_t x_u = xbits.uintval();
-  uint16_t x_abs = x_u & 0x7fff;
-  bool x_sign = x_u >> 15;
-  float sign = (x_sign ? -1.0 : 1.0);
-
-  // |x| >= +/-inf
-  if (LIBC_UNLIKELY(x_abs >= 0x7c00)) {
-    if (xbits.is_nan()) {
-      if (xbits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-      return x;
-    }
-
-    // atanf16(+/-inf) = +/-pi/2
-    return fputil::cast<float16>(sign * PI_2);
-  }
-
-  float xf = x;
-  float xsq = xf * xf;
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  // Handle exceptional values
-  if (auto r = ATANF16_EXCEPTS.lookup_odd(x_abs, x_sign);
-      LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif
-
-  // |x| <= 0x1p0, |x| <= 1
-  if (x_abs <= 0x3c00) {
-    // atanf16(+/-0) = +/-0
-    if (LIBC_UNLIKELY(x_abs == 0))
-      return x;
-
-    // Degree-14 minimax odd polynomial of atan(x) generated by Sollya with:
-    // > P = fpminimax(atan(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14|], [|SG...|],
-    // [0, 1]);
-    float result = fputil::polyeval(
-        xsq, 0x1.fffffcp-1f, -0x1.55519ep-2f, 0x1.98f6a8p-3f, -0x1.1f0a92p-3f,
-        0x1.95b654p-4f, -0x1.e65492p-5f, 0x1.8c0c36p-6f, -0x1.32316ep-8f);
-    return fputil::cast<float16>(xf * result);
-  }
-
-  // If |x| > 1
-  // y = atan(x) = sign(x) * atan(|x|)
-  // atan(|x|) = pi/2 - atan(1/|x|)
-  // Recall, 1/|x| < 1
-  float x_inv_sq = 1.0f / xsq;
-  float x_inv = fputil::sqrt<float>(x_inv_sq);
-
-  // Degree-14 minimax odd polynomial of atan(x) generated by Sollya with:
-  // > P = fpminimax(atan(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14|], [|SG...|],
-  // [0, 1]);
-  float interm =
-      fputil::polyeval(x_inv_sq, 0x1.fffffcp-1f, -0x1.55519ep-2f,
-                       0x1.98f6a8p-3f, -0x1.1f0a92p-3f, 0x1.95b654p-4f,
-                       -0x1.e65492p-5f, 0x1.8c0c36p-6f, -0x1.32316ep-8f);
-
-  return fputil::cast<float16>(sign *
-                               fputil::multiply_add(x_inv, -interm, PI_2));
-}
+LLVM_LIBC_FUNCTION(float16, atanf16, (float16 x)) { return math::atanf16(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fabsbf16.cpp b/libc/src/math/generic/fabsbf16.cpp
new file mode 100644
index 0000000..ea39719
--- /dev/null
+++ b/libc/src/math/generic/fabsbf16.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of fabsbf16 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fabsbf16.h"
+
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, fabsbf16, (bfloat16 x)) { return fputil::abs(x); }
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/CMakeLists.txt b/libc/src/pthread/CMakeLists.txt
index c5db6fa..fe31e6a 100644
--- a/libc/src/pthread/CMakeLists.txt
+++ b/libc/src/pthread/CMakeLists.txt
@@ -273,6 +273,40 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  pthread_barrier_init
+  SRCS
+    pthread_barrier_init.cpp
+  HDRS
+    pthread_barrier_init.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.include.pthread
+    libc.src.__support.threads.linux.barrier
+)
+
+add_entrypoint_object(
+  pthread_barrier_destroy
+  SRCS
+    pthread_barrier_destroy.cpp
+  HDRS
+    pthread_barrier_destroy.h
+  DEPENDS
+    libc.include.pthread
+    libc.src.__support.threads.linux.barrier
+)
+
+add_entrypoint_object(
+  pthread_barrier_wait
+  SRCS
+    pthread_barrier_wait.cpp
+  HDRS
+    pthread_barrier_wait.h
+  DEPENDS
+    libc.include.pthread
+    libc.src.__support.threads.linux.barrier
+)
+
+add_entrypoint_object(
   pthread_mutex_init
   SRCS
     pthread_mutex_init.cpp
diff --git a/libc/src/pthread/pthread_barrier_destroy.cpp b/libc/src/pthread/pthread_barrier_destroy.cpp
new file mode 100644
index 0000000..82de8f2
--- /dev/null
+++ b/libc/src/pthread/pthread_barrier_destroy.cpp
@@ -0,0 +1,22 @@
+//===-- Implementation of the pthread_barrier_destroy function ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "pthread_barrier_destroy.h"
+
+#include "hdr/types/pthread_barrier_t.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/linux/barrier.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, pthread_barrier_destroy, (pthread_barrier_t * b)) {
+  return Barrier::destroy(reinterpret_cast<Barrier *>(b));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_barrier_destroy.h b/libc/src/pthread/pthread_barrier_destroy.h
new file mode 100644
index 0000000..e27552c
--- /dev/null
+++ b/libc/src/pthread/pthread_barrier_destroy.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for pthread_barrier_destroy --------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_BARRIER_DESTROY_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_BARRIER_DESTROY_H
+
+#include "hdr/types/pthread_barrier_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_barrier_destroy(pthread_barrier_t *b);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_BARRIER_DESTROY_H
diff --git a/libc/src/pthread/pthread_barrier_init.cpp b/libc/src/pthread/pthread_barrier_init.cpp
new file mode 100644
index 0000000..2e92238
--- /dev/null
+++ b/libc/src/pthread/pthread_barrier_init.cpp
@@ -0,0 +1,26 @@
+//===-- Implementation of the pthread_barrier_init function ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "pthread_barrier_init.h"
+
+#include "hdr/types/pthread_barrier_t.h"
+#include "hdr/types/pthread_barrierattr_t.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/linux/barrier.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, pthread_barrier_init,
+                   (pthread_barrier_t * b,
+                    const pthread_barrierattr_t *__restrict attr,
+                    unsigned count)) {
+  return Barrier::init(reinterpret_cast<Barrier *>(b), attr, count);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_barrier_init.h b/libc/src/pthread/pthread_barrier_init.h
new file mode 100644
index 0000000..bb17f3f
--- /dev/null
+++ b/libc/src/pthread/pthread_barrier_init.h
@@ -0,0 +1,24 @@
+//===-- Implementation header for pthread_barrier_init ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_BARRIER_INIT_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_BARRIER_INIT_H
+
+#include "hdr/types/pthread_barrier_t.h"
+#include "hdr/types/pthread_barrierattr_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_barrier_init(pthread_barrier_t *b,
+                         const pthread_barrierattr_t *__restrict attr,
+                         unsigned count);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_BARRIER_INIT_H
diff --git a/libc/src/pthread/pthread_barrier_wait.cpp b/libc/src/pthread/pthread_barrier_wait.cpp
new file mode 100644
index 0000000..dbd1333
--- /dev/null
+++ b/libc/src/pthread/pthread_barrier_wait.cpp
@@ -0,0 +1,22 @@
+//===-- Implementation of the pthread_barrier_wait function ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "pthread_barrier_wait.h"
+
+#include "hdr/types/pthread_barrier_t.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/linux/barrier.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, pthread_barrier_wait, (pthread_barrier_t * b)) {
+  return reinterpret_cast<Barrier *>(b)->wait();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_barrier_wait.h b/libc/src/pthread/pthread_barrier_wait.h
new file mode 100644
index 0000000..16ddc06
--- /dev/null
+++ b/libc/src/pthread/pthread_barrier_wait.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for pthread_barrier_wait ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_BARRIER_WAIT_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_BARRIER_WAIT_H
+
+#include "hdr/types/pthread_barrier_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_barrier_wait(pthread_barrier_t *b);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_BARRIER_WAIT_H
diff --git a/libc/src/sched/CMakeLists.txt b/libc/src/sched/CMakeLists.txt
index e6c37d3..d1d1de0 100644
--- a/libc/src/sched/CMakeLists.txt
+++ b/libc/src/sched/CMakeLists.txt
@@ -3,6 +3,13 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
 add_entrypoint_object(
+  getcpu
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.getcpu
+)
+
+add_entrypoint_object(
   sched_getaffinity
   ALIAS
   DEPENDS
diff --git a/libc/src/sched/getcpu.h b/libc/src/sched/getcpu.h
new file mode 100644
index 0000000..4c90e64
--- /dev/null
+++ b/libc/src/sched/getcpu.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for getcpu ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SCHED_GETCPU_H
+#define LLVM_LIBC_SRC_SCHED_GETCPU_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int getcpu(unsigned int *cpu, unsigned int *node);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_SCHED_GETCPU_H
diff --git a/libc/src/sched/linux/CMakeLists.txt b/libc/src/sched/linux/CMakeLists.txt
index e690e76..bb50002 100644
--- a/libc/src/sched/linux/CMakeLists.txt
+++ b/libc/src/sched/linux/CMakeLists.txt
@@ -1,4 +1,16 @@
 add_entrypoint_object(
+  getcpu
+  SRCS
+    getcpu.cpp
+  HDRS
+    ../getcpu.h
+  DEPENDS
+    libc.include.sched
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
+
+add_entrypoint_object(
   sched_getaffinity
   SRCS
     sched_getaffinity.cpp
@@ -6,7 +18,9 @@ add_entrypoint_object(
     ../sched_getaffinity.h
   DEPENDS
     libc.hdr.stdint_proxy
-    libc.include.sched
+    libc.hdr.types.cpu_set_t
+    libc.hdr.types.pid_t
+    libc.hdr.types.size_t
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
@@ -18,7 +32,9 @@ add_entrypoint_object(
   HDRS
     ../sched_setaffinity.h
   DEPENDS
-    libc.include.sched
+    libc.hdr.types.cpu_set_t
+    libc.hdr.types.pid_t
+    libc.hdr.types.size_t
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
@@ -30,7 +46,8 @@ add_entrypoint_object(
   HDRS
     ../sched_getcpucount.h
   DEPENDS
-    libc.include.sched
+    libc.hdr.types.cpu_set_t
+    libc.hdr.types.size_t
 )
 
 add_entrypoint_object(
@@ -94,7 +111,7 @@ add_entrypoint_object(
   HDRS
     ../sched_getscheduler.h
   DEPENDS
-    libc.include.sched
+    libc.hdr.types.pid_t
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
@@ -131,8 +148,9 @@ add_entrypoint_object(
   HDRS
     ../sched_rr_get_interval.h
   DEPENDS
+    libc.hdr.types.pid_t
+    libc.hdr.types.struct_timespec
     libc.include.sys_syscall
-    libc.include.sched
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
diff --git a/libc/src/sched/linux/getcpu.cpp b/libc/src/sched/linux/getcpu.cpp
new file mode 100644
index 0000000..a34b693
--- /dev/null
+++ b/libc/src/sched/linux/getcpu.cpp
@@ -0,0 +1,29 @@
+//===-- Implementation of getcpu ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/sched/getcpu.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, getcpu, (unsigned int *cpu, unsigned int *node)) {
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_getcpu, cpu, node, nullptr);
+  if (ret < 0) {
+    libc_errno = -ret;
+    return -1;
+  }
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/sched/linux/sched_getaffinity.cpp b/libc/src/sched/linux/sched_getaffinity.cpp
index 4a5e91a..d652f7f7 100644
--- a/libc/src/sched/linux/sched_getaffinity.cpp
+++ b/libc/src/sched/linux/sched_getaffinity.cpp
@@ -14,7 +14,9 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 
-#include <sched.h>
+#include "hdr/types/cpu_set_t.h"
+#include "hdr/types/pid_t.h"
+#include "hdr/types/size_t.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sched/linux/sched_getcpucount.cpp b/libc/src/sched/linux/sched_getcpucount.cpp
index 7ae166e..dcc2338 100644
--- a/libc/src/sched/linux/sched_getcpucount.cpp
+++ b/libc/src/sched/linux/sched_getcpucount.cpp
@@ -12,7 +12,8 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 
-#include <sched.h>
+#include "hdr/types/cpu_set_t.h"
+#include "hdr/types/size_t.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sched/linux/sched_getscheduler.cpp b/libc/src/sched/linux/sched_getscheduler.cpp
index d8e0296..10625f2 100644
--- a/libc/src/sched/linux/sched_getscheduler.cpp
+++ b/libc/src/sched/linux/sched_getscheduler.cpp
@@ -13,6 +13,7 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 
+#include "hdr/types/pid_t.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sched/linux/sched_rr_get_interval.cpp b/libc/src/sched/linux/sched_rr_get_interval.cpp
index 5668d596b..eecbaa4 100644
--- a/libc/src/sched/linux/sched_rr_get_interval.cpp
+++ b/libc/src/sched/linux/sched_rr_get_interval.cpp
@@ -13,6 +13,8 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 
+#include "hdr/types/pid_t.h"
+#include "hdr/types/struct_timespec.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 #ifdef SYS_sched_rr_get_interval_time64
diff --git a/libc/src/sched/linux/sched_setaffinity.cpp b/libc/src/sched/linux/sched_setaffinity.cpp
index 93e930d..3c7ed91 100644
--- a/libc/src/sched/linux/sched_setaffinity.cpp
+++ b/libc/src/sched/linux/sched_setaffinity.cpp
@@ -13,7 +13,9 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 
-#include <sched.h>
+#include "hdr/types/cpu_set_t.h"
+#include "hdr/types/pid_t.h"
+#include "hdr/types/size_t.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sched/sched_getaffinity.h b/libc/src/sched/sched_getaffinity.h
index 52ec5bc..8623089 100644
--- a/libc/src/sched/sched_getaffinity.h
+++ b/libc/src/sched/sched_getaffinity.h
@@ -10,7 +10,10 @@
 #define LLVM_LIBC_SRC_SCHED_SCHED_GETAFFINITY_H
 
 #include "src/__support/macros/config.h"
-#include <sched.h>
+
+#include "hdr/types/cpu_set_t.h"
+#include "hdr/types/pid_t.h"
+#include "hdr/types/size_t.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sched/sched_getcpucount.h b/libc/src/sched/sched_getcpucount.h
index 8f35301..0667d8c 100644
--- a/libc/src/sched/sched_getcpucount.h
+++ b/libc/src/sched/sched_getcpucount.h
@@ -10,7 +10,8 @@
 #define LLVM_LIBC_SRC_SCHED_SCHED_GETCPUCOUNT_H
 
 #include "src/__support/macros/config.h"
-#include <sched.h>
+
+#include "hdr/types/cpu_set_t.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sched/sched_getscheduler.h b/libc/src/sched/sched_getscheduler.h
index d29e902..6407dbf 100644
--- a/libc/src/sched/sched_getscheduler.h
+++ b/libc/src/sched/sched_getscheduler.h
@@ -10,7 +10,8 @@
 #define LLVM_LIBC_SRC_SCHED_SCHED_GETSCHEDULER_H
 
 #include "src/__support/macros/config.h"
-#include <sched.h>
+
+#include "hdr/types/pid_t.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sched/sched_rr_get_interval.h b/libc/src/sched/sched_rr_get_interval.h
index ff09329..4195c14 100644
--- a/libc/src/sched/sched_rr_get_interval.h
+++ b/libc/src/sched/sched_rr_get_interval.h
@@ -10,7 +10,9 @@
 #define LLVM_LIBC_SRC_SCHED_SCHED_RR_GET_INTERVAL_H
 
 #include "src/__support/macros/config.h"
-#include <sched.h>
+
+#include "hdr/types/pid_t.h"
+#include "hdr/types/struct_timespec.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sched/sched_setaffinity.h b/libc/src/sched/sched_setaffinity.h
index cb2303d..f6739ab 100644
--- a/libc/src/sched/sched_setaffinity.h
+++ b/libc/src/sched/sched_setaffinity.h
@@ -10,7 +10,10 @@
 #define LLVM_LIBC_SRC_SCHED_SCHED_SETAFFINITY_H
 
 #include "src/__support/macros/config.h"
-#include <sched.h>
+
+#include "hdr/types/cpu_set_t.h"
+#include "hdr/types/pid_t.h"
+#include "hdr/types/size_t.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 49f4a1b..9ba0a06 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -185,6 +185,55 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  mbstowcs
+  SRCS
+    mbstowcs.cpp
+  HDRS
+    mbstowcs.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.types.wchar_t
+    libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.null_check
+    libc.src.__support.libc_errno
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.wchar.mbsnrtowcs
+)
+
+add_entrypoint_object(
+  mbsrtowcs
+  SRCS
+    mbsrtowcs.cpp
+  HDRS
+    mbsrtowcs.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.types.wchar_t
+    libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.libc_errno
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.wchar.mbsnrtowcs
+)
+
+add_entrypoint_object(
+  mbsnrtowcs
+  SRCS
+    mbsnrtowcs.cpp
+  HDRS
+    mbsnrtowcs.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.types.wchar_t
+    libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.libc_errno
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.wchar.mbsnrtowcs
+)
+
+add_entrypoint_object(
   wcstombs
   SRCS
     wcstombs.cpp
diff --git a/libc/src/wchar/mbsnrtowcs.cpp b/libc/src/wchar/mbsnrtowcs.cpp
new file mode 100644
index 0000000..28e0ff3
--- /dev/null
+++ b/libc/src/wchar/mbsnrtowcs.cpp
@@ -0,0 +1,39 @@
+//===-- Implementation of mbsnrtowcs --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/mbsnrtowcs.h"
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbsnrtowcs.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(size_t, mbsnrtowcs,
+                   (wchar_t *__restrict dst, const char **__restrict src,
+                    size_t nmc, size_t len, mbstate_t *__restrict ps)) {
+  static internal::mbstate internal_mbstate;
+  // If destination is null, ignore len
+  len = dst == nullptr ? SIZE_MAX : len;
+  auto ret = internal::mbsnrtowcs(
+      dst, src, nmc, len,
+      ps == nullptr ? &internal_mbstate
+                    : reinterpret_cast<internal::mbstate *>(ps));
+  if (!ret.has_value()) {
+    // Encoding failure
+    libc_errno = ret.error();
+    return -1;
+  }
+  return ret.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/mbsnrtowcs.h b/libc/src/wchar/mbsnrtowcs.h
new file mode 100644
index 0000000..0d66b95
--- /dev/null
+++ b/libc/src/wchar/mbsnrtowcs.h
@@ -0,0 +1,24 @@
+//===-- Implementation header for mbsnrtowcs ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_MBSNRTOWCS_H
+#define LLVM_LIBC_SRC_WCHAR_MBSNRTOWCS_H
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
+                  size_t nmc, size_t len, mbstate_t *__restrict ps);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_MBSNRTOWCS_H
diff --git a/libc/src/wchar/mbsrtowcs.cpp b/libc/src/wchar/mbsrtowcs.cpp
new file mode 100644
index 0000000..82ca25a
--- /dev/null
+++ b/libc/src/wchar/mbsrtowcs.cpp
@@ -0,0 +1,39 @@
+//===-- Implementation of mbsrtowcs ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/mbsrtowcs.h"
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbsnrtowcs.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(size_t, mbsrtowcs,
+                   (wchar_t *__restrict dst, const char **__restrict src,
+                    size_t len, mbstate_t *__restrict ps)) {
+  static internal::mbstate internal_mbstate;
+  // If destination is null, ignore len
+  len = dst == nullptr ? SIZE_MAX : len;
+  auto ret = internal::mbsnrtowcs(
+      dst, src, SIZE_MAX, len,
+      ps == nullptr ? &internal_mbstate
+                    : reinterpret_cast<internal::mbstate *>(ps));
+  if (!ret.has_value()) {
+    // Encoding failure
+    libc_errno = ret.error();
+    return -1;
+  }
+  return ret.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/mbsrtowcs.h b/libc/src/wchar/mbsrtowcs.h
new file mode 100644
index 0000000..f8d4cc2
--- /dev/null
+++ b/libc/src/wchar/mbsrtowcs.h
@@ -0,0 +1,24 @@
+//===-- Implementation header for mbsrtowcs -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_MBSRTOWCS_H
+#define LLVM_LIBC_SRC_WCHAR_MBSRTOWCS_H
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
+                 size_t len, mbstate_t *__restrict ps);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_MBSRTOWCS_H
diff --git a/libc/src/wchar/mbstowcs.cpp b/libc/src/wchar/mbstowcs.cpp
new file mode 100644
index 0000000..43e953c
--- /dev/null
+++ b/libc/src/wchar/mbstowcs.cpp
@@ -0,0 +1,40 @@
+//===-- Implementation of mbstowcs ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/mbstowcs.h"
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+#include "src/__support/wchar/mbsnrtowcs.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(size_t, mbstowcs,
+                   (wchar_t *__restrict pwcs, const char *__restrict s,
+                    size_t n)) {
+  LIBC_CRASH_ON_NULLPTR(s);
+  // If destination is null, ignore n
+  n = pwcs == nullptr ? SIZE_MAX : n;
+  static internal::mbstate internal_mbstate;
+  const char *temp = s;
+  auto ret = internal::mbsnrtowcs(pwcs, &temp, SIZE_MAX, n, &internal_mbstate);
+
+  if (!ret.has_value()) {
+    // Encoding failure
+    libc_errno = ret.error();
+    return -1;
+  }
+  return ret.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/mbstowcs.h b/libc/src/wchar/mbstowcs.h
new file mode 100644
index 0000000..7d08a83
--- /dev/null
+++ b/libc/src/wchar/mbstowcs.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for mbstowcs --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_MBSTOWCS_H
+#define LLVM_LIBC_SRC_WCHAR_MBSTOWCS_H
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+size_t mbstowcs(wchar_t *__restrict pwcs, const char *__restrict s, size_t n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_MBSTOWCS_H