48 files changed, 805 insertions, 827 deletions
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 4e90aad..5090dc2 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -267,7 +267,9 @@ add_header_library(
   HDRS
     fixedvector.h
   DEPENDS
+    .libc_assert
     libc.src.__support.CPP.array
+    libc.src.string.memory_utils.inline_memset
 )
 
 add_header_library(
diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp
index 972249f..528542c 100644
--- a/libc/src/__support/File/file.cpp
+++ b/libc/src/__support/File/file.cpp
@@ -42,7 +42,7 @@ FileIOResult File::write_unlocked_nbf(const uint8_t *data, size_t len) {
   if (pos > 0) { // If the buffer is not empty
     // Flush the buffer
     const size_t write_size = pos;
-    auto write_result = platform_write(this, buf, write_size);
+    FileIOResult write_result = platform_write(this, buf, write_size);
     pos = 0; // Buffer is now empty so reset pos to the beginning.
     // If less bytes were written than expected, then an error occurred.
     if (write_result < write_size) {
@@ -52,7 +52,7 @@ FileIOResult File::write_unlocked_nbf(const uint8_t *data, size_t len) {
     }
   }
 
-  auto write_result = platform_write(this, data, len);
+  FileIOResult write_result = platform_write(this, data, len);
   if (write_result < len)
     err = true;
   return write_result;
@@ -99,7 +99,7 @@ FileIOResult File::write_unlocked_fbf(const uint8_t *data, size_t len) {
   // is full.
   const size_t write_size = pos;
 
-  auto buf_result = platform_write(this, buf, write_size);
+  FileIOResult buf_result = platform_write(this, buf, write_size);
   size_t bytes_written = buf_result.value;
 
   pos = 0; // Buffer is now empty so reset pos to the beginning.
@@ -121,7 +121,8 @@ FileIOResult File::write_unlocked_fbf(const uint8_t *data, size_t len) {
     pos = remainder.size();
   } else {
 
-    auto result = platform_write(this, remainder.data(), remainder.size());
+    FileIOResult result =
+        platform_write(this, remainder.data(), remainder.size());
     size_t bytes_written = buf_result.value;
 
     // If less bytes were written than expected, then an error occurred. Return
@@ -190,6 +191,17 @@ FileIOResult File::read_unlocked(void *data, size_t len) {
 
   prev_op = FileOp::READ;
 
+  if (bufmode == _IONBF) { // unbuffered.
+    return read_unlocked_nbf(static_cast<uint8_t *>(data), len);
+  } else if (bufmode == _IOFBF) { // fully buffered
+    return read_unlocked_fbf(static_cast<uint8_t *>(data), len);
+  } else /*if (bufmode == _IOLBF) */ { // line buffered
+    // There is no line buffered mode for read. Use fully buffered instead.
+    return read_unlocked_fbf(static_cast<uint8_t *>(data), len);
+  }
+}
+
+size_t File::copy_data_from_buf(uint8_t *data, size_t len) {
   cpp::span<uint8_t> bufref(static_cast<uint8_t *>(buf), bufsize);
   cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data), len);
 
@@ -209,32 +221,42 @@ FileIOResult File::read_unlocked(void *data, size_t len) {
   for (size_t i = 0; i < available_data; ++i)
     dataref[i] = bufref[i + pos];
   read_limit = pos = 0; // Reset the pointers.
+
+  return available_data;
+}
+
+FileIOResult File::read_unlocked_fbf(uint8_t *data, size_t len) {
+  // Read data from the buffer first.
+  size_t available_data = copy_data_from_buf(data, len);
+  if (available_data == len)
+    return available_data;
+
   // Update the dataref to reflect that fact that we have already
   // copied |available_data| into |data|.
-  dataref = cpp::span<uint8_t>(dataref.data() + available_data,
-                               dataref.size() - available_data);
-
   size_t to_fetch = len - available_data;
+  cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data) + available_data,
+                             to_fetch);
+
   if (to_fetch > bufsize) {
-    auto result = platform_read(this, dataref.data(), to_fetch);
+    FileIOResult result = platform_read(this, dataref.data(), to_fetch);
     size_t fetched_size = result.value;
     if (result.has_error() || fetched_size < to_fetch) {
       if (!result.has_error())
         eof = true;
       else
         err = true;
-      return {available_data + fetched_size, result.has_error()};
+      return {available_data + fetched_size, result.error};
     }
     return len;
   }
 
   // Fetch and buffer another buffer worth of data.
-  auto result = platform_read(this, buf, bufsize);
+  FileIOResult result = platform_read(this, buf, bufsize);
   size_t fetched_size = result.value;
   read_limit += fetched_size;
   size_t transfer_size = fetched_size >= to_fetch ? to_fetch : fetched_size;
   for (size_t i = 0; i < transfer_size; ++i)
-    dataref[i] = bufref[i];
+    dataref[i] = buf[i];
   pos += transfer_size;
   if (result.has_error() || fetched_size < to_fetch) {
     if (!result.has_error())
@@ -245,6 +267,26 @@ FileIOResult File::read_unlocked(void *data, size_t len) {
   return {transfer_size + available_data, result.error};
 }
 
+FileIOResult File::read_unlocked_nbf(uint8_t *data, size_t len) {
+  // Check whether there is a character in the ungetc buffer.
+  size_t available_data = copy_data_from_buf(data, len);
+  if (available_data == len)
+    return available_data;
+
+  // Directly copy the data into |data|.
+  cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data) + available_data,
+                             len - available_data);
+  FileIOResult result = platform_read(this, dataref.data(), dataref.size());
+
+  if (result.has_error() || result < dataref.size()) {
+    if (!result.has_error())
+      eof = true;
+    else
+      err = true;
+  }
+  return {result + available_data, result.error};
+}
+
 int File::ungetc_unlocked(int c) {
   // There is no meaning to unget if:
   // 1. You are trying to push back EOF.
@@ -287,7 +329,7 @@ ErrorOr<int> File::seek(off_t offset, int whence) {
   FileLock lock(this);
   if (prev_op == FileOp::WRITE && pos > 0) {
 
-    auto buf_result = platform_write(this, buf, pos);
+    FileIOResult buf_result = platform_write(this, buf, pos);
     if (buf_result.has_error() || buf_result.value < pos) {
       err = true;
       return Error(buf_result.error);
@@ -325,7 +367,7 @@ ErrorOr<off_t> File::tell() {
 
 int File::flush_unlocked() {
   if (prev_op == FileOp::WRITE && pos > 0) {
-    auto buf_result = platform_write(this, buf, pos);
+    FileIOResult buf_result = platform_write(this, buf, pos);
     if (buf_result.has_error() || buf_result.value < pos) {
       err = true;
       return buf_result.error;
diff --git a/libc/src/__support/File/file.h b/libc/src/__support/File/file.h
index 42e1d11..5c97a9c 100644
--- a/libc/src/__support/File/file.h
+++ b/libc/src/__support/File/file.h
@@ -280,6 +280,10 @@ private:
   FileIOResult write_unlocked_fbf(const uint8_t *data, size_t len);
   FileIOResult write_unlocked_nbf(const uint8_t *data, size_t len);
 
+  FileIOResult read_unlocked_fbf(uint8_t *data, size_t len);
+  FileIOResult read_unlocked_nbf(uint8_t *data, size_t len);
+  size_t copy_data_from_buf(uint8_t *data, size_t len);
+
   constexpr void adjust_buf() {
     if (read_allowed() && (buf == nullptr || bufsize == 0)) {
       // We should allow atleast one ungetc operation.
diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index 28fd9a1..9b359f6 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -1,16 +1,12 @@
-if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+# These utilities are GPU only.
+if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
-add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
-set(target_gpu_utils libc.src.__support.GPU.${LIBC_TARGET_ARCHITECTURE}.${LIBC_TARGET_ARCHITECTURE}_utils)
-
 add_header_library(
   utils
   HDRS
     utils.h
-  DEPENDS
-    ${target_gpu_utils}
 )
 
 add_object_library(
@@ -21,6 +17,6 @@ add_object_library(
     allocator.h
   DEPENDS
     libc.src.__support.common
-    libc.src.__support.GPU.utils
     libc.src.__support.RPC.rpc_client
+    .utils
 )
diff --git a/libc/src/__support/GPU/amdgpu/CMakeLists.txt b/libc/src/__support/GPU/amdgpu/CMakeLists.txt
deleted file mode 100644
index f2b98fc..0000000
--- a/libc/src/__support/GPU/amdgpu/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  amdgpu_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
deleted file mode 100644
index 6ab9540..0000000
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ /dev/null
@@ -1,183 +0,0 @@
-//===-------------- AMDGPU implementation of GPU utils ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-/// Type aliases to the address spaces used by the AMDGPU backend.
-template <typename T> using Private = [[clang::opencl_private]] T;
-template <typename T> using Constant = [[clang::opencl_constant]] T;
-template <typename T> using Local = [[clang::opencl_local]] T;
-template <typename T> using Global = [[clang::opencl_global]] T;
-
-/// Returns the number of workgroups in the 'x' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_x() {
-  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-}
-
-/// Returns the number of workgroups in the 'y' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_y() {
-  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
-}
-
-/// Returns the number of workgroups in the 'z' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_z() {
-  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
-}
-
-/// Returns the total number of workgruops in the grid.
-LIBC_INLINE uint64_t get_num_blocks() {
-  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
-}
-
-/// Returns the 'x' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_x() {
-  return __builtin_amdgcn_workgroup_id_x();
-}
-
-/// Returns the 'y' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_y() {
-  return __builtin_amdgcn_workgroup_id_y();
-}
-
-/// Returns the 'z' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_z() {
-  return __builtin_amdgcn_workgroup_id_z();
-}
-
-/// Returns the absolute id of the AMD workgroup.
-LIBC_INLINE uint64_t get_block_id() {
-  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
-         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
-}
-
-/// Returns the number of workitems in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_threads_x() {
-  return __builtin_amdgcn_workgroup_size_x();
-}
-
-/// Returns the number of workitems in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_threads_y() {
-  return __builtin_amdgcn_workgroup_size_y();
-}
-
-/// Returns the number of workitems in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_threads_z() {
-  return __builtin_amdgcn_workgroup_size_z();
-}
-
-/// Returns the total number of workitems in the workgroup.
-LIBC_INLINE uint64_t get_num_threads() {
-  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
-}
-
-/// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_x() {
-  return __builtin_amdgcn_workitem_id_x();
-}
-
-/// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_y() {
-  return __builtin_amdgcn_workitem_id_y();
-}
-
-/// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_z() {
-  return __builtin_amdgcn_workitem_id_z();
-}
-
-/// Returns the absolute id of the thread in the current AMD workgroup.
-LIBC_INLINE uint64_t get_thread_id() {
-  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
-         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
-}
-
-/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
-/// and compilation options.
-LIBC_INLINE uint32_t get_lane_size() {
-  return __builtin_amdgcn_wavefrontsize();
-}
-
-/// Returns the id of the thread inside of an AMD wavefront executing together.
-[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-/// Returns the bit-mask of active threads in the current wavefront.
-[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
-  return __builtin_amdgcn_read_exec();
-}
-
-/// Copies the value from the first active thread in the wavefront to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,
-                                                           uint32_t x) {
-  return __builtin_amdgcn_readfirstlane(x);
-}
-
-/// Returns a bitmask of threads in the current lane for which \p x is true.
-[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  // the lane_mask & gives the nvptx semantics when lane_mask is a subset of
-  // the active threads
-  return lane_mask & __builtin_amdgcn_ballot_w64(x);
-}
-
-/// Waits for all the threads in the block to converge and issues a fence.
-[[clang::convergent]] LIBC_INLINE void sync_threads() {
-  __builtin_amdgcn_s_barrier();
-  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
-}
-
-/// Waits for all pending memory operations to complete in program order.
-[[clang::convergent]] LIBC_INLINE void memory_fence() {
-  __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
-}
-
-/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
-[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
-  __builtin_amdgcn_wave_barrier();
-}
-
-/// Shuffles the the lanes inside the wavefront according to the given index.
-[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx,
-                                                   uint32_t x) {
-  return __builtin_amdgcn_ds_bpermute(idx << 2, x);
-}
-
-/// Returns the current value of the GPU's processor clock.
-/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
-LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
-
-/// Returns a fixed-frequency timestamp. The actual frequency is dependent on
-/// the card and can only be queried via the driver.
-LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
-}
-
-/// Terminates execution of the associated wavefront.
-[[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); }
-
-/// Returns a unique identifier for the process cluster the current wavefront is
-/// executing on. Here we use the identifier for the compute unit (CU) and
-/// shader engine.
-/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface
-/// than the one at
-/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899
-LIBC_INLINE uint32_t get_cluster_id() { return 0; }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif
diff --git a/libc/src/__support/GPU/generic/CMakeLists.txt b/libc/src/__support/GPU/generic/CMakeLists.txt
deleted file mode 100644
index 68ba7d1..0000000
--- a/libc/src/__support/GPU/generic/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  generic_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
deleted file mode 100644
index 9461ef0..0000000
--- a/libc/src/__support/GPU/generic/utils.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//===-------------- Generic implementation of GPU utils ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-template <typename T> using Private = T;
-template <typename T> using Constant = T;
-template <typename T> using Shared = T;
-template <typename T> using Global = T;
-
-LIBC_INLINE uint32_t get_num_blocks_x() { return 1; }
-
-LIBC_INLINE uint32_t get_num_blocks_y() { return 1; }
-
-LIBC_INLINE uint32_t get_num_blocks_z() { return 1; }
-
-LIBC_INLINE uint64_t get_num_blocks() { return 1; }
-
-LIBC_INLINE uint32_t get_block_id_x() { return 0; }
-
-LIBC_INLINE uint32_t get_block_id_y() { return 0; }
-
-LIBC_INLINE uint32_t get_block_id_z() { return 0; }
-
-LIBC_INLINE uint64_t get_block_id() { return 0; }
-
-LIBC_INLINE uint32_t get_num_threads_x() { return 1; }
-
-LIBC_INLINE uint32_t get_num_threads_y() { return 1; }
-
-LIBC_INLINE uint32_t get_num_threads_z() { return 1; }
-
-LIBC_INLINE uint64_t get_num_threads() { return 1; }
-
-LIBC_INLINE uint32_t get_thread_id_x() { return 0; }
-
-LIBC_INLINE uint32_t get_thread_id_y() { return 0; }
-
-LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
-
-LIBC_INLINE uint64_t get_thread_id() { return 0; }
-
-LIBC_INLINE uint32_t get_lane_size() { return 1; }
-
-LIBC_INLINE uint32_t get_lane_id() { return 0; }
-
-LIBC_INLINE uint64_t get_lane_mask() { return 1; }
-
-LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; }
-
-LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; }
-
-LIBC_INLINE void sync_threads() {}
-
-LIBC_INLINE void sync_lane(uint64_t) {}
-
-LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t, uint32_t x) { return x; }
-
-LIBC_INLINE uint64_t processor_clock() { return 0; }
-
-LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }
-
-[[noreturn]] LIBC_INLINE void end_program() { __builtin_unreachable(); }
-
-LIBC_INLINE uint32_t get_cluster_id() { return 0; }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
diff --git a/libc/src/__support/GPU/nvptx/CMakeLists.txt b/libc/src/__support/GPU/nvptx/CMakeLists.txt
deleted file mode 100644
index 0d3f8c7..0000000
--- a/libc/src/__support/GPU/nvptx/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  nvptx_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
deleted file mode 100644
index 1a43a83..0000000
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ /dev/null
@@ -1,160 +0,0 @@
-//===-------------- NVPTX implementation of GPU utils -----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-id: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-/// Type aliases to the address spaces used by the NVPTX backend.
-template <typename T> using Private = [[clang::opencl_private]] T;
-template <typename T> using Constant = [[clang::opencl_constant]] T;
-template <typename T> using Local = [[clang::opencl_local]] T;
-template <typename T> using Global = [[clang::opencl_global]] T;
-
-/// Returns the number of CUDA blocks in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_blocks_x() {
-  return __nvvm_read_ptx_sreg_nctaid_x();
-}
-
-/// Returns the number of CUDA blocks in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_blocks_y() {
-  return __nvvm_read_ptx_sreg_nctaid_y();
-}
-
-/// Returns the number of CUDA blocks in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_blocks_z() {
-  return __nvvm_read_ptx_sreg_nctaid_z();
-}
-
-/// Returns the total number of CUDA blocks.
-LIBC_INLINE uint64_t get_num_blocks() {
-  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
-}
-
-/// Returns the 'x' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
-
-/// Returns the 'y' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
-
-/// Returns the 'z' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
-
-/// Returns the absolute id of the CUDA block.
-LIBC_INLINE uint64_t get_block_id() {
-  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
-         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
-}
-
-/// Returns the number of CUDA threads in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_threads_x() {
-  return __nvvm_read_ptx_sreg_ntid_x();
-}
-
-/// Returns the number of CUDA threads in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_threads_y() {
-  return __nvvm_read_ptx_sreg_ntid_y();
-}
-
-/// Returns the number of CUDA threads in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_threads_z() {
-  return __nvvm_read_ptx_sreg_ntid_z();
-}
-
-/// Returns the total number of threads in the block.
-LIBC_INLINE uint64_t get_num_threads() {
-  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
-}
-
-/// Returns the 'x' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
-
-/// Returns the 'y' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
-
-/// Returns the 'z' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
-
-/// Returns the absolute id of the thread in the current CUDA block.
-LIBC_INLINE uint64_t get_thread_id() {
-  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
-         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
-}
-
-/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
-LIBC_INLINE uint32_t get_lane_size() { return 32; }
-
-/// Returns the id of the thread inside of a CUDA warp executing together.
-[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  return __nvvm_read_ptx_sreg_laneid();
-}
-
-/// Returns the bit-mask of active threads in the current warp.
-[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
-  return __nvvm_activemask();
-}
-
-/// Copies the value from the first active thread in the warp to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
-                                                           uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t id = __builtin_ffs(mask) - 1;
-  return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
-}
-
-/// Returns a bitmask of threads in the current lane for which \p x is true.
-[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  return __nvvm_vote_ballot_sync(mask, x);
-}
-
-/// Waits for all the threads in the block to converge and issues a fence.
-[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
-
-/// Waits for all pending memory operations to complete in program order.
-[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
-
-/// Waits for all threads in the warp to reconverge for independent scheduling.
-[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
-  __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
-}
-
-/// Shuffles the the lanes inside the warp according to the given index.
-[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
-                                                   uint32_t idx, uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t bitmask = (mask >> idx) & 1;
-  return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
-}
-
-/// Returns the current value of the GPU's processor clock.
-LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
-
-/// Returns a global fixed-frequency timer at nanosecond frequency.
-LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
-}
-
-/// Terminates execution of the calling thread.
-[[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); }
-
-/// Returns a unique identifier for the process cluster the current warp is
-/// executing on. Here we use the identifier for the symmetric multiprocessor.
-LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index ae52e7a..e138c84 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -9,48 +9,108 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
 #define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
 
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
 
-#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#include "amdgpu/utils.h"
-#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
-#include "nvptx/utils.h"
-#else
-#include "generic/utils.h"
+#if !__has_include(<gpuintrin.h>)
+#error "Unsupported compiler"
 #endif
 
+#include <gpuintrin.h>
+
 namespace LIBC_NAMESPACE_DECL {
 namespace gpu {
-/// Get the first active thread inside the lane.
-LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) {
-  return __builtin_ffsll(lane_mask) - 1;
+
+template <typename T> using Private = __gpu_private T;
+template <typename T> using Constant = __gpu_constant T;
+template <typename T> using Local = __gpu_local T;
+template <typename T> using Global = __gpu_local T;
+
+LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); }
+
+LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); }
+
+LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); }
+
+LIBC_INLINE uint64_t get_num_blocks() {
+  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
+}
+
+LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); }
+
+LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); }
+
+LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); }
+
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); }
+
+LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); }
+
+LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); }
+
+LIBC_INLINE uint64_t get_num_threads() {
+  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
+}
+
+LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); }
+
+LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); }
+
+LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); }
+
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); }
+
+LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); }
+
+LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); }
+
+LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
+  return __gpu_read_first_lane_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+  return __gpu_ballot(lane_mask, x);
+}
+
+LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
+
+LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
+
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
+  return __gpu_shuffle_idx_u32(lane_mask, idx, x);
 }
 
-/// Conditional that is only true for a single thread in a lane.
+[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
+
 LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
-  return gpu::get_lane_id() == get_first_lane_id(lane_mask);
+  return __gpu_is_first_in_lane(lane_mask);
 }
 
-/// Gets the sum of all lanes inside the warp or wavefront.
 LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
-  for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) {
-    uint32_t index = step + gpu::get_lane_id();
-    x += gpu::shuffle(lane_mask, index, x);
-  }
-  return gpu::broadcast_value(lane_mask, x);
+  return __gpu_lane_sum_u32(lane_mask, x);
 }
 
-/// Gets the accumulator scan of the threads in the warp or wavefront.
 LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
-  for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) {
-    uint32_t index = gpu::get_lane_id() - step;
-    uint32_t bitmask = gpu::get_lane_id() >= step;
-    x += -bitmask & gpu::shuffle(lane_mask, index, x);
-  }
-  return x;
+  return __gpu_lane_scan_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t fixed_frequency_clock() {
+  return __builtin_readsteadycounter();
 }
 
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h
index 7ac0c23..34601f8 100644
--- a/libc/src/__support/fixedvector.h
+++ b/libc/src/__support/fixedvector.h
@@ -10,9 +10,10 @@
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDVECTOR_H
 
 #include "src/__support/CPP/array.h"
-
 #include "src/__support/CPP/iterator.h"
+#include "src/__support/libc_assert.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memset.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -23,27 +24,32 @@ template <typename T, size_t CAPACITY> class FixedVector {
   size_t item_count = 0;
 
 public:
-  constexpr FixedVector() = default;
+  LIBC_INLINE constexpr FixedVector() = default;
 
   using iterator = typename cpp::array<T, CAPACITY>::iterator;
-  constexpr FixedVector(iterator begin, iterator end) : store{}, item_count{} {
+  LIBC_INLINE constexpr FixedVector(iterator begin, iterator end)
+      : store{}, item_count{} {
+    LIBC_ASSERT(begin + CAPACITY >= end);
     for (; begin != end; ++begin)
       push_back(*begin);
   }
 
   using const_iterator = typename cpp::array<T, CAPACITY>::const_iterator;
-  constexpr FixedVector(const_iterator begin, const_iterator end)
+  LIBC_INLINE constexpr FixedVector(const_iterator begin, const_iterator end)
       : store{}, item_count{} {
+    LIBC_ASSERT(begin + CAPACITY >= end);
     for (; begin != end; ++begin)
       push_back(*begin);
   }
 
-  constexpr FixedVector(size_t count, const T &value) : store{}, item_count{} {
+  LIBC_INLINE constexpr FixedVector(size_t count, const T &value)
+      : store{}, item_count{} {
+    LIBC_ASSERT(count <= CAPACITY);
     for (size_t i = 0; i < count; ++i)
       push_back(value);
   }
 
-  constexpr bool push_back(const T &obj) {
+  LIBC_INLINE constexpr bool push_back(const T &obj) {
     if (item_count == CAPACITY)
       return false;
     store[item_count] = obj;
@@ -51,27 +57,43 @@ public:
     return true;
   }
 
-  constexpr const T &back() const { return store[item_count - 1]; }
+  LIBC_INLINE constexpr const T &back() const {
+    LIBC_ASSERT(!empty());
+    return store[item_count - 1];
+  }
 
-  constexpr T &back() { return store[item_count - 1]; }
+  LIBC_INLINE constexpr T &back() {
+    LIBC_ASSERT(!empty());
+    return store[item_count - 1];
+  }
 
-  constexpr bool pop_back() {
+  LIBC_INLINE constexpr bool pop_back() {
     if (item_count == 0)
       return false;
+    inline_memset(&store[item_count - 1], 0, sizeof(T));
     --item_count;
     return true;
   }
 
-  constexpr T &operator[](size_t idx) { return store[idx]; }
+  LIBC_INLINE constexpr T &operator[](size_t idx) {
+    LIBC_ASSERT(idx < item_count);
+    return store[idx];
+  }
 
-  constexpr const T &operator[](size_t idx) const { return store[idx]; }
+  LIBC_INLINE constexpr const T &operator[](size_t idx) const {
+    LIBC_ASSERT(idx < item_count);
+    return store[idx];
+  }
 
-  constexpr bool empty() const { return item_count == 0; }
+  LIBC_INLINE constexpr bool empty() const { return item_count == 0; }
 
-  constexpr size_t size() const { return item_count; }
+  LIBC_INLINE constexpr size_t size() const { return item_count; }
 
   // Empties the store for all practical purposes.
-  constexpr void reset() { item_count = 0; }
+  LIBC_INLINE constexpr void reset() {
+    inline_memset(store.data(), 0, sizeof(T) * item_count);
+    item_count = 0;
+  }
 
   // This static method does not free up the resources held by |store|,
   // say by calling `free` or something similar. It just does the equivalent
@@ -81,7 +103,9 @@ public:
   // dynamically allocated storate. So, the `destroy` method like this
   // matches the `destroy` API of those other data structures so that users
   // can easily swap one data structure for the other.
-  static void destroy(FixedVector<T, CAPACITY> *store) { store->reset(); }
+  LIBC_INLINE static void destroy(FixedVector<T, CAPACITY> *store) {
+    store->reset();
+  }
 
   using reverse_iterator = typename cpp::array<T, CAPACITY>::reverse_iterator;
   LIBC_INLINE constexpr reverse_iterator rbegin() {
diff --git a/libc/src/__support/threads/thread.cpp b/libc/src/__support/threads/thread.cpp
index dad4f75..6f6b75b 100644
--- a/libc/src/__support/threads/thread.cpp
+++ b/libc/src/__support/threads/thread.cpp
@@ -117,7 +117,9 @@ public:
 
   int add_callback(AtExitCallback *callback, void *obj) {
     cpp::lock_guard lock(mtx);
-    return callback_list.push_back({callback, obj});
+    if (callback_list.push_back({callback, obj}))
+      return 0;
+    return -1;
   }
 
   void call() {
diff --git a/libc/src/compiler/generic/__stack_chk_fail.cpp b/libc/src/compiler/generic/__stack_chk_fail.cpp
index c76ec14..00e976a 100644
--- a/libc/src/compiler/generic/__stack_chk_fail.cpp
+++ b/libc/src/compiler/generic/__stack_chk_fail.cpp
@@ -9,9 +9,12 @@
 #include "src/compiler/__stack_chk_fail.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/stdlib/abort.h"
+#include <stdint.h> // For uintptr_t
 
 extern "C" {
 
+uintptr_t __stack_chk_guard = static_cast<uintptr_t>(0xa9fff01234);
+
 void __stack_chk_fail(void) {
   LIBC_NAMESPACE::write_to_stderr("stack smashing detected\n");
   LIBC_NAMESPACE::abort();
diff --git a/libc/src/math/docs/add_math_function.md b/libc/src/math/docs/add_math_function.md
index f02d502..daaf1a3 100644
--- a/libc/src/math/docs/add_math_function.md
+++ b/libc/src/math/docs/add_math_function.md
@@ -18,7 +18,7 @@ together with its specifications:
 ```
 - Add function specs to the file:
 ```
-  libc/hdrgen/yaml/math.yaml
+  libc/include/math.yaml
 ```
 
 ## Implementation
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b3d4612..382f5b3 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -358,7 +358,6 @@ add_header_library(
   HDRS
     sincosf16_utils.h
   DEPENDS
-    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.nearest_integer
     libc.src.__support.common
@@ -1702,8 +1701,6 @@ add_header_library(
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
     libc.src.__support.common
diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/math/generic/exp10f_impl.h
index d741318..975fd01 100644
--- a/libc/src/math/generic/exp10f_impl.h
+++ b/libc/src/math/generic/exp10f_impl.h
@@ -10,12 +10,9 @@
 #define LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H
 
 #include "explogxf.h"
-#include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h
index bcab82f..06aeb49 100644
--- a/libc/src/math/generic/range_reduction_double_common.h
+++ b/libc/src/math/generic/range_reduction_double_common.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/double_double.h"
 #include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/FPUtil/multiply_add.h"
diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h
index 5e5edd4..87b1dde 100644
--- a/libc/src/math/generic/sincosf16_utils.h
+++ b/libc/src/math/generic/sincosf16_utils.h
@@ -9,9 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/pthread/pthread_condattr_init.cpp b/libc/src/pthread/pthread_condattr_init.cpp
index 12005b8..b360804 100644
--- a/libc/src/pthread/pthread_condattr_init.cpp
+++ b/libc/src/pthread/pthread_condattr_init.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 
-#include <pthread.h> // pthread_condattr_t, PTHREAD_PROCESS_PRIVATE
-#include <time.h>    // CLOCK_REALTIME
+#include "hdr/time_macros.h" // CLOCK_REALTIME
+#include <pthread.h>         // pthread_condattr_t, PTHREAD_PROCESS_PRIVATE
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/pthread/pthread_condattr_setclock.cpp b/libc/src/pthread/pthread_condattr_setclock.cpp
index 37fbd6b..5e825d5 100644
--- a/libc/src/pthread/pthread_condattr_setclock.cpp
+++ b/libc/src/pthread/pthread_condattr_setclock.cpp
@@ -12,9 +12,9 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <pthread.h>   // pthread_condattr_t
-#include <sys/types.h> // clockid_t
-#include <time.h>      // CLOCK_MONOTONIC, CLOCK_REALTIME
+#include "hdr/time_macros.h" // CLOCK_MONOTONIC, CLOCK_REALTIME
+#include <pthread.h>         // pthread_condattr_t
+#include <sys/types.h>       // clockid_t
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/exit_handler.h b/libc/src/stdlib/exit_handler.h
index 9720c54..e9d163d 100644
--- a/libc/src/stdlib/exit_handler.h
+++ b/libc/src/stdlib/exit_handler.h
@@ -48,7 +48,7 @@ LIBC_INLINE void stdc_at_exit_func(void *payload) {
 LIBC_INLINE void call_exit_callbacks(ExitCallbackList &callbacks) {
   handler_list_mtx.lock();
   while (!callbacks.empty()) {
-    AtExitUnit &unit = callbacks.back();
+    AtExitUnit unit = callbacks.back();
     callbacks.pop_back();
     handler_list_mtx.unlock();
     unit.callback(unit.payload);
diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h
index ccb9ec5..b969977 100644
--- a/libc/src/stdlib/heap_sort.h
+++ b/libc/src/stdlib/heap_sort.h
@@ -18,11 +18,12 @@ namespace internal {
 // A simple in-place heapsort implementation.
 // Follow the implementation in https://en.wikipedia.org/wiki/Heapsort.
 
-LIBC_INLINE void heap_sort(const Array &array) {
-  size_t end = array.size();
+template <typename A, typename F>
+LIBC_INLINE void heap_sort(const A &array, const F &is_less) {
+  size_t end = array.len();
   size_t start = end / 2;
 
-  auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
+  const auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
 
   while (end > 1) {
     if (start > 0) {
@@ -40,12 +41,11 @@ LIBC_INLINE void heap_sort(const Array &array) {
     while (left_child(root) < end) {
       size_t child = left_child(root);
       // If there are two children, set child to the greater.
-      if (child + 1 < end &&
-          array.elem_compare(child, array.get(child + 1)) < 0)
+      if ((child + 1 < end) && is_less(array.get(child), array.get(child + 1)))
         ++child;
 
       // If the root is less than the greater child
-      if (array.elem_compare(root, array.get(child)) >= 0)
+      if (!is_less(array.get(root), array.get(child)))
         break;
 
       // Swap the root with the greater child and continue sifting down.
diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp
index 65a63c2..0bf5fc7 100644
--- a/libc/src/stdlib/qsort.cpp
+++ b/libc/src/stdlib/qsort.cpp
@@ -18,14 +18,12 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(void, qsort,
                    (void *array, size_t array_size, size_t elem_size,
                     int (*compare)(const void *, const void *))) {
-  if (array == nullptr || array_size == 0 || elem_size == 0)
-    return;
-  internal::Comparator c(compare);
 
-  auto arr = internal::Array(reinterpret_cast<uint8_t *>(array), array_size,
-                             elem_size, c);
+  const auto is_less = [compare](const void *a, const void *b) -> bool {
+    return compare(a, b) < 0;
+  };
 
-  internal::sort(arr);
+  internal::unstable_sort(array, array_size, elem_size, is_less);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index c529d55..aa6d9bb 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -17,91 +17,122 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-using Compare = int(const void *, const void *);
-using CompareWithState = int(const void *, const void *, void *);
-
-enum class CompType { COMPARE, COMPARE_WITH_STATE };
-
-struct Comparator {
-  union {
-    Compare *comp_func;
-    CompareWithState *comp_func_r;
-  };
-  const CompType comp_type;
-
-  void *arg;
-
-  Comparator(Compare *func)
-      : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {}
-
-  Comparator(CompareWithState *func, void *arg_val)
-      : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE),
-        arg(arg_val) {}
-
-#if defined(__clang__)
-  // Recent upstream changes to -fsanitize=function find more instances of
-  // function type mismatches. One case is with the comparator passed to this
-  // class. Libraries will tend to pass comparators that take pointers to
-  // varying types while this comparator expects to accept const void pointers.
-  // Ideally those tools would pass a function that strictly accepts const
-  // void*s to avoid UB, or would use qsort_r to pass their own comparator.
-  [[clang::no_sanitize("function")]]
-#endif
-  int comp_vals(const void *a, const void *b) const {
-    if (comp_type == CompType::COMPARE) {
-      return comp_func(a, b);
-    } else {
-      return comp_func_r(a, b, arg);
+class ArrayGenericSize {
+  cpp::byte *array_base;
+  size_t array_len;
+  size_t elem_size;
+
+  LIBC_INLINE cpp::byte *get_internal(size_t i) const {
+    return array_base + (i * elem_size);
+  }
+
+public:
+  LIBC_INLINE ArrayGenericSize(void *a, size_t s, size_t e)
+      : array_base(reinterpret_cast<cpp::byte *>(a)), array_len(s),
+        elem_size(e) {}
+
+  static constexpr bool has_fixed_size() { return false; }
+
+  LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
+
+  LIBC_INLINE void swap(size_t i, size_t j) const {
+    // It's possible to use 8 byte blocks with `uint64_t`, but that
+    // generates more machine code as the remainder loop gets
+    // unrolled, plus 4 byte operations are more likely to be
+    // efficient on a wider variety of hardware. On x86 LLVM tends
+    // to unroll the block loop again into 2 16 byte swaps per
+    // iteration which is another reason that 4 byte blocks yields
+    // good performance even for big types.
+    using block_t = uint32_t;
+    constexpr size_t BLOCK_SIZE = sizeof(block_t);
+
+    alignas(block_t) cpp::byte tmp_block[BLOCK_SIZE];
+
+    cpp::byte *elem_i = get_internal(i);
+    cpp::byte *elem_j = get_internal(j);
+
+    const size_t elem_size_rem = elem_size % BLOCK_SIZE;
+    const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem);
+
+    while (elem_i != elem_i_block_end) {
+      __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE);
+      __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE);
+      __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE);
+
+      elem_i += BLOCK_SIZE;
+      elem_j += BLOCK_SIZE;
+    }
+
+    for (size_t n = 0; n < elem_size_rem; ++n) {
+      cpp::byte tmp = elem_i[n];
+      elem_i[n] = elem_j[n];
+      elem_j[n] = tmp;
     }
   }
+
+  LIBC_INLINE size_t len() const { return array_len; }
+
+  // Make an Array starting at index |i| and length |s|.
+  LIBC_INLINE ArrayGenericSize make_array(size_t i, size_t s) const {
+    return ArrayGenericSize(get_internal(i), s, elem_size);
+  }
+
+  // Reset this Array to point at a different interval of the same
+  // items starting at index |i|.
+  LIBC_INLINE void reset_bounds(size_t i, size_t s) {
+    array_base = get_internal(i);
+    array_len = s;
+  }
 };
 
-class Array {
-  uint8_t *array;
-  size_t array_size;
-  size_t elem_size;
-  Comparator compare;
+// Having a specialized Array type for sorting that knows at
+// compile-time what the size of the element is, allows for much more
+// efficient swapping and for cheaper offset calculations.
+template <size_t ELEM_SIZE> class ArrayFixedSize {
+  cpp::byte *array_base;
+  size_t array_len;
 
-public:
-  Array(uint8_t *a, size_t s, size_t e, Comparator c)
-      : array(a), array_size(s), elem_size(e), compare(c) {}
-
-  uint8_t *get(size_t i) const { return array + i * elem_size; }
-
-  void swap(size_t i, size_t j) const {
-    uint8_t *elem_i = get(i);
-    uint8_t *elem_j = get(j);
-    for (size_t b = 0; b < elem_size; ++b) {
-      uint8_t temp = elem_i[b];
-      elem_i[b] = elem_j[b];
-      elem_j[b] = temp;
-    }
+  LIBC_INLINE cpp::byte *get_internal(size_t i) const {
+    return array_base + (i * ELEM_SIZE);
   }
 
-  int elem_compare(size_t i, const uint8_t *other) const {
-    // An element must compare equal to itself so we don't need to consult the
-    // user provided comparator.
-    if (get(i) == other)
-      return 0;
-    return compare.comp_vals(get(i), other);
+public:
+  LIBC_INLINE ArrayFixedSize(void *a, size_t s)
+      : array_base(reinterpret_cast<cpp::byte *>(a)), array_len(s) {}
+
+  // Beware this function is used a heuristic for cheap to swap types, so
+  // instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad
+  // idea perf wise.
+  static constexpr bool has_fixed_size() { return true; }
+
+  LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
+
+  LIBC_INLINE void swap(size_t i, size_t j) const {
+    alignas(32) cpp::byte tmp[ELEM_SIZE];
+
+    cpp::byte *elem_i = get_internal(i);
+    cpp::byte *elem_j = get_internal(j);
+
+    __builtin_memcpy(tmp, elem_i, ELEM_SIZE);
+    __builtin_memmove(elem_i, elem_j, ELEM_SIZE);
+    __builtin_memcpy(elem_j, tmp, ELEM_SIZE);
   }
 
-  size_t size() const { return array_size; }
+  LIBC_INLINE size_t len() const { return array_len; }
 
-  // Make an Array starting at index |i| and size |s|.
-  LIBC_INLINE Array make_array(size_t i, size_t s) const {
-    return Array(get(i), s, elem_size, compare);
+  // Make an Array starting at index |i| and length |s|.
+  LIBC_INLINE ArrayFixedSize<ELEM_SIZE> make_array(size_t i, size_t s) const {
+    return ArrayFixedSize<ELEM_SIZE>(get_internal(i), s);
   }
 
-  // Reset this Array to point at a different interval of the same items.
-  LIBC_INLINE void reset_bounds(uint8_t *a, size_t s) {
-    array = a;
-    array_size = s;
+  // Reset this Array to point at a different interval of the same
+  // items starting at index |i|.
+  LIBC_INLINE void reset_bounds(size_t i, size_t s) {
+    array_base = get_internal(i);
+    array_len = s;
   }
 };
 
-using SortingRoutine = void(const Array &);
-
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
new file mode 100644
index 0000000..b27e746
--- /dev/null
+++ b/libc/src/stdlib/qsort_pivot.h
@@ -0,0 +1,85 @@
+//===-- Implementation header for qsort utilities ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+
+#include <stddef.h>  // For size_t
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+// Recursively select a pseudomedian if above this threshold.
+constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64;
+
+// Selects a pivot from `array`. Algorithm taken from glidesort by Orson Peters.
+//
+// This chooses a pivot by sampling an adaptive amount of points, approximating
+// the quality of a median of sqrt(n) elements.
+template <typename A, typename F>
+size_t choose_pivot(const A &array, const F &is_less) {
+  const size_t len = array.len();
+
+  if (len < 8) {
+    return 0;
+  }
+
+  const size_t len_div_8 = len / 8;
+
+  const size_t a = 0;             // [0, floor(n/8))
+  const size_t b = len_div_8 * 4; // [4*floor(n/8), 5*floor(n/8))
+  const size_t c = len_div_8 * 7; // [7*floor(n/8), 8*floor(n/8))
+
+  if (len < PSEUDO_MEDIAN_REC_THRESHOLD)
+    return median3(array, a, b, c, is_less);
+  else
+    return median3_rec(array, a, b, c, len_div_8, is_less);
+}
+
+// Calculates an approximate median of 3 elements from sections a, b, c, or
+// recursively from an approximation of each, if they're large enough. By
+// dividing the size of each section by 8 when recursing we have logarithmic
+// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
+// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
+template <typename A, typename F>
+size_t median3_rec(const A &array, size_t a, size_t b, size_t c, size_t n,
+                   const F &is_less) {
+  if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) {
+    const size_t n8 = n / 8;
+    a = median3_rec(array, a, a + (n8 * 4), a + (n8 * 7), n8, is_less);
+    b = median3_rec(array, b, b + (n8 * 4), b + (n8 * 7), n8, is_less);
+    c = median3_rec(array, c, c + (n8 * 4), c + (n8 * 7), n8, is_less);
+  }
+  return median3(array, a, b, c, is_less);
+}
+
+/// Calculates the median of 3 elements.
+template <typename A, typename F>
+size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) {
+  const void *a_ptr = array.get(a);
+  const void *b_ptr = array.get(b);
+  const void *c_ptr = array.get(c);
+
+  const bool x = is_less(a_ptr, b_ptr);
+  const bool y = is_less(a_ptr, c_ptr);
+  if (x == y) {
+    // If x=y=0 then b, c <= a. In this case we want to return max(b, c).
+    // If x=y=1 then a < b, c. In this case we want to return min(b, c).
+    // By toggling the outcome of b < c using XOR x we get this behavior.
+    const bool z = is_less(b_ptr, c_ptr);
+    return z ^ x ? c : b;
+  } else {
+    // Either c <= a < b or b <= a < c, thus a is our median.
+    return a;
+  }
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp
index bf61a40..4e60998 100644
--- a/libc/src/stdlib/qsort_r.cpp
+++ b/libc/src/stdlib/qsort_r.cpp
@@ -19,13 +19,12 @@ LLVM_LIBC_FUNCTION(void, qsort_r,
                    (void *array, size_t array_size, size_t elem_size,
                     int (*compare)(const void *, const void *, void *),
                     void *arg)) {
-  if (array == nullptr || array_size == 0 || elem_size == 0)
-    return;
-  internal::Comparator c(compare, arg);
-  auto arr = internal::Array(reinterpret_cast<uint8_t *>(array), array_size,
-                             elem_size, c);
 
-  internal::sort(arr);
+  const auto is_less = [compare, arg](const void *a, const void *b) -> bool {
+    return compare(a, b, arg) < 0;
+  };
+
+  internal::unstable_sort(array, array_size, elem_size, is_less);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h
index d42adde..7882b82 100644
--- a/libc/src/stdlib/qsort_util.h
+++ b/libc/src/stdlib/qsort_util.h
@@ -27,11 +27,48 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT
-constexpr auto sort = quick_sort;
-#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT
-constexpr auto sort = heap_sort;
-#endif
+template <bool USE_QUICKSORT, typename F>
+LIBC_INLINE void unstable_sort_impl(void *array, size_t array_len,
+                                    size_t elem_size, const F &is_less) {
+  if (array == nullptr || array_len == 0 || elem_size == 0)
+    return;
+
+  if constexpr (USE_QUICKSORT) {
+    switch (elem_size) {
+    case 4: {
+      auto arr_fixed_size = internal::ArrayFixedSize<4>(array, array_len);
+      quick_sort(arr_fixed_size, is_less);
+      return;
+    }
+    case 8: {
+      auto arr_fixed_size = internal::ArrayFixedSize<8>(array, array_len);
+      quick_sort(arr_fixed_size, is_less);
+      return;
+    }
+    case 16: {
+      auto arr_fixed_size = internal::ArrayFixedSize<16>(array, array_len);
+      quick_sort(arr_fixed_size, is_less);
+      return;
+    }
+    default:
+      auto arr_generic_size =
+          internal::ArrayGenericSize(array, array_len, elem_size);
+      quick_sort(arr_generic_size, is_less);
+      return;
+    }
+  } else {
+    auto arr_generic_size =
+        internal::ArrayGenericSize(array, array_len, elem_size);
+    heap_sort(arr_generic_size, is_less);
+  }
+}
+
+template <typename F>
+LIBC_INLINE void unstable_sort(void *array, size_t array_len, size_t elem_size,
+                               const F &is_less) {
+#define USE_QUICK_SORT ((LIBC_QSORT_IMPL) == (LIBC_QSORT_QUICK_SORT))
+  unstable_sort_impl<USE_QUICK_SORT, F>(array, array_len, elem_size, is_less);
+}
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 82b90a7..9ab2830 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -9,84 +9,175 @@
 #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 
-#include "src/__support/macros/attributes.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/cstddef.h"
 #include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_data.h"
+#include "src/stdlib/qsort_pivot.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-// A simple quicksort implementation using the Hoare partition scheme.
-LIBC_INLINE size_t partition(const Array &array) {
-  const size_t array_size = array.size();
-  size_t pivot_index = array_size / 2;
-  uint8_t *pivot = array.get(pivot_index);
-  size_t i = 0;
-  size_t j = array_size - 1;
+// Branchless Lomuto partition based on the implementation by Lukas
+// Bergdoll and Orson Peters
+// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md.
+// Simplified to avoid having to stack allocate.
+template <typename A, typename F>
+LIBC_INLINE size_t partition_lomuto_branchless(const A &array,
+                                               const void *pivot,
+                                               const F &is_less) {
+  const size_t array_len = array.len();
+
+  size_t left = 0;
+  size_t right = 0;
+
+  while (right < array_len) {
+    const bool right_is_lt = is_less(array.get(right), pivot);
+    array.swap(left, right);
+    left += static_cast<size_t>(right_is_lt);
+    right += 1;
+  }
+
+  return left;
+}
+
+// Optimized for large types that are expensive to move. Not optimized
+// for integers. It's possible to use a cyclic permutation here for
+// large types as done in ipnsort but the advantages of this are limited
+// as `is_less` is a small wrapper around a call to a function pointer
+// and won't incur much binary-size overhead. The other reason to use
+// cyclic permutation is to have more efficient swapping, but we don't
+// know the element size so this isn't applicable here either.
+template <typename A, typename F>
+LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot,
+                                           const F &is_less) {
+  const size_t array_len = array.len();
+
+  size_t left = 0;
+  size_t right = array_len;
 
   while (true) {
-    int compare_i, compare_j;
-
-    while ((compare_i = array.elem_compare(i, pivot)) < 0)
-      ++i;
-    while ((compare_j = array.elem_compare(j, pivot)) > 0)
-      --j;
-
-    // At some point i will crossover j so we will definitely break out of
-    // this while loop.
-    if (i >= j)
-      return j + 1;
-
-    array.swap(i, j);
-
-    // The pivot itself might have got swapped so we will update the pivot.
-    if (i == pivot_index) {
-      pivot = array.get(j);
-      pivot_index = j;
-    } else if (j == pivot_index) {
-      pivot = array.get(i);
-      pivot_index = i;
+    while (left < right && is_less(array.get(left), pivot))
+      ++left;
+
+    while (true) {
+      --right;
+      if (left >= right || is_less(array.get(right), pivot)) {
+        break;
+      }
     }
 
-    if (compare_i == 0 && compare_j == 0) {
-      // If we do not move the pointers, we will end up with an
-      // infinite loop as i and j will be stuck without advancing.
-      ++i;
-      --j;
-    }
+    if (left >= right)
+      break;
+
+    array.swap(left, right);
+    ++left;
+  }
+
+  return left;
+}
+
+template <typename A, typename F>
+LIBC_INLINE size_t partition(const A &array, size_t pivot_index,
+                             const F &is_less) {
+  // Place the pivot at the beginning of the array.
+  if (pivot_index != 0) {
+    array.swap(0, pivot_index);
   }
+
+  const A array_without_pivot = array.make_array(1, array.len() - 1);
+  const void *pivot = array.get(0);
+
+  size_t num_lt;
+  if constexpr (A::has_fixed_size()) {
+    // Branchless Lomuto avoid branch misprediction penalties, but
+    // it also swaps more often which is only faster if the swap is a fast
+    // constant operation.
+    num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less);
+  } else {
+    num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less);
+  }
+
+  // Place the pivot between the two partitions.
+  array.swap(0, num_lt);
+
+  return num_lt;
 }
 
-LIBC_INLINE void quick_sort(Array array) {
+template <typename A, typename F>
+LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot,
+                                 size_t limit, const F &is_less) {
   while (true) {
-    const size_t array_size = array.size();
-    if (array_size <= 1)
+    const size_t array_len = array.len();
+    if (array_len <= 1)
       return;
-    size_t split_index = partition(array);
-    if (array_size == 2)
-      // The partition operation sorts the two element array.
+
+    // If too many bad pivot choices were made, simply fall back to
+    // heapsort in order to guarantee `O(N x log(N))` worst-case.
+    if (limit == 0) {
+      heap_sort(array, is_less);
       return;
+    }
 
-    // Make Arrays describing the two sublists that still need sorting.
-    Array left = array.make_array(0, split_index);
-    Array right = array.make_array(split_index, array.size() - split_index);
-
-    // Recurse to sort the smaller of the two, and then loop round within this
-    // function to sort the larger. This way, recursive call depth is bounded
-    // by log2 of the total array size, because every recursive call is sorting
-    // a list at most half the length of the one in its caller.
-    if (left.size() < right.size()) {
-      quick_sort(left);
-      array.reset_bounds(right.get(0), right.size());
-    } else {
-      quick_sort(right);
-      array.reset_bounds(left.get(0), left.size());
+    limit -= 1;
+
+    const size_t pivot_index = choose_pivot(array, is_less);
+
+    // If the chosen pivot is equal to the predecessor, then it's the smallest
+    // element in the slice. Partition the slice into elements equal to and
+    // elements greater than the pivot. This case is usually hit when the slice
+    // contains many duplicate elements.
+    if (ancestor_pivot) {
+      if (!is_less(ancestor_pivot, array.get(pivot_index))) {
+        const size_t num_lt =
+            partition(array, pivot_index,
+                      [is_less](const void *a, const void *b) -> bool {
+                        return !is_less(b, a);
+                      });
+
+        // Continue sorting elements greater than the pivot. We know that
+        // `num_lt` cont
+        array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1));
+        ancestor_pivot = nullptr;
+        continue;
+      }
     }
+
+    size_t split_index = partition(array, pivot_index, is_less);
+
+    if (array_len == 2)
+      // The partition operation sorts the two element array.
+      return;
+
+    // Split the array into `left`, `pivot`, and `right`.
+    A left = array.make_array(0, split_index);
+    const void *pivot = array.get(split_index);
+    const size_t right_start = split_index + 1;
+    A right = array.make_array(right_start, array.len() - right_start);
+
+    // Recurse into the left side. We have a fixed recursion limit,
+    // testing shows no real benefit for recursing into the shorter
+    // side.
+    quick_sort_impl(left, ancestor_pivot, limit, is_less);
+
+    // Continue with the right side.
+    array = right;
+    ancestor_pivot = pivot;
   }
 }
 
+constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; }
+
+template <typename A, typename F>
+LIBC_INLINE void quick_sort(A &array, const F &is_less) {
+  const void *ancestor_pivot = nullptr;
+  // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+  // The binary OR by one is used to eliminate the zero-check in the logarithm.
+  const size_t limit = 2 * ilog2((array.len() | 1));
+  quick_sort_impl(array, ancestor_pivot, limit, is_less);
+}
+
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt
index ae835dc..ef9bfe5 100644
--- a/libc/src/time/CMakeLists.txt
+++ b/libc/src/time/CMakeLists.txt
@@ -2,6 +2,17 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
+add_header_library(
+  time_constants
+  HDRS
+    time_constants.h
+  DEPENDS
+    libc.include.time
+    libc.src.__support.CPP.array
+    libc.src.__support.CPP.string_view
+    libc.hdr.types.time_t
+)
+
 add_object_library(
   time_utils
   SRCS
@@ -12,6 +23,10 @@ add_object_library(
     libc.include.time
     libc.src.__support.CPP.limits
     libc.src.errno.errno
+    .time_constants
+    libc.hdr.types.time_t
+    libc.hdr.types.size_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -22,7 +37,9 @@ add_entrypoint_object(
     asctime.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.include.time
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -33,7 +50,9 @@ add_entrypoint_object(
     asctime_r.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.include.time
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -44,6 +63,7 @@ add_entrypoint_object(
     ctime.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.hdr.types.time_t
     libc.include.time
 )
@@ -56,6 +76,7 @@ add_entrypoint_object(
     ctime_r.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.hdr.types.time_t
     libc.include.time
 )
@@ -68,6 +89,7 @@ add_entrypoint_object(
     difftime.h
   DEPENDS
     libc.include.time
+    libc.hdr.types.time_t
 )
 
 add_entrypoint_object(
@@ -79,6 +101,8 @@ add_entrypoint_object(
   DEPENDS
     .time_utils
     libc.include.time
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -90,6 +114,8 @@ add_entrypoint_object(
   DEPENDS
     .time_utils
     libc.include.time
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -100,8 +126,11 @@ add_entrypoint_object(
     mktime.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.include.time
     libc.src.errno.errno
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -115,6 +144,7 @@ add_entrypoint_object(
     libc.hdr.types.time_t
     libc.src.__support.time.clock_gettime
     libc.src.errno.errno
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
diff --git a/libc/src/time/asctime.cpp b/libc/src/time/asctime.cpp
index d6fbe73..2b00c41 100644
--- a/libc/src/time/asctime.cpp
+++ b/libc/src/time/asctime.cpp
@@ -9,15 +9,15 @@
 #include "src/time/asctime.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 #include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, asctime, (const struct tm *timeptr)) {
-  static char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
-  return time_utils::asctime(timeptr, buffer, TimeConstants::ASCTIME_MAX_BYTES);
+  static char buffer[time_constants::ASCTIME_BUFFER_SIZE];
+  return time_utils::asctime(timeptr, buffer,
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/asctime.h b/libc/src/time/asctime.h
index 623e6df..37325e7 100644
--- a/libc/src/time/asctime.h
+++ b/libc/src/time/asctime.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_TIME_ASCTIME_H
 #define LLVM_LIBC_SRC_TIME_ASCTIME_H
 
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/asctime_r.cpp b/libc/src/time/asctime_r.cpp
index caa22f1..bf53bfd 100644
--- a/libc/src/time/asctime_r.cpp
+++ b/libc/src/time/asctime_r.cpp
@@ -9,15 +9,15 @@
 #include "src/time/asctime_r.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 #include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, asctime_r,
                    (const struct tm *timeptr, char *buffer)) {
-  return time_utils::asctime(timeptr, buffer, TimeConstants::ASCTIME_MAX_BYTES);
+  return time_utils::asctime(timeptr, buffer,
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/asctime_r.h b/libc/src/time/asctime_r.h
index 328b7df..65a6b84 100644
--- a/libc/src/time/asctime_r.h
+++ b/libc/src/time/asctime_r.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_TIME_ASCTIME_R_H
 #define LLVM_LIBC_SRC_TIME_ASCTIME_R_H
 
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/ctime.cpp b/libc/src/time/ctime.cpp
index 8adae9b..ac0ffe5 100644
--- a/libc/src/time/ctime.cpp
+++ b/libc/src/time/ctime.cpp
@@ -6,23 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ctime.h"
+#include "src/time/ctime.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "time_utils.h"
+#include "src/time/time_constants.h"
+#include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, ctime, (const time_t *t_ptr)) {
   if (t_ptr == nullptr || *t_ptr > cpp::numeric_limits<int32_t>::max()) {
     return nullptr;
   }
-  static char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  static char buffer[time_constants::ASCTIME_BUFFER_SIZE];
   return time_utils::asctime(time_utils::localtime(t_ptr), buffer,
-                             TimeConstants::ASCTIME_MAX_BYTES);
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/ctime_r.cpp b/libc/src/time/ctime_r.cpp
index 63d93c4..7224f77 100644
--- a/libc/src/time/ctime_r.cpp
+++ b/libc/src/time/ctime_r.cpp
@@ -6,16 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ctime_r.h"
+#include "src/time/ctime_r.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "time_utils.h"
+#include "src/time/time_constants.h"
+#include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, ctime_r, (const time_t *t_ptr, char *buffer)) {
   if (t_ptr == nullptr || buffer == nullptr ||
       *t_ptr > cpp::numeric_limits<int32_t>::max()) {
@@ -23,7 +22,7 @@ LLVM_LIBC_FUNCTION(char *, ctime_r, (const time_t *t_ptr, char *buffer)) {
   }
 
   return time_utils::asctime(time_utils::localtime(t_ptr), buffer,
-                             TimeConstants::ASCTIME_MAX_BYTES);
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/difftime.h b/libc/src/time/difftime.h
index d5cd593..12de567 100644
--- a/libc/src/time/difftime.h
+++ b/libc/src/time/difftime.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_TIME_DIFFTIME_H
 #define LLVM_LIBC_SRC_TIME_DIFFTIME_H
 
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/gmtime.h b/libc/src/time/gmtime.h
index 3de3ceb..ac7f1be 100644
--- a/libc/src/time/gmtime.h
+++ b/libc/src/time/gmtime.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_GMTIME_H
 #define LLVM_LIBC_SRC_TIME_GMTIME_H
 
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/gmtime_r.h b/libc/src/time/gmtime_r.h
index b4f387e..4c88b22 100644
--- a/libc/src/time/gmtime_r.h
+++ b/libc/src/time/gmtime_r.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_GMTIME_R_H
 #define LLVM_LIBC_SRC_TIME_GMTIME_R_H
 
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/gpu/clock.cpp b/libc/src/time/gpu/clock.cpp
index add5b27..8609c5c 100644
--- a/libc/src/time/gpu/clock.cpp
+++ b/libc/src/time/gpu/clock.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/clock.h"
+
+#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 
diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp
index a92f660..d22d9d6 100644
--- a/libc/src/time/gpu/nanosleep.cpp
+++ b/libc/src/time/gpu/nanosleep.cpp
@@ -8,6 +8,7 @@
 
 #include "src/time/nanosleep.h"
 
+#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 
diff --git a/libc/src/time/mktime.cpp b/libc/src/time/mktime.cpp
index 72cd2291..3874cad 100644
--- a/libc/src/time/mktime.cpp
+++ b/libc/src/time/mktime.cpp
@@ -9,15 +9,11 @@
 #include "src/time/mktime.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 #include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
-static constexpr int NON_LEAP_YEAR_DAYS_IN_MONTH[] = {31, 28, 31, 30, 31, 30,
-                                                      31, 31, 30, 31, 30, 31};
-
 // Returns number of years from (1, year).
 static constexpr int64_t get_num_of_leap_years_before(int64_t year) {
   return (year / 4) - (year / 100) + (year / 400);
@@ -31,12 +27,12 @@ static constexpr bool is_leap_year(const int64_t year) {
 LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
   // Unlike most C Library functions, mktime doesn't just die on bad input.
   // TODO(rtenneti); Handle leap seconds.
-  int64_t tm_year_from_base = tm_out->tm_year + TimeConstants::TIME_YEAR_BASE;
+  int64_t tm_year_from_base = tm_out->tm_year + time_constants::TIME_YEAR_BASE;
 
   // 32-bit end-of-the-world is 03:14:07 UTC on 19 January 2038.
   if (sizeof(time_t) == 4 &&
-      tm_year_from_base >= TimeConstants::END_OF32_BIT_EPOCH_YEAR) {
-    if (tm_year_from_base > TimeConstants::END_OF32_BIT_EPOCH_YEAR)
+      tm_year_from_base >= time_constants::END_OF32_BIT_EPOCH_YEAR) {
+    if (tm_year_from_base > time_constants::END_OF32_BIT_EPOCH_YEAR)
       return time_utils::out_of_range();
     if (tm_out->tm_mon > 0)
       return time_utils::out_of_range();
@@ -64,7 +60,7 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
 
   // Calculate number of months and years from tm_mon.
   int64_t month = tm_out->tm_mon;
-  if (month < 0 || month >= TimeConstants::MONTHS_PER_YEAR - 1) {
+  if (month < 0 || month >= time_constants::MONTHS_PER_YEAR - 1) {
     int64_t years = month / 12;
     month %= 12;
     if (month < 0) {
@@ -78,23 +74,23 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
   // Calculate total number of days based on the month and the day (tm_mday).
   int64_t total_days = tm_out->tm_mday - 1;
   for (int64_t i = 0; i < month; ++i)
-    total_days += NON_LEAP_YEAR_DAYS_IN_MONTH[i];
+    total_days += time_constants::NON_LEAP_YEAR_DAYS_IN_MONTH[i];
   // Add one day if it is a leap year and the month is after February.
   if (tm_year_is_leap && month > 1)
     total_days++;
 
   // Calculate total numbers of days based on the year.
-  total_days += (tm_year_from_base - TimeConstants::EPOCH_YEAR) *
-                TimeConstants::DAYS_PER_NON_LEAP_YEAR;
-  if (tm_year_from_base >= TimeConstants::EPOCH_YEAR) {
+  total_days += (tm_year_from_base - time_constants::EPOCH_YEAR) *
+                time_constants::DAYS_PER_NON_LEAP_YEAR;
+  if (tm_year_from_base >= time_constants::EPOCH_YEAR) {
     total_days += get_num_of_leap_years_before(tm_year_from_base - 1) -
-                  get_num_of_leap_years_before(TimeConstants::EPOCH_YEAR);
+                  get_num_of_leap_years_before(time_constants::EPOCH_YEAR);
   } else if (tm_year_from_base >= 1) {
-    total_days -= get_num_of_leap_years_before(TimeConstants::EPOCH_YEAR) -
+    total_days -= get_num_of_leap_years_before(time_constants::EPOCH_YEAR) -
                   get_num_of_leap_years_before(tm_year_from_base - 1);
   } else {
     // Calculate number of leap years until 0th year.
-    total_days -= get_num_of_leap_years_before(TimeConstants::EPOCH_YEAR) -
+    total_days -= get_num_of_leap_years_before(time_constants::EPOCH_YEAR) -
                   get_num_of_leap_years_before(0);
     if (tm_year_from_base <= 0) {
       total_days -= 1; // Subtract 1 for 0th year.
@@ -106,11 +102,12 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
     }
   }
 
-  // TODO(rtenneti): Need to handle timezone and update of tm_isdst.
+  // TODO: https://github.com/llvm/llvm-project/issues/121962
+  // Need to handle timezone and update of tm_isdst.
   int64_t seconds = tm_out->tm_sec +
-                    tm_out->tm_min * TimeConstants::SECONDS_PER_MIN +
-                    tm_out->tm_hour * TimeConstants::SECONDS_PER_HOUR +
-                    total_days * TimeConstants::SECONDS_PER_DAY;
+                    tm_out->tm_min * time_constants::SECONDS_PER_MIN +
+                    tm_out->tm_hour * time_constants::SECONDS_PER_HOUR +
+                    total_days * time_constants::SECONDS_PER_DAY;
 
   // Update the tm structure's year, month, day, etc. from seconds.
   if (time_utils::update_from_seconds(seconds, tm_out) < 0)
diff --git a/libc/src/time/mktime.h b/libc/src/time/mktime.h
index 2b4c679..985c629 100644
--- a/libc/src/time/mktime.h
+++ b/libc/src/time/mktime.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_MKTIME_H
 #define LLVM_LIBC_SRC_TIME_MKTIME_H
 
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/time.cpp b/libc/src/time/time.cpp
index 4a0b614..860909a 100644
--- a/libc/src/time/time.cpp
+++ b/libc/src/time/time.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/time/time_func.h"
+
 #include "hdr/time_macros.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/time_func.h"
 
 namespace LIBC_NAMESPACE_DECL {
 // avoid inconsitent clang-format behavior
diff --git a/libc/src/time/time_constants.h b/libc/src/time/time_constants.h
new file mode 100644
index 0000000..3e25f74
--- /dev/null
+++ b/libc/src/time/time_constants.h
@@ -0,0 +1,100 @@
+//===-- Collection of constants for time functions --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
+#define LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
+
+#include "hdr/types/time_t.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/string_view.h"
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+namespace time_constants {
+
+enum Month : int {
+  JANUARY,
+  FEBRUARY,
+  MARCH,
+  APRIL,
+  MAY,
+  JUNE,
+  JULY,
+  AUGUST,
+  SEPTEMBER,
+  OCTOBER,
+  NOVEMBER,
+  DECEMBER
+};
+
+constexpr int SECONDS_PER_MIN = 60;
+constexpr int MINUTES_PER_HOUR = 60;
+constexpr int HOURS_PER_DAY = 24;
+constexpr int DAYS_PER_WEEK = 7;
+constexpr int MONTHS_PER_YEAR = 12;
+constexpr int DAYS_PER_NON_LEAP_YEAR = 365;
+constexpr int DAYS_PER_LEAP_YEAR = 366;
+
+constexpr int SECONDS_PER_HOUR = SECONDS_PER_MIN * MINUTES_PER_HOUR;
+constexpr int SECONDS_PER_DAY = SECONDS_PER_HOUR * HOURS_PER_DAY;
+constexpr int NUMBER_OF_SECONDS_IN_LEAP_YEAR =
+    DAYS_PER_LEAP_YEAR * SECONDS_PER_DAY;
+
+constexpr int TIME_YEAR_BASE = 1900;
+constexpr int EPOCH_YEAR = 1970;
+constexpr int EPOCH_WEEK_DAY = 4;
+
+// For asctime the behavior is undefined if struct tm's tm_wday or tm_mon are
+// not within the normal ranges as defined in <time.h>, or if struct tm's
+// tm_year exceeds {INT_MAX}-1990, or if the below asctime_internal algorithm
+// would attempt to generate more than 26 bytes of output (including the
+// terminating null).
+constexpr int ASCTIME_BUFFER_SIZE = 256;
+constexpr int ASCTIME_MAX_BYTES = 26;
+
+/* 2000-03-01 (mod 400 year, immediately after feb29 */
+constexpr int64_t SECONDS_UNTIL2000_MARCH_FIRST =
+    (946684800LL + SECONDS_PER_DAY * (31 + 29));
+constexpr int WEEK_DAY_OF2000_MARCH_FIRST = 3;
+
+constexpr int DAYS_PER400_YEARS =
+    (DAYS_PER_NON_LEAP_YEAR * 400) + (400 / 4) - 3;
+constexpr int DAYS_PER100_YEARS =
+    (DAYS_PER_NON_LEAP_YEAR * 100) + (100 / 4) - 1;
+constexpr int DAYS_PER4_YEARS = (DAYS_PER_NON_LEAP_YEAR * 4) + 1;
+
+// The latest time that can be represented in this form is 03:14:07 UTC on
+// Tuesday, 19 January 2038 (corresponding to 2,147,483,647 seconds since the
+// start of the epoch). This means that systems using a 32-bit time_t type are
+// susceptible to the Year 2038 problem.
+constexpr int END_OF32_BIT_EPOCH_YEAR = 2038;
+
+constexpr time_t OUT_OF_RANGE_RETURN_VALUE = -1;
+
+constexpr cpp::array<cpp::string_view, DAYS_PER_WEEK> WEEK_DAY_NAMES = {
+    "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"};
+
+constexpr cpp::array<cpp::string_view, DAYS_PER_WEEK> WEEK_DAY_FULL_NAMES = {
+    "Sunday",   "Monday", "Tuesday", "Wednesday",
+    "Thursday", "Friday", "Saturday"};
+
+constexpr cpp::array<cpp::string_view, MONTHS_PER_YEAR> MONTH_NAMES = {
+    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
+
+constexpr cpp::array<cpp::string_view, MONTHS_PER_YEAR> MONTH_FULL_NAMES = {
+    "January", "February", "March",     "April",   "May",      "June",
+    "July",    "August",   "September", "October", "November", "December"};
+
+constexpr int NON_LEAP_YEAR_DAYS_IN_MONTH[] = {31, 28, 31, 30, 31, 30,
+                                               31, 31, 30, 31, 30, 31};
+
+} // namespace time_constants
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
diff --git a/libc/src/time/time_utils.cpp b/libc/src/time/time_utils.cpp
index 509cad8..abc93b8 100644
--- a/libc/src/time/time_utils.cpp
+++ b/libc/src/time/time_utils.cpp
@@ -10,12 +10,11 @@
 #include "src/__support/CPP/limits.h" // INT_MIN, INT_MAX
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace time_utils {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 static int64_t computeRemainingYears(int64_t daysPerYears,
                                      int64_t quotientYears,
                                      int64_t *remainingDays) {
@@ -52,36 +51,36 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
       (sizeof(time_t) == 4)
           ? INT_MIN
           : INT_MIN * static_cast<int64_t>(
-                          TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
+                          time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
   constexpr time_t time_max =
       (sizeof(time_t) == 4)
           ? INT_MAX
           : INT_MAX * static_cast<int64_t>(
-                          TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
+                          time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
 
   time_t ts = static_cast<time_t>(total_seconds);
   if (ts < time_min || ts > time_max)
     return time_utils::out_of_range();
 
   int64_t seconds =
-      total_seconds - TimeConstants::SECONDS_UNTIL2000_MARCH_FIRST;
-  int64_t days = seconds / TimeConstants::SECONDS_PER_DAY;
-  int64_t remainingSeconds = seconds % TimeConstants::SECONDS_PER_DAY;
+      total_seconds - time_constants::SECONDS_UNTIL2000_MARCH_FIRST;
+  int64_t days = seconds / time_constants::SECONDS_PER_DAY;
+  int64_t remainingSeconds = seconds % time_constants::SECONDS_PER_DAY;
   if (remainingSeconds < 0) {
-    remainingSeconds += TimeConstants::SECONDS_PER_DAY;
+    remainingSeconds += time_constants::SECONDS_PER_DAY;
     days--;
   }
 
-  int64_t wday = (TimeConstants::WEEK_DAY_OF2000_MARCH_FIRST + days) %
-                 TimeConstants::DAYS_PER_WEEK;
+  int64_t wday = (time_constants::WEEK_DAY_OF2000_MARCH_FIRST + days) %
+                 time_constants::DAYS_PER_WEEK;
   if (wday < 0)
-    wday += TimeConstants::DAYS_PER_WEEK;
+    wday += time_constants::DAYS_PER_WEEK;
 
   // Compute the number of 400 year cycles.
-  int64_t numOfFourHundredYearCycles = days / TimeConstants::DAYS_PER400_YEARS;
-  int64_t remainingDays = days % TimeConstants::DAYS_PER400_YEARS;
+  int64_t numOfFourHundredYearCycles = days / time_constants::DAYS_PER400_YEARS;
+  int64_t remainingDays = days % time_constants::DAYS_PER400_YEARS;
   if (remainingDays < 0) {
-    remainingDays += TimeConstants::DAYS_PER400_YEARS;
+    remainingDays += time_constants::DAYS_PER400_YEARS;
     numOfFourHundredYearCycles--;
   }
 
@@ -89,17 +88,17 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
   // "four hundred year cycles" will be 4 hundred year cycles or less in 400
   // years.
   int64_t numOfHundredYearCycles = computeRemainingYears(
-      TimeConstants::DAYS_PER100_YEARS, 4, &remainingDays);
+      time_constants::DAYS_PER100_YEARS, 4, &remainingDays);
 
   // The remaining number of years after computing the number of
   // "hundred year cycles" will be 25 four year cycles or less in 100 years.
-  int64_t numOfFourYearCycles =
-      computeRemainingYears(TimeConstants::DAYS_PER4_YEARS, 25, &remainingDays);
+  int64_t numOfFourYearCycles = computeRemainingYears(
+      time_constants::DAYS_PER4_YEARS, 25, &remainingDays);
 
   // The remaining number of years after computing the number of
   // "four year cycles" will be 4 one year cycles or less in 4 years.
   int64_t remainingYears = computeRemainingYears(
-      TimeConstants::DAYS_PER_NON_LEAP_YEAR, 4, &remainingDays);
+      time_constants::DAYS_PER_NON_LEAP_YEAR, 4, &remainingDays);
 
   // Calculate number of years from year 2000.
   int64_t years = remainingYears + 4 * numOfFourYearCycles +
@@ -112,8 +111,8 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
   // We add 31 and 28 for the number of days in January and February, since our
   // starting point was March 1st.
   int64_t yday = remainingDays + 31 + 28 + leapDay;
-  if (yday >= TimeConstants::DAYS_PER_NON_LEAP_YEAR + leapDay)
-    yday -= TimeConstants::DAYS_PER_NON_LEAP_YEAR + leapDay;
+  if (yday >= time_constants::DAYS_PER_NON_LEAP_YEAR + leapDay)
+    yday -= time_constants::DAYS_PER_NON_LEAP_YEAR + leapDay;
 
   int64_t months = 0;
   while (daysInMonth[months] <= remainingDays) {
@@ -121,8 +120,8 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
     months++;
   }
 
-  if (months >= TimeConstants::MONTHS_PER_YEAR - 2) {
-    months -= TimeConstants::MONTHS_PER_YEAR;
+  if (months >= time_constants::MONTHS_PER_YEAR - 2) {
+    months -= time_constants::MONTHS_PER_YEAR;
     years++;
   }
 
@@ -131,19 +130,19 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
 
   // All the data (years, month and remaining days) was calculated from
   // March, 2000. Thus adjust the data to be from January, 1900.
-  tm->tm_year = static_cast<int>(years + 2000 - TimeConstants::TIME_YEAR_BASE);
+  tm->tm_year = static_cast<int>(years + 2000 - time_constants::TIME_YEAR_BASE);
   tm->tm_mon = static_cast<int>(months + 2);
   tm->tm_mday = static_cast<int>(remainingDays + 1);
   tm->tm_wday = static_cast<int>(wday);
   tm->tm_yday = static_cast<int>(yday);
 
   tm->tm_hour =
-      static_cast<int>(remainingSeconds / TimeConstants::SECONDS_PER_HOUR);
+      static_cast<int>(remainingSeconds / time_constants::SECONDS_PER_HOUR);
   tm->tm_min =
-      static_cast<int>(remainingSeconds / TimeConstants::SECONDS_PER_MIN %
-                       TimeConstants::SECONDS_PER_MIN);
+      static_cast<int>(remainingSeconds / time_constants::SECONDS_PER_MIN %
+                       time_constants::SECONDS_PER_MIN);
   tm->tm_sec =
-      static_cast<int>(remainingSeconds % TimeConstants::SECONDS_PER_MIN);
+      static_cast<int>(remainingSeconds % time_constants::SECONDS_PER_MIN);
   // TODO(rtenneti): Need to handle timezone and update of tm_isdst.
   tm->tm_isdst = 0;
 
diff --git a/libc/src/time/time_utils.h b/libc/src/time/time_utils.h
index 552ea92..5e0a692 100644
--- a/libc/src/time/time_utils.h
+++ b/libc/src/time/time_utils.h
@@ -9,79 +9,19 @@
 #ifndef LLVM_LIBC_SRC_TIME_TIME_UTILS_H
 #define LLVM_LIBC_SRC_TIME_TIME_UTILS_H
 
-#include <stddef.h> // For size_t.
-
+#include "hdr/types/size_t.h"
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/mktime.h"
+#include "time_constants.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace time_utils {
 
-enum Month : int {
-  JANUARY,
-  FEBRUARY,
-  MARCH,
-  APRIL,
-  MAY,
-  JUNE,
-  JULY,
-  AUGUST,
-  SEPTEMBER,
-  OCTOBER,
-  NOVEMBER,
-  DECEMBER
-};
-
-struct TimeConstants {
-  static constexpr int SECONDS_PER_MIN = 60;
-  static constexpr int MINUTES_PER_HOUR = 60;
-  static constexpr int HOURS_PER_DAY = 24;
-  static constexpr int DAYS_PER_WEEK = 7;
-  static constexpr int MONTHS_PER_YEAR = 12;
-  static constexpr int DAYS_PER_NON_LEAP_YEAR = 365;
-  static constexpr int DAYS_PER_LEAP_YEAR = 366;
-
-  static constexpr int SECONDS_PER_HOUR = SECONDS_PER_MIN * MINUTES_PER_HOUR;
-  static constexpr int SECONDS_PER_DAY = SECONDS_PER_HOUR * HOURS_PER_DAY;
-  static constexpr int NUMBER_OF_SECONDS_IN_LEAP_YEAR =
-      DAYS_PER_LEAP_YEAR * SECONDS_PER_DAY;
-
-  static constexpr int TIME_YEAR_BASE = 1900;
-  static constexpr int EPOCH_YEAR = 1970;
-  static constexpr int EPOCH_WEEK_DAY = 4;
-
-  // For asctime the behavior is undefined if struct tm's tm_wday or tm_mon are
-  // not within the normal ranges as defined in <time.h>, or if struct tm's
-  // tm_year exceeds {INT_MAX}-1990, or if the below asctime_internal algorithm
-  // would attempt to generate more than 26 bytes of output (including the
-  // terminating null).
-  static constexpr int ASCTIME_BUFFER_SIZE = 256;
-  static constexpr int ASCTIME_MAX_BYTES = 26;
-
-  /* 2000-03-01 (mod 400 year, immediately after feb29 */
-  static constexpr int64_t SECONDS_UNTIL2000_MARCH_FIRST =
-      (946684800LL + SECONDS_PER_DAY * (31 + 29));
-  static constexpr int WEEK_DAY_OF2000_MARCH_FIRST = 3;
-
-  static constexpr int DAYS_PER400_YEARS =
-      (DAYS_PER_NON_LEAP_YEAR * 400) + (400 / 4) - 3;
-  static constexpr int DAYS_PER100_YEARS =
-      (DAYS_PER_NON_LEAP_YEAR * 100) + (100 / 4) - 1;
-  static constexpr int DAYS_PER4_YEARS = (DAYS_PER_NON_LEAP_YEAR * 4) + 1;
-
-  // The latest time that can be represented in this form is 03:14:07 UTC on
-  // Tuesday, 19 January 2038 (corresponding to 2,147,483,647 seconds since the
-  // start of the epoch). This means that systems using a 32-bit time_t type are
-  // susceptible to the Year 2038 problem.
-  static constexpr int END_OF32_BIT_EPOCH_YEAR = 2038;
-
-  static constexpr time_t OUT_OF_RANGE_RETURN_VALUE = -1;
-};
-
 // Update the "tm" structure's year, month, etc. members from seconds.
 // "total_seconds" is the number of seconds since January 1st, 1970.
 extern int64_t update_from_seconds(int64_t total_seconds, struct tm *tm);
@@ -98,7 +38,7 @@ LIBC_INLINE time_t out_of_range() {
   // require it.
   libc_errno = EOVERFLOW;
 #endif
-  return TimeConstants::OUT_OF_RANGE_RETURN_VALUE;
+  return time_constants::OUT_OF_RANGE_RETURN_VALUE;
 }
 
 LIBC_INLINE void invalid_value() { libc_errno = EINVAL; }
@@ -110,32 +50,23 @@ LIBC_INLINE char *asctime(const struct tm *timeptr, char *buffer,
     return nullptr;
   }
   if (timeptr->tm_wday < 0 ||
-      timeptr->tm_wday > (TimeConstants::DAYS_PER_WEEK - 1)) {
+      timeptr->tm_wday > (time_constants::DAYS_PER_WEEK - 1)) {
     invalid_value();
     return nullptr;
   }
   if (timeptr->tm_mon < 0 ||
-      timeptr->tm_mon > (TimeConstants::MONTHS_PER_YEAR - 1)) {
+      timeptr->tm_mon > (time_constants::MONTHS_PER_YEAR - 1)) {
     invalid_value();
     return nullptr;
   }
 
-  // TODO(rtenneti): i18n the following strings.
-  static const char *week_days_name[TimeConstants::DAYS_PER_WEEK] = {
-      "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"};
-
-  static const char *months_name[TimeConstants::MONTHS_PER_YEAR] = {
-      "Jan", "Feb", "Mar", "Apr", "May", "Jun",
-      "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
-
-  // TODO(michaelr): look into removing this call to __builtin_snprintf that may
-  // be emitted as a call to snprintf. Alternatively, look into using our
-  // internal printf machinery.
+  // TODO(michaelr): move this to use the strftime machinery
   int written_size = __builtin_snprintf(
       buffer, bufferLength, "%.3s %.3s%3d %.2d:%.2d:%.2d %d\n",
-      week_days_name[timeptr->tm_wday], months_name[timeptr->tm_mon],
-      timeptr->tm_mday, timeptr->tm_hour, timeptr->tm_min, timeptr->tm_sec,
-      TimeConstants::TIME_YEAR_BASE + timeptr->tm_year);
+      time_constants::WEEK_DAY_NAMES[timeptr->tm_wday].data(),
+      time_constants::MONTH_NAMES[timeptr->tm_mon].data(), timeptr->tm_mday,
+      timeptr->tm_hour, timeptr->tm_min, timeptr->tm_sec,
+      time_constants::TIME_YEAR_BASE + timeptr->tm_year);
   if (written_size < 0)
     return nullptr;
   if (static_cast<size_t>(written_size) >= bufferLength) {
diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp
index c7c7c1a..7ffc151 100644
--- a/libc/src/unistd/linux/dup2.cpp
+++ b/libc/src/unistd/linux/dup2.cpp
@@ -32,7 +32,6 @@ LLVM_LIBC_FUNCTION(int, dup2, (int oldfd, int newfd)) {
     int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_fcntl, oldfd, F_GETFD);
 #elif defined(SYS_fcntl64)
     // Same as fcntl but can handle large offsets
-    static_assert(sizeof(off_t) == 8);
     int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_fcntl64, oldfd, F_GETFD);
 #else
 #error "SYS_fcntl and SYS_fcntl64 syscalls not available."