aboutsummaryrefslogtreecommitdiff
path: root/libc
diff options
context:
space:
mode:
Diffstat (limited to 'libc')
-rw-r--r--libc/shared/math.h1
-rw-r--r--libc/shared/math/asinhf.h23
-rw-r--r--libc/src/__support/GPU/allocator.cpp134
-rw-r--r--libc/src/__support/math/CMakeLists.txt14
-rw-r--r--libc/src/__support/math/asinhf.h125
-rw-r--r--libc/src/math/generic/CMakeLists.txt7
-rw-r--r--libc/src/math/generic/asinhf.cpp106
-rw-r--r--libc/src/stdio/baremetal/CMakeLists.txt1
-rw-r--r--libc/src/stdio/scanf_core/CMakeLists.txt2
-rw-r--r--libc/src/wchar/wchar_utils.h13
-rw-r--r--libc/src/wchar/wcschr.cpp9
-rw-r--r--libc/src/wchar/wcspbrk.cpp11
-rw-r--r--libc/src/wchar/wcstok.cpp23
-rw-r--r--libc/test/integration/src/stdlib/gpu/malloc_stress.cpp28
-rw-r--r--libc/test/shared/CMakeLists.txt1
-rw-r--r--libc/test/shared/shared_math_test.cpp1
16 files changed, 300 insertions, 199 deletions
diff --git a/libc/shared/math.h b/libc/shared/math.h
index e01c8db..e0f00f5 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -20,6 +20,7 @@
#include "math/asin.h"
#include "math/asinf.h"
#include "math/asinf16.h"
+#include "math/asinhf.h"
#include "math/erff.h"
#include "math/exp.h"
#include "math/exp10.h"
diff --git a/libc/shared/math/asinhf.h b/libc/shared/math/asinhf.h
new file mode 100644
index 0000000..c4a5509
--- /dev/null
+++ b/libc/shared/math/asinhf.h
@@ -0,0 +1,23 @@
+//===-- Shared asinhf function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ASINHF_H
+#define LLVM_LIBC_SHARED_MATH_ASINHF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/asinhf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::asinhf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_ASINHF_H
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 866aea7..2b78c4d 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -16,6 +16,7 @@
#include "allocator.h"
+#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/new.h"
@@ -31,14 +32,12 @@ constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
+constexpr static uint32_t BITS_IN_DWORD = sizeof(uint64_t) * 8;
constexpr static uint32_t MIN_SIZE = 16;
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
// The number of times to attempt claiming an in-progress slab allocation.
-constexpr static uint32_t MAX_TRIES = 128;
-
-// A sentinel used to indicate an invalid but non-null pointer value.
-constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
+constexpr static uint32_t MAX_TRIES = 1024;
static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
@@ -70,8 +69,8 @@ static void rpc_free(void *ptr) {
// Convert a potentially disjoint bitmask into an increasing integer per-lane
// for use with indexing between gpu lanes.
-static inline uint32_t lane_count(uint64_t lane_mask) {
- return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1));
+static inline uint32_t lane_count(uint64_t lane_mask, uint32_t id) {
+ return cpp::popcount(lane_mask & ((uint64_t(1) << id) - 1));
}
// Obtain an initial value to seed a random number generator. We use the rounded
@@ -133,7 +132,8 @@ static inline constexpr T round_up(const T x) {
void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
uint64_t mask = gpu::get_lane_mask();
uint32_t workers = cpp::popcount(uniform);
- for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers)
+ for (uint32_t i = impl::lane_count(mask & uniform, gpu::get_lane_id()); i < n;
+ i += workers)
s[i] = c;
}
@@ -142,10 +142,27 @@ static inline constexpr bool is_pow2(uint64_t x) {
return x && (x & (x - 1)) == 0;
}
-// Where this chunk size should start looking in the global array.
-static inline constexpr uint32_t start_index(uint32_t chunk_index) {
- return (ARRAY_SIZE * impl::get_chunk_id(chunk_index)) /
- impl::get_chunk_id(SLAB_SIZE / 2);
+// Where this chunk size should start looking in the global array. Small
+// allocations are much more likely than large ones, so we give them the most
+// space. We use a cubic easing function normalized on the possible chunks.
+static inline constexpr uint32_t get_start_index(uint32_t chunk_size) {
+ constexpr uint32_t max_chunk = impl::get_chunk_id(SLAB_SIZE / 2);
+ uint64_t norm =
+ (1 << 16) - (impl::get_chunk_id(chunk_size) << 16) / max_chunk;
+ uint64_t bias = (norm * norm * norm) >> 32;
+ uint64_t inv = (1 << 16) - bias;
+ return static_cast<uint32_t>(((ARRAY_SIZE - 1) * inv) >> 16);
+}
+
+// Returns the id of the lane below this one that acts as its leader.
+static inline uint32_t get_leader_id(uint64_t ballot, uint32_t id) {
+ uint64_t mask = id < BITS_IN_DWORD ? ~0ull << (id + 1) : 0;
+ return BITS_IN_DWORD - cpp::countl_zero(ballot & ~mask) - 1;
+}
+
+// We use a sentinal value to indicate a failed or in-progress allocation.
+template <typename T> bool is_sentinel(const T &x) {
+ return x == cpp::numeric_limits<T>::max();
}
} // namespace impl
@@ -264,28 +281,33 @@ struct Slab {
continue;
// We try using any known empty bits from the previous attempt first.
- uint32_t start = gpu::shuffle(mask, cpp::countr_zero(uniform & mask),
- ~after ? (old_index & ~(BITS_IN_WORD - 1)) +
- cpp::countr_zero(~after)
- : impl::xorshift32(state));
+ uint32_t start = gpu::shuffle(
+ mask, cpp::countr_zero(uniform & mask),
+ ~after ? (old_index & ~(BITS_IN_WORD - 1)) + cpp::countr_zero(~after)
+ : __builtin_align_down(impl::xorshift32(state), BITS_IN_WORD));
- uint32_t id = impl::lane_count(uniform & mask);
+ // Each lane tries to claim one bit in a single contiguous mask.
+ uint32_t id = impl::lane_count(uniform & mask, gpu::get_lane_id());
uint32_t index = (start + id) % usable_bits(chunk_size);
uint32_t slot = index / BITS_IN_WORD;
uint32_t bit = index % BITS_IN_WORD;
// Get the mask of bits destined for the same slot and coalesce it.
- uint64_t match = uniform & gpu::match_any(mask, slot);
- uint32_t length = cpp::popcount(match);
- uint32_t bitmask = gpu::shuffle(
- mask, cpp::countr_zero(match),
- static_cast<uint32_t>((uint64_t(1) << length) - 1) << bit);
+ uint32_t leader = impl::get_leader_id(
+ uniform & gpu::ballot(mask, !id || index % BITS_IN_WORD == 0),
+ gpu::get_lane_id());
+ uint32_t length = cpp::popcount(uniform & mask) -
+ impl::lane_count(uniform & mask, leader);
+ uint32_t bitmask =
+ static_cast<uint32_t>(
+ (uint64_t(1) << cpp::min(length, BITS_IN_WORD)) - 1)
+ << bit;
uint32_t before = 0;
- if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
+ if (gpu::get_lane_id() == leader)
before = cpp::AtomicRef(get_bitfield()[slot])
.fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
- before = gpu::shuffle(mask, cpp::countr_zero(match), before);
+ before = gpu::shuffle(mask, leader, before);
if (~before & (1 << bit))
result = ptr_from_index(index, chunk_size);
else
@@ -323,20 +345,20 @@ struct GuardPtr {
private:
struct RefCounter {
// Indicates that the object is in its deallocation phase and thus invalid.
- static constexpr uint64_t INVALID = uint64_t(1) << 63;
+ static constexpr uint32_t INVALID = uint32_t(1) << 31;
// If a read preempts an unlock call we indicate this so the following
// unlock call can swap out the helped bit and maintain exclusive ownership.
- static constexpr uint64_t HELPED = uint64_t(1) << 62;
+ static constexpr uint32_t HELPED = uint32_t(1) << 30;
// Resets the reference counter, cannot be reset to zero safely.
- void reset(uint32_t n, uint64_t &count) {
+ void reset(uint32_t n, uint32_t &count) {
counter.store(n, cpp::MemoryOrder::RELAXED);
count = n;
}
// Acquire a slot in the reference counter if it is not invalid.
- bool acquire(uint32_t n, uint64_t &count) {
+ bool acquire(uint32_t n, uint32_t &count) {
count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n;
return (count & INVALID) == 0;
}
@@ -349,7 +371,7 @@ private:
// another thread resurrected the counter and we quit, or a parallel read
// helped us invalidating it. For the latter, claim that flag and return.
if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
- uint64_t expected = 0;
+ uint32_t expected = 0;
if (counter.compare_exchange_strong(expected, INVALID,
cpp::MemoryOrder::RELAXED,
cpp::MemoryOrder::RELAXED))
@@ -372,28 +394,29 @@ private:
return (val & INVALID) ? 0 : val;
}
- cpp::Atomic<uint64_t> counter{0};
+ cpp::Atomic<uint32_t> counter{0};
};
- cpp::Atomic<Slab *> ptr{nullptr};
- RefCounter ref{};
+ cpp::Atomic<Slab *> ptr;
+ RefCounter ref;
// Should be called be a single lane for each different pointer.
template <typename... Args>
- Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
+ Slab *try_lock_impl(uint32_t n, uint32_t &count, Args &&...args) {
Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
if (!expected &&
ptr.compare_exchange_strong(
- expected, reinterpret_cast<Slab *>(SENTINEL),
+ expected,
+ reinterpret_cast<Slab *>(cpp::numeric_limits<uintptr_t>::max()),
cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
- count = cpp::numeric_limits<uint64_t>::max();
+ count = cpp::numeric_limits<uint32_t>::max();
void *raw = impl::rpc_allocate(sizeof(Slab));
if (!raw)
return nullptr;
return new (raw) Slab(cpp::forward<Args>(args)...);
}
- if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
+ if (!expected || impl::is_sentinel(reinterpret_cast<uintptr_t>(expected)))
return nullptr;
if (!ref.acquire(n, count))
@@ -405,7 +428,7 @@ private:
// Finalize the associated memory and signal that it is ready to use by
// resetting the counter.
- void finalize(Slab *mem, uint32_t n, uint64_t &count) {
+ void finalize(Slab *mem, uint32_t n, uint32_t &count) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
ptr.store(mem, cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
@@ -418,7 +441,7 @@ public:
// The uniform mask represents which lanes share the same pointer. For each
// uniform value we elect a leader to handle it on behalf of the other lanes.
template <typename... Args>
- Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
+ Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint32_t &count,
Args &&...args) {
count = 0;
Slab *result = nullptr;
@@ -433,14 +456,15 @@ public:
// We defer storing the newly allocated slab until now so that we can use
// multiple lanes to initialize it and release it for use.
- if (count == cpp::numeric_limits<uint64_t>::max()) {
+ if (impl::is_sentinel(count)) {
result->initialize(uniform);
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
finalize(result, cpp::popcount(uniform), count);
}
- if (count != cpp::numeric_limits<uint64_t>::max())
- count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
+ if (!impl::is_sentinel(count))
+ count = count - cpp::popcount(uniform) +
+ impl::lane_count(uniform, gpu::get_lane_id()) + 1;
return result;
}
@@ -469,7 +493,7 @@ static GuardPtr slots[ARRAY_SIZE] = {};
// Keep a cache of the last successful slot for each chunk size. Initialize it
// to an even spread of the total size. Must be updated if the chunking scheme
// changes.
-#define S(X) (impl::start_index(X))
+#define S(X) (impl::get_start_index(X))
static cpp::Atomic<uint32_t> indices[] = {
S(16), S(32), S(48), S(64), S(96), S(112), S(128),
S(192), S(224), S(256), S(384), S(448), S(512), S(768),
@@ -481,26 +505,28 @@ static cpp::Atomic<uint32_t> indices[] = {
#undef S
// Tries to find a slab in the table that can support the given chunk size.
-static Slab *find_slab(uint32_t chunk_size) {
+static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform) {
// We start at the index of the last successful allocation for this kind.
uint32_t chunk_id = impl::get_chunk_id(chunk_size);
uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
- uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
- for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
+ for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) {
uint32_t index =
- !offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE;
+ !offset ? start
+ : (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
- if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
+ if (!offset ||
+ slots[index].use_count() < Slab::available_chunks(chunk_size)) {
uint64_t lane_mask = gpu::get_lane_mask();
- uint64_t reserved = 0;
+ uint32_t reserved = 0;
Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
reserved, chunk_size, index);
// If there is a slab allocation in progress we retry a few times.
for (uint32_t retries = 0;
- retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
+ !slab && !impl::is_sentinel(reserved) && retries < MAX_TRIES;
+ retries++) {
uint64_t lane_mask = gpu::get_lane_mask();
slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
chunk_size, index);
@@ -514,13 +540,17 @@ static Slab *find_slab(uint32_t chunk_size) {
slab->get_chunk_size() == chunk_size) {
if (index != start)
indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
+ uniform = uniform & gpu::get_lane_mask();
return slab;
} else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
slab->get_chunk_size() != chunk_size)) {
slots[index].unlock(gpu::get_lane_mask(),
gpu::get_lane_mask() & uniform);
- } else if (!slab && reserved == SENTINEL) {
+ } else if (!slab && impl::is_sentinel(reserved)) {
+ uniform = uniform & gpu::get_lane_mask();
return nullptr;
+ } else {
+ sleep_briefly();
}
}
}
@@ -547,12 +577,12 @@ void *allocate(uint64_t size) {
// Try to find a slab for the rounded up chunk size and allocate from it.
uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
- Slab *slab = find_slab(chunk_size);
- if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL))
+ uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
+ Slab *slab = find_slab(chunk_size, uniform);
+ if (!slab || impl::is_sentinel(reinterpret_cast<uintptr_t>(slab)))
return nullptr;
uint64_t lane_mask = gpu::get_lane_mask();
- uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index());
void *ptr = slab->allocate(lane_mask, uniform);
return ptr;
}
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 1050938..13f46a1 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -141,6 +141,20 @@ add_header_library(
)
add_header_library(
+ asinhf
+ HDRS
+ asinhf.h
+ DEPENDS
+ .acoshf_utils
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.polyeval
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.sqrt
+ libc.src.__support.macros.config
+ libc.src.__support.macros.optimization
+)
+
+add_header_library(
asinf
HDRS
asinf.h
diff --git a/libc/src/__support/math/asinhf.h b/libc/src/__support/math/asinhf.h
new file mode 100644
index 0000000..1c08a6e
--- /dev/null
+++ b/libc/src/__support/math/asinhf.h
@@ -0,0 +1,125 @@
+//===-- Implementation header for asinf -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ASINHF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ASINHF_H
+
+#include "acoshf_utils.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float asinhf(float x) {
+ using namespace acoshf_internal;
+ using FPBits_t = typename fputil::FPBits<float>;
+ FPBits_t xbits(x);
+ uint32_t x_u = xbits.uintval();
+ uint32_t x_abs = xbits.abs().uintval();
+
+ // |x| <= 2^-3
+ if (LIBC_UNLIKELY(x_abs <= 0x3e80'0000U)) {
+ // |x| <= 2^-26
+ if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
+ return static_cast<float>(LIBC_UNLIKELY(x_abs == 0)
+ ? x
+ : (x - 0x1.5555555555555p-3 * x * x * x));
+ }
+
+ double x_d = x;
+ double x_sq = x_d * x_d;
+ // Generated by Sollya with:
+ // > P = fpminimax(asinh(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16|], [|D...|],
+ // [0, 2^-2]);
+ double p = fputil::polyeval(
+ x_sq, 0.0, -0x1.555555555551ep-3, 0x1.3333333325495p-4,
+ -0x1.6db6db5a7622bp-5, 0x1.f1c70f82928c6p-6, -0x1.6e893934266b7p-6,
+ 0x1.1c0b41d3fbe78p-6, -0x1.c0f47810b3c4fp-7, 0x1.2c8602690143dp-7);
+ return static_cast<float>(fputil::multiply_add(x_d, p, x_d));
+ }
+
+ const double SIGN[2] = {1.0, -1.0};
+ double x_sign = SIGN[x_u >> 31];
+ double x_d = x;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+ // Helper functions to set results for exceptional cases.
+ auto round_result_slightly_down = [x_sign](float r) -> float {
+ return fputil::multiply_add(static_cast<float>(x_sign), r,
+ static_cast<float>(x_sign) * (-0x1.0p-24f));
+ };
+ auto round_result_slightly_up = [x_sign](float r) -> float {
+ return fputil::multiply_add(static_cast<float>(x_sign), r,
+ static_cast<float>(x_sign) * 0x1.0p-24f);
+ };
+
+ if (LIBC_UNLIKELY(x_abs >= 0x4bdd'65a5U)) {
+ if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
+ if (xbits.is_signaling_nan()) {
+ fputil::raise_except_if_required(FE_INVALID);
+ return FPBits_t::quiet_nan().get_val();
+ }
+
+ return x;
+ }
+
+ // Exceptional cases when x > 2^24.
+ switch (x_abs) {
+ case 0x4bdd65a5: // |x| = 0x1.bacb4ap24f
+ return round_result_slightly_down(0x1.1e0696p4f);
+ case 0x4c803f2c: // |x| = 0x1.007e58p26f
+ return round_result_slightly_down(0x1.2b786cp4f);
+ case 0x4f8ffb03: // |x| = 0x1.1ff606p32f
+ return round_result_slightly_up(0x1.6fdd34p4f);
+ case 0x5c569e88: // |x| = 0x1.ad3d1p57f
+ return round_result_slightly_up(0x1.45c146p5f);
+ case 0x5e68984e: // |x| = 0x1.d1309cp61f
+ return round_result_slightly_up(0x1.5c9442p5f);
+ case 0x655890d3: // |x| = 0x1.b121a6p75f
+ return round_result_slightly_down(0x1.a9a3f2p5f);
+ case 0x65de7ca6: // |x| = 0x1.bcf94cp76f
+ return round_result_slightly_up(0x1.af66cp5f);
+ case 0x6eb1a8ec: // |x| = 0x1.6351d8p94f
+ return round_result_slightly_down(0x1.08b512p6f);
+ case 0x7997f30a: // |x| = 0x1.2fe614p116f
+ return round_result_slightly_up(0x1.451436p6f);
+ }
+ } else {
+ // Exceptional cases when x < 2^24.
+ if (LIBC_UNLIKELY(x_abs == 0x45abaf26)) {
+ // |x| = 0x1.575e4cp12f
+ return round_result_slightly_down(0x1.29becap3f);
+ }
+ if (LIBC_UNLIKELY(x_abs == 0x49d29048)) {
+ // |x| = 0x1.a5209p20f
+ return round_result_slightly_down(0x1.e1b92p3f);
+ }
+ }
+#else
+ if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
+ return x;
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+ // asinh(x) = log(x + sqrt(x^2 + 1))
+ return static_cast<float>(
+ x_sign * log_eval(fputil::multiply_add(
+ x_d, x_sign,
+ fputil::sqrt<double>(fputil::multiply_add(x_d, x_d, 1.0)))));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ASINHF_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index d4d268c..f91feacb 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -3889,12 +3889,7 @@ add_entrypoint_object(
HDRS
../asinhf.h
DEPENDS
- .explogxf
- libc.src.__support.FPUtil.fp_bits
- libc.src.__support.FPUtil.multiply_add
- libc.src.__support.FPUtil.polyeval
- libc.src.__support.FPUtil.sqrt
- libc.src.__support.macros.optimization
+ libc.src.__support.math.asinhf
)
add_entrypoint_object(
diff --git a/libc/src/math/generic/asinhf.cpp b/libc/src/math/generic/asinhf.cpp
index 3aed3bc..45023c8 100644
--- a/libc/src/math/generic/asinhf.cpp
+++ b/libc/src/math/generic/asinhf.cpp
@@ -7,112 +7,10 @@
//===----------------------------------------------------------------------===//
#include "src/math/asinhf.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/sqrt.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/math/generic/common_constants.h"
-#include "src/math/generic/explogxf.h"
+#include "src/__support/math/asinhf.h"
namespace LIBC_NAMESPACE_DECL {
-LLVM_LIBC_FUNCTION(float, asinhf, (float x)) {
- using namespace acoshf_internal;
- using FPBits_t = typename fputil::FPBits<float>;
- FPBits_t xbits(x);
- uint32_t x_u = xbits.uintval();
- uint32_t x_abs = xbits.abs().uintval();
-
- // |x| <= 2^-3
- if (LIBC_UNLIKELY(x_abs <= 0x3e80'0000U)) {
- // |x| <= 2^-26
- if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
- return static_cast<float>(LIBC_UNLIKELY(x_abs == 0)
- ? x
- : (x - 0x1.5555555555555p-3 * x * x * x));
- }
-
- double x_d = x;
- double x_sq = x_d * x_d;
- // Generated by Sollya with:
- // > P = fpminimax(asinh(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16|], [|D...|],
- // [0, 2^-2]);
- double p = fputil::polyeval(
- x_sq, 0.0, -0x1.555555555551ep-3, 0x1.3333333325495p-4,
- -0x1.6db6db5a7622bp-5, 0x1.f1c70f82928c6p-6, -0x1.6e893934266b7p-6,
- 0x1.1c0b41d3fbe78p-6, -0x1.c0f47810b3c4fp-7, 0x1.2c8602690143dp-7);
- return static_cast<float>(fputil::multiply_add(x_d, p, x_d));
- }
-
- const double SIGN[2] = {1.0, -1.0};
- double x_sign = SIGN[x_u >> 31];
- double x_d = x;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
- // Helper functions to set results for exceptional cases.
- auto round_result_slightly_down = [x_sign](float r) -> float {
- return fputil::multiply_add(static_cast<float>(x_sign), r,
- static_cast<float>(x_sign) * (-0x1.0p-24f));
- };
- auto round_result_slightly_up = [x_sign](float r) -> float {
- return fputil::multiply_add(static_cast<float>(x_sign), r,
- static_cast<float>(x_sign) * 0x1.0p-24f);
- };
-
- if (LIBC_UNLIKELY(x_abs >= 0x4bdd'65a5U)) {
- if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
- if (xbits.is_signaling_nan()) {
- fputil::raise_except_if_required(FE_INVALID);
- return FPBits_t::quiet_nan().get_val();
- }
-
- return x;
- }
-
- // Exceptional cases when x > 2^24.
- switch (x_abs) {
- case 0x4bdd65a5: // |x| = 0x1.bacb4ap24f
- return round_result_slightly_down(0x1.1e0696p4f);
- case 0x4c803f2c: // |x| = 0x1.007e58p26f
- return round_result_slightly_down(0x1.2b786cp4f);
- case 0x4f8ffb03: // |x| = 0x1.1ff606p32f
- return round_result_slightly_up(0x1.6fdd34p4f);
- case 0x5c569e88: // |x| = 0x1.ad3d1p57f
- return round_result_slightly_up(0x1.45c146p5f);
- case 0x5e68984e: // |x| = 0x1.d1309cp61f
- return round_result_slightly_up(0x1.5c9442p5f);
- case 0x655890d3: // |x| = 0x1.b121a6p75f
- return round_result_slightly_down(0x1.a9a3f2p5f);
- case 0x65de7ca6: // |x| = 0x1.bcf94cp76f
- return round_result_slightly_up(0x1.af66cp5f);
- case 0x6eb1a8ec: // |x| = 0x1.6351d8p94f
- return round_result_slightly_down(0x1.08b512p6f);
- case 0x7997f30a: // |x| = 0x1.2fe614p116f
- return round_result_slightly_up(0x1.451436p6f);
- }
- } else {
- // Exceptional cases when x < 2^24.
- if (LIBC_UNLIKELY(x_abs == 0x45abaf26)) {
- // |x| = 0x1.575e4cp12f
- return round_result_slightly_down(0x1.29becap3f);
- }
- if (LIBC_UNLIKELY(x_abs == 0x49d29048)) {
- // |x| = 0x1.a5209p20f
- return round_result_slightly_down(0x1.e1b92p3f);
- }
- }
-#else
- if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
- return x;
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
- // asinh(x) = log(x + sqrt(x^2 + 1))
- return static_cast<float>(
- x_sign * log_eval(fputil::multiply_add(
- x_d, x_sign,
- fputil::sqrt<double>(fputil::multiply_add(x_d, x_d, 1.0)))));
-}
+LLVM_LIBC_FUNCTION(float, asinhf, (float x)) { return math::asinhf(x); }
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt
index e879230..548938f 100644
--- a/libc/src/stdio/baremetal/CMakeLists.txt
+++ b/libc/src/stdio/baremetal/CMakeLists.txt
@@ -72,6 +72,7 @@ add_entrypoint_object(
../scanf.h
DEPENDS
.scanf_internal
+ libc.include.inttypes
libc.src.stdio.scanf_core.scanf_main
libc.src.__support.arg_list
libc.src.__support.OSUtil.osutil
diff --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt
index dee125c..561180c 100644
--- a/libc/src/stdio/scanf_core/CMakeLists.txt
+++ b/libc/src/stdio/scanf_core/CMakeLists.txt
@@ -35,6 +35,7 @@ add_header_library(
core_structs.h
DEPENDS
.scanf_config
+ libc.include.inttypes
libc.src.__support.CPP.string_view
libc.src.__support.CPP.bitset
libc.src.__support.FPUtil.fp_bits
@@ -97,6 +98,7 @@ add_header_library(
DEPENDS
.reader
.core_structs
+ libc.include.inttypes
libc.src.__support.common
libc.src.__support.ctype_utils
libc.src.__support.CPP.bitset
diff --git a/libc/src/wchar/wchar_utils.h b/libc/src/wchar/wchar_utils.h
index e0218c7..55a3cee 100644
--- a/libc/src/wchar/wchar_utils.h
+++ b/libc/src/wchar/wchar_utils.h
@@ -17,13 +17,10 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-// returns true if the character exists in the string
-LIBC_INLINE static bool wcschr(wchar_t c, const wchar_t *str) {
- for (int n = 0; str[n]; ++n) {
- if (str[n] == c)
- return true;
- }
- return false;
+LIBC_INLINE static const wchar_t *wcschr(const wchar_t *s, wchar_t c) {
+ for (; *s && *s != c; ++s)
+ ;
+ return (*s == c) ? s : nullptr;
}
// bool should be true for wcscspn for complimentary span
@@ -32,7 +29,7 @@ LIBC_INLINE static size_t wcsspn(const wchar_t *s1, const wchar_t *s2,
bool not_match_set) {
size_t i = 0;
for (; s1[i]; ++i) {
- bool in_set = wcschr(s1[i], s2);
+ bool in_set = internal::wcschr(s2, s1[i]);
if (in_set == not_match_set)
return i;
}
diff --git a/libc/src/wchar/wcschr.cpp b/libc/src/wchar/wcschr.cpp
index defc2ce..8ac4916 100644
--- a/libc/src/wchar/wcschr.cpp
+++ b/libc/src/wchar/wcschr.cpp
@@ -11,15 +11,14 @@
#include "hdr/types/wchar_t.h"
#include "src/__support/common.h"
#include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+#include "wchar_utils.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(const wchar_t *, wcschr, (const wchar_t *s, wchar_t c)) {
- for (; *s && *s != c; ++s)
- ;
- if (*s == c)
- return s;
- return nullptr;
+ LIBC_CRASH_ON_NULLPTR(s);
+ return internal::wcschr(s, c);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wcspbrk.cpp b/libc/src/wchar/wcspbrk.cpp
index a00ba99..5d86a49 100644
--- a/libc/src/wchar/wcspbrk.cpp
+++ b/libc/src/wchar/wcspbrk.cpp
@@ -11,17 +11,10 @@
#include "hdr/types/wchar_t.h"
#include "src/__support/common.h"
#include "src/__support/macros/null_check.h"
+#include "wchar_utils.h"
namespace LIBC_NAMESPACE_DECL {
-bool contains_char(const wchar_t *str, wchar_t target) {
- for (; *str != L'\0'; str++)
- if (*str == target)
- return true;
-
- return false;
-}
-
LLVM_LIBC_FUNCTION(const wchar_t *, wcspbrk,
(const wchar_t *src, const wchar_t *breakset)) {
LIBC_CRASH_ON_NULLPTR(src);
@@ -29,7 +22,7 @@ LLVM_LIBC_FUNCTION(const wchar_t *, wcspbrk,
// currently O(n * m), can be further optimized to O(n + m) with a hash set
for (int src_idx = 0; src[src_idx] != 0; src_idx++)
- if (contains_char(breakset, src[src_idx]))
+ if (internal::wcschr(breakset, src[src_idx]))
return src + src_idx;
return nullptr;
diff --git a/libc/src/wchar/wcstok.cpp b/libc/src/wchar/wcstok.cpp
index 291efc1..ed4f0aa 100644
--- a/libc/src/wchar/wcstok.cpp
+++ b/libc/src/wchar/wcstok.cpp
@@ -10,18 +10,12 @@
#include "hdr/types/wchar_t.h"
#include "src/__support/common.h"
+#include "wchar_utils.h"
namespace LIBC_NAMESPACE_DECL {
-bool isADelimeter(wchar_t wc, const wchar_t *delimiters) {
- for (const wchar_t *delim_ptr = delimiters; *delim_ptr != L'\0'; ++delim_ptr)
- if (wc == *delim_ptr)
- return true;
- return false;
-}
-
LLVM_LIBC_FUNCTION(wchar_t *, wcstok,
- (wchar_t *__restrict str, const wchar_t *__restrict delim,
+ (wchar_t *__restrict str, const wchar_t *__restrict delims,
wchar_t **__restrict context)) {
if (str == nullptr) {
if (*context == nullptr)
@@ -30,14 +24,13 @@ LLVM_LIBC_FUNCTION(wchar_t *, wcstok,
str = *context;
}
- wchar_t *tok_start, *tok_end;
- for (tok_start = str; *tok_start != L'\0' && isADelimeter(*tok_start, delim);
- ++tok_start)
- ;
+ wchar_t *tok_start = str;
+ while (*tok_start != L'\0' && internal::wcschr(delims, *tok_start))
+ ++tok_start;
- for (tok_end = tok_start; *tok_end != L'\0' && !isADelimeter(*tok_end, delim);
- ++tok_end)
- ;
+ wchar_t *tok_end = tok_start;
+ while (*tok_end != L'\0' && !internal::wcschr(delims, *tok_end))
+ ++tok_end;
if (*tok_end != L'\0') {
*tok_end = L'\0';
diff --git a/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
index 77479f8..4c540a8 100644
--- a/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
+++ b/libc/test/integration/src/stdlib/gpu/malloc_stress.cpp
@@ -14,6 +14,20 @@
using namespace LIBC_NAMESPACE;
+static inline uint32_t entropy() {
+ return (static_cast<uint32_t>(gpu::processor_clock()) ^
+ (gpu::get_thread_id_x() * 0x632be59b) ^
+ (gpu::get_block_id_x() * 0x85157af5)) *
+ 0x9e3779bb;
+}
+
+static inline uint32_t xorshift32(uint32_t &state) {
+ state ^= state << 13;
+ state ^= state >> 17;
+ state ^= state << 5;
+ return state * 0x9e3779bb;
+}
+
static inline void use(uint8_t *ptr, uint32_t size) {
EXPECT_NE(ptr, nullptr);
for (int i = 0; i < size; ++i)
@@ -34,5 +48,19 @@ TEST_MAIN(int, char **, char **) {
for (int i = 0; i < 256; ++i)
free(ptrs[i]);
+
+ uint32_t state = entropy();
+ for (int i = 0; i < 1024; ++i) {
+ if (xorshift32(state) % 2) {
+ uint64_t size = xorshift32(state) % 256 + 16;
+ uint64_t *ptr = reinterpret_cast<uint64_t *>(malloc(size));
+ *ptr = gpu::get_thread_id();
+
+ EXPECT_EQ(*ptr, gpu::get_thread_id());
+ ASSERT_TRUE(ptr);
+ ASSERT_TRUE(__builtin_is_aligned(ptr, 16));
+ free(ptr);
+ }
+ }
return 0;
}
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index e93fbb9..77f3617 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -16,6 +16,7 @@ add_fp_unittest(
libc.src.__support.math.asin
libc.src.__support.math.asinf
libc.src.__support.math.asinf16
+ libc.src.__support.math.asinhf
libc.src.__support.math.erff
libc.src.__support.math.exp
libc.src.__support.math.exp10
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 51d38ec..2e4450a 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -42,6 +42,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat) {
EXPECT_FP_EQ(0x1.921fb6p+0, LIBC_NAMESPACE::shared::acosf(0.0f));
EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::acoshf(1.0f));
EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::asinf(0.0f));
+ EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::asinhf(0.0f));
EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::erff(0.0f));
EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::exp10f(0.0f));
EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::expf(0.0f));