aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGuillaume Chatelet <gchatelet@google.com>2025-10-13 23:25:42 +0200
committerGitHub <noreply@github.com>2025-10-13 23:25:42 +0200
commit57726bdca274b152d2f36aaad7c961767bb1f91a (patch)
tree4811d025c12321c442695ad5aa4f511fa2fbd10b
parent0ceb32d06d4881d7d9c2182a05d41f8fd61220ab (diff)
downloadllvm-57726bdca274b152d2f36aaad7c961767bb1f91a.zip
llvm-57726bdca274b152d2f36aaad7c961767bb1f91a.tar.gz
llvm-57726bdca274b152d2f36aaad7c961767bb1f91a.tar.bz2
Revert "[libc] Implement branchless head-tail comparison for bcmp" (#162859)
Reverts llvm/llvm-project#107540 This PR demonstrated improvements on micro-benchmarks but the gains did not seem to materialize in production. We are reverting this change for now to get more data. This PR might be reintegrated later once we're more confident in its effects.
-rw-r--r--libc/src/string/memory_utils/op_x86.h86
-rw-r--r--libc/src/string/memory_utils/x86_64/inline_bcmp.h32
2 files changed, 41 insertions, 77 deletions
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 1b40527..215cafb 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -73,15 +73,6 @@ struct Memcpy {
namespace LIBC_NAMESPACE_DECL {
namespace generic {
-// Not equals: returns non-zero iff values at head or tail differ.
-// This function typically loads more data than necessary when the two buffer
-// differs.
-template <typename T>
-LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
- static_assert(cpp::is_integral_v<T>);
- return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
-}
-
///////////////////////////////////////////////////////////////////////////////
// Specializations for uint16_t
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
@@ -154,11 +145,6 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
#if defined(__SSE4_1__)
template <> struct is_vector<__m128i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
-LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m128i>(p1, offset);
- const auto b = load<__m128i>(p2, offset);
- return _mm_xor_si128(a, b);
-}
LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
return _mm_max_epu8(a, b);
}
@@ -170,21 +156,17 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
return static_cast<uint16_t>(
_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
}
-LIBC_INLINE bool is_zero(__m128i value) {
- return _mm_testz_si128(value, value) == 1;
-}
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
- return is_zero(load_and_xor_m128i(p1, p2, offset));
+ const auto a = load<__m128i>(p1, offset);
+ const auto b = load<__m128i>(p2, offset);
+ const auto xored = _mm_xor_si128(a, b);
+ return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
}
template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
- return !is_zero(load_and_xor_m128i(p1, p2, offset));
-}
-template <>
-LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
- size_t count) {
- const __m128i head = load_and_xor_m128i(p1, p2, 0);
- const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
- return !is_zero(_mm_or_si128(head, tail));
+ const auto a = load<__m128i>(p1, offset);
+ const auto b = load<__m128i>(p2, offset);
+ const auto xored = _mm_xor_si128(a, b);
+ return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
@@ -203,34 +185,19 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
#if defined(__AVX__)
template <> struct is_vector<__m256i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
-LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
- return _mm256_castps_si256(
- _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
- return _mm256_castps_si256(
- _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
+template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m256i>(p1, offset);
const auto b = load<__m256i>(p2, offset);
- return xor_m256i(a, b);
-}
-LIBC_INLINE bool is_zero(__m256i value) {
- return _mm256_testz_si256(value, value) == 1;
-}
-template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
- return is_zero(load_and_xor_m256i(p1, p2, offset));
+ const auto xored = _mm256_castps_si256(
+ _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+ return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
}
template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
- return !is_zero(load_and_xor_m256i(p1, p2, offset));
-}
-template <>
-LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
- size_t count) {
- const __m256i head = load_and_xor_m256i(p1, p2, 0);
- const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
- return !is_zero(or_m256i(head, tail));
+ const auto a = load<__m256i>(p1, offset);
+ const auto b = load<__m256i>(p2, offset);
+ const auto xored = _mm256_castps_si256(
+ _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+ return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
}
#endif // __AVX__
@@ -345,22 +312,9 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
- return _mm512_cmpneq_epi8_mask(a, b) != 0;
-}
-LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m512i>(p1, offset);
- const auto b = load<__m512i>(p2, offset);
- return _mm512_xor_epi64(a, b);
-}
-LIBC_INLINE bool is_zero(__m512i value) {
- return _mm512_test_epi32_mask(value, value) == 0;
-}
-template <>
-LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
- size_t count) {
- const __m512i head = load_and_xor_m512i(p1, p2, 0);
- const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
- return !is_zero(_mm512_or_epi64(head, tail));
+ const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
+ return static_cast<uint32_t>(xored >> 32) |
+ static_cast<uint32_t>(xored & 0xFFFFFFFF);
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
diff --git a/libc/src/string/memory_utils/x86_64/inline_bcmp.h b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
index 8be391b..0eaf968 100644
--- a/libc/src/string/memory_utils/x86_64/inline_bcmp.h
+++ b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
@@ -27,7 +27,7 @@ inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
+ return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __SSE4_1__
@@ -36,9 +36,9 @@ inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
+ return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
- return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
+ return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __AVX__
@@ -47,11 +47,11 @@ inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
+ return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
- return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
+ return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
if (count <= 128)
- return generic::branchless_head_tail_neq<__m512i>(p1, p2, count);
+ return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __AVX512BW__
@@ -62,12 +62,22 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
return BcmpReturnType::zero();
if (count == 1)
return generic::Bcmp<uint8_t>::block(p1, p2);
- if (count <= 4)
- return generic::branchless_head_tail_neq<uint16_t>(p1, p2, count);
- if (count <= 8)
- return generic::branchless_head_tail_neq<uint32_t>(p1, p2, count);
+ if (count == 2)
+ return generic::Bcmp<uint16_t>::block(p1, p2);
+ if (count == 3)
+ return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
+ if (count == 4)
+ return generic::Bcmp<uint32_t>::block(p1, p2);
+ if (count == 5)
+ return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
+ if (count == 6)
+ return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
+ if (count == 7)
+ return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
+ if (count == 8)
+ return generic::Bcmp<uint64_t>::block(p1, p2);
if (count <= 16)
- return generic::branchless_head_tail_neq<uint64_t>(p1, p2, count);
+ return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
#if defined(__AVX512BW__)
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
#elif defined(__AVX__)