diff options
-rw-r--r-- | libc/src/string/memory_utils/op_x86.h | 86 | ||||
-rw-r--r-- | libc/src/string/memory_utils/x86_64/inline_bcmp.h | 32 |
2 files changed, 41 insertions, 77 deletions
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h index 1b40527..215cafb 100644 --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -73,15 +73,6 @@ struct Memcpy { namespace LIBC_NAMESPACE_DECL { namespace generic { -// Not equals: returns non-zero iff values at head or tail differ. -// This function typically loads more data than necessary when the two buffer -// differs. -template <typename T> -LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) { - static_assert(cpp::is_integral_v<T>); - return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T)); -} - /////////////////////////////////////////////////////////////////////////////// // Specializations for uint16_t template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {}; @@ -154,11 +145,6 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2, #if defined(__SSE4_1__) template <> struct is_vector<__m128i> : cpp::true_type {}; template <> struct cmp_is_expensive<__m128i> : cpp::true_type {}; -LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) { - const auto a = load<__m128i>(p1, offset); - const auto b = load<__m128i>(p2, offset); - return _mm_xor_si128(a, b); -} LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) { return _mm_max_epu8(a, b); } @@ -170,21 +156,17 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) { return static_cast<uint16_t>( _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)))); } -LIBC_INLINE bool is_zero(__m128i value) { - return _mm_testz_si128(value, value) == 1; -} template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) { - return is_zero(load_and_xor_m128i(p1, p2, offset)); + const auto a = load<__m128i>(p1, offset); + const auto b = load<__m128i>(p2, offset); + const auto xored = _mm_xor_si128(a, b); + return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0 } template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { - return !is_zero(load_and_xor_m128i(p1, p2, offset)); -} -template <> -LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2, - size_t count) { - const __m128i head = load_and_xor_m128i(p1, p2, 0); - const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i)); - return !is_zero(_mm_or_si128(head, tail)); + const auto a = load<__m128i>(p1, offset); + const auto b = load<__m128i>(p2, offset); + const auto xored = _mm_xor_si128(a, b); + return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0 } template <> LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { @@ -203,34 +185,19 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { #if defined(__AVX__) template <> struct is_vector<__m256i> : cpp::true_type {}; template <> struct cmp_is_expensive<__m256i> : cpp::true_type {}; -LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) { - return _mm256_castps_si256( - _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); -} -LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) { - return _mm256_castps_si256( - _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); -} -LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) { +template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) { const auto a = load<__m256i>(p1, offset); const auto b = load<__m256i>(p2, offset); - return xor_m256i(a, b); -} -LIBC_INLINE bool is_zero(__m256i value) { - return _mm256_testz_si256(value, value) == 1; -} -template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) { - return is_zero(load_and_xor_m256i(p1, p2, offset)); + const auto xored = _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); + return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0 } template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { - return !is_zero(load_and_xor_m256i(p1, p2, offset)); -} -template <> -LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2, - size_t count) { - const __m256i head = load_and_xor_m256i(p1, p2, 0); - const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i)); - return !is_zero(or_m256i(head, tail)); + const auto a = load<__m256i>(p1, offset); + const auto b = load<__m256i>(p2, offset); + const auto xored = _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); + return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0 } #endif // __AVX__ @@ -345,22 +312,9 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) { template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { const auto a = load<__m512i>(p1, offset); const auto b = load<__m512i>(p2, offset); - return _mm512_cmpneq_epi8_mask(a, b) != 0; -} -LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) { - const auto a = load<__m512i>(p1, offset); - const auto b = load<__m512i>(p2, offset); - return _mm512_xor_epi64(a, b); -} -LIBC_INLINE bool is_zero(__m512i value) { - return _mm512_test_epi32_mask(value, value) == 0; -} -template <> -LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2, - size_t count) { - const __m512i head = load_and_xor_m512i(p1, p2, 0); - const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i)); - return !is_zero(_mm512_or_epi64(head, tail)); + const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b); + return static_cast<uint32_t>(xored >> 32) | + static_cast<uint32_t>(xored & 0xFFFFFFFF); } template <> LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { diff --git a/libc/src/string/memory_utils/x86_64/inline_bcmp.h b/libc/src/string/memory_utils/x86_64/inline_bcmp.h index 8be391b..0eaf968 100644 --- a/libc/src/string/memory_utils/x86_64/inline_bcmp.h +++ b/libc/src/string/memory_utils/x86_64/inline_bcmp.h @@ -27,7 +27,7 @@ inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { [[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) - return generic::branchless_head_tail_neq<__m128i>(p1, p2, count); + return generic::Bcmp<__m128i>::head_tail(p1, p2, count); return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count); } #endif // __SSE4_1__ @@ -36,9 +36,9 @@ inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) { [[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) - return generic::branchless_head_tail_neq<__m128i>(p1, p2, count); + return generic::Bcmp<__m128i>::head_tail(p1, p2, count); if (count <= 64) - return generic::branchless_head_tail_neq<__m256i>(p1, p2, count); + return generic::Bcmp<__m256i>::head_tail(p1, p2, count); return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count); } #endif // __AVX__ @@ -47,11 +47,11 @@ inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) { [[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) - return generic::branchless_head_tail_neq<__m128i>(p1, p2, count); + return generic::Bcmp<__m128i>::head_tail(p1, p2, count); if (count <= 64) - return generic::branchless_head_tail_neq<__m256i>(p1, p2, count); + return generic::Bcmp<__m256i>::head_tail(p1, p2, count); if (count <= 128) - return generic::branchless_head_tail_neq<__m512i>(p1, p2, count); + return generic::Bcmp<__m512i>::head_tail(p1, p2, count); return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count); } #endif // __AVX512BW__ @@ -62,12 +62,22 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { return BcmpReturnType::zero(); if (count == 1) return generic::Bcmp<uint8_t>::block(p1, p2); - if (count <= 4) - return generic::branchless_head_tail_neq<uint16_t>(p1, p2, count); - if (count <= 8) - return generic::branchless_head_tail_neq<uint32_t>(p1, p2, count); + if (count == 2) + return generic::Bcmp<uint16_t>::block(p1, p2); + if (count == 3) + return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2); + if (count == 4) + return generic::Bcmp<uint32_t>::block(p1, p2); + if (count == 5) + return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2); + if (count == 6) + return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2); + if (count == 7) + return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2); + if (count == 8) + return generic::Bcmp<uint64_t>::block(p1, p2); if (count <= 16) - return generic::branchless_head_tail_neq<uint64_t>(p1, p2, count); + return generic::Bcmp<uint64_t>::head_tail(p1, p2, count); #if defined(__AVX512BW__) return inline_bcmp_x86_avx512bw_gt16(p1, p2, count); #elif defined(__AVX__) |