From 985c1a44f8d49e0afeba907fe29d881c19b319fc Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Mon, 1 Apr 2024 18:21:51 +0200 Subject: [libc++] Optimize the two range overload of mismatch (#86853) ``` ----------------------------------------------------------------------------- Benchmark old new ----------------------------------------------------------------------------- bm_mismatch_two_range_overload/1 0.941 ns 1.88 ns bm_mismatch_two_range_overload/2 1.43 ns 2.15 ns bm_mismatch_two_range_overload/3 1.95 ns 2.55 ns bm_mismatch_two_range_overload/4 2.58 ns 2.90 ns bm_mismatch_two_range_overload/5 3.75 ns 3.31 ns bm_mismatch_two_range_overload/6 5.00 ns 3.83 ns bm_mismatch_two_range_overload/7 5.59 ns 4.35 ns bm_mismatch_two_range_overload/8 6.37 ns 4.84 ns bm_mismatch_two_range_overload/16 11.8 ns 6.72 ns bm_mismatch_two_range_overload/64 45.5 ns 2.59 ns bm_mismatch_two_range_overload/512 366 ns 12.6 ns bm_mismatch_two_range_overload/4096 2890 ns 91.6 ns bm_mismatch_two_range_overload/32768 23038 ns 758 ns bm_mismatch_two_range_overload/262144 142813 ns 6573 ns bm_mismatch_two_range_overload/1048576 366679 ns 26710 ns bm_mismatch_two_range_overload/1 0.934 ns 1.88 ns bm_mismatch_two_range_overload/2 1.30 ns 2.58 ns bm_mismatch_two_range_overload/3 1.76 ns 3.28 ns bm_mismatch_two_range_overload/4 2.24 ns 3.98 ns bm_mismatch_two_range_overload/5 2.80 ns 4.92 ns bm_mismatch_two_range_overload/6 3.58 ns 6.01 ns bm_mismatch_two_range_overload/7 4.29 ns 7.03 ns bm_mismatch_two_range_overload/8 4.67 ns 7.39 ns bm_mismatch_two_range_overload/16 9.86 ns 13.1 ns bm_mismatch_two_range_overload/64 38.9 ns 4.55 ns bm_mismatch_two_range_overload/512 348 ns 27.7 ns bm_mismatch_two_range_overload/4096 2881 ns 225 ns bm_mismatch_two_range_overload/32768 23111 ns 1715 ns bm_mismatch_two_range_overload/262144 184846 ns 14416 ns bm_mismatch_two_range_overload/1048576 742885 ns 57264 ns bm_mismatch_two_range_overload/1 0.838 ns 1.19 ns bm_mismatch_two_range_overload/2 1.19 ns 1.65 ns bm_mismatch_two_range_overload/3 1.83 ns 2.06 ns bm_mismatch_two_range_overload/4 2.38 ns 2.42 ns bm_mismatch_two_range_overload/5 3.60 ns 2.47 ns bm_mismatch_two_range_overload/6 3.68 ns 3.05 ns bm_mismatch_two_range_overload/7 4.32 ns 3.36 ns bm_mismatch_two_range_overload/8 5.18 ns 3.58 ns bm_mismatch_two_range_overload/16 10.6 ns 2.84 ns bm_mismatch_two_range_overload/64 39.0 ns 7.78 ns bm_mismatch_two_range_overload/512 247 ns 53.9 ns bm_mismatch_two_range_overload/4096 1927 ns 429 ns bm_mismatch_two_range_overload/32768 15569 ns 3393 ns bm_mismatch_two_range_overload/262144 125413 ns 28504 ns bm_mismatch_two_range_overload/1048576 504549 ns 112729 ns ``` --- libcxx/benchmarks/algorithms/mismatch.bench.cpp | 16 ++++++++++ libcxx/include/__algorithm/mismatch.h | 34 +++++++++++++++++++--- libcxx/include/__algorithm/ranges_mismatch.h | 22 +++++++++----- libcxx/include/__algorithm/simd_utils.h | 9 +++++- libcxx/test/libcxx/transitive_includes/cxx23.csv | 1 + libcxx/test/libcxx/transitive_includes/cxx26.csv | 1 + .../alg.nonmodifying/mismatch/mismatch.pass.cpp | 8 ++--- 7 files changed, 74 insertions(+), 17 deletions(-) diff --git a/libcxx/benchmarks/algorithms/mismatch.bench.cpp b/libcxx/benchmarks/algorithms/mismatch.bench.cpp index 0628906..7917828 100644 --- a/libcxx/benchmarks/algorithms/mismatch.bench.cpp +++ b/libcxx/benchmarks/algorithms/mismatch.bench.cpp @@ -37,4 +37,20 @@ BENCHMARK(bm_mismatch)->Apply(BenchmarkSizes); BENCHMARK(bm_mismatch)->Apply(BenchmarkSizes); BENCHMARK(bm_mismatch)->Apply(BenchmarkSizes); +template +static void bm_mismatch_two_range_overload(benchmark::State& state) { + std::vector vec1(state.range(), '1'); + std::vector vec2(state.range(), '1'); + std::mt19937_64 rng(std::random_device{}()); + + vec1.back() = '2'; + for (auto _ : state) { + benchmark::DoNotOptimize(vec1); + benchmark::DoNotOptimize(std::mismatch(vec1.begin(), vec1.end(), vec2.begin(), vec2.end())); + } +} +BENCHMARK(bm_mismatch_two_range_overload)->DenseRange(1, 8)->Range(16, 1 << 20); +BENCHMARK(bm_mismatch_two_range_overload)->DenseRange(1, 8)->Range(16, 1 << 20); +BENCHMARK(bm_mismatch_two_range_overload)->DenseRange(1, 8)->Range(16, 1 << 20); + BENCHMARK_MAIN(); diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index 1cb83b0..8abb273 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -11,6 +11,7 @@ #define _LIBCPP___ALGORITHM_MISMATCH_H #include <__algorithm/comp.h> +#include <__algorithm/min.h> #include <__algorithm/simd_utils.h> #include <__algorithm/unwrap_iter.h> #include <__config> @@ -136,6 +137,25 @@ mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi } #if _LIBCPP_STD_VER >= 14 +template +[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter2> __mismatch( + _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Sent2 __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { + while (__first1 != __last1 && __first2 != __last2) { + if (!std::__invoke(__pred, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2))) + break; + ++__first1; + ++__first2; + } + return {std::move(__first1), std::move(__first2)}; +} + +template +[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*> +__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Tp* __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { + auto __len = std::min(__last1 - __first1, __last2 - __first2); + return std::__mismatch(__first1, __first1 + __len, __first2, __pred, __proj1, __proj2); +} + template _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2> mismatch(_InputIterator1 __first1, @@ -143,10 +163,16 @@ mismatch(_InputIterator1 __first1, _InputIterator2 __first2, _InputIterator2 __last2, _BinaryPredicate __pred) { - for (; __first1 != __last1 && __first2 != __last2; ++__first1, (void)++__first2) - if (!__pred(*__first1, *__first2)) - break; - return pair<_InputIterator1, _InputIterator2>(__first1, __first2); + __identity __proj; + auto __res = std::__mismatch( + std::__unwrap_iter(__first1), + std::__unwrap_iter(__last1), + std::__unwrap_iter(__first2), + std::__unwrap_iter(__last2), + __pred, + __proj, + __proj); + return {std::__rewrap_iter(__first1, __res.first), std::__rewrap_iter(__first2, __res.second)}; } template diff --git a/libcxx/include/__algorithm/ranges_mismatch.h b/libcxx/include/__algorithm/ranges_mismatch.h index 037af39..d8a7dd4 100644 --- a/libcxx/include/__algorithm/ranges_mismatch.h +++ b/libcxx/include/__algorithm/ranges_mismatch.h @@ -10,6 +10,8 @@ #define _LIBCPP___ALGORITHM_RANGES_MISMATCH_H #include <__algorithm/in_in_result.h> +#include <__algorithm/mismatch.h> +#include <__algorithm/unwrap_range.h> #include <__config> #include <__functional/identity.h> #include <__functional/invoke.h> @@ -42,13 +44,17 @@ struct __fn { template static _LIBCPP_HIDE_FROM_ABI constexpr mismatch_result<_I1, _I2> __go(_I1 __first1, _S1 __last1, _I2 __first2, _S2 __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { - while (__first1 != __last1 && __first2 != __last2) { - if (!std::invoke(__pred, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2))) - break; - ++__first1; - ++__first2; + if constexpr (forward_iterator<_I1> && forward_iterator<_I2>) { + auto __range1 = std::__unwrap_range(__first1, __last1); + auto __range2 = std::__unwrap_range(__first2, __last2); + auto __res = + std::__mismatch(__range1.first, __range1.second, __range2.first, __range2.second, __pred, __proj1, __proj2); + return {std::__rewrap_range<_S1>(__first1, __res.first), std::__rewrap_range<_S2>(__first2, __res.second)}; + } else { + auto __res = std::__mismatch( + std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), __pred, __proj1, __proj2); + return {std::move(__res.first), std::move(__res.second)}; } - return {std::move(__first1), std::move(__first2)}; } template requires indirectly_comparable, iterator_t<_R2>, _Pred, _Proj1, _Proj2> - _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr mismatch_result, - borrowed_iterator_t<_R2>> + _LIBCPP_NODISCARD_EXT + _LIBCPP_HIDE_FROM_ABI constexpr mismatch_result, borrowed_iterator_t<_R2>> operator()(_R1&& __r1, _R2&& __r2, _Pred __pred = {}, _Proj1 __proj1 = {}, _Proj2 __proj2 = {}) const { return __go( ranges::begin(__r1), ranges::end(__r1), ranges::begin(__r2), ranges::end(__r2), __pred, __proj1, __proj2); diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h index 1aedb3d..989a195 100644 --- a/libcxx/include/__algorithm/simd_utils.h +++ b/libcxx/include/__algorithm/simd_utils.h @@ -9,6 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_SIMD_UTILS_H #define _LIBCPP___ALGORITHM_SIMD_UTILS_H +#include <__algorithm/min.h> #include <__bit/bit_cast.h> #include <__bit/countr.h> #include <__config> @@ -22,6 +23,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + // TODO: Find out how altivec changes things and allow vectorizations there too. #if _LIBCPP_STD_VER >= 14 && defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 1700 && !defined(__ALTIVEC__) # define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 1 @@ -94,7 +98,8 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_T // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876 auto __impl = [&](_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept { - return std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))); + return std::min( + _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec)))); }; if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) { @@ -120,4 +125,6 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_SIMD_UTILS_H diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv index 79c67dc..69429b5 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx23.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv @@ -452,6 +452,7 @@ random vector random version ranges compare ranges cstddef +ranges cstdint ranges cwchar ranges initializer_list ranges iterator diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index 79c67dc..69429b5 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -452,6 +452,7 @@ random vector random version ranges compare ranges cstddef +ranges cstdint ranges cwchar ranges initializer_list ranges iterator diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp index 55c9eea..eb5f7ca 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp @@ -187,8 +187,8 @@ int main(int, char**) { { // check the tail of the vectorized loop for (size_t vec_size = 1; vec_size != 256; ++vec_size) { { - std::vector lhs(256); - std::vector rhs(256); + std::vector lhs(vec_size); + std::vector rhs(vec_size); check(lhs, rhs, lhs.size()); lhs.back() = 1; @@ -199,8 +199,8 @@ int main(int, char**) { rhs.back() = 0; } { - std::vector lhs(256); - std::vector rhs(256); + std::vector lhs(vec_size); + std::vector rhs(vec_size); check(lhs, rhs, lhs.size()); lhs.back() = 1; -- cgit v1.1