diff options
author | Matthias Kretz <m.kretz@gsi.de> | 2023-01-14 17:07:59 +0100 |
---|---|---|
committer | Matthias Kretz <m.kretz@gsi.de> | 2023-05-23 10:11:39 +0200 |
commit | d3217028725f06f4eb67a4c80b12e1a3219d3502 (patch) | |
tree | a69ed894197e3021cd1dd672fba71c8e9051f540 /libstdc++-v3 | |
parent | a460f10796ed21900c9f85e702de6bde5134f3c7 (diff) | |
download | gcc-d3217028725f06f4eb67a4c80b12e1a3219d3502.zip gcc-d3217028725f06f4eb67a4c80b12e1a3219d3502.tar.gz gcc-d3217028725f06f4eb67a4c80b12e1a3219d3502.tar.bz2 |
libstdc++: Annotate most lambdas with always_inline
All of the annotated lambdas are simply a necessary means for
implementing these functions and should never result in an actual
function call. Many of these lambdas would go away if C++ had better
language support for packs.
Signed-off-by: Matthias Kretz <m.kretz@gsi.de>
libstdc++-v3/ChangeLog:
PR libstdc++/108030
* include/experimental/bits/simd_detail.h: Define
_GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA.
* include/experimental/bits/simd.h: Annotate lambdas with
_GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA.
* include/experimental/bits/simd_builtin.h: Ditto.
* include/experimental/bits/simd_converter.h: Ditto.
* include/experimental/bits/simd_fixed_size.h: Ditto.
* include/experimental/bits/simd_math.h: Ditto.
* include/experimental/bits/simd_neon.h: Ditto.
* include/experimental/bits/simd_x86.h: Ditto.
(cherry picked from commit 53b55701aed6896f456cdec7997ac6bbef1d6074)
Diffstat (limited to 'libstdc++-v3')
-rw-r--r-- | libstdc++-v3/include/experimental/bits/simd.h | 234 | ||||
-rw-r--r-- | libstdc++-v3/include/experimental/bits/simd_builtin.h | 351 | ||||
-rw-r--r-- | libstdc++-v3/include/experimental/bits/simd_converter.h | 22 | ||||
-rw-r--r-- | libstdc++-v3/include/experimental/bits/simd_detail.h | 3 | ||||
-rw-r--r-- | libstdc++-v3/include/experimental/bits/simd_fixed_size.h | 265 | ||||
-rw-r--r-- | libstdc++-v3/include/experimental/bits/simd_math.h | 52 | ||||
-rw-r--r-- | libstdc++-v3/include/experimental/bits/simd_neon.h | 14 | ||||
-rw-r--r-- | libstdc++-v3/include/experimental/bits/simd_x86.h | 122 |
8 files changed, 572 insertions, 491 deletions
diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h index 7c5a32f..8fcd45c 100644 --- a/libstdc++-v3/include/experimental/bits/simd.h +++ b/libstdc++-v3/include/experimental/bits/simd.h @@ -609,28 +609,34 @@ template <size_t _Bytes> operator&(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( - [&](auto __i) { return __rhs._M_data[__i] & _M_data[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __rhs._M_data[__i] & _M_data[__i]; + }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator|(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( - [&](auto __i) { return __rhs._M_data[__i] | _M_data[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __rhs._M_data[__i] | _M_data[__i]; + }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator^(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( - [&](auto __i) { return __rhs._M_data[__i] ^ _M_data[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __rhs._M_data[__i] ^ _M_data[__i]; + }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator~() const { return __generate_from_n_evaluations<_Np, _Ip>( - [&](auto __i) { return ~_M_data[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return ~_M_data[__i]; }); } }; return _Ip{}; @@ -1391,7 +1397,7 @@ template <size_t _Np, bool _Sanitized> operator^=(const _BitMask& __b) & noexcept { __execute_n_times<_S_array_size>( - [&](auto __i) { _M_bits[__i] ^= __b._M_bits[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] ^= __b._M_bits[__i]; }); return *this; } @@ -1399,7 +1405,7 @@ template <size_t _Np, bool _Sanitized> operator|=(const _BitMask& __b) & noexcept { __execute_n_times<_S_array_size>( - [&](auto __i) { _M_bits[__i] |= __b._M_bits[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] |= __b._M_bits[__i]; }); return *this; } @@ -1407,7 +1413,7 @@ template <size_t _Np, bool _Sanitized> operator&=(const _BitMask& __b) & noexcept { __execute_n_times<_S_array_size>( - [&](auto __i) { _M_bits[__i] &= __b._M_bits[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] &= __b._M_bits[__i]; }); return *this; } @@ -2206,7 +2212,7 @@ template <int _Offset, #endif constexpr int _O = _Offset * __return_width; return __call_with_subscripts<__return_width, _O>( - __x, [](auto... __entries) { + __x, [](auto... __entries) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return reinterpret_cast<_R>(_Up{__entries...}); }); } @@ -2608,7 +2614,7 @@ template <typename _Tp, size_t _Width> _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper(initializer_list<_Tp> __init) : _Base(__generate_from_n_evaluations<_Width, _BuiltinType>( - [&](auto __i) { return __init.begin()[__i.value]; })) {} + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __init.begin()[__i.value]; })) {} _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper() = default; _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper(const _SimdWrapper&) @@ -2633,10 +2639,9 @@ template <typename _Tp, size_t _Width> _GLIBCXX_SIMD_INTRINSIC constexpr operator _SimdTuple<_Tp, _As...>() const { - const auto& dd = _M_data; // workaround for GCC7 ICE - return __generate_from_n_evaluations<sizeof...(_As), - _SimdTuple<_Tp, _As...>>([&]( - auto __i) constexpr { return dd[int(__i)]; }); + return __generate_from_n_evaluations<sizeof...(_As), _SimdTuple<_Tp, _As...>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return _M_data[int(__i)]; }); } _GLIBCXX_SIMD_INTRINSIC constexpr operator const _BuiltinType&() const @@ -3193,21 +3198,19 @@ template <typename _Tp, int _Np> { return __x; } template <typename _Tp, typename _Ap> - _GLIBCXX_SIMD_INTRINSIC auto + _GLIBCXX_SIMD_INTRINSIC fixed_size_simd<_Tp, simd_size_v<_Tp, _Ap>> to_fixed_size(const simd<_Tp, _Ap>& __x) { - return simd<_Tp, simd_abi::fixed_size<simd_size_v<_Tp, _Ap>>>([&__x]( - auto __i) constexpr { return __x[__i]; }); + using _Rp = fixed_size_simd<_Tp, simd_size_v<_Tp, _Ap>>; + return _Rp([&__x](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); } template <typename _Tp, typename _Ap> - _GLIBCXX_SIMD_INTRINSIC auto + _GLIBCXX_SIMD_INTRINSIC fixed_size_simd_mask<_Tp, simd_size_v<_Tp, _Ap>> to_fixed_size(const simd_mask<_Tp, _Ap>& __x) { - constexpr int _Np = simd_mask<_Tp, _Ap>::size(); - fixed_size_simd_mask<_Tp, _Np> __r; - __execute_n_times<_Np>([&](auto __i) constexpr { __r[__i] = __x[__i]; }); - return __r; + return {__private_init, + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }}; } // to_native {{{2 @@ -3226,7 +3229,9 @@ template <typename _Tp, size_t _Np> enable_if_t<(_Np == native_simd_mask<_Tp>::size()), native_simd_mask<_Tp>> to_native(const fixed_size_simd_mask<_Tp, _Np>& __x) { - return native_simd_mask<_Tp>([&](auto __i) constexpr { return __x[__i]; }); + return native_simd_mask<_Tp>( + __private_init, + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); } // to_compatible {{{2 @@ -3243,7 +3248,10 @@ template <typename _Tp, size_t _Np> _GLIBCXX_SIMD_INTRINSIC enable_if_t<(_Np == simd_mask<_Tp>::size()), simd_mask<_Tp>> to_compatible(const simd_mask<_Tp, simd_abi::fixed_size<_Np>>& __x) - { return simd_mask<_Tp>([&](auto __i) constexpr { return __x[__i]; }); } + { + return simd_mask<_Tp>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); + } // masked assignment [simd_mask.where] {{{1 @@ -3401,9 +3409,9 @@ template <typename _M, typename _Tp> _Impl::template _S_masked_cassign( \ __data(_M_k), __data(_M_value), \ __to_value_type_or_member_type<_Tp>(static_cast<_Up&&>(__x)), \ - [](auto __impl, auto __lhs, auto __rhs) constexpr { \ - return __impl.__name(__lhs, __rhs); \ - }); \ + [](auto __impl, auto __lhs, auto __rhs) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { return __impl.__name(__lhs, __rhs); }); \ } \ static_assert(true) _GLIBCXX_SIMD_OP_(+, _S_plus); @@ -3900,12 +3908,11 @@ template <typename _V, typename _Ap, } else if (__x._M_is_constprop()) { - return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&]( - auto __i) constexpr { - return _V([&](auto __j) constexpr { - return __x[__i * _V::size() + __j]; - }); - }); + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __x[__i * _V::size() + __j]; }); + }); } else if constexpr ( __is_fixed_size_abi_v<_Ap> @@ -3918,41 +3925,40 @@ template <typename _V, typename _Ap, #ifdef _GLIBCXX_SIMD_USE_ALIASING_LOADS const __may_alias<_Tp>* const __element_ptr = reinterpret_cast<const __may_alias<_Tp>*>(&__data(__x)); - return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&]( - auto __i) constexpr { - return _V(__element_ptr + __i * _V::size(), vector_aligned); - }); + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return _V(__element_ptr + __i * _V::size(), vector_aligned); }); #else const auto& __xx = __data(__x); - return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&]( - auto __i) constexpr { - [[maybe_unused]] constexpr size_t __offset - = decltype(__i)::value * _V::size(); - return _V([&](auto __j) constexpr { - constexpr _SizeConstant<__j + __offset> __k; - return __xx[__k]; - }); - }); + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + [[maybe_unused]] constexpr size_t __offset + = decltype(__i)::value * _V::size(); + return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr _SizeConstant<__j + __offset> __k; + return __xx[__k]; + }); + }); #endif } else if constexpr (is_same_v<typename _V::abi_type, simd_abi::scalar>) { // normally memcpy should work here as well - return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&]( - auto __i) constexpr { return __x[__i]; }); + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); } else { - return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&]( - auto __i) constexpr { - if constexpr (__is_fixed_size_abi_v<typename _V::abi_type>) - return _V([&](auto __j) constexpr { - return __x[__i * _V::size() + __j]; - }); - else - return _V(__private_init, - __extract_part<decltype(__i)::value, _Parts>(__data(__x))); - }); + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (__is_fixed_size_abi_v<typename _V::abi_type>) + return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __x[__i * _V::size() + __j]; + }); + else + return _V(__private_init, + __extract_part<decltype(__i)::value, _Parts>(__data(__x))); + }); } } @@ -3976,22 +3982,22 @@ template <typename _V, typename _Ap, else if constexpr (_V::size() <= __CHAR_BIT__ * sizeof(_ULLong)) { const bitset __bits = __x.__to_bitset(); - return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&]( - auto __i) constexpr { - constexpr size_t __offset = __i * _V::size(); - return _V(__bitset_init, (__bits >> __offset).to_ullong()); - }); + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr size_t __offset = __i * _V::size(); + return _V(__bitset_init, (__bits >> __offset).to_ullong()); + }); } else { - return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&]( - auto __i) constexpr { - constexpr size_t __offset = __i * _V::size(); - return _V( - __private_init, [&](auto __j) constexpr { - return __x[__j + __offset]; - }); - }); + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr size_t __offset = __i * _V::size(); + return _V(__private_init, + [&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __x[__j + __offset]; + }); + }); } } @@ -4009,12 +4015,14 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename> using _V = __deduced_simd<_Tp, _N0>; if (__x._M_is_constprop()) - return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>([&]( - auto __i) constexpr { - using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; - constexpr size_t __offset = _SL::_S_before(__i); - return _Vi([&](auto __j) constexpr { return __x[__offset + __j]; }); - }); + return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; + constexpr size_t __offset = _SL::_S_before(__i); + return _Vi([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __x[__offset + __j]; + }); + }); else if constexpr (_Np == _N0) { static_assert(sizeof...(_Sizes) == 1); @@ -4081,28 +4089,28 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename> #ifdef _GLIBCXX_SIMD_USE_ALIASING_LOADS const __may_alias<_Tp>* const __element_ptr = reinterpret_cast<const __may_alias<_Tp>*>(&__x); - return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>([&]( - auto __i) constexpr { - using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; - constexpr size_t __offset = _SL::_S_before(__i); - constexpr size_t __base_align = alignof(simd<_Tp, _Ap>); - constexpr size_t __a - = __base_align - ((__offset * sizeof(_Tp)) % __base_align); - constexpr size_t __b = ((__a - 1) & __a) ^ __a; - constexpr size_t __alignment = __b == 0 ? __a : __b; - return _Vi(__element_ptr + __offset, overaligned<__alignment>); - }); + return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; + constexpr size_t __offset = _SL::_S_before(__i); + constexpr size_t __base_align = alignof(simd<_Tp, _Ap>); + constexpr size_t __a + = __base_align - ((__offset * sizeof(_Tp)) % __base_align); + constexpr size_t __b = ((__a - 1) & __a) ^ __a; + constexpr size_t __alignment = __b == 0 ? __a : __b; + return _Vi(__element_ptr + __offset, overaligned<__alignment>); + }); #else - return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>([&]( - auto __i) constexpr { - using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; - const auto& __xx = __data(__x); - using _Offset = decltype(_SL::_S_before(__i)); - return _Vi([&](auto __j) constexpr { - constexpr _SizeConstant<_Offset::value + __j> __k; - return __xx[__k]; - }); - }); + return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; + const auto& __xx = __data(__x); + using _Offset = decltype(_SL::_S_before(__i)); + return _Vi([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr _SizeConstant<_Offset::value + __j> __k; + return __xx[__k]; + }); + }); #endif } @@ -4144,8 +4152,9 @@ template <typename _Tp, typename... _As, typename = __detail::__odr_helper> return simd_cast<_Rp>(__xs...); else if ((... && __xs._M_is_constprop())) return simd<_Tp, - simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>([&]( - auto __i) constexpr { return __subscript_in_pack<__i>(__xs...); }); + simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __subscript_in_pack<__i>(__xs...); }); else { _Rp __r{}; @@ -4161,9 +4170,10 @@ template <typename _Tp, typename _Abi, size_t _Np> _GLIBCXX_SIMD_CONSTEXPR __deduced_simd<_Tp, simd_size_v<_Tp, _Abi> * _Np> concat(const array<simd<_Tp, _Abi>, _Np>& __x) { - return __call_with_subscripts<_Np>(__x, [](const auto&... __xs) { - return concat(__xs...); - }); + return __call_with_subscripts<_Np>( + __x, [](const auto&... __xs) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return concat(__xs...); + }); } // }}} @@ -4696,7 +4706,7 @@ template <typename _Tp, typename _Abi> simd_mask(_PrivateInit, _Fp&& __gen) : _M_data() { - __execute_n_times<size()>([&](auto __i) constexpr { + __execute_n_times<size()>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _Impl::_S_set(_M_data, __i, __gen(__i)); }); } @@ -4882,7 +4892,9 @@ template <typename _Tp, typename _Abi> if (__builtin_is_constant_evaluated() || __k._M_is_constprop()) { const int __r = __call_with_subscripts<simd_size_v<_Tp, _Abi>>( - __k, [](auto... __elements) { return ((__elements != 0) + ...); }); + __k, [](auto... __elements) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return ((__elements != 0) + ...); + }); if (__builtin_is_constant_evaluated() || __builtin_constant_p(__r)) return __r; } @@ -4897,8 +4909,11 @@ template <typename _Tp, typename _Abi> { constexpr size_t _Np = simd_size_v<_Tp, _Abi>; const size_t _Idx = __call_with_n_evaluations<_Np>( - [](auto... __indexes) { return std::min({__indexes...}); }, - [&](auto __i) { return __k[__i] ? +__i : _Np; }); + [](auto... __indexes) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::min({__indexes...}); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __k[__i] ? +__i : _Np; + }); if (_Idx >= _Np) __invoke_ub("find_first_set(empty mask) is UB"); if (__builtin_constant_p(_Idx)) @@ -4915,8 +4930,11 @@ template <typename _Tp, typename _Abi> { constexpr size_t _Np = simd_size_v<_Tp, _Abi>; const int _Idx = __call_with_n_evaluations<_Np>( - [](auto... __indexes) { return std::max({__indexes...}); }, - [&](auto __i) { return __k[__i] ? int(__i) : -1; }); + [](auto... __indexes) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::max({__indexes...}); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __k[__i] ? int(__i) : -1; + }); if (_Idx < 0) __invoke_ub("find_first_set(empty mask) is UB"); if (__builtin_constant_p(_Idx)) diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h b/libstdc++-v3/include/experimental/bits/simd_builtin.h index 5901dee..119e0de 100644 --- a/libstdc++-v3/include/experimental/bits/simd_builtin.h +++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h @@ -194,8 +194,11 @@ template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>> using _Up = decltype(__w); return __intrin_bitcast<_Tp>( __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>( - [](auto... __chunks) { return _Up{__chunks...}; }, - [&](auto __i) { return __w[__shift / __chunksize + __i]; })); + [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _Up{__chunks...}; + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __w[__shift / __chunksize + __i]; + })); } } @@ -225,7 +228,9 @@ template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np> // by _Total"); if (__x._M_is_constprop()) return __generate_from_n_evaluations<__return_size, _R>( - [&](auto __i) { return __x[__values_to_skip + __i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __x[__values_to_skip + __i]; + }); if constexpr (_Index == 0 && _Total == 1) return __x; else if constexpr (_Index == 0) @@ -570,7 +575,9 @@ template <typename _To, constexpr auto _Np = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts; return __generate_from_n_evaluations<_Np, array<_To, _Np>>( - [&](auto __i) { return static_cast<_To>(__v[__i + _Offset]); }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return static_cast<_To>(__v[__i + _Offset]); + }); } else { @@ -611,13 +618,14 @@ template <typename _To, return __vector_bitcast<_FromT, decltype(__n)::value>(__vv); }; [[maybe_unused]] const auto __vi = __to_intrin(__v); - auto&& __make_array = [](auto __x0, [[maybe_unused]] auto __x1) { - if constexpr (_Np == 1) - return _R{__intrin_bitcast<_To>(__x0)}; - else - return _R{__intrin_bitcast<_To>(__x0), - __intrin_bitcast<_To>(__x1)}; - }; + auto&& __make_array + = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (_Np == 1) + return _R{__intrin_bitcast<_To>(__x0)}; + else + return _R{__intrin_bitcast<_To>(__x0), + __intrin_bitcast<_To>(__x1)}; + }; if constexpr (_Np == 0) return _R{}; @@ -642,7 +650,7 @@ template <typename _To, = __convert_all<__vector_type16_t<int>, _Np>( __adjust(_SizeConstant<_Np * 4>(), __v)); return __generate_from_n_evaluations<_Np, _R>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __vector_convert<_To>(__as_wrapper(__ints[__i])); }); } @@ -687,36 +695,40 @@ template <typename _To, __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])), __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))}; if constexpr (sizeof(_ToT) == 4) - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - return __vector_convert<_To>( - _SimdWrapper<int, 4>(__vvvv[__i] >> 24)); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __vector_convert<_To>( + _SimdWrapper<int, 4>(__vvvv[__i] >> 24)); + }); else if constexpr (is_integral_v<_ToT>) - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31); - const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24); - return __vector_bitcast<_ToT>( - __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits) - : _mm_unpackhi_epi32(__sx32, __signbits)); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31); + const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24); + return __vector_bitcast<_ToT>( + __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits) + : _mm_unpackhi_epi32(__sx32, __signbits)); + }); else - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24; - return __vector_convert<_To>( - __i % 2 == 0 ? __int4 - : _SimdWrapper<int, 4>( - _mm_unpackhi_epi64(__to_intrin(__int4), - __to_intrin(__int4)))); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24; + return __vector_convert<_To>( + __i % 2 == 0 ? __int4 + : _SimdWrapper<int, 4>( + _mm_unpackhi_epi64(__to_intrin(__int4), + __to_intrin(__int4)))); + }); } else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4) { const auto __shorts = __convert_all<__vector_type16_t< conditional_t<is_signed_v<_FromT>, short, unsigned short>>>( __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v)); - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - return __convert_all<_To>(__shorts[__i / 2])[__i % 2]; - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __convert_all<_To>(__shorts[__i / 2])[__i % 2]; + }); } else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8 && is_signed_v<_FromT> && is_integral_v<_ToT>) @@ -736,9 +748,10 @@ template <typename _To, __vector_bitcast<int>( _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16), _mm_srai_epi32(__vv[1], 31)))}; - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - return __vector_bitcast<_ToT>(__vvvv[__i]); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __vector_bitcast<_ToT>(__vvvv[__i]); + }); } else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8) { @@ -747,9 +760,10 @@ template <typename _To, is_signed_v<_FromT> || is_floating_point_v<_ToT>, int, unsigned int>>>( __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v)); - return __generate_from_n_evaluations<_Np, _R>([&](auto __i) { - return __convert_all<_To>(__ints[__i / 2])[__i % 2]; - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __convert_all<_To>(__ints[__i / 2])[__i % 2]; + }); } else __assert_unreachable<_To>(); @@ -779,14 +793,14 @@ template <typename _To, __extract_part<_Offset, _FromVT::_S_partial_width, _ToVT::_S_full_size>(__v))}; else - return __generate_from_n_evaluations<_Np, _R>([&]( - auto __i) constexpr { - auto __part - = __extract_part<__i * _ToVT::_S_full_size + _Offset, - _FromVT::_S_partial_width, - _ToVT::_S_full_size>(__v); - return __vector_convert<_To>(__part); - }); + return __generate_from_n_evaluations<_Np, _R>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + auto __part + = __extract_part<__i * _ToVT::_S_full_size + _Offset, + _FromVT::_S_partial_width, + _ToVT::_S_full_size>(__v); + return __vector_convert<_To>(__part); + }); } else if constexpr (_Offset == 0) return array<_To, 1>{__vector_convert<_To>(__v)}; @@ -1017,8 +1031,9 @@ template <int _UsedBytes> else { constexpr auto __size = _S_size<_Tp>; - _GLIBCXX_SIMD_USE_CONSTEXPR auto __r = __generate_vector<_UV>( - [](auto __i) constexpr { return __i < __size ? -1 : 0; }); + _GLIBCXX_SIMD_USE_CONSTEXPR auto __r + = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __i < __size ? -1 : 0; }); return __r; } } @@ -1208,7 +1223,7 @@ template <int _UsedBytes> if constexpr (is_integral_v<typename _TVT::value_type>) return __x | __generate_vector<_Tp, _S_full_size<_Tp>>( - [](auto __i) -> _Tp { + [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp { if (__i < _Np) return 0; else @@ -1348,26 +1363,27 @@ struct _CommonImplBuiltin } else { - __execute_n_times<__div_roundup(_Np, 4)>([&](auto __i) { - constexpr int __offset = __i * 4; - constexpr int __remaining = _Np - __offset; - if constexpr (__remaining > 4 && __remaining <= 7) - { - const _ULLong __bool7 - = (__x.template _M_extract<__offset>()._M_to_bits() - * 0x40810204081ULL) - & 0x0101010101010101ULL; - _S_store<__remaining>(__bool7, __mem + __offset); - } - else if constexpr (__remaining >= 4) - { - int __bits = __x.template _M_extract<__offset>()._M_to_bits(); - if constexpr (__remaining > 7) - __bits &= 0xf; - const int __bool4 = (__bits * 0x204081) & 0x01010101; - _S_store<4>(__bool4, __mem + __offset); - } - }); + __execute_n_times<__div_roundup(_Np, 4)>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr int __offset = __i * 4; + constexpr int __remaining = _Np - __offset; + if constexpr (__remaining > 4 && __remaining <= 7) + { + const _ULLong __bool7 + = (__x.template _M_extract<__offset>()._M_to_bits() + * 0x40810204081ULL) + & 0x0101010101010101ULL; + _S_store<__remaining>(__bool7, __mem + __offset); + } + else if constexpr (__remaining >= 4) + { + int __bits = __x.template _M_extract<__offset>()._M_to_bits(); + if constexpr (__remaining > 7) + __bits &= 0xf; + const int __bool4 = (__bits * 0x204081) & 0x01010101; + _S_store<4>(__bool4, __mem + __offset); + } + }); } } @@ -1434,13 +1450,13 @@ template <typename _Abi, typename> inline static constexpr _SimdMember<_Tp> _S_generator(_Fp&& __gen, _TypeTag<_Tp>) { - return __generate_vector<_Tp, _S_full_size<_Tp>>([&]( - auto __i) constexpr { - if constexpr (__i < _S_size<_Tp>) - return __gen(__i); - else - return 0; - }); + return __generate_vector<_Tp, _S_full_size<_Tp>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (__i < _S_size<_Tp>) + return __gen(__i); + else + return 0; + }); } // _S_load {{{2 @@ -1455,10 +1471,10 @@ template <typename _Abi, typename> : 16; constexpr size_t __bytes_to_load = sizeof(_Up) * _Np; if constexpr (sizeof(_Up) > 8) - return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>([&]( - auto __i) constexpr { - return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); - }); + return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); + }); else if constexpr (is_same_v<_Up, _Tp>) return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>, _Np * sizeof(_Tp)>(__mem); @@ -1470,13 +1486,12 @@ template <typename _Abi, typename> constexpr size_t __n_loads = __bytes_to_load / __max_load_size; constexpr size_t __elements_per_load = _Np / __n_loads; return __call_with_n_evaluations<__n_loads>( - [](auto... __uncvted) { - return __convert<_SimdMember<_Tp>>(__uncvted...); - }, - [&](auto __i) { - return _CommonImpl::template _S_load<_Up, __elements_per_load>( - __mem + __i * __elements_per_load); - }); + [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __convert<_SimdMember<_Tp>>(__uncvted...); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _CommonImpl::template _S_load<_Up, __elements_per_load>( + __mem + __i * __elements_per_load); + }); } else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0 && __max_load_size > 16) @@ -1485,20 +1500,19 @@ template <typename _Abi, typename> = __bytes_to_load / (__max_load_size / 2); constexpr size_t __elements_per_load = _Np / __n_loads; return __call_with_n_evaluations<__n_loads>( - [](auto... __uncvted) { - return __convert<_SimdMember<_Tp>>(__uncvted...); - }, - [&](auto __i) { - return _CommonImpl::template _S_load<_Up, __elements_per_load>( - __mem + __i * __elements_per_load); - }); + [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __convert<_SimdMember<_Tp>>(__uncvted...); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _CommonImpl::template _S_load<_Up, __elements_per_load>( + __mem + __i * __elements_per_load); + }); } else // e.g. int[] -> <char, 9> return __call_with_subscripts( - __mem, make_index_sequence<_Np>(), [](auto... __args) { - return __vector_type_t<_Tp, _S_full_size<_Tp>>{ - static_cast<_Tp>(__args)...}; - }); + __mem, make_index_sequence<_Np>(), + [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...}; + }); } // _S_masked_load {{{2 @@ -1507,9 +1521,10 @@ template <typename _Abi, typename> _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, const _Up* __mem) noexcept { - _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), [&](auto __i) { - __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); - }); + _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); + }); return __merge; } @@ -1523,7 +1538,7 @@ template <typename _Abi, typename> constexpr size_t __max_store_size = _SuperImpl::template _S_max_store_size<_Up>; if constexpr (sizeof(_Up) > 8) - __execute_n_times<_Np>([&](auto __i) constexpr { + __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; }); else if constexpr (is_same_v<_Up, _Tp>) @@ -1540,9 +1555,10 @@ template <typename _Abi, typename> using _V = __vector_type_t<_Up, __vsize>; const array<_V, __stores> __converted = __convert_all<_V, __stores>(__v); - __execute_n_times<__full_stores>([&](auto __i) constexpr { - _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize); - }); + __execute_n_times<__full_stores>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize); + }); if constexpr (__full_stores < __stores) _CommonImpl::template _S_store<(_Np - __full_stores * __vsize) * sizeof(_Up)>( @@ -1557,7 +1573,8 @@ template <typename _Abi, typename> _MaskMember<_Tp> __k) { _BitOps::_S_bit_iteration( - _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr { + _MaskImpl::_S_to_bits(__k), + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; }); } @@ -1579,7 +1596,7 @@ template <typename _Abi, typename> _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up))) { // bitwise or no conversion, reinterpret: - const _MaskMember<_Up> __kk = [&]() { + const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (__is_bitmask_v<decltype(__k)>) return _MaskMember<_Up>(__k._M_data); else @@ -1618,7 +1635,7 @@ template <typename _Abi, typename> constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size; const array<_UV, _NAllStores> __converted = __convert_all<_UV, _NAllStores>(__v); - __execute_n_times<_NFullStores>([&](auto __i) { + __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _SuperImpl::_S_masked_store_nocvt( _UW(__converted[__i]), __mem + __i * _UW_size, _UAbi::_MaskImpl::template _S_convert< @@ -1637,10 +1654,10 @@ template <typename _Abi, typename> } } else - _BitOps::_S_bit_iteration( - _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr { - __mem[__i] = static_cast<_Up>(__v[__i]); - }); + _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __mem[__i] = static_cast<_Up>(__v[__i]); + }); } // _S_complement {{{2 @@ -1932,7 +1949,9 @@ template <typename _Abi, typename> static _Tp _S_##__name(const _Tp& __x, const _More&... __more) \ { \ return __generate_vector<_Tp>( \ - [&](auto __i) { return __name(__x[__i], __more[__i]...); }); \ + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __name(__x[__i], __more[__i]...); \ + }); \ } #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \ @@ -1941,23 +1960,25 @@ template <typename _Abi, typename> const _More&... __more) \ { \ return __generate_vector<_Tp>( \ - [&](auto __i) { return __name(__x[__i], __more[__i]...); }); \ - } - -#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \ - template <typename _Tp, typename... _More> \ - static auto _S_##__name(const _Tp& __x, const _More&... __more) \ - { \ - return __fixed_size_storage_t<_RetTp, \ - _VectorTraits<_Tp>::_S_partial_width>:: \ - _S_generate([&](auto __meta) constexpr { \ - return __meta._S_generator( \ - [&](auto __i) { \ - return __name(__x[__meta._S_offset + __i], \ - __more[__meta._S_offset + __i]...); \ - }, \ - static_cast<_RetTp*>(nullptr)); \ - }); \ + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __name(__x[__i], __more[__i]...); \ + }); \ + } + +#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \ + template <typename _Tp, typename... _More> \ + static auto _S_##__name(const _Tp& __x, const _More&... __more) \ + { \ + return __fixed_size_storage_t<_RetTp, \ + _VectorTraits<_Tp>::_S_partial_width>:: \ + _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __meta._S_generator( \ + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __name(__x[__meta._S_offset + __i], \ + __more[__meta._S_offset + __i]...); \ + }, \ + static_cast<_RetTp*>(nullptr)); \ + }); \ } _GLIBCXX_SIMD_MATH_FALLBACK(acos) @@ -2010,7 +2031,7 @@ template <typename _Abi, typename> _S_remquo(const _Tp __x, const _Tp __y, __fixed_size_storage_t<int, _TVT::_S_partial_width>* __z) { - return __generate_vector<_Tp>([&](auto __i) { + return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { int __tmp; auto __r = remquo(__x[__i], __y[__i], &__tmp); __z->_M_set(__i, __tmp); @@ -2423,7 +2444,7 @@ template <typename _Abi, typename> #endif // _GLIBCXX_SIMD_X86INTRIN else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1) return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp), - [](auto... __l) { + [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __make_wrapper<int>(__l...); })}; else @@ -2554,13 +2575,13 @@ struct _MaskImplBuiltinMixin _S_to_maskvector(_BitMask<_Np, _Sanitized> __x) { static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); - return __generate_vector<__vector_type_t<_Up, _ToN>>([&]( - auto __i) constexpr { - if constexpr (__i < _Np) - return __x[__i] ? ~_Up() : _Up(); - else - return _Up(); - }); + return __generate_vector<__vector_type_t<_Up, _ToN>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (__i < _Np) + return __x[__i] ? ~_Up() : _Up(); + else + return _Up(); + }); } template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np, @@ -2601,13 +2622,13 @@ struct _MaskImplBuiltinMixin -1, -1, -1, -1, -1>(__y); else */ { - return __generate_vector<__vector_type_t<_Up, _ToN>>([&]( - auto __i) constexpr { - if constexpr (__i < _Np) - return _Up(__x[__i.value]); - else - return _Up(); - }); + return __generate_vector<__vector_type_t<_Up, _ToN>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (__i < _Np) + return _Up(__x[__i.value]); + else + return _Up(); + }); } } } @@ -2625,7 +2646,9 @@ struct _MaskImplBuiltinMixin = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1); _ULLong __r = 0; __execute_n_times<_Np>( - [&](auto __i) { __r |= _ULLong(__bools[__i.value]) << __i; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __r |= _ULLong(__bools[__i.value]) << __i; + }); return __r; } @@ -2677,9 +2700,10 @@ template <typename _Abi, typename> return __bools > 0; } else - return __generate_vector<_I, _S_size<_Tp>>([&](auto __i) constexpr { - return __mem[__i] ? ~_I() : _I(); - }); + return __generate_vector<_I, _S_size<_Tp>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __mem[__i] ? ~_I() : _I(); + }); } // }}} @@ -2752,7 +2776,7 @@ template <typename _Abi, typename> // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge); _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask), - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __tmp._M_set(__i, -__mem[__i]); }); __merge = __wrapper_bitcast<_Tp>(__tmp); @@ -2764,7 +2788,7 @@ template <typename _Abi, typename> _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept { - __execute_n_times<_Np>([&](auto __i) constexpr { + __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; }); } @@ -2775,10 +2799,10 @@ template <typename _Abi, typename> _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, const _SimdWrapper<_Tp, _Np> __k) noexcept { - _BitOps::_S_bit_iteration( - _SuperImpl::_S_to_bits(__k), [&](auto __i) constexpr { - __mem[__i] = __v[__i]; - }); + _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k), + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __mem[__i] = __v[__i]; + }); } // _S_from_bitmask{{{2 @@ -2845,7 +2869,7 @@ template <typename _Abi, typename> { __k = __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>( - [&](auto __j) { + [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__i == static_cast<int>(__j)) return _Tp(-__x); else @@ -2890,7 +2914,8 @@ template <typename _Abi, typename> { return __call_with_subscripts( __data(__k), make_index_sequence<_S_size<_Tp>>(), - [](const auto... __ent) constexpr { return (... && !(__ent == 0)); }); + [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return (... && !(__ent == 0)); }); } // }}} @@ -2901,7 +2926,8 @@ template <typename _Abi, typename> { return __call_with_subscripts( __data(__k), make_index_sequence<_S_size<_Tp>>(), - [](const auto... __ent) constexpr { return (... || !(__ent == 0)); }); + [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return (... || !(__ent == 0)); }); } // }}} @@ -2912,7 +2938,8 @@ template <typename _Abi, typename> { return __call_with_subscripts( __data(__k), make_index_sequence<_S_size<_Tp>>(), - [](const auto... __ent) constexpr { return (... && (__ent == 0)); }); + [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return (... && (__ent == 0)); }); } // }}} diff --git a/libstdc++-v3/include/experimental/bits/simd_converter.h b/libstdc++-v3/include/experimental/bits/simd_converter.h index f6e8569..cfca421 100644 --- a/libstdc++-v3/include/experimental/bits/simd_converter.h +++ b/libstdc++-v3/include/experimental/bits/simd_converter.h @@ -121,7 +121,7 @@ template <typename _From, typename _To, int _Np> { return __call_with_subscripts( __x, make_index_sequence<_Np>(), - [](auto... __values) constexpr->_Ret { + [](auto... __values) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Ret { return __make_simd_tuple<_To, decltype((void) __values, simd_abi::scalar())...>( static_cast<_To>(__values)...); @@ -233,7 +233,9 @@ template <typename _From, typename _To, int _Np> static_assert(_Ret::_FirstAbi::template _S_is_partial<_To>); return _Ret{__generate_from_n_evaluations< _Np, typename _VectorTraits<typename _Ret::_FirstType>::type>( - [&](auto __i) { return static_cast<_To>(__x[__i]); })}; + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return static_cast<_To>(__x[__i]); + })}; } else { @@ -241,7 +243,7 @@ template <typename _From, typename _To, int _Np> constexpr auto __n = __div_roundup(_Ret::_S_first_size, _Arg::_S_first_size); return __call_with_n_evaluations<__n>( - [&__x](auto... __uncvted) { + [&__x](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { // assuming _Arg Abi tags for all __i are _Arg::_FirstAbi _SimdConverter<_From, typename _Arg::_FirstAbi, _To, typename _Ret::_FirstAbi> @@ -255,8 +257,9 @@ template <typename _From, typename _To, int _Np> _From, simd_abi::fixed_size<_Np - _Ret::_S_first_size>, _To, simd_abi::fixed_size<_Np - _Ret::_S_first_size>>()( __simd_tuple_pop_front<_Ret::_S_first_size>(__x))}; - }, - [&__x](auto __i) { return __get_tuple_at<__i>(__x); }); + }, [&__x](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __get_tuple_at<__i>(__x); + }); } } }; @@ -322,13 +325,14 @@ template <typename _From, int _Np, typename _To, typename _Ap> return __vector_convert<__vector_type_t<_To, _Np>>(__x.first); else if constexpr (_Arg::_S_is_homogeneous) return __call_with_n_evaluations<_Arg::_S_tuple_size>( - [](auto... __members) { + [](auto... __members) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr ((is_convertible_v<decltype(__members), _To> && ...)) return __vector_type_t<_To, _Np>{static_cast<_To>(__members)...}; else return __vector_convert<__vector_type_t<_To, _Np>>(__members...); - }, - [&](auto __i) { return __get_tuple_at<__i>(__x); }); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __get_tuple_at<__i>(__x); + }); else if constexpr (__fixed_size_storage_t<_To, _Np>::_S_tuple_size == 1) { _SimdConverter<_From, simd_abi::fixed_size<_Np>, _To, @@ -340,7 +344,7 @@ template <typename _From, int _Np, typename _To, typename _Ap> { const _SimdWrapper<_From, _Np> __xv = __generate_from_n_evaluations<_Np, __vector_type_t<_From, _Np>>( - [&](auto __i) { return __x[__i]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }); return __vector_convert<__vector_type_t<_To, _Np>>(__xv); } } diff --git a/libstdc++-v3/include/experimental/bits/simd_detail.h b/libstdc++-v3/include/experimental/bits/simd_detail.h index de8d018..bd4e380 100644 --- a/libstdc++-v3/include/experimental/bits/simd_detail.h +++ b/libstdc++-v3/include/experimental/bits/simd_detail.h @@ -262,6 +262,7 @@ #define _GLIBCXX_SIMD_INTRINSIC \ [[__gnu__::__always_inline__, __gnu__::__artificial__]] inline #define _GLIBCXX_SIMD_ALWAYS_INLINE [[__gnu__::__always_inline__]] inline +#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA __attribute__((__always_inline__)) #define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0) #define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1) @@ -294,6 +295,8 @@ #ifdef _GLIBCXX_SIMD_NO_ALWAYS_INLINE #undef _GLIBCXX_SIMD_ALWAYS_INLINE #define _GLIBCXX_SIMD_ALWAYS_INLINE inline +#undef _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA +#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA #undef _GLIBCXX_SIMD_INTRINSIC #define _GLIBCXX_SIMD_INTRINSIC inline #endif diff --git a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h index 7bb248c..eb71a7b 100644 --- a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h +++ b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h @@ -434,14 +434,15 @@ template <typename _Tp, typename _Abi0, typename... _Abis> if constexpr (is_same_v<_SimdTuple, __remove_cvref_t<_Tup>>) return __tup.first; else if (__builtin_is_constant_evaluated()) - return __fixed_size_storage_t<_TupT, _S_first_size>::_S_generate([&]( - auto __meta) constexpr { - return __meta._S_generator( - [&](auto __i) constexpr { return __tup[__i]; }, - static_cast<_TupT*>(nullptr)); + return __fixed_size_storage_t<_TupT, _S_first_size>::_S_generate( + [&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __meta._S_generator( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __tup[__i]; + }, static_cast<_TupT*>(nullptr)); }); else - return [&]() { + return [&]() { // not always_inline; allow the compiler to decide __fixed_size_storage_t<_TupT, _S_first_size> __r; __builtin_memcpy(__r._M_as_charptr(), __tup._M_as_charptr(), sizeof(__r)); @@ -515,12 +516,11 @@ template <typename _Tp, typename _Abi0, typename... _Abis> negation<is_const<remove_reference_t<_More>>>>) ) { // need to write back at least one of __more after calling __fun - auto&& __first = [&](auto... __args) constexpr - { + auto&& __first = [&](auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { auto __r = __fun(__tuple_element_meta<_Tp, _Abi0, 0>(), first, __args...); [[maybe_unused]] auto&& __ignore_me = {( - [](auto&& __dst, const auto& __src) { + [](auto&& __dst, const auto& __src) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (is_assignable_v<decltype(__dst), decltype(__dst)>) { @@ -530,8 +530,7 @@ template <typename _Tp, typename _Abi0, typename... _Abis> }(static_cast<_More&&>(__more), __args), 0)...}; return __r; - } - (_M_extract_argument(__more)...); + }(_M_extract_argument(__more)...); if constexpr (_S_tuple_size == 1) return {__first}; else @@ -776,18 +775,18 @@ template <typename _Tp, size_t _Np, typename _V, size_t _NV, typename... _VX> sizeof...(_VX) == 0, "An array of scalars must be the last argument to __to_simd_tuple"); return __call_with_subscripts( - __from, - make_index_sequence<_NV>(), [&](const auto... __args) constexpr { - return __simd_tuple_concat( - _SimdTuple<_Tp, simd_abi::scalar>{__args}..., _SimdTuple<_Tp>()); - }); + __from, make_index_sequence<_NV>(), + [&](const auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __simd_tuple_concat( + _SimdTuple<_Tp, simd_abi::scalar>{__args}..., _SimdTuple<_Tp>()); + }); } else return __call_with_subscripts( - __from, - make_index_sequence<_NV>(), [&](const auto... __args) constexpr { - return __to_simd_tuple<_Tp, _Np>(__args..., __fromX...); - }); + __from, make_index_sequence<_NV>(), + [&](const auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __to_simd_tuple<_Tp, _Np>(__args..., __fromX...); + }); } template <size_t, typename _Tp> @@ -841,7 +840,7 @@ template <typename _Tp, typename _A0, typename _A1, typename... _Abis, || _A0::template _S_is_partial<_Tp>) return {__generate_from_n_evaluations<_R::_S_first_size, typename _R::_FirstType>( - [&](auto __i) { return __x[__i]; }), + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }), __optimize_simd_tuple( __simd_tuple_pop_front<_R::_S_first_size>(__x))}; else if constexpr (is_same_v<_A0, _A1> @@ -994,10 +993,11 @@ template <int _Index, int _Total, int _Combine, typename _Tp, typename _A0, return __as_vector(simd<_Tp, _RetAbi>(element_ptr, element_aligned)); #else [[maybe_unused]] constexpr size_t __offset = __values_to_skip; - return __as_vector(simd<_Tp, _RetAbi>([&](auto __i) constexpr { - constexpr _SizeConstant<__i + __offset> __k; - return __x[__k]; - })); + return __as_vector(simd<_Tp, _RetAbi>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + constexpr _SizeConstant<__i + __offset> __k; + return __x[__k]; + })); #endif } @@ -1286,9 +1286,10 @@ template <int _Np, typename> template <typename _Tp> static constexpr inline _SimdMember<_Tp> _S_broadcast(_Tp __x) noexcept { - return _SimdMember<_Tp>::_S_generate([&](auto __meta) constexpr { - return __meta._S_broadcast(__x); - }); + return _SimdMember<_Tp>::_S_generate( + [&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __meta._S_broadcast(__x); + }); } // _S_generator {{{2 @@ -1296,14 +1297,15 @@ template <int _Np, typename> static constexpr inline _SimdMember<_Tp> _S_generator(_Fp&& __gen, _TypeTag<_Tp>) { - return _SimdMember<_Tp>::_S_generate([&__gen](auto __meta) constexpr { - return __meta._S_generator( - [&](auto __i) constexpr { - return __i < _Np ? __gen(_SizeConstant<__meta._S_offset + __i>()) - : 0; - }, - _TypeTag<_Tp>()); - }); + return _SimdMember<_Tp>::_S_generate( + [&__gen](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __meta._S_generator( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __i < _Np ? __gen(_SizeConstant<__meta._S_offset + __i>()) + : 0; + }, + _TypeTag<_Tp>()); + }); } // _S_load {{{2 @@ -1311,9 +1313,10 @@ template <int _Np, typename> static inline _SimdMember<_Tp> _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept { - return _SimdMember<_Tp>::_S_generate([&](auto __meta) { - return __meta._S_load(&__mem[__meta._S_offset], _TypeTag<_Tp>()); - }); + return _SimdMember<_Tp>::_S_generate( + [&](auto __meta) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __meta._S_load(&__mem[__meta._S_offset], _TypeTag<_Tp>()); + }); } // _S_masked_load {{{2 @@ -1323,7 +1326,7 @@ template <int _Np, typename> const _MaskMember __bits, const _Up* __mem) noexcept { auto __merge = __old; - __for_each(__merge, [&](auto __meta, auto& __native) { + __for_each(__merge, [&](auto __meta, auto& __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__meta._S_submask(__bits).any()) #pragma GCC diagnostic push // __mem + __mem._S_offset could be UB ([expr.add]/4.3, but it punts @@ -1344,7 +1347,7 @@ template <int _Np, typename> static inline void _S_store(const _SimdMember<_Tp>& __v, _Up* __mem, _TypeTag<_Tp>) noexcept { - __for_each(__v, [&](auto __meta, auto __native) { + __for_each(__v, [&](auto __meta, auto __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta._S_store(__native, &__mem[__meta._S_offset], _TypeTag<_Tp>()); }); } @@ -1355,7 +1358,7 @@ template <int _Np, typename> _Up* __mem, const _MaskMember __bits) noexcept { - __for_each(__v, [&](auto __meta, auto __native) { + __for_each(__v, [&](auto __meta, auto __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__meta._S_submask(__bits).any()) #pragma GCC diagnostic push // __mem + __mem._S_offset could be UB ([expr.add]/4.3, but it punts @@ -1376,7 +1379,7 @@ template <int _Np, typename> { _MaskMember __bits = 0; __for_each( - __x, [&__bits](auto __meta, auto __native) constexpr { + __x, [&__bits](auto __meta, auto __native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __bits |= __meta._S_mask_to_shifted_ullong(__meta._S_negate(__native)); }); @@ -1414,7 +1417,7 @@ template <int _Np, typename> { const auto& __x2 = __call_with_n_evaluations< __div_roundup(_Tup::_S_tuple_size, 2)>( - [](auto __first_simd, auto... __remaining) { + [](auto __first_simd, auto... __remaining) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (sizeof...(__remaining) == 0) return __first_simd; else @@ -1428,7 +1431,7 @@ template <int _Np, typename> __make_simd_tuple(__first_simd, __remaining...)); } }, - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { auto __left = __tup.template _M_simd_at<2 * __i>(); if constexpr (2 * __i + 1 == _Tup::_S_tuple_size) return __left; @@ -1444,7 +1447,9 @@ template <int _Np, typename> _GLIBCXX_SIMD_USE_CONSTEXPR_API typename _LT::mask_type __k( __private_init, - [](auto __j) constexpr { return __j < _RT::size(); }); + [](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __j < _RT::size(); + }); _LT __ext_right = __left; where(__k, __ext_right) = __proposed::resizing_simd_cast<_LT>(__right); @@ -1464,7 +1469,7 @@ template <int _Np, typename> const _SimdTuple<_Tp, _As...>& __b) { return __a._M_apply_per_chunk( - [](auto __impl, auto __aa, auto __bb) constexpr { + [](auto __impl, auto __aa, auto __bb) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __impl._S_min(__aa, __bb); }, __b); @@ -1476,7 +1481,7 @@ template <int _Np, typename> const _SimdTuple<_Tp, _As...>& __b) { return __a._M_apply_per_chunk( - [](auto __impl, auto __aa, auto __bb) constexpr { + [](auto __impl, auto __aa, auto __bb) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __impl._S_max(__aa, __bb); }, __b); @@ -1487,9 +1492,10 @@ template <int _Np, typename> static inline constexpr _SimdTuple<_Tp, _As...> _S_complement(const _SimdTuple<_Tp, _As...>& __x) noexcept { - return __x._M_apply_per_chunk([](auto __impl, auto __xx) constexpr { - return __impl._S_complement(__xx); - }); + return __x._M_apply_per_chunk( + [](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __impl._S_complement(__xx); + }); } // _S_unary_minus {{{2 @@ -1497,23 +1503,24 @@ template <int _Np, typename> static inline constexpr _SimdTuple<_Tp, _As...> _S_unary_minus(const _SimdTuple<_Tp, _As...>& __x) noexcept { - return __x._M_apply_per_chunk([](auto __impl, auto __xx) constexpr { - return __impl._S_unary_minus(__xx); - }); + return __x._M_apply_per_chunk( + [](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __impl._S_unary_minus(__xx); + }); } // arithmetic operators {{{2 -#define _GLIBCXX_SIMD_FIXED_OP(name_, op_) \ - template <typename _Tp, typename... _As> \ - static inline constexpr _SimdTuple<_Tp, _As...> name_( \ - const _SimdTuple<_Tp, _As...>& __x, const _SimdTuple<_Tp, _As...>& __y)\ - { \ - return __x._M_apply_per_chunk( \ - [](auto __impl, auto __xx, auto __yy) constexpr { \ - return __impl.name_(__xx, __yy); \ - }, \ - __y); \ +#define _GLIBCXX_SIMD_FIXED_OP(name_, op_) \ + template <typename _Tp, typename... _As> \ + static inline constexpr _SimdTuple<_Tp, _As...> name_( \ + const _SimdTuple<_Tp, _As...>& __x, const _SimdTuple<_Tp, _As...>& __y) \ + { \ + return __x._M_apply_per_chunk( \ + [](auto __impl, auto __xx, auto __yy) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __impl.name_(__xx, __yy); \ + }, \ + __y); \ } _GLIBCXX_SIMD_FIXED_OP(_S_plus, +) @@ -1532,18 +1539,20 @@ template <int _Np, typename> static inline constexpr _SimdTuple<_Tp, _As...> _S_bit_shift_left(const _SimdTuple<_Tp, _As...>& __x, int __y) { - return __x._M_apply_per_chunk([__y](auto __impl, auto __xx) constexpr { - return __impl._S_bit_shift_left(__xx, __y); - }); + return __x._M_apply_per_chunk( + [__y](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __impl._S_bit_shift_left(__xx, __y); + }); } template <typename _Tp, typename... _As> static inline constexpr _SimdTuple<_Tp, _As...> _S_bit_shift_right(const _SimdTuple<_Tp, _As...>& __x, int __y) { - return __x._M_apply_per_chunk([__y](auto __impl, auto __xx) constexpr { - return __impl._S_bit_shift_right(__xx, __y); - }); + return __x._M_apply_per_chunk( + [__y](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __impl._S_bit_shift_right(__xx, __y); + }); } // math {{{2 @@ -1557,35 +1566,40 @@ template <int _Np, typename> { \ if constexpr (is_same_v<_Tp, _RetTp>) \ return __x._M_apply_per_chunk( \ - [](auto __impl, auto __xx) constexpr { \ - using _V = typename decltype(__impl)::simd_type; \ - return __data(__name(_V(__private_init, __xx))); \ - }); \ + [](auto __impl, auto __xx) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { \ + using _V = typename decltype(__impl)::simd_type; \ + return __data(__name(_V(__private_init, __xx))); \ + }); \ else \ return __optimize_simd_tuple( \ - __x.template _M_apply_r<_RetTp>([](auto __impl, auto __xx) { \ - return __impl._S_##__name(__xx); \ - })); \ + __x.template _M_apply_r<_RetTp>( \ + [](auto __impl, auto __xx) \ + _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { return __impl._S_##__name(__xx); })); \ } \ else if constexpr ( \ is_same_v< \ _Tp, \ _RetTp> && (... && is_same_v<_SimdTuple<_Tp, _As...>, _More>) ) \ return __x._M_apply_per_chunk( \ - [](auto __impl, auto __xx, auto... __pack) constexpr { \ - using _V = typename decltype(__impl)::simd_type; \ - return __data(__name(_V(__private_init, __xx), \ - _V(__private_init, __pack)...)); \ - }, \ - __more...); \ + [](auto __impl, auto __xx, auto... __pack) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { \ + using _V = typename decltype(__impl)::simd_type; \ + return __data(__name(_V(__private_init, __xx), \ + _V(__private_init, __pack)...)); \ + }, __more...); \ else if constexpr (is_same_v<_Tp, _RetTp>) \ return __x._M_apply_per_chunk( \ - [](auto __impl, auto __xx, auto... __pack) constexpr { \ - using _V = typename decltype(__impl)::simd_type; \ - return __data(__name(_V(__private_init, __xx), \ - __autocvt_to_simd(__pack)...)); \ - }, \ - __more...); \ + [](auto __impl, auto __xx, auto... __pack) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { \ + using _V = typename decltype(__impl)::simd_type; \ + return __data(__name(_V(__private_init, __xx), \ + __autocvt_to_simd(__pack)...)); \ + }, __more...); \ else \ __assert_unreachable<_Tp>(); \ } @@ -1657,10 +1671,10 @@ template <int _Np, typename> __fixed_size_storage_t<int, _SimdTuple<_Tp, _Abis...>::_S_size()>* __z) { return __x._M_apply_per_chunk( - [](auto __impl, const auto __xx, const auto __yy, auto& __zz) { - return __impl._S_remquo(__xx, __yy, &__zz); - }, - __y, *__z); + [](auto __impl, const auto __xx, const auto __yy, auto& __zz) + _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __impl._S_remquo(__xx, __yy, &__zz); }, + __y, *__z); } template <typename _Tp, typename... _As> @@ -1669,12 +1683,10 @@ template <int _Np, typename> __fixed_size_storage_t<int, _Np>& __exp) noexcept { return __x._M_apply_per_chunk( - [](auto __impl, const auto& __a, auto& __b) { - return __data( - frexp(typename decltype(__impl)::simd_type(__private_init, __a), - __autocvt_to_simd(__b))); - }, - __exp); + [](auto __impl, const auto& __a, auto& __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __data(frexp(typename decltype(__impl)::simd_type(__private_init, __a), + __autocvt_to_simd(__b))); + }, __exp); } #define _GLIBCXX_SIMD_TEST_ON_TUPLE_(name_) \ @@ -1700,7 +1712,7 @@ template <int _Np, typename> _S_increment(_SimdTuple<_Ts...>& __x) { __for_each( - __x, [](auto __meta, auto& native) constexpr { + __x, [](auto __meta, auto& native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta._S_increment(native); }); } @@ -1710,7 +1722,7 @@ template <int _Np, typename> _S_decrement(_SimdTuple<_Ts...>& __x) { __for_each( - __x, [](auto __meta, auto& native) constexpr { + __x, [](auto __meta, auto& native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta._S_decrement(native); }); } @@ -1722,11 +1734,10 @@ template <int _Np, typename> __cmp(const _SimdTuple<_Tp, _As...>& __x, \ const _SimdTuple<_Tp, _As...>& __y) \ { \ - return _M_test( \ - [](auto __impl, auto __xx, auto __yy) constexpr { \ - return __impl.__cmp(__xx, __yy); \ - }, \ - __x, __y); \ + return _M_test([](auto __impl, auto __xx, auto __yy) \ + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \ + { return __impl.__cmp(__xx, __yy); }, \ + __x, __y); \ } _GLIBCXX_SIMD_CMP_OPERATIONS(_S_equal_to) @@ -1753,12 +1764,13 @@ template <int _Np, typename> _S_masked_assign(const _MaskMember __bits, _SimdTuple<_Tp, _As...>& __lhs, const __type_identity_t<_SimdTuple<_Tp, _As...>>& __rhs) { - __for_each( - __lhs, __rhs, - [&](auto __meta, auto& __native_lhs, auto __native_rhs) constexpr { - __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs, - __native_rhs); - }); + __for_each(__lhs, __rhs, + [&](auto __meta, auto& __native_lhs, auto __native_rhs) + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { + __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs, + __native_rhs); + }); } // Optimization for the case where the RHS is a scalar. No need to broadcast @@ -1769,7 +1781,7 @@ template <int _Np, typename> const __type_identity_t<_Tp> __rhs) { __for_each( - __lhs, [&](auto __meta, auto& __native_lhs) constexpr { + __lhs, [&](auto __meta, auto& __native_lhs) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs, __rhs); }); @@ -1782,12 +1794,13 @@ template <int _Np, typename> const _SimdTuple<_Tp, _As...>& __rhs, _Op __op) { - __for_each( - __lhs, __rhs, - [&](auto __meta, auto& __native_lhs, auto __native_rhs) constexpr { - __meta.template _S_masked_cassign(__meta._S_make_mask(__bits), - __native_lhs, __native_rhs, __op); - }); + __for_each(__lhs, __rhs, + [&](auto __meta, auto& __native_lhs, auto __native_rhs) + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { + __meta.template _S_masked_cassign(__meta._S_make_mask(__bits), + __native_lhs, __native_rhs, __op); + }); } // Optimization for the case where the RHS is a scalar. No need to broadcast @@ -1798,7 +1811,7 @@ template <int _Np, typename> const _Tp& __rhs, _Op __op) { __for_each( - __lhs, [&](auto __meta, auto& __native_lhs) constexpr { + __lhs, [&](auto __meta, auto& __native_lhs) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __meta.template _S_masked_cassign(__meta._S_make_mask(__bits), __native_lhs, __rhs, __op); }); @@ -1899,7 +1912,7 @@ template <int _Np, typename> // _Np _UShort, _UInt, _ULLong, float, and double can be more efficient. _ULLong __r = 0; using _Vs = __fixed_size_storage_t<_UChar, _Np>; - __for_each(_Vs{}, [&](auto __meta, auto) { + __for_each(_Vs{}, [&](auto __meta, auto) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r |= __meta._S_mask_to_shifted_ullong( __meta._S_mask_impl._S_load(&__mem[__meta._S_offset], _SizeConstant<__meta._S_size()>())); @@ -1912,9 +1925,10 @@ template <int _Np, typename> _MaskMember __mask, const bool* __mem) noexcept { - _BitOps::_S_bit_iteration(__mask.to_ullong(), [&](auto __i) { - __merge.set(__i, __mem[__i]); - }); + _BitOps::_S_bit_iteration(__mask.to_ullong(), + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __merge.set(__i, __mem[__i]); + }); return __merge; } @@ -1932,7 +1946,8 @@ template <int _Np, typename> static inline void _S_masked_store(const _MaskMember __v, bool* __mem, const _MaskMember __k) noexcept { - _BitOps::_S_bit_iteration(__k, [&](auto __i) { __mem[__i] = __v[__i]; }); + _BitOps::_S_bit_iteration( + __k, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; }); } // logical and bitwise operators {{{2 diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h index b008139..61f9ad2 100644 --- a/libstdc++-v3/include/experimental/bits/simd_math.h +++ b/libstdc++-v3/include/experimental/bits/simd_math.h @@ -788,7 +788,7 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> // __exponent(__x) returns the exponent value (bias removed) as // simd<_Up> with integral _Up - auto&& __exponent = [](const _V& __v) { + auto&& __exponent = [](const _V& __v) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { using namespace std::experimental::__proposed; using _IV = rebind_simd_t< conditional_t<sizeof(_Tp) == sizeof(_LLong), _LLong, int>, _V>; @@ -931,7 +931,7 @@ template <typename _R, typename _ToApply, typename _Tp, typename... _Tps> { return {__private_init, __data(__arg0)._M_apply_per_chunk( - [&](auto __impl, const auto&... __inner) { + [&](auto __impl, const auto&... __inner) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { using _V = typename decltype(__impl)::simd_type; return __data(__apply(_V(__private_init, __inner)...)); }, @@ -1092,8 +1092,9 @@ _GLIBCXX_SIMD_CVTING2(hypot) if constexpr (__is_fixed_size_abi_v<_Abi> && _V::size() > 1) { return __fixed_size_apply<simd<_Tp, _Abi>>( - [](auto __a, auto __b, auto __c) { return hypot(__a, __b, __c); }, - __x, __y, __z); + [](auto __a, auto __b, auto __c) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return hypot(__a, __b, __c); + }, __x, __y, __z); } else { @@ -1380,9 +1381,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>([&](auto __i) { - return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]); - }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]); + }); } template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> @@ -1391,9 +1392,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>([&](auto __i) { - return std::assoc_legendre(__n[__i], __m[__i], __x[__i]); - }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::assoc_legendre(__n[__i], __m[__i], __x[__i]); + }); } _GLIBCXX_SIMD_MATH_CALL2_(beta, _Tp) @@ -1414,8 +1415,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> hermite(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::hermite(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::hermite(__n[__i], __x[__i]); + }); } template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> @@ -1423,8 +1425,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> laguerre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::laguerre(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::laguerre(__n[__i], __x[__i]); + }); } template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> @@ -1432,8 +1435,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> legendre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::legendre(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::legendre(__n[__i], __x[__i]); + }); } _GLIBCXX_SIMD_MATH_CALL_(riemann_zeta) @@ -1443,8 +1447,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> sph_bessel(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::sph_bessel(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::sph_bessel(__n[__i], __x[__i]); + }); } template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> @@ -1453,9 +1458,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m, const simd<_Tp, _Abi>& theta) { - return simd<_Tp, _Abi>([&](auto __i) { - return std::assoc_legendre(__l[__i], __m[__i], theta[__i]); - }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::assoc_legendre(__l[__i], __m[__i], theta[__i]); + }); } template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> @@ -1463,8 +1468,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> sph_neumann(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, const simd<_Tp, _Abi>& __x) { - return simd<_Tp, _Abi>( - [&](auto __i) { return std::sph_neumann(__n[__i], __x[__i]); }); + return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return std::sph_neumann(__n[__i], __x[__i]); + }); } // }}} diff --git a/libstdc++-v3/include/experimental/bits/simd_neon.h b/libstdc++-v3/include/experimental/bits/simd_neon.h index 0945092..c809addd 100644 --- a/libstdc++-v3/include/experimental/bits/simd_neon.h +++ b/libstdc++-v3/include/experimental/bits/simd_neon.h @@ -61,7 +61,7 @@ template <typename _Abi, typename> _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, const _Up* __mem) noexcept { - __execute_n_times<_Np>([&](auto __i) { + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__k[__i] != 0) __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); }); @@ -75,7 +75,7 @@ template <typename _Abi, typename> _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k) { - __execute_n_times<_Np>([&](auto __i) { + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__k[__i] != 0) __mem[__i] = __v[__i]; }); @@ -286,7 +286,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>( __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0); }); @@ -306,7 +306,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; @@ -322,7 +322,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; @@ -346,7 +346,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; @@ -361,7 +361,7 @@ struct _MaskImplNeonMixin { constexpr auto __bitsel = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; diff --git a/libstdc++-v3/include/experimental/bits/simd_x86.h b/libstdc++-v3/include/experimental/bits/simd_x86.h index 3570246..c8dde61 100644 --- a/libstdc++-v3/include/experimental/bits/simd_x86.h +++ b/libstdc++-v3/include/experimental/bits/simd_x86.h @@ -537,16 +537,17 @@ struct _CommonImplX86 : _CommonImplBuiltin _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem) { if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL - _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>([=]() constexpr { - if constexpr (_Np <= 16) - return _mm_movm_epi8(__x._M_to_bits()); - else if constexpr (_Np <= 32) - return _mm256_movm_epi8(__x._M_to_bits()); - else if constexpr (_Np <= 64) - return _mm512_movm_epi8(__x._M_to_bits()); - else - __assert_unreachable<_SizeConstant<_Np>>(); - }()), + _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>( + [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (_Np <= 16) + return _mm_movm_epi8(__x._M_to_bits()); + else if constexpr (_Np <= 32) + return _mm256_movm_epi8(__x._M_to_bits()); + else if constexpr (_Np <= 64) + return _mm512_movm_epi8(__x._M_to_bits()); + else + __assert_unreachable<_SizeConstant<_Np>>(); + }()), __mem); else if constexpr (__have_bmi2) { @@ -554,7 +555,7 @@ struct _CommonImplX86 : _CommonImplBuiltin _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem); else __execute_n_times<__div_roundup(_Np, sizeof(size_t))>( - [&](auto __i) { + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { constexpr size_t __offset = __i * sizeof(size_t); constexpr int __todo = std::min(sizeof(size_t), _Np - __offset); if constexpr (__todo == 1) @@ -575,7 +576,7 @@ struct _CommonImplX86 : _CommonImplBuiltin }); } else if constexpr (__have_sse2 && _Np > 7) - __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) { + __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { constexpr int __offset = __i * 16; constexpr int __todo = std::min(16, int(_Np) - __offset); const int __bits = __x.template _M_extract<__offset>()._M_to_bits(); @@ -765,9 +766,10 @@ struct _CommonImplX86 : _CommonImplBuiltin static_assert(is_same_v<_Tp, _Tp> && __have_avx512f); if (__k._M_is_constprop() && __at0._M_is_constprop() && __at1._M_is_constprop()) - return __generate_from_n_evaluations<_Np, - __vector_type_t<_Tp, _Np>>([&]( - auto __i) constexpr { return __k[__i] ? __at1[__i] : __at0[__i]; }); + return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __k[__i] ? __at1[__i] : __at0[__i]; + }); else if constexpr (sizeof(__at0) == 64 || (__have_avx512vl && sizeof(__at0) >= 16)) return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data); @@ -994,9 +996,8 @@ template <typename _Abi, typename> } else _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), - [&](auto __i) { - __merge._M_set(__i, static_cast<_Tp>( - __mem[__i])); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); }); } /* Very uncertain, that the following improves anything. Needs @@ -1417,11 +1418,12 @@ template <typename _Abi, typename> const auto __yf = __convert_all<_FloatV, __n_floatv>( _Abi::__make_padding_nonzero(__as_vector(__y))); return __call_with_n_evaluations<__n_floatv>( - [](auto... __quotients) { + [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __vector_convert<_R>(__quotients...); }, - [&__xf, - &__yf](auto __i) -> _SimdWrapper<_Float, __n_intermediate> { + [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + -> _SimdWrapper<_Float, __n_intermediate> + { #if __RECIPROCAL_MATH__ // If -freciprocal-math is active, using the `/` operator is // incorrect because it may be translated to an imprecise @@ -1980,7 +1982,7 @@ template <typename _Abi, typename> { auto __mask = __vector_bitcast<_UChar>( __vector_bitcast<_UShort>(__iy) << 5); - auto __maskl = [&]() { + auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); }; auto __xh = __vector_bitcast<short>(__ix); @@ -2067,19 +2069,20 @@ template <typename _Abi, typename> } //}}} else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ { - [[maybe_unused]] auto __blend_0xaa = [](auto __a, auto __b) { - if constexpr (sizeof(__a) == 16) - return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), - 0xaa); - else if constexpr (sizeof(__a) == 32) - return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), - 0xaa); - else if constexpr (sizeof(__a) == 64) - return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), - __to_intrin(__b)); - else - __assert_unreachable<decltype(__a)>(); - }; + [[maybe_unused]] auto __blend_0xaa + = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if constexpr (sizeof(__a) == 16) + return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), + 0xaa); + else if constexpr (sizeof(__a) == 32) + return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), + 0xaa); + else if constexpr (sizeof(__a) == 64) + return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), + __to_intrin(__b)); + else + __assert_unreachable<decltype(__a)>(); + }; if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) return __intrin_bitcast<_V>(is_signed_v<_Up> ? _mm_srav_epi16(__ix, __iy) @@ -2136,9 +2139,10 @@ template <typename _Abi, typename> { auto __k = __vector_bitcast<_UShort>(__iy) << 11; auto __x128 = __vector_bitcast<_Up>(__ix); - auto __mask = [](__vector_type16_t<_UShort> __kk) { - return __vector_bitcast<short>(__kk) < 0; - }; + auto __mask + = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __vector_bitcast<short>(__kk) < 0; + }; // do __x128 = 0 where __y[4] is set __x128 = __mask(__k) ? decltype(__x128)() : __x128; // do __x128 =>> 8 where __y[3] is set @@ -2178,7 +2182,7 @@ template <typename _Abi, typename> } else { - auto __shift = [](auto __a, auto __b) { + auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (is_signed_v<_Up>) return _mm_sra_epi32(__a, __b); else @@ -3495,7 +3499,7 @@ struct _MaskImplX86Mixin return _S_to_maskvector<_Up, _ToN>(__k); else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>( - [&](auto __i) -> _Up { return -__x[__i.value]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; }); else if constexpr (sizeof(_Up) == 1) { if constexpr (sizeof(_UI) == 16) @@ -3740,9 +3744,9 @@ struct _MaskImplX86Mixin else if constexpr (__bits_per_element >= _ToN) { constexpr auto __bitmask - = __generate_vector<_V>([](auto __i) constexpr->_UpUInt { - return __i < _ToN ? 1ull << __i : 0; - }); + = __generate_vector<_V>([](auto __i) + constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt + { return __i < _ToN ? 1ull << __i : 0; }); const auto __bits = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; if constexpr (__bits_per_element > _ToN) @@ -3753,11 +3757,11 @@ struct _MaskImplX86Mixin else { const _V __tmp - = __generate_vector<_V>([&](auto __i) constexpr { + = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_UpUInt>( __k >> (__bits_per_element * (__i / __bits_per_element))); }) - & __generate_vector<_V>([](auto __i) constexpr { + & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_UpUInt>(1ull << (__i % __bits_per_element)); }); // mask bit index @@ -3793,7 +3797,7 @@ struct _MaskImplX86Mixin const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); return __generate_from_n_evaluations<std::min(_ToN, _Np), __vector_type_t<_Up, _ToN>>( - [&](auto __i) -> _Up { return __y[__i.value]; }); + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; }); } using _To = __vector_type_t<_Up, _ToN>; [[maybe_unused]] constexpr size_t _FromN = _Np; @@ -4128,8 +4132,11 @@ struct _MaskImplX86Mixin { const auto __bools = -__x._M_data; const _ULLong __k = __call_with_n_evaluations<_Np>( - [](auto... __bits) { return (__bits | ...); }, - [&](auto __i) { return _ULLong(__bools[+__i]) << __i; }); + [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return (__bits | ...); + }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _ULLong(__bools[+__i]) << __i; + }); if (__builtin_is_constant_evaluated() || __builtin_constant_p(__k)) return __k; @@ -4285,13 +4292,14 @@ template <typename _Abi, typename> static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); if constexpr (__have_avx512bw) { - const auto __to_vec_or_bits = [](auto __bits) -> decltype(auto) { - if constexpr (__is_avx512_abi<_Abi>()) - return __bits; - else - return _S_to_maskvector<_Tp>( - _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); - }; + const auto __to_vec_or_bits + = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) { + if constexpr (__is_avx512_abi<_Abi>()) + return __bits; + else + return _S_to_maskvector<_Tp>( + _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); + }; if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) { @@ -4478,7 +4486,7 @@ template <typename _Abi, typename> } else { - _BitOps::_S_bit_iteration(__mask, [&](auto __i) { + _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __merge._M_set(__i, __mem[__i]); }); return __merge; @@ -4557,7 +4565,7 @@ template <typename _Abi, typename> { if constexpr (__have_avx512bw_vl) _CommonImplX86::_S_store<_Np>( - __vector_bitcast<char>([](auto __data) { + __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if constexpr (_Np <= 16) return _mm_maskz_set1_epi8(__data, 1); else if constexpr (_Np <= 32) |