aboutsummaryrefslogtreecommitdiff
path: root/libstdc++-v3
diff options
context:
space:
mode:
authorMatthias Kretz <m.kretz@gsi.de>2023-01-14 17:07:59 +0100
committerMatthias Kretz <m.kretz@gsi.de>2023-05-23 10:11:39 +0200
commitd3217028725f06f4eb67a4c80b12e1a3219d3502 (patch)
treea69ed894197e3021cd1dd672fba71c8e9051f540 /libstdc++-v3
parenta460f10796ed21900c9f85e702de6bde5134f3c7 (diff)
downloadgcc-d3217028725f06f4eb67a4c80b12e1a3219d3502.zip
gcc-d3217028725f06f4eb67a4c80b12e1a3219d3502.tar.gz
gcc-d3217028725f06f4eb67a4c80b12e1a3219d3502.tar.bz2
libstdc++: Annotate most lambdas with always_inline
All of the annotated lambdas are simply a necessary means for implementing these functions and should never result in an actual function call. Many of these lambdas would go away if C++ had better language support for packs. Signed-off-by: Matthias Kretz <m.kretz@gsi.de> libstdc++-v3/ChangeLog: PR libstdc++/108030 * include/experimental/bits/simd_detail.h: Define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA. * include/experimental/bits/simd.h: Annotate lambdas with _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA. * include/experimental/bits/simd_builtin.h: Ditto. * include/experimental/bits/simd_converter.h: Ditto. * include/experimental/bits/simd_fixed_size.h: Ditto. * include/experimental/bits/simd_math.h: Ditto. * include/experimental/bits/simd_neon.h: Ditto. * include/experimental/bits/simd_x86.h: Ditto. (cherry picked from commit 53b55701aed6896f456cdec7997ac6bbef1d6074)
Diffstat (limited to 'libstdc++-v3')
-rw-r--r--libstdc++-v3/include/experimental/bits/simd.h234
-rw-r--r--libstdc++-v3/include/experimental/bits/simd_builtin.h351
-rw-r--r--libstdc++-v3/include/experimental/bits/simd_converter.h22
-rw-r--r--libstdc++-v3/include/experimental/bits/simd_detail.h3
-rw-r--r--libstdc++-v3/include/experimental/bits/simd_fixed_size.h265
-rw-r--r--libstdc++-v3/include/experimental/bits/simd_math.h52
-rw-r--r--libstdc++-v3/include/experimental/bits/simd_neon.h14
-rw-r--r--libstdc++-v3/include/experimental/bits/simd_x86.h122
8 files changed, 572 insertions, 491 deletions
diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h
index 7c5a32f..8fcd45c 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -609,28 +609,34 @@ template <size_t _Bytes>
operator&(_Ip __rhs) const
{
return __generate_from_n_evaluations<_Np, _Ip>(
- [&](auto __i) { return __rhs._M_data[__i] & _M_data[__i]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __rhs._M_data[__i] & _M_data[__i];
+ });
}
_GLIBCXX_SIMD_INTRINSIC constexpr _Ip
operator|(_Ip __rhs) const
{
return __generate_from_n_evaluations<_Np, _Ip>(
- [&](auto __i) { return __rhs._M_data[__i] | _M_data[__i]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __rhs._M_data[__i] | _M_data[__i];
+ });
}
_GLIBCXX_SIMD_INTRINSIC constexpr _Ip
operator^(_Ip __rhs) const
{
return __generate_from_n_evaluations<_Np, _Ip>(
- [&](auto __i) { return __rhs._M_data[__i] ^ _M_data[__i]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __rhs._M_data[__i] ^ _M_data[__i];
+ });
}
_GLIBCXX_SIMD_INTRINSIC constexpr _Ip
operator~() const
{
return __generate_from_n_evaluations<_Np, _Ip>(
- [&](auto __i) { return ~_M_data[__i]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return ~_M_data[__i]; });
}
};
return _Ip{};
@@ -1391,7 +1397,7 @@ template <size_t _Np, bool _Sanitized>
operator^=(const _BitMask& __b) & noexcept
{
__execute_n_times<_S_array_size>(
- [&](auto __i) { _M_bits[__i] ^= __b._M_bits[__i]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] ^= __b._M_bits[__i]; });
return *this;
}
@@ -1399,7 +1405,7 @@ template <size_t _Np, bool _Sanitized>
operator|=(const _BitMask& __b) & noexcept
{
__execute_n_times<_S_array_size>(
- [&](auto __i) { _M_bits[__i] |= __b._M_bits[__i]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] |= __b._M_bits[__i]; });
return *this;
}
@@ -1407,7 +1413,7 @@ template <size_t _Np, bool _Sanitized>
operator&=(const _BitMask& __b) & noexcept
{
__execute_n_times<_S_array_size>(
- [&](auto __i) { _M_bits[__i] &= __b._M_bits[__i]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] &= __b._M_bits[__i]; });
return *this;
}
@@ -2206,7 +2212,7 @@ template <int _Offset,
#endif
constexpr int _O = _Offset * __return_width;
return __call_with_subscripts<__return_width, _O>(
- __x, [](auto... __entries) {
+ __x, [](auto... __entries) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return reinterpret_cast<_R>(_Up{__entries...});
});
}
@@ -2608,7 +2614,7 @@ template <typename _Tp, size_t _Width>
_GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper(initializer_list<_Tp> __init)
: _Base(__generate_from_n_evaluations<_Width, _BuiltinType>(
- [&](auto __i) { return __init.begin()[__i.value]; })) {}
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __init.begin()[__i.value]; })) {}
_GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper() = default;
_GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper(const _SimdWrapper&)
@@ -2633,10 +2639,9 @@ template <typename _Tp, size_t _Width>
_GLIBCXX_SIMD_INTRINSIC constexpr
operator _SimdTuple<_Tp, _As...>() const
{
- const auto& dd = _M_data; // workaround for GCC7 ICE
- return __generate_from_n_evaluations<sizeof...(_As),
- _SimdTuple<_Tp, _As...>>([&](
- auto __i) constexpr { return dd[int(__i)]; });
+ return __generate_from_n_evaluations<sizeof...(_As), _SimdTuple<_Tp, _As...>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ { return _M_data[int(__i)]; });
}
_GLIBCXX_SIMD_INTRINSIC constexpr operator const _BuiltinType&() const
@@ -3193,21 +3198,19 @@ template <typename _Tp, int _Np>
{ return __x; }
template <typename _Tp, typename _Ap>
- _GLIBCXX_SIMD_INTRINSIC auto
+ _GLIBCXX_SIMD_INTRINSIC fixed_size_simd<_Tp, simd_size_v<_Tp, _Ap>>
to_fixed_size(const simd<_Tp, _Ap>& __x)
{
- return simd<_Tp, simd_abi::fixed_size<simd_size_v<_Tp, _Ap>>>([&__x](
- auto __i) constexpr { return __x[__i]; });
+ using _Rp = fixed_size_simd<_Tp, simd_size_v<_Tp, _Ap>>;
+ return _Rp([&__x](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
}
template <typename _Tp, typename _Ap>
- _GLIBCXX_SIMD_INTRINSIC auto
+ _GLIBCXX_SIMD_INTRINSIC fixed_size_simd_mask<_Tp, simd_size_v<_Tp, _Ap>>
to_fixed_size(const simd_mask<_Tp, _Ap>& __x)
{
- constexpr int _Np = simd_mask<_Tp, _Ap>::size();
- fixed_size_simd_mask<_Tp, _Np> __r;
- __execute_n_times<_Np>([&](auto __i) constexpr { __r[__i] = __x[__i]; });
- return __r;
+ return {__private_init,
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }};
}
// to_native {{{2
@@ -3226,7 +3229,9 @@ template <typename _Tp, size_t _Np>
enable_if_t<(_Np == native_simd_mask<_Tp>::size()), native_simd_mask<_Tp>>
to_native(const fixed_size_simd_mask<_Tp, _Np>& __x)
{
- return native_simd_mask<_Tp>([&](auto __i) constexpr { return __x[__i]; });
+ return native_simd_mask<_Tp>(
+ __private_init,
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
}
// to_compatible {{{2
@@ -3243,7 +3248,10 @@ template <typename _Tp, size_t _Np>
_GLIBCXX_SIMD_INTRINSIC
enable_if_t<(_Np == simd_mask<_Tp>::size()), simd_mask<_Tp>>
to_compatible(const simd_mask<_Tp, simd_abi::fixed_size<_Np>>& __x)
- { return simd_mask<_Tp>([&](auto __i) constexpr { return __x[__i]; }); }
+ {
+ return simd_mask<_Tp>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
+ }
// masked assignment [simd_mask.where] {{{1
@@ -3401,9 +3409,9 @@ template <typename _M, typename _Tp>
_Impl::template _S_masked_cassign( \
__data(_M_k), __data(_M_value), \
__to_value_type_or_member_type<_Tp>(static_cast<_Up&&>(__x)), \
- [](auto __impl, auto __lhs, auto __rhs) constexpr { \
- return __impl.__name(__lhs, __rhs); \
- }); \
+ [](auto __impl, auto __lhs, auto __rhs) \
+ constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \
+ { return __impl.__name(__lhs, __rhs); }); \
} \
static_assert(true)
_GLIBCXX_SIMD_OP_(+, _S_plus);
@@ -3900,12 +3908,11 @@ template <typename _V, typename _Ap,
}
else if (__x._M_is_constprop())
{
- return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&](
- auto __i) constexpr {
- return _V([&](auto __j) constexpr {
- return __x[__i * _V::size() + __j];
- });
- });
+ return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ { return __x[__i * _V::size() + __j]; });
+ });
}
else if constexpr (
__is_fixed_size_abi_v<_Ap>
@@ -3918,41 +3925,40 @@ template <typename _V, typename _Ap,
#ifdef _GLIBCXX_SIMD_USE_ALIASING_LOADS
const __may_alias<_Tp>* const __element_ptr
= reinterpret_cast<const __may_alias<_Tp>*>(&__data(__x));
- return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&](
- auto __i) constexpr {
- return _V(__element_ptr + __i * _V::size(), vector_aligned);
- });
+ return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ { return _V(__element_ptr + __i * _V::size(), vector_aligned); });
#else
const auto& __xx = __data(__x);
- return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&](
- auto __i) constexpr {
- [[maybe_unused]] constexpr size_t __offset
- = decltype(__i)::value * _V::size();
- return _V([&](auto __j) constexpr {
- constexpr _SizeConstant<__j + __offset> __k;
- return __xx[__k];
- });
- });
+ return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ [[maybe_unused]] constexpr size_t __offset
+ = decltype(__i)::value * _V::size();
+ return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ constexpr _SizeConstant<__j + __offset> __k;
+ return __xx[__k];
+ });
+ });
#endif
}
else if constexpr (is_same_v<typename _V::abi_type, simd_abi::scalar>)
{
// normally memcpy should work here as well
- return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&](
- auto __i) constexpr { return __x[__i]; });
+ return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
}
else
{
- return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&](
- auto __i) constexpr {
- if constexpr (__is_fixed_size_abi_v<typename _V::abi_type>)
- return _V([&](auto __j) constexpr {
- return __x[__i * _V::size() + __j];
- });
- else
- return _V(__private_init,
- __extract_part<decltype(__i)::value, _Parts>(__data(__x)));
- });
+ return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ if constexpr (__is_fixed_size_abi_v<typename _V::abi_type>)
+ return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __x[__i * _V::size() + __j];
+ });
+ else
+ return _V(__private_init,
+ __extract_part<decltype(__i)::value, _Parts>(__data(__x)));
+ });
}
}
@@ -3976,22 +3982,22 @@ template <typename _V, typename _Ap,
else if constexpr (_V::size() <= __CHAR_BIT__ * sizeof(_ULLong))
{
const bitset __bits = __x.__to_bitset();
- return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&](
- auto __i) constexpr {
- constexpr size_t __offset = __i * _V::size();
- return _V(__bitset_init, (__bits >> __offset).to_ullong());
- });
+ return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ constexpr size_t __offset = __i * _V::size();
+ return _V(__bitset_init, (__bits >> __offset).to_ullong());
+ });
}
else
{
- return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&](
- auto __i) constexpr {
- constexpr size_t __offset = __i * _V::size();
- return _V(
- __private_init, [&](auto __j) constexpr {
- return __x[__j + __offset];
- });
- });
+ return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ constexpr size_t __offset = __i * _V::size();
+ return _V(__private_init,
+ [&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __x[__j + __offset];
+ });
+ });
}
}
@@ -4009,12 +4015,14 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
using _V = __deduced_simd<_Tp, _N0>;
if (__x._M_is_constprop())
- return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>([&](
- auto __i) constexpr {
- using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
- constexpr size_t __offset = _SL::_S_before(__i);
- return _Vi([&](auto __j) constexpr { return __x[__offset + __j]; });
- });
+ return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
+ constexpr size_t __offset = _SL::_S_before(__i);
+ return _Vi([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __x[__offset + __j];
+ });
+ });
else if constexpr (_Np == _N0)
{
static_assert(sizeof...(_Sizes) == 1);
@@ -4081,28 +4089,28 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
#ifdef _GLIBCXX_SIMD_USE_ALIASING_LOADS
const __may_alias<_Tp>* const __element_ptr
= reinterpret_cast<const __may_alias<_Tp>*>(&__x);
- return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>([&](
- auto __i) constexpr {
- using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
- constexpr size_t __offset = _SL::_S_before(__i);
- constexpr size_t __base_align = alignof(simd<_Tp, _Ap>);
- constexpr size_t __a
- = __base_align - ((__offset * sizeof(_Tp)) % __base_align);
- constexpr size_t __b = ((__a - 1) & __a) ^ __a;
- constexpr size_t __alignment = __b == 0 ? __a : __b;
- return _Vi(__element_ptr + __offset, overaligned<__alignment>);
- });
+ return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
+ constexpr size_t __offset = _SL::_S_before(__i);
+ constexpr size_t __base_align = alignof(simd<_Tp, _Ap>);
+ constexpr size_t __a
+ = __base_align - ((__offset * sizeof(_Tp)) % __base_align);
+ constexpr size_t __b = ((__a - 1) & __a) ^ __a;
+ constexpr size_t __alignment = __b == 0 ? __a : __b;
+ return _Vi(__element_ptr + __offset, overaligned<__alignment>);
+ });
#else
- return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>([&](
- auto __i) constexpr {
- using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
- const auto& __xx = __data(__x);
- using _Offset = decltype(_SL::_S_before(__i));
- return _Vi([&](auto __j) constexpr {
- constexpr _SizeConstant<_Offset::value + __j> __k;
- return __xx[__k];
- });
- });
+ return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
+ const auto& __xx = __data(__x);
+ using _Offset = decltype(_SL::_S_before(__i));
+ return _Vi([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ constexpr _SizeConstant<_Offset::value + __j> __k;
+ return __xx[__k];
+ });
+ });
#endif
}
@@ -4144,8 +4152,9 @@ template <typename _Tp, typename... _As, typename = __detail::__odr_helper>
return simd_cast<_Rp>(__xs...);
else if ((... && __xs._M_is_constprop()))
return simd<_Tp,
- simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>([&](
- auto __i) constexpr { return __subscript_in_pack<__i>(__xs...); });
+ simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ { return __subscript_in_pack<__i>(__xs...); });
else
{
_Rp __r{};
@@ -4161,9 +4170,10 @@ template <typename _Tp, typename _Abi, size_t _Np>
_GLIBCXX_SIMD_CONSTEXPR __deduced_simd<_Tp, simd_size_v<_Tp, _Abi> * _Np>
concat(const array<simd<_Tp, _Abi>, _Np>& __x)
{
- return __call_with_subscripts<_Np>(__x, [](const auto&... __xs) {
- return concat(__xs...);
- });
+ return __call_with_subscripts<_Np>(
+ __x, [](const auto&... __xs) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return concat(__xs...);
+ });
}
// }}}
@@ -4696,7 +4706,7 @@ template <typename _Tp, typename _Abi>
simd_mask(_PrivateInit, _Fp&& __gen)
: _M_data()
{
- __execute_n_times<size()>([&](auto __i) constexpr {
+ __execute_n_times<size()>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
_Impl::_S_set(_M_data, __i, __gen(__i));
});
}
@@ -4882,7 +4892,9 @@ template <typename _Tp, typename _Abi>
if (__builtin_is_constant_evaluated() || __k._M_is_constprop())
{
const int __r = __call_with_subscripts<simd_size_v<_Tp, _Abi>>(
- __k, [](auto... __elements) { return ((__elements != 0) + ...); });
+ __k, [](auto... __elements) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return ((__elements != 0) + ...);
+ });
if (__builtin_is_constant_evaluated() || __builtin_constant_p(__r))
return __r;
}
@@ -4897,8 +4909,11 @@ template <typename _Tp, typename _Abi>
{
constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
const size_t _Idx = __call_with_n_evaluations<_Np>(
- [](auto... __indexes) { return std::min({__indexes...}); },
- [&](auto __i) { return __k[__i] ? +__i : _Np; });
+ [](auto... __indexes) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::min({__indexes...});
+ }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __k[__i] ? +__i : _Np;
+ });
if (_Idx >= _Np)
__invoke_ub("find_first_set(empty mask) is UB");
if (__builtin_constant_p(_Idx))
@@ -4915,8 +4930,11 @@ template <typename _Tp, typename _Abi>
{
constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
const int _Idx = __call_with_n_evaluations<_Np>(
- [](auto... __indexes) { return std::max({__indexes...}); },
- [&](auto __i) { return __k[__i] ? int(__i) : -1; });
+ [](auto... __indexes) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::max({__indexes...});
+ }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __k[__i] ? int(__i) : -1;
+ });
if (_Idx < 0)
__invoke_ub("find_first_set(empty mask) is UB");
if (__builtin_constant_p(_Idx))
diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h b/libstdc++-v3/include/experimental/bits/simd_builtin.h
index 5901dee..119e0de 100644
--- a/libstdc++-v3/include/experimental/bits/simd_builtin.h
+++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h
@@ -194,8 +194,11 @@ template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
using _Up = decltype(__w);
return __intrin_bitcast<_Tp>(
__call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>(
- [](auto... __chunks) { return _Up{__chunks...}; },
- [&](auto __i) { return __w[__shift / __chunksize + __i]; }));
+ [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return _Up{__chunks...};
+ }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __w[__shift / __chunksize + __i];
+ }));
}
}
@@ -225,7 +228,9 @@ template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
// by _Total");
if (__x._M_is_constprop())
return __generate_from_n_evaluations<__return_size, _R>(
- [&](auto __i) { return __x[__values_to_skip + __i]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __x[__values_to_skip + __i];
+ });
if constexpr (_Index == 0 && _Total == 1)
return __x;
else if constexpr (_Index == 0)
@@ -570,7 +575,9 @@ template <typename _To,
constexpr auto _Np
= _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts;
return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
- [&](auto __i) { return static_cast<_To>(__v[__i + _Offset]); });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return static_cast<_To>(__v[__i + _Offset]);
+ });
}
else
{
@@ -611,13 +618,14 @@ template <typename _To,
return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
};
[[maybe_unused]] const auto __vi = __to_intrin(__v);
- auto&& __make_array = [](auto __x0, [[maybe_unused]] auto __x1) {
- if constexpr (_Np == 1)
- return _R{__intrin_bitcast<_To>(__x0)};
- else
- return _R{__intrin_bitcast<_To>(__x0),
- __intrin_bitcast<_To>(__x1)};
- };
+ auto&& __make_array
+ = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ if constexpr (_Np == 1)
+ return _R{__intrin_bitcast<_To>(__x0)};
+ else
+ return _R{__intrin_bitcast<_To>(__x0),
+ __intrin_bitcast<_To>(__x1)};
+ };
if constexpr (_Np == 0)
return _R{};
@@ -642,7 +650,7 @@ template <typename _To,
= __convert_all<__vector_type16_t<int>, _Np>(
__adjust(_SizeConstant<_Np * 4>(), __v));
return __generate_from_n_evaluations<_Np, _R>(
- [&](auto __i) {
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return __vector_convert<_To>(__as_wrapper(__ints[__i]));
});
}
@@ -687,36 +695,40 @@ template <typename _To,
__vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])),
__vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))};
if constexpr (sizeof(_ToT) == 4)
- return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
- return __vector_convert<_To>(
- _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
- });
+ return __generate_from_n_evaluations<_Np, _R>(
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __vector_convert<_To>(
+ _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
+ });
else if constexpr (is_integral_v<_ToT>)
- return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
- const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
- const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
- return __vector_bitcast<_ToT>(
- __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
- : _mm_unpackhi_epi32(__sx32, __signbits));
- });
+ return __generate_from_n_evaluations<_Np, _R>(
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
+ const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
+ return __vector_bitcast<_ToT>(
+ __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
+ : _mm_unpackhi_epi32(__sx32, __signbits));
+ });
else
- return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
- const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
- return __vector_convert<_To>(
- __i % 2 == 0 ? __int4
- : _SimdWrapper<int, 4>(
- _mm_unpackhi_epi64(__to_intrin(__int4),
- __to_intrin(__int4))));
- });
+ return __generate_from_n_evaluations<_Np, _R>(
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
+ return __vector_convert<_To>(
+ __i % 2 == 0 ? __int4
+ : _SimdWrapper<int, 4>(
+ _mm_unpackhi_epi64(__to_intrin(__int4),
+ __to_intrin(__int4))));
+ });
}
else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4)
{
const auto __shorts = __convert_all<__vector_type16_t<
conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
__adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v));
- return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
- return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
- });
+ return __generate_from_n_evaluations<_Np, _R>(
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
+ });
}
else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8
&& is_signed_v<_FromT> && is_integral_v<_ToT>)
@@ -736,9 +748,10 @@ template <typename _To,
__vector_bitcast<int>(
_mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16),
_mm_srai_epi32(__vv[1], 31)))};
- return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
- return __vector_bitcast<_ToT>(__vvvv[__i]);
- });
+ return __generate_from_n_evaluations<_Np, _R>(
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __vector_bitcast<_ToT>(__vvvv[__i]);
+ });
}
else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8)
{
@@ -747,9 +760,10 @@ template <typename _To,
is_signed_v<_FromT> || is_floating_point_v<_ToT>, int,
unsigned int>>>(
__adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v));
- return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
- return __convert_all<_To>(__ints[__i / 2])[__i % 2];
- });
+ return __generate_from_n_evaluations<_Np, _R>(
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __convert_all<_To>(__ints[__i / 2])[__i % 2];
+ });
}
else
__assert_unreachable<_To>();
@@ -779,14 +793,14 @@ template <typename _To,
__extract_part<_Offset, _FromVT::_S_partial_width,
_ToVT::_S_full_size>(__v))};
else
- return __generate_from_n_evaluations<_Np, _R>([&](
- auto __i) constexpr {
- auto __part
- = __extract_part<__i * _ToVT::_S_full_size + _Offset,
- _FromVT::_S_partial_width,
- _ToVT::_S_full_size>(__v);
- return __vector_convert<_To>(__part);
- });
+ return __generate_from_n_evaluations<_Np, _R>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ auto __part
+ = __extract_part<__i * _ToVT::_S_full_size + _Offset,
+ _FromVT::_S_partial_width,
+ _ToVT::_S_full_size>(__v);
+ return __vector_convert<_To>(__part);
+ });
}
else if constexpr (_Offset == 0)
return array<_To, 1>{__vector_convert<_To>(__v)};
@@ -1017,8 +1031,9 @@ template <int _UsedBytes>
else
{
constexpr auto __size = _S_size<_Tp>;
- _GLIBCXX_SIMD_USE_CONSTEXPR auto __r = __generate_vector<_UV>(
- [](auto __i) constexpr { return __i < __size ? -1 : 0; });
+ _GLIBCXX_SIMD_USE_CONSTEXPR auto __r
+ = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ { return __i < __size ? -1 : 0; });
return __r;
}
}
@@ -1208,7 +1223,7 @@ template <int _UsedBytes>
if constexpr (is_integral_v<typename _TVT::value_type>)
return __x
| __generate_vector<_Tp, _S_full_size<_Tp>>(
- [](auto __i) -> _Tp {
+ [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp {
if (__i < _Np)
return 0;
else
@@ -1348,26 +1363,27 @@ struct _CommonImplBuiltin
}
else
{
- __execute_n_times<__div_roundup(_Np, 4)>([&](auto __i) {
- constexpr int __offset = __i * 4;
- constexpr int __remaining = _Np - __offset;
- if constexpr (__remaining > 4 && __remaining <= 7)
- {
- const _ULLong __bool7
- = (__x.template _M_extract<__offset>()._M_to_bits()
- * 0x40810204081ULL)
- & 0x0101010101010101ULL;
- _S_store<__remaining>(__bool7, __mem + __offset);
- }
- else if constexpr (__remaining >= 4)
- {
- int __bits = __x.template _M_extract<__offset>()._M_to_bits();
- if constexpr (__remaining > 7)
- __bits &= 0xf;
- const int __bool4 = (__bits * 0x204081) & 0x01010101;
- _S_store<4>(__bool4, __mem + __offset);
- }
- });
+ __execute_n_times<__div_roundup(_Np, 4)>(
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ constexpr int __offset = __i * 4;
+ constexpr int __remaining = _Np - __offset;
+ if constexpr (__remaining > 4 && __remaining <= 7)
+ {
+ const _ULLong __bool7
+ = (__x.template _M_extract<__offset>()._M_to_bits()
+ * 0x40810204081ULL)
+ & 0x0101010101010101ULL;
+ _S_store<__remaining>(__bool7, __mem + __offset);
+ }
+ else if constexpr (__remaining >= 4)
+ {
+ int __bits = __x.template _M_extract<__offset>()._M_to_bits();
+ if constexpr (__remaining > 7)
+ __bits &= 0xf;
+ const int __bool4 = (__bits * 0x204081) & 0x01010101;
+ _S_store<4>(__bool4, __mem + __offset);
+ }
+ });
}
}
@@ -1434,13 +1450,13 @@ template <typename _Abi, typename>
inline static constexpr _SimdMember<_Tp> _S_generator(_Fp&& __gen,
_TypeTag<_Tp>)
{
- return __generate_vector<_Tp, _S_full_size<_Tp>>([&](
- auto __i) constexpr {
- if constexpr (__i < _S_size<_Tp>)
- return __gen(__i);
- else
- return 0;
- });
+ return __generate_vector<_Tp, _S_full_size<_Tp>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ if constexpr (__i < _S_size<_Tp>)
+ return __gen(__i);
+ else
+ return 0;
+ });
}
// _S_load {{{2
@@ -1455,10 +1471,10 @@ template <typename _Abi, typename>
: 16;
constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
if constexpr (sizeof(_Up) > 8)
- return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>([&](
- auto __i) constexpr {
- return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
- });
+ return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
+ });
else if constexpr (is_same_v<_Up, _Tp>)
return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
_Np * sizeof(_Tp)>(__mem);
@@ -1470,13 +1486,12 @@ template <typename _Abi, typename>
constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
constexpr size_t __elements_per_load = _Np / __n_loads;
return __call_with_n_evaluations<__n_loads>(
- [](auto... __uncvted) {
- return __convert<_SimdMember<_Tp>>(__uncvted...);
- },
- [&](auto __i) {
- return _CommonImpl::template _S_load<_Up, __elements_per_load>(
- __mem + __i * __elements_per_load);
- });
+ [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __convert<_SimdMember<_Tp>>(__uncvted...);
+ }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return _CommonImpl::template _S_load<_Up, __elements_per_load>(
+ __mem + __i * __elements_per_load);
+ });
}
else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0
&& __max_load_size > 16)
@@ -1485,20 +1500,19 @@ template <typename _Abi, typename>
= __bytes_to_load / (__max_load_size / 2);
constexpr size_t __elements_per_load = _Np / __n_loads;
return __call_with_n_evaluations<__n_loads>(
- [](auto... __uncvted) {
- return __convert<_SimdMember<_Tp>>(__uncvted...);
- },
- [&](auto __i) {
- return _CommonImpl::template _S_load<_Up, __elements_per_load>(
- __mem + __i * __elements_per_load);
- });
+ [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __convert<_SimdMember<_Tp>>(__uncvted...);
+ }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return _CommonImpl::template _S_load<_Up, __elements_per_load>(
+ __mem + __i * __elements_per_load);
+ });
}
else // e.g. int[] -> <char, 9>
return __call_with_subscripts(
- __mem, make_index_sequence<_Np>(), [](auto... __args) {
- return __vector_type_t<_Tp, _S_full_size<_Tp>>{
- static_cast<_Tp>(__args)...};
- });
+ __mem, make_index_sequence<_Np>(),
+ [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...};
+ });
}
// _S_masked_load {{{2
@@ -1507,9 +1521,10 @@ template <typename _Abi, typename>
_S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
const _Up* __mem) noexcept
{
- _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), [&](auto __i) {
- __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
- });
+ _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
+ });
return __merge;
}
@@ -1523,7 +1538,7 @@ template <typename _Abi, typename>
constexpr size_t __max_store_size
= _SuperImpl::template _S_max_store_size<_Up>;
if constexpr (sizeof(_Up) > 8)
- __execute_n_times<_Np>([&](auto __i) constexpr {
+ __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__mem[__i] = __v[__i];
});
else if constexpr (is_same_v<_Up, _Tp>)
@@ -1540,9 +1555,10 @@ template <typename _Abi, typename>
using _V = __vector_type_t<_Up, __vsize>;
const array<_V, __stores> __converted
= __convert_all<_V, __stores>(__v);
- __execute_n_times<__full_stores>([&](auto __i) constexpr {
- _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
- });
+ __execute_n_times<__full_stores>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
+ });
if constexpr (__full_stores < __stores)
_CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
* sizeof(_Up)>(
@@ -1557,7 +1573,8 @@ template <typename _Abi, typename>
_MaskMember<_Tp> __k)
{
_BitOps::_S_bit_iteration(
- _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr {
+ _MaskImpl::_S_to_bits(__k),
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__mem[__i] = __v[__i];
});
}
@@ -1579,7 +1596,7 @@ template <typename _Abi, typename>
_Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
{
// bitwise or no conversion, reinterpret:
- const _MaskMember<_Up> __kk = [&]() {
+ const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if constexpr (__is_bitmask_v<decltype(__k)>)
return _MaskMember<_Up>(__k._M_data);
else
@@ -1618,7 +1635,7 @@ template <typename _Abi, typename>
constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
const array<_UV, _NAllStores> __converted
= __convert_all<_UV, _NAllStores>(__v);
- __execute_n_times<_NFullStores>([&](auto __i) {
+ __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
_SuperImpl::_S_masked_store_nocvt(
_UW(__converted[__i]), __mem + __i * _UW_size,
_UAbi::_MaskImpl::template _S_convert<
@@ -1637,10 +1654,10 @@ template <typename _Abi, typename>
}
}
else
- _BitOps::_S_bit_iteration(
- _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr {
- __mem[__i] = static_cast<_Up>(__v[__i]);
- });
+ _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ __mem[__i] = static_cast<_Up>(__v[__i]);
+ });
}
// _S_complement {{{2
@@ -1932,7 +1949,9 @@ template <typename _Abi, typename>
static _Tp _S_##__name(const _Tp& __x, const _More&... __more) \
{ \
return __generate_vector<_Tp>( \
- [&](auto __i) { return __name(__x[__i], __more[__i]...); }); \
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
+ return __name(__x[__i], __more[__i]...); \
+ }); \
}
#define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \
@@ -1941,23 +1960,25 @@ template <typename _Abi, typename>
const _More&... __more) \
{ \
return __generate_vector<_Tp>( \
- [&](auto __i) { return __name(__x[__i], __more[__i]...); }); \
- }
-
-#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \
- template <typename _Tp, typename... _More> \
- static auto _S_##__name(const _Tp& __x, const _More&... __more) \
- { \
- return __fixed_size_storage_t<_RetTp, \
- _VectorTraits<_Tp>::_S_partial_width>:: \
- _S_generate([&](auto __meta) constexpr { \
- return __meta._S_generator( \
- [&](auto __i) { \
- return __name(__x[__meta._S_offset + __i], \
- __more[__meta._S_offset + __i]...); \
- }, \
- static_cast<_RetTp*>(nullptr)); \
- }); \
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
+ return __name(__x[__i], __more[__i]...); \
+ }); \
+ }
+
+#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \
+ template <typename _Tp, typename... _More> \
+ static auto _S_##__name(const _Tp& __x, const _More&... __more) \
+ { \
+ return __fixed_size_storage_t<_RetTp, \
+ _VectorTraits<_Tp>::_S_partial_width>:: \
+ _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
+ return __meta._S_generator( \
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
+ return __name(__x[__meta._S_offset + __i], \
+ __more[__meta._S_offset + __i]...); \
+ }, \
+ static_cast<_RetTp*>(nullptr)); \
+ }); \
}
_GLIBCXX_SIMD_MATH_FALLBACK(acos)
@@ -2010,7 +2031,7 @@ template <typename _Abi, typename>
_S_remquo(const _Tp __x, const _Tp __y,
__fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
{
- return __generate_vector<_Tp>([&](auto __i) {
+ return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
int __tmp;
auto __r = remquo(__x[__i], __y[__i], &__tmp);
__z->_M_set(__i, __tmp);
@@ -2423,7 +2444,7 @@ template <typename _Abi, typename>
#endif // _GLIBCXX_SIMD_X86INTRIN
else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
- [](auto... __l) {
+ [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return __make_wrapper<int>(__l...);
})};
else
@@ -2554,13 +2575,13 @@ struct _MaskImplBuiltinMixin
_S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
{
static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
- return __generate_vector<__vector_type_t<_Up, _ToN>>([&](
- auto __i) constexpr {
- if constexpr (__i < _Np)
- return __x[__i] ? ~_Up() : _Up();
- else
- return _Up();
- });
+ return __generate_vector<__vector_type_t<_Up, _ToN>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ if constexpr (__i < _Np)
+ return __x[__i] ? ~_Up() : _Up();
+ else
+ return _Up();
+ });
}
template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
@@ -2601,13 +2622,13 @@ struct _MaskImplBuiltinMixin
-1, -1, -1, -1, -1>(__y); else
*/
{
- return __generate_vector<__vector_type_t<_Up, _ToN>>([&](
- auto __i) constexpr {
- if constexpr (__i < _Np)
- return _Up(__x[__i.value]);
- else
- return _Up();
- });
+ return __generate_vector<__vector_type_t<_Up, _ToN>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ if constexpr (__i < _Np)
+ return _Up(__x[__i.value]);
+ else
+ return _Up();
+ });
}
}
}
@@ -2625,7 +2646,9 @@ struct _MaskImplBuiltinMixin
= __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1);
_ULLong __r = 0;
__execute_n_times<_Np>(
- [&](auto __i) { __r |= _ULLong(__bools[__i.value]) << __i; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ __r |= _ULLong(__bools[__i.value]) << __i;
+ });
return __r;
}
@@ -2677,9 +2700,10 @@ template <typename _Abi, typename>
return __bools > 0;
}
else
- return __generate_vector<_I, _S_size<_Tp>>([&](auto __i) constexpr {
- return __mem[__i] ? ~_I() : _I();
- });
+ return __generate_vector<_I, _S_size<_Tp>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __mem[__i] ? ~_I() : _I();
+ });
}
// }}}
@@ -2752,7 +2776,7 @@ template <typename _Abi, typename>
// AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
_BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
- [&](auto __i) {
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__tmp._M_set(__i, -__mem[__i]);
});
__merge = __wrapper_bitcast<_Tp>(__tmp);
@@ -2764,7 +2788,7 @@ template <typename _Abi, typename>
_GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v,
bool* __mem) noexcept
{
- __execute_n_times<_Np>([&](auto __i) constexpr {
+ __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__mem[__i] = __v[__i];
});
}
@@ -2775,10 +2799,10 @@ template <typename _Abi, typename>
_S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
const _SimdWrapper<_Tp, _Np> __k) noexcept
{
- _BitOps::_S_bit_iteration(
- _SuperImpl::_S_to_bits(__k), [&](auto __i) constexpr {
- __mem[__i] = __v[__i];
- });
+ _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k),
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ __mem[__i] = __v[__i];
+ });
}
// _S_from_bitmask{{{2
@@ -2845,7 +2869,7 @@ template <typename _Abi, typename>
{
__k = __generate_from_n_evaluations<_Np,
__vector_type_t<_Tp, _Np>>(
- [&](auto __j) {
+ [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if (__i == static_cast<int>(__j))
return _Tp(-__x);
else
@@ -2890,7 +2914,8 @@ template <typename _Abi, typename>
{
return __call_with_subscripts(
__data(__k), make_index_sequence<_S_size<_Tp>>(),
- [](const auto... __ent) constexpr { return (... && !(__ent == 0)); });
+ [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ { return (... && !(__ent == 0)); });
}
// }}}
@@ -2901,7 +2926,8 @@ template <typename _Abi, typename>
{
return __call_with_subscripts(
__data(__k), make_index_sequence<_S_size<_Tp>>(),
- [](const auto... __ent) constexpr { return (... || !(__ent == 0)); });
+ [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ { return (... || !(__ent == 0)); });
}
// }}}
@@ -2912,7 +2938,8 @@ template <typename _Abi, typename>
{
return __call_with_subscripts(
__data(__k), make_index_sequence<_S_size<_Tp>>(),
- [](const auto... __ent) constexpr { return (... && (__ent == 0)); });
+ [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ { return (... && (__ent == 0)); });
}
// }}}
diff --git a/libstdc++-v3/include/experimental/bits/simd_converter.h b/libstdc++-v3/include/experimental/bits/simd_converter.h
index f6e8569..cfca421 100644
--- a/libstdc++-v3/include/experimental/bits/simd_converter.h
+++ b/libstdc++-v3/include/experimental/bits/simd_converter.h
@@ -121,7 +121,7 @@ template <typename _From, typename _To, int _Np>
{
return __call_with_subscripts(
__x, make_index_sequence<_Np>(),
- [](auto... __values) constexpr->_Ret {
+ [](auto... __values) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Ret {
return __make_simd_tuple<_To, decltype((void) __values,
simd_abi::scalar())...>(
static_cast<_To>(__values)...);
@@ -233,7 +233,9 @@ template <typename _From, typename _To, int _Np>
static_assert(_Ret::_FirstAbi::template _S_is_partial<_To>);
return _Ret{__generate_from_n_evaluations<
_Np, typename _VectorTraits<typename _Ret::_FirstType>::type>(
- [&](auto __i) { return static_cast<_To>(__x[__i]); })};
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return static_cast<_To>(__x[__i]);
+ })};
}
else
{
@@ -241,7 +243,7 @@ template <typename _From, typename _To, int _Np>
constexpr auto __n
= __div_roundup(_Ret::_S_first_size, _Arg::_S_first_size);
return __call_with_n_evaluations<__n>(
- [&__x](auto... __uncvted) {
+ [&__x](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
// assuming _Arg Abi tags for all __i are _Arg::_FirstAbi
_SimdConverter<_From, typename _Arg::_FirstAbi, _To,
typename _Ret::_FirstAbi>
@@ -255,8 +257,9 @@ template <typename _From, typename _To, int _Np>
_From, simd_abi::fixed_size<_Np - _Ret::_S_first_size>, _To,
simd_abi::fixed_size<_Np - _Ret::_S_first_size>>()(
__simd_tuple_pop_front<_Ret::_S_first_size>(__x))};
- },
- [&__x](auto __i) { return __get_tuple_at<__i>(__x); });
+ }, [&__x](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __get_tuple_at<__i>(__x);
+ });
}
}
};
@@ -322,13 +325,14 @@ template <typename _From, int _Np, typename _To, typename _Ap>
return __vector_convert<__vector_type_t<_To, _Np>>(__x.first);
else if constexpr (_Arg::_S_is_homogeneous)
return __call_with_n_evaluations<_Arg::_S_tuple_size>(
- [](auto... __members) {
+ [](auto... __members) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if constexpr ((is_convertible_v<decltype(__members), _To> && ...))
return __vector_type_t<_To, _Np>{static_cast<_To>(__members)...};
else
return __vector_convert<__vector_type_t<_To, _Np>>(__members...);
- },
- [&](auto __i) { return __get_tuple_at<__i>(__x); });
+ }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __get_tuple_at<__i>(__x);
+ });
else if constexpr (__fixed_size_storage_t<_To, _Np>::_S_tuple_size == 1)
{
_SimdConverter<_From, simd_abi::fixed_size<_Np>, _To,
@@ -340,7 +344,7 @@ template <typename _From, int _Np, typename _To, typename _Ap>
{
const _SimdWrapper<_From, _Np> __xv
= __generate_from_n_evaluations<_Np, __vector_type_t<_From, _Np>>(
- [&](auto __i) { return __x[__i]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
return __vector_convert<__vector_type_t<_To, _Np>>(__xv);
}
}
diff --git a/libstdc++-v3/include/experimental/bits/simd_detail.h b/libstdc++-v3/include/experimental/bits/simd_detail.h
index de8d018..bd4e380 100644
--- a/libstdc++-v3/include/experimental/bits/simd_detail.h
+++ b/libstdc++-v3/include/experimental/bits/simd_detail.h
@@ -262,6 +262,7 @@
#define _GLIBCXX_SIMD_INTRINSIC \
[[__gnu__::__always_inline__, __gnu__::__artificial__]] inline
#define _GLIBCXX_SIMD_ALWAYS_INLINE [[__gnu__::__always_inline__]] inline
+#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA __attribute__((__always_inline__))
#define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0)
#define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1)
@@ -294,6 +295,8 @@
#ifdef _GLIBCXX_SIMD_NO_ALWAYS_INLINE
#undef _GLIBCXX_SIMD_ALWAYS_INLINE
#define _GLIBCXX_SIMD_ALWAYS_INLINE inline
+#undef _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
#undef _GLIBCXX_SIMD_INTRINSIC
#define _GLIBCXX_SIMD_INTRINSIC inline
#endif
diff --git a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
index 7bb248c..eb71a7b 100644
--- a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
+++ b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
@@ -434,14 +434,15 @@ template <typename _Tp, typename _Abi0, typename... _Abis>
if constexpr (is_same_v<_SimdTuple, __remove_cvref_t<_Tup>>)
return __tup.first;
else if (__builtin_is_constant_evaluated())
- return __fixed_size_storage_t<_TupT, _S_first_size>::_S_generate([&](
- auto __meta) constexpr {
- return __meta._S_generator(
- [&](auto __i) constexpr { return __tup[__i]; },
- static_cast<_TupT*>(nullptr));
+ return __fixed_size_storage_t<_TupT, _S_first_size>::_S_generate(
+ [&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __meta._S_generator(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __tup[__i];
+ }, static_cast<_TupT*>(nullptr));
});
else
- return [&]() {
+ return [&]() { // not always_inline; allow the compiler to decide
__fixed_size_storage_t<_TupT, _S_first_size> __r;
__builtin_memcpy(__r._M_as_charptr(), __tup._M_as_charptr(),
sizeof(__r));
@@ -515,12 +516,11 @@ template <typename _Tp, typename _Abi0, typename... _Abis>
negation<is_const<remove_reference_t<_More>>>>) )
{
// need to write back at least one of __more after calling __fun
- auto&& __first = [&](auto... __args) constexpr
- {
+ auto&& __first = [&](auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
auto __r = __fun(__tuple_element_meta<_Tp, _Abi0, 0>(), first,
__args...);
[[maybe_unused]] auto&& __ignore_me = {(
- [](auto&& __dst, const auto& __src) {
+ [](auto&& __dst, const auto& __src) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if constexpr (is_assignable_v<decltype(__dst),
decltype(__dst)>)
{
@@ -530,8 +530,7 @@ template <typename _Tp, typename _Abi0, typename... _Abis>
}(static_cast<_More&&>(__more), __args),
0)...};
return __r;
- }
- (_M_extract_argument(__more)...);
+ }(_M_extract_argument(__more)...);
if constexpr (_S_tuple_size == 1)
return {__first};
else
@@ -776,18 +775,18 @@ template <typename _Tp, size_t _Np, typename _V, size_t _NV, typename... _VX>
sizeof...(_VX) == 0,
"An array of scalars must be the last argument to __to_simd_tuple");
return __call_with_subscripts(
- __from,
- make_index_sequence<_NV>(), [&](const auto... __args) constexpr {
- return __simd_tuple_concat(
- _SimdTuple<_Tp, simd_abi::scalar>{__args}..., _SimdTuple<_Tp>());
- });
+ __from, make_index_sequence<_NV>(),
+ [&](const auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __simd_tuple_concat(
+ _SimdTuple<_Tp, simd_abi::scalar>{__args}..., _SimdTuple<_Tp>());
+ });
}
else
return __call_with_subscripts(
- __from,
- make_index_sequence<_NV>(), [&](const auto... __args) constexpr {
- return __to_simd_tuple<_Tp, _Np>(__args..., __fromX...);
- });
+ __from, make_index_sequence<_NV>(),
+ [&](const auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __to_simd_tuple<_Tp, _Np>(__args..., __fromX...);
+ });
}
template <size_t, typename _Tp>
@@ -841,7 +840,7 @@ template <typename _Tp, typename _A0, typename _A1, typename... _Abis,
|| _A0::template _S_is_partial<_Tp>)
return {__generate_from_n_evaluations<_R::_S_first_size,
typename _R::_FirstType>(
- [&](auto __i) { return __x[__i]; }),
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }),
__optimize_simd_tuple(
__simd_tuple_pop_front<_R::_S_first_size>(__x))};
else if constexpr (is_same_v<_A0, _A1>
@@ -994,10 +993,11 @@ template <int _Index, int _Total, int _Combine, typename _Tp, typename _A0,
return __as_vector(simd<_Tp, _RetAbi>(element_ptr, element_aligned));
#else
[[maybe_unused]] constexpr size_t __offset = __values_to_skip;
- return __as_vector(simd<_Tp, _RetAbi>([&](auto __i) constexpr {
- constexpr _SizeConstant<__i + __offset> __k;
- return __x[__k];
- }));
+ return __as_vector(simd<_Tp, _RetAbi>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ constexpr _SizeConstant<__i + __offset> __k;
+ return __x[__k];
+ }));
#endif
}
@@ -1286,9 +1286,10 @@ template <int _Np, typename>
template <typename _Tp>
static constexpr inline _SimdMember<_Tp> _S_broadcast(_Tp __x) noexcept
{
- return _SimdMember<_Tp>::_S_generate([&](auto __meta) constexpr {
- return __meta._S_broadcast(__x);
- });
+ return _SimdMember<_Tp>::_S_generate(
+ [&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __meta._S_broadcast(__x);
+ });
}
// _S_generator {{{2
@@ -1296,14 +1297,15 @@ template <int _Np, typename>
static constexpr inline _SimdMember<_Tp> _S_generator(_Fp&& __gen,
_TypeTag<_Tp>)
{
- return _SimdMember<_Tp>::_S_generate([&__gen](auto __meta) constexpr {
- return __meta._S_generator(
- [&](auto __i) constexpr {
- return __i < _Np ? __gen(_SizeConstant<__meta._S_offset + __i>())
- : 0;
- },
- _TypeTag<_Tp>());
- });
+ return _SimdMember<_Tp>::_S_generate(
+ [&__gen](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __meta._S_generator(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __i < _Np ? __gen(_SizeConstant<__meta._S_offset + __i>())
+ : 0;
+ },
+ _TypeTag<_Tp>());
+ });
}
// _S_load {{{2
@@ -1311,9 +1313,10 @@ template <int _Np, typename>
static inline _SimdMember<_Tp> _S_load(const _Up* __mem,
_TypeTag<_Tp>) noexcept
{
- return _SimdMember<_Tp>::_S_generate([&](auto __meta) {
- return __meta._S_load(&__mem[__meta._S_offset], _TypeTag<_Tp>());
- });
+ return _SimdMember<_Tp>::_S_generate(
+ [&](auto __meta) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __meta._S_load(&__mem[__meta._S_offset], _TypeTag<_Tp>());
+ });
}
// _S_masked_load {{{2
@@ -1323,7 +1326,7 @@ template <int _Np, typename>
const _MaskMember __bits, const _Up* __mem) noexcept
{
auto __merge = __old;
- __for_each(__merge, [&](auto __meta, auto& __native) {
+ __for_each(__merge, [&](auto __meta, auto& __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if (__meta._S_submask(__bits).any())
#pragma GCC diagnostic push
// __mem + __mem._S_offset could be UB ([expr.add]/4.3, but it punts
@@ -1344,7 +1347,7 @@ template <int _Np, typename>
static inline void _S_store(const _SimdMember<_Tp>& __v, _Up* __mem,
_TypeTag<_Tp>) noexcept
{
- __for_each(__v, [&](auto __meta, auto __native) {
+ __for_each(__v, [&](auto __meta, auto __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__meta._S_store(__native, &__mem[__meta._S_offset], _TypeTag<_Tp>());
});
}
@@ -1355,7 +1358,7 @@ template <int _Np, typename>
_Up* __mem,
const _MaskMember __bits) noexcept
{
- __for_each(__v, [&](auto __meta, auto __native) {
+ __for_each(__v, [&](auto __meta, auto __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if (__meta._S_submask(__bits).any())
#pragma GCC diagnostic push
// __mem + __mem._S_offset could be UB ([expr.add]/4.3, but it punts
@@ -1376,7 +1379,7 @@ template <int _Np, typename>
{
_MaskMember __bits = 0;
__for_each(
- __x, [&__bits](auto __meta, auto __native) constexpr {
+ __x, [&__bits](auto __meta, auto __native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__bits
|= __meta._S_mask_to_shifted_ullong(__meta._S_negate(__native));
});
@@ -1414,7 +1417,7 @@ template <int _Np, typename>
{
const auto& __x2 = __call_with_n_evaluations<
__div_roundup(_Tup::_S_tuple_size, 2)>(
- [](auto __first_simd, auto... __remaining) {
+ [](auto __first_simd, auto... __remaining) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if constexpr (sizeof...(__remaining) == 0)
return __first_simd;
else
@@ -1428,7 +1431,7 @@ template <int _Np, typename>
__make_simd_tuple(__first_simd, __remaining...));
}
},
- [&](auto __i) {
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
auto __left = __tup.template _M_simd_at<2 * __i>();
if constexpr (2 * __i + 1 == _Tup::_S_tuple_size)
return __left;
@@ -1444,7 +1447,9 @@ template <int _Np, typename>
_GLIBCXX_SIMD_USE_CONSTEXPR_API
typename _LT::mask_type __k(
__private_init,
- [](auto __j) constexpr { return __j < _RT::size(); });
+ [](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __j < _RT::size();
+ });
_LT __ext_right = __left;
where(__k, __ext_right)
= __proposed::resizing_simd_cast<_LT>(__right);
@@ -1464,7 +1469,7 @@ template <int _Np, typename>
const _SimdTuple<_Tp, _As...>& __b)
{
return __a._M_apply_per_chunk(
- [](auto __impl, auto __aa, auto __bb) constexpr {
+ [](auto __impl, auto __aa, auto __bb) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return __impl._S_min(__aa, __bb);
},
__b);
@@ -1476,7 +1481,7 @@ template <int _Np, typename>
const _SimdTuple<_Tp, _As...>& __b)
{
return __a._M_apply_per_chunk(
- [](auto __impl, auto __aa, auto __bb) constexpr {
+ [](auto __impl, auto __aa, auto __bb) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return __impl._S_max(__aa, __bb);
},
__b);
@@ -1487,9 +1492,10 @@ template <int _Np, typename>
static inline constexpr _SimdTuple<_Tp, _As...>
_S_complement(const _SimdTuple<_Tp, _As...>& __x) noexcept
{
- return __x._M_apply_per_chunk([](auto __impl, auto __xx) constexpr {
- return __impl._S_complement(__xx);
- });
+ return __x._M_apply_per_chunk(
+ [](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __impl._S_complement(__xx);
+ });
}
// _S_unary_minus {{{2
@@ -1497,23 +1503,24 @@ template <int _Np, typename>
static inline constexpr _SimdTuple<_Tp, _As...>
_S_unary_minus(const _SimdTuple<_Tp, _As...>& __x) noexcept
{
- return __x._M_apply_per_chunk([](auto __impl, auto __xx) constexpr {
- return __impl._S_unary_minus(__xx);
- });
+ return __x._M_apply_per_chunk(
+ [](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __impl._S_unary_minus(__xx);
+ });
}
// arithmetic operators {{{2
-#define _GLIBCXX_SIMD_FIXED_OP(name_, op_) \
- template <typename _Tp, typename... _As> \
- static inline constexpr _SimdTuple<_Tp, _As...> name_( \
- const _SimdTuple<_Tp, _As...>& __x, const _SimdTuple<_Tp, _As...>& __y)\
- { \
- return __x._M_apply_per_chunk( \
- [](auto __impl, auto __xx, auto __yy) constexpr { \
- return __impl.name_(__xx, __yy); \
- }, \
- __y); \
+#define _GLIBCXX_SIMD_FIXED_OP(name_, op_) \
+ template <typename _Tp, typename... _As> \
+ static inline constexpr _SimdTuple<_Tp, _As...> name_( \
+ const _SimdTuple<_Tp, _As...>& __x, const _SimdTuple<_Tp, _As...>& __y) \
+ { \
+ return __x._M_apply_per_chunk( \
+ [](auto __impl, auto __xx, auto __yy) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
+ return __impl.name_(__xx, __yy); \
+ }, \
+ __y); \
}
_GLIBCXX_SIMD_FIXED_OP(_S_plus, +)
@@ -1532,18 +1539,20 @@ template <int _Np, typename>
static inline constexpr _SimdTuple<_Tp, _As...>
_S_bit_shift_left(const _SimdTuple<_Tp, _As...>& __x, int __y)
{
- return __x._M_apply_per_chunk([__y](auto __impl, auto __xx) constexpr {
- return __impl._S_bit_shift_left(__xx, __y);
- });
+ return __x._M_apply_per_chunk(
+ [__y](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __impl._S_bit_shift_left(__xx, __y);
+ });
}
template <typename _Tp, typename... _As>
static inline constexpr _SimdTuple<_Tp, _As...>
_S_bit_shift_right(const _SimdTuple<_Tp, _As...>& __x, int __y)
{
- return __x._M_apply_per_chunk([__y](auto __impl, auto __xx) constexpr {
- return __impl._S_bit_shift_right(__xx, __y);
- });
+ return __x._M_apply_per_chunk(
+ [__y](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __impl._S_bit_shift_right(__xx, __y);
+ });
}
// math {{{2
@@ -1557,35 +1566,40 @@ template <int _Np, typename>
{ \
if constexpr (is_same_v<_Tp, _RetTp>) \
return __x._M_apply_per_chunk( \
- [](auto __impl, auto __xx) constexpr { \
- using _V = typename decltype(__impl)::simd_type; \
- return __data(__name(_V(__private_init, __xx))); \
- }); \
+ [](auto __impl, auto __xx) \
+ constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \
+ { \
+ using _V = typename decltype(__impl)::simd_type; \
+ return __data(__name(_V(__private_init, __xx))); \
+ }); \
else \
return __optimize_simd_tuple( \
- __x.template _M_apply_r<_RetTp>([](auto __impl, auto __xx) { \
- return __impl._S_##__name(__xx); \
- })); \
+ __x.template _M_apply_r<_RetTp>( \
+ [](auto __impl, auto __xx) \
+ _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \
+ { return __impl._S_##__name(__xx); })); \
} \
else if constexpr ( \
is_same_v< \
_Tp, \
_RetTp> && (... && is_same_v<_SimdTuple<_Tp, _As...>, _More>) ) \
return __x._M_apply_per_chunk( \
- [](auto __impl, auto __xx, auto... __pack) constexpr { \
- using _V = typename decltype(__impl)::simd_type; \
- return __data(__name(_V(__private_init, __xx), \
- _V(__private_init, __pack)...)); \
- }, \
- __more...); \
+ [](auto __impl, auto __xx, auto... __pack) \
+ constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \
+ { \
+ using _V = typename decltype(__impl)::simd_type; \
+ return __data(__name(_V(__private_init, __xx), \
+ _V(__private_init, __pack)...)); \
+ }, __more...); \
else if constexpr (is_same_v<_Tp, _RetTp>) \
return __x._M_apply_per_chunk( \
- [](auto __impl, auto __xx, auto... __pack) constexpr { \
- using _V = typename decltype(__impl)::simd_type; \
- return __data(__name(_V(__private_init, __xx), \
- __autocvt_to_simd(__pack)...)); \
- }, \
- __more...); \
+ [](auto __impl, auto __xx, auto... __pack) \
+ constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \
+ { \
+ using _V = typename decltype(__impl)::simd_type; \
+ return __data(__name(_V(__private_init, __xx), \
+ __autocvt_to_simd(__pack)...)); \
+ }, __more...); \
else \
__assert_unreachable<_Tp>(); \
}
@@ -1657,10 +1671,10 @@ template <int _Np, typename>
__fixed_size_storage_t<int, _SimdTuple<_Tp, _Abis...>::_S_size()>* __z)
{
return __x._M_apply_per_chunk(
- [](auto __impl, const auto __xx, const auto __yy, auto& __zz) {
- return __impl._S_remquo(__xx, __yy, &__zz);
- },
- __y, *__z);
+ [](auto __impl, const auto __xx, const auto __yy, auto& __zz)
+ _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ { return __impl._S_remquo(__xx, __yy, &__zz); },
+ __y, *__z);
}
template <typename _Tp, typename... _As>
@@ -1669,12 +1683,10 @@ template <int _Np, typename>
__fixed_size_storage_t<int, _Np>& __exp) noexcept
{
return __x._M_apply_per_chunk(
- [](auto __impl, const auto& __a, auto& __b) {
- return __data(
- frexp(typename decltype(__impl)::simd_type(__private_init, __a),
- __autocvt_to_simd(__b)));
- },
- __exp);
+ [](auto __impl, const auto& __a, auto& __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __data(frexp(typename decltype(__impl)::simd_type(__private_init, __a),
+ __autocvt_to_simd(__b)));
+ }, __exp);
}
#define _GLIBCXX_SIMD_TEST_ON_TUPLE_(name_) \
@@ -1700,7 +1712,7 @@ template <int _Np, typename>
_S_increment(_SimdTuple<_Ts...>& __x)
{
__for_each(
- __x, [](auto __meta, auto& native) constexpr {
+ __x, [](auto __meta, auto& native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__meta._S_increment(native);
});
}
@@ -1710,7 +1722,7 @@ template <int _Np, typename>
_S_decrement(_SimdTuple<_Ts...>& __x)
{
__for_each(
- __x, [](auto __meta, auto& native) constexpr {
+ __x, [](auto __meta, auto& native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__meta._S_decrement(native);
});
}
@@ -1722,11 +1734,10 @@ template <int _Np, typename>
__cmp(const _SimdTuple<_Tp, _As...>& __x, \
const _SimdTuple<_Tp, _As...>& __y) \
{ \
- return _M_test( \
- [](auto __impl, auto __xx, auto __yy) constexpr { \
- return __impl.__cmp(__xx, __yy); \
- }, \
- __x, __y); \
+ return _M_test([](auto __impl, auto __xx, auto __yy) \
+ constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA \
+ { return __impl.__cmp(__xx, __yy); }, \
+ __x, __y); \
}
_GLIBCXX_SIMD_CMP_OPERATIONS(_S_equal_to)
@@ -1753,12 +1764,13 @@ template <int _Np, typename>
_S_masked_assign(const _MaskMember __bits, _SimdTuple<_Tp, _As...>& __lhs,
const __type_identity_t<_SimdTuple<_Tp, _As...>>& __rhs)
{
- __for_each(
- __lhs, __rhs,
- [&](auto __meta, auto& __native_lhs, auto __native_rhs) constexpr {
- __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs,
- __native_rhs);
- });
+ __for_each(__lhs, __rhs,
+ [&](auto __meta, auto& __native_lhs, auto __native_rhs)
+ constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ {
+ __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs,
+ __native_rhs);
+ });
}
// Optimization for the case where the RHS is a scalar. No need to broadcast
@@ -1769,7 +1781,7 @@ template <int _Np, typename>
const __type_identity_t<_Tp> __rhs)
{
__for_each(
- __lhs, [&](auto __meta, auto& __native_lhs) constexpr {
+ __lhs, [&](auto __meta, auto& __native_lhs) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs,
__rhs);
});
@@ -1782,12 +1794,13 @@ template <int _Np, typename>
const _SimdTuple<_Tp, _As...>& __rhs,
_Op __op)
{
- __for_each(
- __lhs, __rhs,
- [&](auto __meta, auto& __native_lhs, auto __native_rhs) constexpr {
- __meta.template _S_masked_cassign(__meta._S_make_mask(__bits),
- __native_lhs, __native_rhs, __op);
- });
+ __for_each(__lhs, __rhs,
+ [&](auto __meta, auto& __native_lhs, auto __native_rhs)
+ constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ {
+ __meta.template _S_masked_cassign(__meta._S_make_mask(__bits),
+ __native_lhs, __native_rhs, __op);
+ });
}
// Optimization for the case where the RHS is a scalar. No need to broadcast
@@ -1798,7 +1811,7 @@ template <int _Np, typename>
const _Tp& __rhs, _Op __op)
{
__for_each(
- __lhs, [&](auto __meta, auto& __native_lhs) constexpr {
+ __lhs, [&](auto __meta, auto& __native_lhs) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__meta.template _S_masked_cassign(__meta._S_make_mask(__bits),
__native_lhs, __rhs, __op);
});
@@ -1899,7 +1912,7 @@ template <int _Np, typename>
// _Np _UShort, _UInt, _ULLong, float, and double can be more efficient.
_ULLong __r = 0;
using _Vs = __fixed_size_storage_t<_UChar, _Np>;
- __for_each(_Vs{}, [&](auto __meta, auto) {
+ __for_each(_Vs{}, [&](auto __meta, auto) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__r |= __meta._S_mask_to_shifted_ullong(
__meta._S_mask_impl._S_load(&__mem[__meta._S_offset],
_SizeConstant<__meta._S_size()>()));
@@ -1912,9 +1925,10 @@ template <int _Np, typename>
_MaskMember __mask,
const bool* __mem) noexcept
{
- _BitOps::_S_bit_iteration(__mask.to_ullong(), [&](auto __i) {
- __merge.set(__i, __mem[__i]);
- });
+ _BitOps::_S_bit_iteration(__mask.to_ullong(),
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ __merge.set(__i, __mem[__i]);
+ });
return __merge;
}
@@ -1932,7 +1946,8 @@ template <int _Np, typename>
static inline void _S_masked_store(const _MaskMember __v, bool* __mem,
const _MaskMember __k) noexcept
{
- _BitOps::_S_bit_iteration(__k, [&](auto __i) { __mem[__i] = __v[__i]; });
+ _BitOps::_S_bit_iteration(
+ __k, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; });
}
// logical and bitwise operators {{{2
diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h
index b008139..61f9ad2 100644
--- a/libstdc++-v3/include/experimental/bits/simd_math.h
+++ b/libstdc++-v3/include/experimental/bits/simd_math.h
@@ -788,7 +788,7 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
// __exponent(__x) returns the exponent value (bias removed) as
// simd<_Up> with integral _Up
- auto&& __exponent = [](const _V& __v) {
+ auto&& __exponent = [](const _V& __v) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
using namespace std::experimental::__proposed;
using _IV = rebind_simd_t<
conditional_t<sizeof(_Tp) == sizeof(_LLong), _LLong, int>, _V>;
@@ -931,7 +931,7 @@ template <typename _R, typename _ToApply, typename _Tp, typename... _Tps>
{
return {__private_init,
__data(__arg0)._M_apply_per_chunk(
- [&](auto __impl, const auto&... __inner) {
+ [&](auto __impl, const auto&... __inner) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
using _V = typename decltype(__impl)::simd_type;
return __data(__apply(_V(__private_init, __inner)...));
},
@@ -1092,8 +1092,9 @@ _GLIBCXX_SIMD_CVTING2(hypot)
if constexpr (__is_fixed_size_abi_v<_Abi> && _V::size() > 1)
{
return __fixed_size_apply<simd<_Tp, _Abi>>(
- [](auto __a, auto __b, auto __c) { return hypot(__a, __b, __c); },
- __x, __y, __z);
+ [](auto __a, auto __b, auto __c) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return hypot(__a, __b, __c);
+ }, __x, __y, __z);
}
else
{
@@ -1380,9 +1381,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m,
const simd<_Tp, _Abi>& __x)
{
- return simd<_Tp, _Abi>([&](auto __i) {
- return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]);
- });
+ return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]);
+ });
}
template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1391,9 +1392,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m,
const simd<_Tp, _Abi>& __x)
{
- return simd<_Tp, _Abi>([&](auto __i) {
- return std::assoc_legendre(__n[__i], __m[__i], __x[__i]);
- });
+ return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::assoc_legendre(__n[__i], __m[__i], __x[__i]);
+ });
}
_GLIBCXX_SIMD_MATH_CALL2_(beta, _Tp)
@@ -1414,8 +1415,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
hermite(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
const simd<_Tp, _Abi>& __x)
{
- return simd<_Tp, _Abi>(
- [&](auto __i) { return std::hermite(__n[__i], __x[__i]); });
+ return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::hermite(__n[__i], __x[__i]);
+ });
}
template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1423,8 +1425,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
laguerre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
const simd<_Tp, _Abi>& __x)
{
- return simd<_Tp, _Abi>(
- [&](auto __i) { return std::laguerre(__n[__i], __x[__i]); });
+ return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::laguerre(__n[__i], __x[__i]);
+ });
}
template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1432,8 +1435,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
legendre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
const simd<_Tp, _Abi>& __x)
{
- return simd<_Tp, _Abi>(
- [&](auto __i) { return std::legendre(__n[__i], __x[__i]); });
+ return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::legendre(__n[__i], __x[__i]);
+ });
}
_GLIBCXX_SIMD_MATH_CALL_(riemann_zeta)
@@ -1443,8 +1447,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
sph_bessel(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
const simd<_Tp, _Abi>& __x)
{
- return simd<_Tp, _Abi>(
- [&](auto __i) { return std::sph_bessel(__n[__i], __x[__i]); });
+ return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::sph_bessel(__n[__i], __x[__i]);
+ });
}
template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1453,9 +1458,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m,
const simd<_Tp, _Abi>& theta)
{
- return simd<_Tp, _Abi>([&](auto __i) {
- return std::assoc_legendre(__l[__i], __m[__i], theta[__i]);
- });
+ return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::assoc_legendre(__l[__i], __m[__i], theta[__i]);
+ });
}
template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1463,8 +1468,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
sph_neumann(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
const simd<_Tp, _Abi>& __x)
{
- return simd<_Tp, _Abi>(
- [&](auto __i) { return std::sph_neumann(__n[__i], __x[__i]); });
+ return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return std::sph_neumann(__n[__i], __x[__i]);
+ });
}
// }}}
diff --git a/libstdc++-v3/include/experimental/bits/simd_neon.h b/libstdc++-v3/include/experimental/bits/simd_neon.h
index 0945092..c809addd 100644
--- a/libstdc++-v3/include/experimental/bits/simd_neon.h
+++ b/libstdc++-v3/include/experimental/bits/simd_neon.h
@@ -61,7 +61,7 @@ template <typename _Abi, typename>
_S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
const _Up* __mem) noexcept
{
- __execute_n_times<_Np>([&](auto __i) {
+ __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if (__k[__i] != 0)
__merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
});
@@ -75,7 +75,7 @@ template <typename _Abi, typename>
_S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
_MaskMember<_Tp> __k)
{
- __execute_n_times<_Np>([&](auto __i) {
+ __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if (__k[__i] != 0)
__mem[__i] = __v[__i];
});
@@ -286,7 +286,7 @@ struct _MaskImplNeonMixin
{
constexpr auto __bitsel
= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
- [&](auto __i) {
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(
__i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
});
@@ -306,7 +306,7 @@ struct _MaskImplNeonMixin
{
constexpr auto __bitsel
= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
- [&](auto __i) {
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
@@ -322,7 +322,7 @@ struct _MaskImplNeonMixin
{
constexpr auto __bitsel
= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
- [&](auto __i) {
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
@@ -346,7 +346,7 @@ struct _MaskImplNeonMixin
{
constexpr auto __bitsel
= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
- [&](auto __i) {
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
@@ -361,7 +361,7 @@ struct _MaskImplNeonMixin
{
constexpr auto __bitsel
= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
- [&](auto __i) {
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
});
__asint &= __bitsel;
diff --git a/libstdc++-v3/include/experimental/bits/simd_x86.h b/libstdc++-v3/include/experimental/bits/simd_x86.h
index 3570246..c8dde61 100644
--- a/libstdc++-v3/include/experimental/bits/simd_x86.h
+++ b/libstdc++-v3/include/experimental/bits/simd_x86.h
@@ -537,16 +537,17 @@ struct _CommonImplX86 : _CommonImplBuiltin
_S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
{
if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
- _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>([=]() constexpr {
- if constexpr (_Np <= 16)
- return _mm_movm_epi8(__x._M_to_bits());
- else if constexpr (_Np <= 32)
- return _mm256_movm_epi8(__x._M_to_bits());
- else if constexpr (_Np <= 64)
- return _mm512_movm_epi8(__x._M_to_bits());
- else
- __assert_unreachable<_SizeConstant<_Np>>();
- }()),
+ _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
+ [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ if constexpr (_Np <= 16)
+ return _mm_movm_epi8(__x._M_to_bits());
+ else if constexpr (_Np <= 32)
+ return _mm256_movm_epi8(__x._M_to_bits());
+ else if constexpr (_Np <= 64)
+ return _mm512_movm_epi8(__x._M_to_bits());
+ else
+ __assert_unreachable<_SizeConstant<_Np>>();
+ }()),
__mem);
else if constexpr (__have_bmi2)
{
@@ -554,7 +555,7 @@ struct _CommonImplX86 : _CommonImplBuiltin
_S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
else
__execute_n_times<__div_roundup(_Np, sizeof(size_t))>(
- [&](auto __i) {
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
constexpr size_t __offset = __i * sizeof(size_t);
constexpr int __todo = std::min(sizeof(size_t), _Np - __offset);
if constexpr (__todo == 1)
@@ -575,7 +576,7 @@ struct _CommonImplX86 : _CommonImplBuiltin
});
}
else if constexpr (__have_sse2 && _Np > 7)
- __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) {
+ __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
constexpr int __offset = __i * 16;
constexpr int __todo = std::min(16, int(_Np) - __offset);
const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
@@ -765,9 +766,10 @@ struct _CommonImplX86 : _CommonImplBuiltin
static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
if (__k._M_is_constprop() && __at0._M_is_constprop()
&& __at1._M_is_constprop())
- return __generate_from_n_evaluations<_Np,
- __vector_type_t<_Tp, _Np>>([&](
- auto __i) constexpr { return __k[__i] ? __at1[__i] : __at0[__i]; });
+ return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
+ [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __k[__i] ? __at1[__i] : __at0[__i];
+ });
else if constexpr (sizeof(__at0) == 64
|| (__have_avx512vl && sizeof(__at0) >= 16))
return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
@@ -994,9 +996,8 @@ template <typename _Abi, typename>
}
else
_BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
- [&](auto __i) {
- __merge._M_set(__i, static_cast<_Tp>(
- __mem[__i]));
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
});
}
/* Very uncertain, that the following improves anything. Needs
@@ -1417,11 +1418,12 @@ template <typename _Abi, typename>
const auto __yf = __convert_all<_FloatV, __n_floatv>(
_Abi::__make_padding_nonzero(__as_vector(__y)));
return __call_with_n_evaluations<__n_floatv>(
- [](auto... __quotients) {
+ [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return __vector_convert<_R>(__quotients...);
},
- [&__xf,
- &__yf](auto __i) -> _SimdWrapper<_Float, __n_intermediate> {
+ [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+ -> _SimdWrapper<_Float, __n_intermediate>
+ {
#if __RECIPROCAL_MATH__
// If -freciprocal-math is active, using the `/` operator is
// incorrect because it may be translated to an imprecise
@@ -1980,7 +1982,7 @@ template <typename _Abi, typename>
{
auto __mask = __vector_bitcast<_UChar>(
__vector_bitcast<_UShort>(__iy) << 5);
- auto __maskl = [&]() {
+ auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
};
auto __xh = __vector_bitcast<short>(__ix);
@@ -2067,19 +2069,20 @@ template <typename _Abi, typename>
} //}}}
else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
{
- [[maybe_unused]] auto __blend_0xaa = [](auto __a, auto __b) {
- if constexpr (sizeof(__a) == 16)
- return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
- 0xaa);
- else if constexpr (sizeof(__a) == 32)
- return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
- 0xaa);
- else if constexpr (sizeof(__a) == 64)
- return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
- __to_intrin(__b));
- else
- __assert_unreachable<decltype(__a)>();
- };
+ [[maybe_unused]] auto __blend_0xaa
+ = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ if constexpr (sizeof(__a) == 16)
+ return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
+ 0xaa);
+ else if constexpr (sizeof(__a) == 32)
+ return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
+ 0xaa);
+ else if constexpr (sizeof(__a) == 64)
+ return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
+ __to_intrin(__b));
+ else
+ __assert_unreachable<decltype(__a)>();
+ };
if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
return __intrin_bitcast<_V>(is_signed_v<_Up>
? _mm_srav_epi16(__ix, __iy)
@@ -2136,9 +2139,10 @@ template <typename _Abi, typename>
{
auto __k = __vector_bitcast<_UShort>(__iy) << 11;
auto __x128 = __vector_bitcast<_Up>(__ix);
- auto __mask = [](__vector_type16_t<_UShort> __kk) {
- return __vector_bitcast<short>(__kk) < 0;
- };
+ auto __mask
+ = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return __vector_bitcast<short>(__kk) < 0;
+ };
// do __x128 = 0 where __y[4] is set
__x128 = __mask(__k) ? decltype(__x128)() : __x128;
// do __x128 =>> 8 where __y[3] is set
@@ -2178,7 +2182,7 @@ template <typename _Abi, typename>
}
else
{
- auto __shift = [](auto __a, auto __b) {
+ auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if constexpr (is_signed_v<_Up>)
return _mm_sra_epi32(__a, __b);
else
@@ -3495,7 +3499,7 @@ struct _MaskImplX86Mixin
return _S_to_maskvector<_Up, _ToN>(__k);
else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>(
- [&](auto __i) -> _Up { return -__x[__i.value]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; });
else if constexpr (sizeof(_Up) == 1)
{
if constexpr (sizeof(_UI) == 16)
@@ -3740,9 +3744,9 @@ struct _MaskImplX86Mixin
else if constexpr (__bits_per_element >= _ToN)
{
constexpr auto __bitmask
- = __generate_vector<_V>([](auto __i) constexpr->_UpUInt {
- return __i < _ToN ? 1ull << __i : 0;
- });
+ = __generate_vector<_V>([](auto __i)
+ constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt
+ { return __i < _ToN ? 1ull << __i : 0; });
const auto __bits
= __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
if constexpr (__bits_per_element > _ToN)
@@ -3753,11 +3757,11 @@ struct _MaskImplX86Mixin
else
{
const _V __tmp
- = __generate_vector<_V>([&](auto __i) constexpr {
+ = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_UpUInt>(
__k >> (__bits_per_element * (__i / __bits_per_element)));
})
- & __generate_vector<_V>([](auto __i) constexpr {
+ & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return static_cast<_UpUInt>(1ull
<< (__i % __bits_per_element));
}); // mask bit index
@@ -3793,7 +3797,7 @@ struct _MaskImplX86Mixin
const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
return __generate_from_n_evaluations<std::min(_ToN, _Np),
__vector_type_t<_Up, _ToN>>(
- [&](auto __i) -> _Up { return __y[__i.value]; });
+ [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; });
}
using _To = __vector_type_t<_Up, _ToN>;
[[maybe_unused]] constexpr size_t _FromN = _Np;
@@ -4128,8 +4132,11 @@ struct _MaskImplX86Mixin
{
const auto __bools = -__x._M_data;
const _ULLong __k = __call_with_n_evaluations<_Np>(
- [](auto... __bits) { return (__bits | ...); },
- [&](auto __i) { return _ULLong(__bools[+__i]) << __i; });
+ [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return (__bits | ...);
+ }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+ return _ULLong(__bools[+__i]) << __i;
+ });
if (__builtin_is_constant_evaluated()
|| __builtin_constant_p(__k))
return __k;
@@ -4285,13 +4292,14 @@ template <typename _Abi, typename>
static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
if constexpr (__have_avx512bw)
{
- const auto __to_vec_or_bits = [](auto __bits) -> decltype(auto) {
- if constexpr (__is_avx512_abi<_Abi>())
- return __bits;
- else
- return _S_to_maskvector<_Tp>(
- _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
- };
+ const auto __to_vec_or_bits
+ = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) {
+ if constexpr (__is_avx512_abi<_Abi>())
+ return __bits;
+ else
+ return _S_to_maskvector<_Tp>(
+ _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
+ };
if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
{
@@ -4478,7 +4486,7 @@ template <typename _Abi, typename>
}
else
{
- _BitOps::_S_bit_iteration(__mask, [&](auto __i) {
+ _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__merge._M_set(__i, __mem[__i]);
});
return __merge;
@@ -4557,7 +4565,7 @@ template <typename _Abi, typename>
{
if constexpr (__have_avx512bw_vl)
_CommonImplX86::_S_store<_Np>(
- __vector_bitcast<char>([](auto __data) {
+ __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
if constexpr (_Np <= 16)
return _mm_maskz_set1_epi8(__data, 1);
else if constexpr (_Np <= 32)