// Implementation of -*- C++ -*- // Copyright The GNU Toolchain Authors. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the // terms of the GNU General Public License as published by the // Free Software Foundation; either version 3, or (at your option) // any later version. // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // Under Section 7 of GPL version 3, you are granted additional // permissions described in the GCC Runtime Library Exception, version // 3.1, as published by the Free Software Foundation. // You should have received a copy of the GNU General Public License and // a copy of the GCC Runtime Library Exception along with this program; // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see // . #ifndef _GLIBCXX_SIMD_MASK_H #define _GLIBCXX_SIMD_MASK_H 1 #ifdef _GLIBCXX_SYSHDR #pragma GCC system_header #endif #if __cplusplus >= 202400L #include "simd_iterator.h" #include "vec_ops.h" #if _GLIBCXX_X86 #include "simd_x86.h" #endif #include #include // psabi warnings are bogus because the ABI of the internal types never leaks into user code #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpsabi" namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION namespace simd { template struct _SwapNeighbors { consteval unsigned operator()(unsigned __i, unsigned __size) const { if (__size % (2 * _Np) != 0) __builtin_abort(); // swap_neighbors permutation requires a multiple of 2N elements else if (std::has_single_bit(_Np)) return __i ^ _Np; else if (__i % (2 * _Np) >= _Np) return __i - _Np; else return __i + _Np; } }; template constexpr auto __bitset_split(const bitset<_Mp>& __b) { constexpr auto __bits_per_word = __CHAR_BIT__ * __SIZEOF_LONG__; if constexpr (_Np % __bits_per_word == 0) { struct _Tmp { bitset<_Np> _M_lo; bitset<_Mp - _Np> _M_hi; }; return __builtin_bit_cast(_Tmp, __b); } else { constexpr auto __bits_per_ullong = __CHAR_BIT__ * __SIZEOF_LONG_LONG__; static_assert(_Mp <= __bits_per_ullong); using _Lo = _Bitmask<_Np>; using _Hi = _Bitmask<_Mp - _Np>; struct _Tmp { _Lo _M_lo; _Hi _M_hi; }; return _Tmp {static_cast<_Lo>(__b.to_ullong()), static_cast<_Hi>(__b.to_ullong() >> _Np)}; } } static_assert(__bitset_split<64>(bitset<128>(1))._M_lo == bitset<64>(1)); static_assert(__bitset_split<64>(bitset<128>(1))._M_hi == bitset<64>(0)); // [simd.traits] // --- rebind --- template struct rebind {}; /** * Computes a member @c type `basic_vec<_Tp, Abi>`, where @c Abi is chosen such that the * number of elements is equal to `_Vp::size()` and features of the ABI tag (such as the * internal representation of masks, or storage order of complex components) are preserved. */ template <__vectorizable _Tp, __simd_vec_type _Vp, _ArchTraits _Traits> //requires requires { typename __deduce_abi_t<_Tp, _Vp::size()>; } struct rebind<_Tp, _Vp, _Traits> { using type = __similar_vec<_Tp, _Vp::size(), typename _Vp::abi_type>; }; /** * As above, except for @c basic_mask. */ template <__vectorizable _Tp, __simd_mask_type _Mp, _ArchTraits _Traits> //requires requires { typename __deduce_abi_t<_Tp, _Mp::size()>; } struct rebind<_Tp, _Mp, _Traits> { using type = __similar_mask<_Tp, _Mp::size(), typename _Mp::abi_type>; }; template using rebind_t = typename rebind<_Tp, _Vp>::type; // --- resize --- template <__simd_size_type _Np, typename _Vp, _ArchTraits _Traits = {}> struct resize {}; template <__simd_size_type _Np, __simd_vec_type _Vp, _ArchTraits _Traits> requires (_Np >= 1) //requires requires { typename __deduce_abi_t; } struct resize<_Np, _Vp, _Traits> { using type = __similar_vec; }; template <__simd_size_type _Np, __simd_mask_type _Mp, _ArchTraits _Traits> requires (_Np >= 1) //requires requires { typename __deduce_abi_t; } struct resize<_Np, _Mp, _Traits> { using _A1 = decltype(__abi_rebind<__mask_element_size<_Mp>, _Np, typename _Mp::abi_type, true>()); static_assert(__abi_tag<_A1>); static_assert(_Mp::abi_type::_S_variant == _A1::_S_variant || __scalar_abi_tag<_A1> || __scalar_abi_tag); using type = basic_mask<__mask_element_size<_Mp>, _A1>; }; template <__simd_size_type _Np, typename _Vp> using resize_t = typename resize<_Np, _Vp>::type; // [simd.syn] inline constexpr __simd_size_type zero_element = numeric_limits::min(); inline constexpr __simd_size_type uninit_element = zero_element + 1; // [simd.permute.static] template<__simd_size_type _Np = 0, __simd_vec_or_mask_type _Vp, __index_permutation_function<_Vp> _IdxMap> [[__gnu__::__always_inline__]] constexpr resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp> permute(const _Vp& __v, _IdxMap&& __idxmap) { return resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp>::_S_static_permute(__v, __idxmap); } // [simd.permute.dynamic] template<__simd_vec_or_mask_type _Vp, __simd_integral _Ip> [[__gnu__::__always_inline__]] constexpr resize_t<_Ip::size(), _Vp> permute(const _Vp& __v, const _Ip& __indices) { return __v[__indices]; } // [simd.creation] ---------------------------------------------------------- template<__simd_vec_type _Vp, typename _Ap> [[__gnu__::__always_inline__]] constexpr auto chunk(const basic_vec& __x) noexcept { return __x.template _M_chunk<_Vp>(); } template<__simd_mask_type _Mp, typename _Ap> [[__gnu__::__always_inline__]] constexpr auto chunk(const basic_mask<__mask_element_size<_Mp>, _Ap>& __x) noexcept { return __x.template _M_chunk<_Mp>(); } template<__simd_size_type _Np, typename _Tp, typename _Ap> [[__gnu__::__always_inline__]] constexpr auto chunk(const basic_vec<_Tp, _Ap>& __x) noexcept -> decltype(chunk>>(__x)) { return chunk>>(__x); } template<__simd_size_type _Np, size_t _Bytes, typename _Ap> [[__gnu__::__always_inline__]] constexpr auto chunk(const basic_mask<_Bytes, _Ap>& __x) noexcept -> decltype(chunk>>(__x)) { return chunk>>(__x); } // LWG???? (reported 2025-11-25) template constexpr resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_vec<_Tp, _A0>> cat(const basic_vec<_Tp, _A0>& __x0, const basic_vec<_Tp, _Abis>&... __xs) noexcept { return resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_vec<_Tp, _A0>> ::_S_concat(__x0, __xs...); } // LWG???? (reported 2025-11-25) template constexpr resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_mask<_Bytes, _A0>> cat(const basic_mask<_Bytes, _A0>& __x0, const basic_mask<_Bytes, _Abis>&... __xs) noexcept { return resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_mask<_Bytes, _A0>> ::_S_concat(__x0, __xs...); } // implementation helper for chunk and cat consteval int __packs_to_skip_at_front(int __offset, initializer_list __sizes) { int __i = 0; int __n = 0; for (int __s : __sizes) { __n += __s; if (__n > __offset) return __i; ++__i; } __builtin_trap(); // called out of contract } consteval int __packs_to_skip_at_back(int __offset, int __max, initializer_list __sizes) { int __i = 0; int __n = -__offset; for (int __s : __sizes) { ++__i; __n += __s; if (__n >= __max) return int(__sizes.size()) - __i; } return 0; } // in principle, this overload allows conversions to _Dst - and it wouldn't be wrong - but the // general overload below is still a better candidate in overload resolution template [[__gnu__::__always_inline__]] constexpr _Dst __extract_simd_at(auto _Offset, const _Dst& __r, const auto&...) requires(_Offset.value == 0) { return __r; } template [[__gnu__::__always_inline__]] constexpr _Dst __extract_simd_at(auto _Offset, const _V0&, const _Dst& __r, const auto&...) requires(_Offset.value == _V0::size.value) { return __r; } template [[__gnu__::__always_inline__]] constexpr _Dst __extract_simd_at(auto _Offset, const _Vs&... __xs) { using _Adst = typename _Dst::abi_type; if constexpr (_Adst::_S_nreg >= 2) { using _Dst0 = remove_cvref_t()._M_get_low())>; using _Dst1 = remove_cvref_t()._M_get_high())>; return _Dst::_S_init(__extract_simd_at<_Dst0>(_Offset, __xs...), __extract_simd_at<_Dst1>(_Offset + _Dst0::size, __xs...)); } else { using _Ret = remove_cvref_t()._M_get())>; constexpr bool __use_bitmask = __simd_mask_type<_Dst> && _Adst::_S_is_bitmask; constexpr int __dst_full_size = __bit_ceil(unsigned(_Adst::_S_size)); constexpr int __nargs = sizeof...(__xs); using _Afirst = typename _Vs...[0]::abi_type; using _Alast = typename _Vs...[__nargs - 1]::abi_type; const auto& __x0 = __xs...[0]; const auto& __xlast = __xs...[__nargs - 1]; constexpr int __ninputs = (_Vs::size.value + ...); if constexpr (_Offset.value >= _Afirst::_S_size || __ninputs - _Offset.value - _Alast::_S_size >= _Adst::_S_size) { // can drop inputs at the front and/or back of the pack constexpr int __skip_front = __packs_to_skip_at_front(_Offset.value, {_Vs::size.value...}); constexpr int __skip_back = __packs_to_skip_at_back(_Offset.value, _Adst::_S_size, {_Vs::size.value...}); static_assert(__skip_front > 0 || __skip_back > 0); constexpr auto [...__skip] = _IotaArray<__skip_front>; constexpr auto [...__is] = _IotaArray<__nargs - __skip_front - __skip_back>; constexpr int __new_offset = _Offset.value - (0 + ... + _Vs...[__skip]::size.value); return __extract_simd_at<_Dst>(cw<__new_offset>, __xs...[__is + __skip_front]...); } else if constexpr (_Adst::_S_size == 1) { // trivial conversion to one value_type return _Dst(__x0[_Offset.value]); } else if constexpr (_Afirst::_S_nreg >= 2 || _Alast::_S_nreg >= 2) { // flatten first and/or last multi-register argument constexpr bool __flatten_first = _Afirst::_S_nreg >= 2; constexpr bool __flatten_last = __nargs > 1 && _Alast::_S_nreg >= 2; constexpr auto [...__is] = _IotaArray<__nargs - __flatten_first - __flatten_last>; if constexpr (__flatten_first && __flatten_last) return __extract_simd_at<_Dst>( _Offset, __x0._M_get_low(), __x0._M_get_high(), __xs...[__is + 1]..., __xlast._M_get_low(), __xlast._M_get_high()); else if constexpr (__flatten_first) return __extract_simd_at<_Dst>( _Offset, __x0._M_get_low(), __x0._M_get_high(), __xs...[__is + 1]...); else return __extract_simd_at<_Dst>( _Offset, __xs...[__is]..., __xlast._M_get_low(), __xlast._M_get_high()); } else if constexpr (__simd_mask_type<_Dst> && ((_Adst::_S_variant != _Vs::abi_type::_S_variant && !__scalar_abi_tag) || ...)) { // convert ABI tag if incompatible return __extract_simd_at<_Dst>( _Offset, static_cast&>(__xs)...); } // at this point __xs should be as small as possible; there may be some corner cases left else if constexpr (__nargs == 1) { // simple and optimal if constexpr (__use_bitmask) return _Dst(_Ret(__x0._M_to_uint() >> _Offset.value)); else return _VecOps<_Ret>::_S_extract(__x0._M_concat_data(false), _Offset); } else if constexpr (__use_bitmask) { // fairly simple and optimal bit shifting solution static_assert(_Afirst::_S_nreg == 1); static_assert(_Offset.value < _Afirst::_S_size); int __offset = -_Offset.value; _Ret __r; template for (const auto& __x : {__xs...}) { if (__offset <= 0) __r = _Ret(__x._M_to_uint() >> -__offset); else if (__offset < _Adst::_S_size) __r |= _Ret(_Ret(__x._M_to_uint()) << __offset); __offset += __x.size.value; } return _Dst(__r); } else if constexpr (__nargs == 2 && _Offset == 0 && _Adst::_S_nreg == 1 && _Afirst::_S_size >= _Alast::_S_size && __has_single_bit(unsigned(_Afirst::_S_size))) { // simple __vec_concat if constexpr (_Afirst::_S_size == 1) // even simpler init from two values return _Ret{__x0._M_concat_data()[0], __xlast._M_concat_data()[0]}; else { const auto __v0 = __x0._M_concat_data(); const auto __v1 = __vec_zero_pad_to(__xlast._M_concat_data()); return __vec_concat(__v0, __v1); } } else if constexpr (__nargs == 2 && _Adst::_S_nreg == 1 && _Offset == 0 && _Afirst::_S_nreg == 1 && _Alast::_S_size == 1) { // optimize insertion of one element at the end _Ret __r = __vec_zero_pad_to(__x0._M_get()); __vec_set(__r, _Afirst::_S_size, __xlast._M_concat_data()[0]); return __r; } else if constexpr (__nargs == 2 && _Adst::_S_nreg == 1 && _Offset == 0 && _Afirst::_S_nreg == 1 && _Alast::_S_size == 2) { // optimize insertion of two elements at the end _Ret __r = __vec_zero_pad_to(__x0._M_concat_data()); const auto __x1 = __xlast._M_concat_data(); if constexpr (sizeof(__x1) <= sizeof(double) && (_Afirst::_S_size & 1) == 0) { // can use a single insert instruction using _Up = __conditional_t< is_floating_point_v<__vec_value_type<_Ret>>, __conditional_t, __integer_from>; auto __r2 = __vec_bit_cast<_Up>(__r); __vec_set(__r2, _Afirst::_S_size / 2, __vec_bit_cast<_Up>(__x1)[0]); __r = reinterpret_cast<_Ret>(__r2); } else { __vec_set(__r, _Afirst::_S_size, __x1[0]); __vec_set(__r, _Afirst::_S_size + 1, __x1[1]); } return __r; } else if constexpr (__nargs == 2 && _Afirst::_S_nreg == 1 && _Alast::_S_nreg == 1) { // optimize concat of two input vectors (e.g. using palignr) constexpr auto [...__is] = _IotaArray<__dst_full_size>; constexpr int __v2_offset = __width_of; return __builtin_shufflevector( __x0._M_concat_data(), __xlast._M_concat_data(), [](int __i) consteval { if (__i < _Afirst::_S_size) return __i; __i -= _Afirst::_S_size; if (__i < _Alast::_S_size) return __i + __v2_offset; else return -1; }(__is + _Offset.value)...); } else if (__is_const_known(__xs...) || __ninputs == _Adst::_S_size) { // hard to optimize for the compiler, but necessary in constant expressions return _VecOps<_Ret>::_S_extract( __vec_concat_sized<__xs.size.value...>(__xs._M_concat_data(false)...), _Offset); } else { // fallback to concatenation in memory => load the result alignas(_Ret) __vec_value_type<_Ret> __tmp[std::max(__ninputs, _Offset.value + __dst_full_size)] = {}; int __offset = 0; template for (const auto& __x : {__xs...}) { if constexpr (__simd_mask_type<_Dst>) (-__x)._M_store(__tmp + __offset); else __x._M_store(__tmp + __offset); __offset += __x.size.value; } _Ret __r; __builtin_memcpy(&__r, __tmp + _Offset.value, sizeof(_Ret)); return __r; } } } // [simd.mask] -------------------------------------------------------------- template class basic_mask { public: using value_type = bool; using abi_type = _Ap; #define _GLIBCXX_DELETE_SIMD "This specialization is disabled because of an invalid combination " \ "of template arguments to basic_mask." basic_mask() = delete(_GLIBCXX_DELETE_SIMD); ~basic_mask() = delete(_GLIBCXX_DELETE_SIMD); basic_mask(const basic_mask&) = delete(_GLIBCXX_DELETE_SIMD); basic_mask& operator=(const basic_mask&) = delete(_GLIBCXX_DELETE_SIMD); #undef _GLIBCXX_DELETE_SIMD }; template class _MaskBase { using _Mp = basic_mask<_Bytes, _Ap>; protected: using _VecType = __simd_vec_from_mask_t<_Bytes, _Ap>; static_assert(destructible<_VecType> || _Bytes > sizeof(0ull)); public: using iterator = __iterator<_Mp>; using const_iterator = __iterator; constexpr iterator begin() noexcept { return {static_cast<_Mp&>(*this), 0}; } constexpr const_iterator begin() const noexcept { return cbegin(); } constexpr const_iterator cbegin() const noexcept { return {static_cast(*this), 0}; } constexpr default_sentinel_t end() const noexcept { return {}; } constexpr default_sentinel_t cend() const noexcept { return {}; } static constexpr auto size = __simd_size_c<_Ap::_S_size>; _MaskBase() = default; // LWG issue from 2026-03-04 / P4042R0 template requires (_Ap::_S_size != _UAbi::_S_size) explicit _MaskBase(const basic_mask<_UBytes, _UAbi>&) = delete("size mismatch"); template explicit _MaskBase(const basic_vec<_Up, _UAbi>&) = delete("use operator! or a comparison to convert a vec into a mask"); template requires (_Ap::_S_size != _UAbi::_S_size) operator basic_vec<_Up, _UAbi>() const = delete("size mismatch"); }; template requires (_Ap::_S_nreg == 1) class basic_mask<_Bytes, _Ap> : public _MaskBase<_Bytes, _Ap> { using _Base = _MaskBase<_Bytes, _Ap>; using _VecType = _Base::_VecType; template friend class basic_mask; template friend class basic_vec; static constexpr int _S_size = _Ap::_S_size; using _DataType = typename _Ap::template _MaskDataType<_Bytes>; static constexpr bool _S_has_bool_member = is_same_v<_DataType, bool>; static constexpr bool _S_is_scalar = _S_has_bool_member; static constexpr bool _S_use_bitmask = _Ap::_S_is_bitmask; static constexpr int _S_full_size = [] { if constexpr (_S_is_scalar) return _S_size; else if constexpr (_S_use_bitmask && _S_size < __CHAR_BIT__) return __CHAR_BIT__; else return __bit_ceil(unsigned(_S_size)); }(); static constexpr bool _S_is_partial = _S_size != _S_full_size; static constexpr _DataType _S_implicit_mask = [] { if constexpr (_S_is_scalar) return true; else if (!_S_is_partial) return _DataType(~_DataType()); else if constexpr (_S_use_bitmask) return _DataType((_DataType(1) << _S_size) - 1); else { constexpr auto [...__is] = _IotaArray<_S_full_size>; return _DataType{ (__is < _S_size ? -1 : 0)... }; } }(); // Actual padding bytes, not padding elements. // => _S_padding_bytes is 0 even if _S_is_partial is true. static constexpr size_t _S_padding_bytes = 0; _DataType _M_data; public: using value_type = bool; using abi_type = _Ap; using iterator = _Base::iterator; using const_iterator = _Base::const_iterator; // internal but public API ---------------------------------------------- [[__gnu__::__always_inline__]] static constexpr basic_mask _S_init(_DataType __x) { basic_mask __r; __r._M_data = __x; return __r; } [[__gnu__::__always_inline__]] static constexpr basic_mask _S_init(unsigned_integral auto __bits) { return basic_mask(__bits); } [[__gnu__::__always_inline__]] constexpr const _DataType& _M_get() const { return _M_data; } /** @internal * Bit-cast the given object @p __x to basic_mask. * * This is necessary for _S_nreg > 1 where the last element can be bool or when the sizeof * doesn't match because of different alignment requirements of the sub-masks. */ template [[__gnu__::__always_inline__]] static constexpr basic_mask _S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x) { return __builtin_bit_cast(basic_mask, __x._M_concat_data()); } [[__gnu__::__always_inline__]] constexpr auto _M_concat_data(bool __do_sanitize = _S_is_partial) const { if constexpr (_S_is_scalar) return __vec_builtin_type<__integer_from<_Bytes>, 1>{__integer_from<_Bytes>(-_M_data)}; else { if constexpr (_S_is_partial) if (__do_sanitize) return _DataType(_M_data & _S_implicit_mask); return _M_data; } } /** @internal * Returns a mask where the first @p __n elements are true. All remaining elements are false. * * @pre @p __n > 0 && @p __n < _S_size */ template <_ArchTraits _Traits = {}> [[__gnu__::__always_inline__]] static constexpr basic_mask _S_partial_mask_of_n(int __n) { static_assert(!_S_is_scalar); if constexpr (!_S_use_bitmask) { using _Ip = __integer_from<_Bytes>; __glibcxx_simd_precondition(__n >= 0 && __n <= numeric_limits<_Ip>::max(), "_S_partial_mask_of_n without _S_use_bitmask requires " "positive __n that does not overflow."); constexpr _DataType __0123 = __builtin_bit_cast(_DataType, _IotaArray<_Ip(_S_full_size)>); return basic_mask(__0123 < _Ip(__n)); } else { __glibcxx_simd_precondition(__n >= 0 && __n <= 255, "The x86 BZHI instruction requires __n to " "only use bits 0:7"); #if __has_builtin(__builtin_ia32_bzhi_si) if constexpr (_S_size <= 32 && _Traits._M_have_bmi2()) return _S_init(_Bitmask<_S_size>( __builtin_ia32_bzhi_si(~0u >> (32 - _S_size), unsigned(__n)))); #endif #if __has_builtin(__builtin_ia32_bzhi_di) else if constexpr (_S_size <= 64 && _Traits._M_have_bmi2()) return _S_init(__builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n))); #endif if constexpr (_S_size <= 32) { __glibcxx_simd_precondition(__n < 32, "invalid shift"); return _S_init(_Bitmask<_S_size>((1u << unsigned(__n)) - 1)); } else if constexpr (_S_size <= 64) { __glibcxx_simd_precondition(__n < 64, "invalid shift"); return _S_init((1ull << unsigned(__n)) - 1); } else static_assert(false); } } [[__gnu__::__always_inline__]] constexpr basic_mask& _M_and_neighbors() { if constexpr (_S_use_bitmask) _M_data &= ((_M_data >> 1) & 0x5555'5555'5555'5555ull) | ((_M_data << 1) & ~0x5555'5555'5555'5555ull); else _M_data &= _VecOps<_DataType>::_S_swap_neighbors(_M_data); return *this; } [[__gnu__::__always_inline__]] constexpr basic_mask& _M_or_neighbors() { if constexpr (_S_use_bitmask) _M_data |= ((_M_data >> 1) & 0x5555'5555'5555'5555ull) | ((_M_data << 1) & ~0x5555'5555'5555'5555ull); else _M_data |= _VecOps<_DataType>::_S_swap_neighbors(_M_data); return *this; } template [[__gnu__::__always_inline__]] constexpr auto _M_chunk() const noexcept { constexpr int __n = _S_size / _Mp::_S_size; constexpr int __rem = _S_size % _Mp::_S_size; constexpr auto [...__is] = _IotaArray<__n>; if constexpr (__rem == 0) return array<_Mp, __n>{__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, *this)...}; else { using _Rest = resize_t<__rem, _Mp>; return tuple(__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, *this)..., __extract_simd_at<_Rest>(cw<_Mp::_S_size * __n>, *this)); } } [[__gnu__::__always_inline__]] static constexpr const basic_mask& _S_concat(const basic_mask& __x0) noexcept { return __x0; } template requires (sizeof...(_As) > 1) [[__gnu__::__always_inline__]] static constexpr basic_mask _S_concat(const basic_mask<_Bytes, _As>&... __xs) noexcept { static_assert(_S_size == (_As::_S_size + ...)); return __extract_simd_at(cw<0>, __xs...); } // [simd.mask.overview] default constructor ----------------------------- basic_mask() = default; // [simd.mask.overview] conversion extensions --------------------------- [[__gnu__::__always_inline__]] constexpr basic_mask(_DataType __x) requires(!_S_is_scalar && !_S_use_bitmask) : _M_data(__x) {} [[__gnu__::__always_inline__]] constexpr operator _DataType() requires(!_S_is_scalar && !_S_use_bitmask) { return _M_data; } // [simd.mask.ctor] broadcast constructor ------------------------------- [[__gnu__::__always_inline__]] constexpr explicit basic_mask(same_as auto __x) noexcept // LWG 4382. : _M_data(__x ? _S_implicit_mask : _DataType()) {} // [simd.mask.ctor] conversion constructor ------------------------------ template requires (_S_size == _UAbi::_S_size) [[__gnu__::__always_inline__]] constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes)) basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept : _M_data([&] [[__gnu__::__always_inline__]] { using _UV = basic_mask<_UBytes, _UAbi>; // bool to bool if constexpr (_S_is_scalar) return __x[0]; // converting from an "array of bool" else if constexpr (_UV::_S_is_scalar) { constexpr auto [...__is] = _IotaArray<_S_size>; if constexpr (_S_use_bitmask) return ((_DataType(__x[__is]) << __is) | ...); else return _DataType{__vec_value_type<_DataType>(-__x[__is])...}; } // vec-/bit-mask to bit-mask | bit-mask to vec-mask else if constexpr (_S_use_bitmask || _UV::_S_use_bitmask) return basic_mask(__x.to_bitset())._M_data; // vec-mask to vec-mask else if constexpr (_Bytes == _UBytes) return _S_recursive_bit_cast(__x)._M_data; else { #if _GLIBCXX_X86 // TODO: turn this into a __vec_mask_cast overload in simd_x86.h if constexpr (_Bytes == 1 && _UBytes == 2) if (!__is_const_known(__x)) { if constexpr (_UAbi::_S_nreg == 1) return __x86_cvt_vecmask<_DataType>(__x._M_data); else if constexpr (_UAbi::_S_nreg == 2) { auto __lo = __x._M_data0._M_data; auto __hi = __vec_zero_pad_to( __x._M_data1._M_concat_data()); return __x86_cvt_vecmask<_DataType>(__lo, __hi); } } #endif return __vec_mask_cast<_DataType>(__x._M_concat_data()); } }()) {} using _Base::_MaskBase; // [simd.mask.ctor] generator constructor ------------------------------- template <__simd_generator_invokable _Fp> [[__gnu__::__always_inline__]] constexpr explicit basic_mask(_Fp&& __gen) : _M_data([&] [[__gnu__::__always_inline__]] { constexpr auto [...__is] = _IotaArray<_S_size>; if constexpr (_S_is_scalar) return __gen(__simd_size_c<0>); else if constexpr (_S_use_bitmask) return _DataType(((_DataType(__gen(__simd_size_c<__is>)) << __is) | ...)); else return _DataType{__vec_value_type<_DataType>( __gen(__simd_size_c<__is>) ? -1 : 0)...}; }()) {} // [simd.mask.ctor] bitset constructor ---------------------------------- [[__gnu__::__always_inline__]] constexpr basic_mask(const same_as> auto& __b) noexcept // LWG 4382. : basic_mask(static_cast<_Bitmask<_S_size>>(__b.to_ullong())) { // more than 64 elements in one register? not yet. static_assert(_S_size <= numeric_limits::digits); } // [simd.mask.ctor] uint constructor ------------------------------------ template requires (!same_as<_Tp, bool>) // LWG 4382. [[__gnu__::__always_inline__]] constexpr explicit basic_mask(_Tp __val) noexcept : _M_data([&] [[__gnu__::__always_inline__]] () { if constexpr (_S_use_bitmask) return __val; else if constexpr (_S_is_scalar) return bool(__val & 1); else if (__is_const_known(__val)) { constexpr auto [...__is] = _IotaArray<_S_size>; return _DataType {__vec_value_type<_DataType>((__val & (1ull << __is)) == 0 ? 0 : -1)...}; } else { using _Ip = typename _VecType::value_type; _VecType __v0 = _Ip(__val); constexpr int __bits_per_element = sizeof(_Ip) * __CHAR_BIT__; constexpr _VecType __pow2 = _VecType(cw<1>) << (__iota<_VecType> % cw<__bits_per_element>); if constexpr (_S_size < __bits_per_element) return ((__v0 & __pow2) > cw<0>)._M_concat_data(); else if constexpr (_S_size == __bits_per_element) return ((__v0 & __pow2) != cw<0>)._M_concat_data(); else { static_assert(_Bytes == 1); static_assert(sizeof(_Ip) == 1); _Bitmask<_S_size> __bits = __val; static_assert(sizeof(_VecType) % sizeof(__bits) == 0); if constexpr (sizeof(_DataType) == 32) { __vec_builtin_type<_UInt<8>, 4> __v1 = { 0xffu & (__bits >> (0 * __CHAR_BIT__)), 0xffu & (__bits >> (1 * __CHAR_BIT__)), 0xffu & (__bits >> (2 * __CHAR_BIT__)), 0xffu & (__bits >> (3 * __CHAR_BIT__)), }; __v1 *= 0x0101'0101'0101'0101ull; __v0 = __builtin_bit_cast(_VecType, __v1); return ((__v0 & __pow2) != cw<0>)._M_data; } else { using _V1 = vec<_Ip, sizeof(__bits)>; _V1 __v1 = __builtin_bit_cast(_V1, __bits); __v0 = _VecType::_S_static_permute(__v1, [](int __i) { return __i / __CHAR_BIT__; }); return ((__v0 & __pow2) != cw<0>)._M_data; } } } }()) {} //Effects: Initializes the first M elements to the corresponding bit values in val, where M is //the smaller of size() and the number of bits in the value representation //([basic.types.general]) of the type of val. If M is less than size(), the remaining elements //are initialized to zero. // [simd.mask.subscr] --------------------------------------------------- [[__gnu__::__always_inline__]] constexpr value_type operator[](__simd_size_type __i) const { __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds"); if constexpr (_S_is_scalar) return _M_data; else if constexpr (_S_use_bitmask) return bool((_M_data >> __i) & 1); else return _M_data[__i] & 1; } // [simd.mask.unary] ---------------------------------------------------- [[__gnu__::__always_inline__]] constexpr basic_mask operator!() const noexcept { if constexpr (_S_is_scalar) return _S_init(!_M_data); else return _S_init(~_M_data); } [[__gnu__::__always_inline__]] constexpr _VecType operator+() const noexcept requires destructible<_VecType> { return operator _VecType(); } constexpr _VecType operator+() const noexcept = delete; [[__gnu__::__always_inline__]] constexpr _VecType operator-() const noexcept requires destructible<_VecType> { using _Ip = typename _VecType::value_type; if constexpr (_S_is_scalar) return _Ip(-int(_M_data)); else if constexpr (_S_use_bitmask) return __select_impl(*this, _Ip(-1), _Ip()); else { static_assert(sizeof(_VecType) == sizeof(_M_data)); return __builtin_bit_cast(_VecType, _M_data); } } constexpr _VecType operator-() const noexcept = delete; [[__gnu__::__always_inline__]] constexpr _VecType operator~() const noexcept requires destructible<_VecType> { using _Ip = typename _VecType::value_type; if constexpr (_S_is_scalar) return _Ip(~int(_M_data)); else if constexpr (_S_use_bitmask) return __select_impl(*this, _Ip(-2), _Ip(-1)); else { static_assert(sizeof(_VecType) == sizeof(_M_data)); return __builtin_bit_cast(_VecType, _M_data) - _Ip(1); } } constexpr _VecType operator~() const noexcept = delete; // [simd.mask.conv] ----------------------------------------------------- template requires (_UAbi::_S_size == _S_size) [[__gnu__::__always_inline__]] constexpr explicit(sizeof(_Up) != _Bytes) operator basic_vec<_Up, _UAbi>() const noexcept { if constexpr (_S_is_scalar) return _Up(_M_data); else { using _UV = basic_vec<_Up, _UAbi>; return __select_impl(static_cast<_UV::mask_type>(*this), _Up(1), _UV()); } } using _Base::operator basic_vec; // [simd.mask.namedconv] ------------------------------------------------ [[__gnu__::__always_inline__]] constexpr bitset<_S_size> to_bitset() const noexcept { // more than 64 elements in one register? not yet. static_assert(_S_size <= numeric_limits::digits); return to_ullong(); } /** @internal * Return the mask as the smallest possible unsigned integer (up to 64 bits). * * @tparam _Offset Adjust the return type & value to start at bit @p _Offset. * @tparam _Use_2_for_1 Store the value of every second element into one bit of the result. * (precondition: each even/odd pair stores the same value) */ template [[__gnu__::__always_inline__]] constexpr _Bitmask<_S_size + _Offset> _M_to_uint() const { constexpr int __nbits = _S_size; static_assert(__nbits + _Offset <= numeric_limits::digits); // before shifting using _U0 = _Bitmask<__nbits>; // potentially wider type needed for shift by _Offset using _Ur = _Bitmask<__nbits + _Offset>; if constexpr (_S_is_scalar || _S_use_bitmask) { auto __bits = _M_data; if constexpr (_S_is_partial) __bits &= _S_implicit_mask; return _Ur(__bits) << _Offset; } else { #if _GLIBCXX_X86 if (!__is_const_known(*this)) { _U0 __uint; if constexpr (_Bytes != 2) // movmskb would duplicate each bit __uint = _U0(__x86_movmsk(_M_data)); else if constexpr (_Bytes == 2 && _Traits._M_have_bmi2()) __uint = __bit_extract_even<__nbits>(__x86_movmsk(_M_data)); else if constexpr (_Bytes == 2) return __similar_mask(*this).template _M_to_uint<_Offset>(); else static_assert(false); // TODO: with AVX512 use __builtin_ia32_cvt[bwdq]2mask(128|256|512) // TODO: Ask for compiler builtin to do the best of the above. This should also // combine with a preceding vector-mask compare to produce a bit-mask compare (on // AVX512) if constexpr (_S_is_partial) __uint &= (_U0(1) << _S_size) - 1; return _Ur(__uint) << _Offset; } #endif using _IV = _VecType; static_assert(destructible<_IV>); const typename _IV::mask_type& __k = [&] [[__gnu__::__always_inline__]] () { if constexpr (is_same_v) return *this; else return typename _IV::mask_type(*this); }(); constexpr int __n = _IV::size(); if constexpr (_Bytes * __CHAR_BIT__ >= __n) // '1 << __iota' cannot overflow { // reduce(select(k, powers_of_2, 0)) constexpr _IV __pow2 = _IV(cw<1>) << __iota<_IV>; return _Ur(_U0(__select_impl(__k, __pow2, _IV()) ._M_reduce(bit_or<>()))) << _Offset; } else if constexpr (__n % __CHAR_BIT__ != 0) { // recurse after splitting in two constexpr int __n_lo = __n - __n % __CHAR_BIT__; const auto [__lo, __hi] = chunk<__n_lo>(__k); _Ur __bits = __hi.template _M_to_uint<_Offset + __n_lo>(); return __bits | __lo.template _M_to_uint<_Offset>(); } else { // limit powers_of_2 to 1, 2, 4, ..., 128 constexpr _IV __pow2 = _IV(cw<1>) << (__iota<_IV> % _IV(cw<__CHAR_BIT__>)); _IV __x = __select_impl(__k, __pow2, _IV()); // partial reductions of 8 neighboring elements __x |= _IV::_S_static_permute(__x, _SwapNeighbors<4>()); __x |= _IV::_S_static_permute(__x, _SwapNeighbors<2>()); __x |= _IV::_S_static_permute(__x, _SwapNeighbors<1>()); // permute partial reduction results to the front __x = _IV::_S_static_permute(__x, [](int __i) { return __i * 8 < __n ? __i * 8 : uninit_element; }); // extract front as scalar unsigned _U0 __bits = __builtin_bit_cast( __similar_vec<_U0, __n * _Bytes / sizeof(_U0), _Ap>, __x)[0]; // mask off unused bits if constexpr (!__has_single_bit(unsigned(__nbits))) __bits &= (_U0(1) << __nbits) - 1; return _Ur(__bits) << _Offset; } } } [[__gnu__::__always_inline__]] constexpr unsigned long long to_ullong() const { return _M_to_uint(); } // [simd.mask.binary] --------------------------------------------------- [[__gnu__::__always_inline__]] friend constexpr basic_mask operator&&(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data & __y._M_data); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator||(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data | __y._M_data); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator&(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data & __y._M_data); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator|(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data | __y._M_data); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator^(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data ^ __y._M_data); } // [simd.mask.cassign] -------------------------------------------------- [[__gnu__::__always_inline__]] friend constexpr basic_mask& operator&=(basic_mask& __x, const basic_mask& __y) noexcept { __x._M_data &= __y._M_data; return __x; } [[__gnu__::__always_inline__]] friend constexpr basic_mask& operator|=(basic_mask& __x, const basic_mask& __y) noexcept { __x._M_data |= __y._M_data; return __x; } [[__gnu__::__always_inline__]] friend constexpr basic_mask& operator^=(basic_mask& __x, const basic_mask& __y) noexcept { __x._M_data ^= __y._M_data; return __x; } // [simd.mask.comparison] ----------------------------------------------- [[__gnu__::__always_inline__]] friend constexpr basic_mask operator==(const basic_mask& __x, const basic_mask& __y) noexcept { return !(__x ^ __y); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator!=(const basic_mask& __x, const basic_mask& __y) noexcept { return __x ^ __y; } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator>=(const basic_mask& __x, const basic_mask& __y) noexcept { return __x || !__y; } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator<=(const basic_mask& __x, const basic_mask& __y) noexcept { return !__x || __y; } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator>(const basic_mask& __x, const basic_mask& __y) noexcept { return __x && !__y; } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator<(const basic_mask& __x, const basic_mask& __y) noexcept { return !__x && __y; } // [simd.mask.cond] ----------------------------------------------------- [[__gnu__::__always_inline__]] friend constexpr basic_mask __select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept { if constexpr (!_S_use_bitmask) { #if _GLIBCXX_X86 // this works around bad code-gen when the compiler can't see that __k is a vector-mask. // This pattern, is recognized to match the x86 blend instructions, which only consider // the sign bit of the mask register. Also, without SSE4, if the compiler knows that __k // is a vector-mask, then the '< 0' is elided. return __k._M_data < 0 ? __t._M_data : __f._M_data; #endif return __k._M_data ? __t._M_data : __f._M_data; } else return (__k._M_data & __t._M_data) | (~__k._M_data & __f._M_data); } [[__gnu__::__always_inline__]] friend constexpr basic_mask __select_impl(const basic_mask& __k, same_as auto __t, same_as auto __f) noexcept { if (__t == __f) return basic_mask(__t); else return __t ? __k : !__k; } template <__vectorizable _T0, same_as<_T0> _T1> requires (sizeof(_T0) == _Bytes) [[__gnu__::__always_inline__]] friend constexpr vec<_T0, _S_size> __select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept { if constexpr (_S_is_scalar) return __k._M_data ? __t : __f; else { using _Vp = vec<_T0, _S_size>; using _Mp = typename _Vp::mask_type; return __select_impl(_Mp(__k), _Vp(__t), _Vp(__f)); } } // [simd.mask.reductions] implementation -------------------------------- [[__gnu__::__always_inline__]] constexpr bool _M_all_of() const noexcept { if constexpr (_S_is_scalar) return _M_data; else if constexpr (_S_use_bitmask) { if constexpr (_S_is_partial) // PR120925 (partial kortest pattern not recognized) return (_M_data & _S_implicit_mask) == _S_implicit_mask; else return _M_data == _S_implicit_mask; } #if _GLIBCXX_X86 else if (!__is_const_known(_M_data)) return __x86_vecmask_all<_S_size>(_M_data); #endif else return _VecOps<_DataType, _S_size>::_S_all_of(_M_data); } [[__gnu__::__always_inline__]] constexpr bool _M_any_of() const noexcept { if constexpr (_S_is_scalar) return _M_data; else if constexpr (_S_use_bitmask) { if constexpr (_S_is_partial) // PR120925 (partial kortest pattern not recognized) return (_M_data & _S_implicit_mask) != 0; else return _M_data != 0; } #if _GLIBCXX_X86 else if (!__is_const_known(_M_data)) return __x86_vecmask_any<_S_size>(_M_data); #endif else return _VecOps<_DataType, _S_size>::_S_any_of(_M_data); } [[__gnu__::__always_inline__]] constexpr bool _M_none_of() const noexcept { if constexpr (_S_is_scalar) return !_M_data; else if constexpr (_S_use_bitmask) { if constexpr (_S_is_partial) // PR120925 (partial kortest pattern not recognized) return (_M_data & _S_implicit_mask) == 0; else return _M_data == 0; } #if _GLIBCXX_X86 else if (!__is_const_known(_M_data)) return __x86_vecmask_none<_S_size>(_M_data); #endif else return _VecOps<_DataType, _S_size>::_S_none_of(_M_data); } [[__gnu__::__always_inline__]] constexpr __simd_size_type _M_reduce_count() const noexcept { if constexpr (_S_is_scalar) return int(_M_data); else if constexpr (_S_size <= numeric_limits::digits) return __builtin_popcount(_M_to_uint()); else return __builtin_popcountll(to_ullong()); } [[__gnu__::__always_inline__]] constexpr __simd_size_type _M_reduce_min_index() const { const auto __bits = _M_to_uint(); __glibcxx_simd_precondition(__bits, "An empty mask does not have a min_index."); if constexpr (_S_size == 1) return 0; else return __countr_zero(__bits); } [[__gnu__::__always_inline__]] constexpr __simd_size_type _M_reduce_max_index() const { const auto __bits = _M_to_uint(); __glibcxx_simd_precondition(__bits, "An empty mask does not have a max_index."); if constexpr (_S_size == 1) return 0; else return __highest_bit(__bits); } [[__gnu__::__always_inline__]] friend constexpr bool __is_const_known(const basic_mask& __x) { return __builtin_constant_p(__x._M_data); } }; template requires (_Ap::_S_nreg > 1) class basic_mask<_Bytes, _Ap> : public _MaskBase<_Bytes, _Ap> { using _Base = _MaskBase<_Bytes, _Ap>; using _VecType = _Base::_VecType; template friend class basic_mask; template friend class basic_vec; static constexpr int _S_size = _Ap::_S_size; static constexpr int _N0 = __bit_ceil(unsigned(_S_size)) / 2; static constexpr int _N1 = _S_size - _N0; static constexpr int _Nreg0 = __bit_ceil(unsigned(_Ap::_S_nreg)) / 2; static constexpr int _Nreg1 = _Ap::_S_nreg - _Nreg0; // explicitly request _Nreg0 rather than use __abi_rebind. This way _Float16 can use half // of native registers (since they convert to full float32 registers). using _Abi0 = decltype(_Ap::template _S_resize<_N0, _Nreg0>()); using _Abi1 = decltype(_Ap::template _S_resize<_N1, _Nreg1>()); using _Mask0 = basic_mask<_Bytes, _Abi0>; // the implementation (and users) depend on elements being contiguous in memory static_assert(_Mask0::_S_padding_bytes == 0 && !_Mask0::_S_is_partial); using _Mask1 = basic_mask<_Bytes, _Abi1>; static constexpr bool _S_is_partial = _Mask1::_S_is_partial; // _Ap::_S_nreg determines how deep the recursion goes. E.g. basic_mask<4, _Abi<8, 4>> cannot // use basic_mask<4, _Abi<4, 1>> as _Mask0/1 types. static_assert(_Mask0::abi_type::_S_nreg + _Mask1::abi_type::_S_nreg == _Ap::_S_nreg); static constexpr bool _S_use_bitmask = _Mask0::_S_use_bitmask; static constexpr bool _S_is_scalar = _Mask0::_S_is_scalar; _Mask0 _M_data0; _Mask1 _M_data1; static constexpr bool _S_has_bool_member = _Mask1::_S_has_bool_member; // by construction _N0 >= _N1 // => sizeof(_Mask0) >= sizeof(_Mask1) // and __alignof__(_Mask0) >= __alignof__(_Mask1) static constexpr size_t _S_padding_bytes = (__alignof__(_Mask0) == __alignof__(_Mask1) ? 0 : __alignof__(_Mask0) - (sizeof(_Mask1) % __alignof__(_Mask0))) + _Mask1::_S_padding_bytes; public: using value_type = bool; using abi_type = _Ap; using iterator = _Base::iterator; using const_iterator = _Base::const_iterator; [[__gnu__::__always_inline__]] static constexpr basic_mask _S_init(const _Mask0& __x, const _Mask1& __y) { basic_mask __r; __r._M_data0 = __x; __r._M_data1 = __y; return __r; } [[__gnu__::__always_inline__]] static constexpr basic_mask _S_init(unsigned_integral auto __bits) { return basic_mask(__bits); } template [[__gnu__::__always_inline__]] static constexpr basic_mask _S_init(const __trivial_pair<_U0, _U1>& __bits) { if constexpr (is_unsigned_v<_U0>) { static_assert(is_unsigned_v<_U1>); return _S_init(_Mask0(__bits._M_first), _Mask1(__bits._M_second)); } else if constexpr (is_unsigned_v<_U1>) return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1(__bits._M_second)); else return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1::_S_init(__bits._M_second)); } [[__gnu__::__always_inline__]] constexpr const _Mask0& _M_get_low() const { return _M_data0; } [[__gnu__::__always_inline__]] constexpr const _Mask1& _M_get_high() const { return _M_data1; } template [[__gnu__::__always_inline__]] static constexpr basic_mask _S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x) { using _Mp = basic_mask<_UBytes, _UAbi>; if constexpr (_Mp::_S_has_bool_member || sizeof(basic_mask) > sizeof(__x) || _Mp::_S_padding_bytes != 0) return _S_init(__builtin_bit_cast(_Mask0, __x._M_data0), _Mask1::_S_recursive_bit_cast(__x._M_data1)); else if constexpr (sizeof(basic_mask) == sizeof(__x)) return __builtin_bit_cast(basic_mask, __x); else { // e.g. on IvyBridge (different alignment => different sizeof) struct _Tmp { alignas(_Mp) basic_mask _M_data; }; return __builtin_bit_cast(_Tmp, __x)._M_data; } } [[__gnu__::__always_inline__]] constexpr auto _M_concat_data(bool __do_sanitize = _S_is_partial) const { if constexpr (_S_use_bitmask) { static_assert(_S_size <= numeric_limits::digits, "cannot concat more than 64 bits"); using _Up = _Bitmask<_S_size>; return _Up(_M_data0._M_concat_data() | (_Up(_M_data1._M_concat_data(__do_sanitize)) << _N0)); } else { auto __lo = _M_data0._M_concat_data(); auto __hi = __vec_zero_pad_to(_M_data1._M_concat_data(__do_sanitize)); return __vec_concat(__lo, __hi); } } template <_ArchTraits _Traits = {}> [[__gnu__::__always_inline__]] static constexpr basic_mask _S_partial_mask_of_n(int __n) { #if __has_builtin(__builtin_ia32_bzhi_di) if constexpr (_S_use_bitmask && _S_size <= 64 && _Traits._M_have_bmi2()) return basic_mask(__builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n))); #endif if constexpr (_N0 == 1) { static_assert(_S_size == 2); // => __n == 1 return _S_init(_Mask0(true), _Mask1(false)); } else if (__n < _N0) return _S_init(_Mask0::_S_partial_mask_of_n(__n), _Mask1(false)); else if (__n == _N0 || _N1 == 1) return _S_init(_Mask0(true), _Mask1(false)); else if constexpr (_N1 != 1) return _S_init(_Mask0(true), _Mask1::_S_partial_mask_of_n(__n - _N0)); } [[__gnu__::__always_inline__]] constexpr basic_mask& _M_and_neighbors() { _M_data0._M_and_neighbors(); _M_data1._M_and_neighbors(); return *this; } [[__gnu__::__always_inline__]] constexpr basic_mask& _M_or_neighbors() { _M_data0._M_or_neighbors(); _M_data1._M_or_neighbors(); return *this; } template [[__gnu__::__always_inline__]] constexpr auto _M_chunk() const noexcept { constexpr int __n = _S_size / _Mp::_S_size; constexpr int __rem = _S_size % _Mp::_S_size; constexpr auto [...__is] = _IotaArray<__n>; if constexpr (__rem == 0) return array<_Mp, __n>{__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, _M_data0, _M_data1)...}; else { using _Rest = resize_t<__rem, _Mp>; return tuple(__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, _M_data0, _M_data1)..., __extract_simd_at<_Rest>(cw<_Mp::_S_size * __n>, _M_data0, _M_data1)); } } [[__gnu__::__always_inline__]] static constexpr basic_mask _S_concat(const basic_mask& __x0) noexcept { return __x0; } template requires (sizeof...(_As) >= 2) [[__gnu__::__always_inline__]] static constexpr basic_mask _S_concat(const basic_mask<_Bytes, _As>&... __xs) noexcept { static_assert(_S_size == (_As::_S_size + ...)); return _S_init(__extract_simd_at<_Mask0>(cw<0>, __xs...), __extract_simd_at<_Mask1>(cw<_N0>, __xs...)); } // [simd.mask.overview] default constructor ----------------------------- basic_mask() = default; // [simd.mask.overview] conversion extensions --------------------------- // TODO: any? // [simd.mask.ctor] broadcast constructor ------------------------------- [[__gnu__::__always_inline__]] constexpr explicit basic_mask(same_as auto __x) noexcept // LWG 4382. : _M_data0(__x), _M_data1(__x) {} // [simd.mask.ctor] conversion constructor ------------------------------ template requires (_S_size == _UAbi::_S_size) [[__gnu__::__always_inline__]] constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes)) basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept : _M_data0([&] { if constexpr (_UAbi::_S_nreg > 1) { return __x._M_data0; } else if constexpr (_N0 == 1) return _Mask0(__x[0]); else return get<0>(chunk<_N0>(__x)); }()), _M_data1([&] { if constexpr (_UAbi::_S_nreg > 1) { return __x._M_data1; } else if constexpr (_N1 == 1) return _Mask1(__x[_N0]); else return get<1>(chunk<_N0>(__x)); }()) {} using _Base::_MaskBase; // [simd.mask.ctor] generator constructor ------------------------------- template <__simd_generator_invokable _Fp> [[__gnu__::__always_inline__]] constexpr explicit basic_mask(_Fp&& __gen) : _M_data0(__gen), _M_data1([&] [[__gnu__::__always_inline__]] (auto __i) { return __gen(__simd_size_c<__i + _N0>); }) {} // [simd.mask.ctor] bitset constructor ---------------------------------- [[__gnu__::__always_inline__]] constexpr basic_mask(const same_as> auto& __b) noexcept // LWG 4382. : _M_data0(__bitset_split<_N0>(__b)._M_lo), _M_data1(__bitset_split<_N0>(__b)._M_hi) {} // [simd.mask.ctor] uint constructor ------------------------------------------ template requires (!same_as<_Tp, bool>) // LWG 4382. [[__gnu__::__always_inline__]] constexpr explicit basic_mask(_Tp __val) noexcept : _M_data0(static_cast<_Bitmask<_N0>>(__val)), _M_data1(sizeof(_Tp) * __CHAR_BIT__ > _N0 ? static_cast<_Bitmask<_N1>>(__val >> _N0) : _Bitmask<_N1>()) {} // [simd.mask.subscr] --------------------------------------------------- [[__gnu__::__always_inline__]] constexpr value_type operator[](__simd_size_type __i) const { __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds"); if (__is_const_known(__i)) return __i < _N0 ? _M_data0[__i] : _M_data1[__i - _N0]; else if constexpr (_M_data1._S_has_bool_member) // in some cases the last element can be 'bool' instead of bit-/vector-mask; // e.g. mask is {mask, mask}, where the latter uses // _ScalarAbi<1>, which is stored as 'bool' return __i < _N0 ? _M_data0[__i] : _M_data1[__i - _N0]; else if constexpr (abi_type::_S_is_bitmask) { using _AliasingByte [[__gnu__::__may_alias__]] = unsigned char; return bool((reinterpret_cast(this) [__i / __CHAR_BIT__] >> (__i % __CHAR_BIT__)) & 1); } else { using _AliasingInt [[__gnu__::__may_alias__]] = __integer_from<_Bytes>; return reinterpret_cast(this)[__i] != 0; } } // [simd.mask.unary] ---------------------------------------------------- [[__gnu__::__always_inline__]] constexpr basic_mask operator!() const noexcept { return _S_init(!_M_data0, !_M_data1); } [[__gnu__::__always_inline__]] constexpr _VecType operator+() const noexcept requires destructible<_VecType> { return _VecType::_S_concat(+_M_data0, +_M_data1); } constexpr _VecType operator+() const noexcept = delete; [[__gnu__::__always_inline__]] constexpr _VecType operator-() const noexcept requires destructible<_VecType> { return _VecType::_S_concat(-_M_data0, -_M_data1); } constexpr _VecType operator-() const noexcept = delete; [[__gnu__::__always_inline__]] constexpr _VecType operator~() const noexcept requires destructible<_VecType> { return _VecType::_S_concat(~_M_data0, ~_M_data1); } constexpr _VecType operator~() const noexcept = delete; // [simd.mask.conv] ----------------------------------------------------- template requires (_UAbi::_S_size == _S_size) [[__gnu__::__always_inline__]] constexpr explicit(sizeof(_Up) != _Bytes) operator basic_vec<_Up, _UAbi>() const noexcept { using _Rp = basic_vec<_Up, _UAbi>; return _Rp::_S_init(static_cast<_Rp::_DataType0>(_M_data0), static_cast<_Rp::_DataType1>(_M_data1)); } using _Base::operator basic_vec; // [simd.mask.namedconv] ------------------------------------------------ [[__gnu__::__always_inline__]] constexpr bitset<_S_size> to_bitset() const noexcept { if constexpr (_S_size <= numeric_limits::digits) return to_ullong(); else { static_assert(_N0 % numeric_limits::digits == 0); struct _Tmp { bitset<_N0> _M_lo; bitset<_N1> _M_hi; } __tmp = {_M_data0.to_bitset(), _M_data1.to_bitset()}; return __builtin_bit_cast(bitset<_S_size>, __tmp); } } template [[__gnu__::__always_inline__]] constexpr auto _M_to_uint() const { constexpr int _N0x = _N0; if constexpr (_N0x >= numeric_limits::digits) { static_assert(_Offset == 0); return __trivial_pair { _M_data0.template _M_to_uint<0>(), _M_data1.template _M_to_uint<0>() }; } else { #if _GLIBCXX_X86 if constexpr (_Bytes == 2 && !_Traits._M_have_bmi2() && _Ap::_S_nreg == 2 && !_S_use_bitmask) return __similar_mask(*this).template _M_to_uint<_Offset>(); #endif auto __uint = _M_data1.template _M_to_uint<_N0x + _Offset>(); __uint |= _M_data0.template _M_to_uint<_Offset>(); return __uint; } } [[__gnu__::__always_inline__]] constexpr unsigned long long to_ullong() const { if constexpr (_S_size <= numeric_limits::digits) return _M_to_uint(); else { __glibcxx_simd_precondition(_M_data1.to_ullong() == 0, "to_ullong called on mask with 'true' elements at indices" "higher than representable in a ullong"); return _M_data0.to_ullong(); } } // [simd.mask.binary] [[__gnu__::__always_inline__]] friend constexpr basic_mask operator&&(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data0 && __y._M_data0, __x._M_data1 && __y._M_data1); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator||(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data0 || __y._M_data0, __x._M_data1 || __y._M_data1); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator&(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data0 & __y._M_data0, __x._M_data1 & __y._M_data1); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator|(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data0 | __y._M_data0, __x._M_data1 | __y._M_data1); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator^(const basic_mask& __x, const basic_mask& __y) noexcept { return _S_init(__x._M_data0 ^ __y._M_data0, __x._M_data1 ^ __y._M_data1); } // [simd.mask.cassign] [[__gnu__::__always_inline__]] friend constexpr basic_mask& operator&=(basic_mask& __x, const basic_mask& __y) noexcept { __x._M_data0 &= __y._M_data0; __x._M_data1 &= __y._M_data1; return __x; } [[__gnu__::__always_inline__]] friend constexpr basic_mask& operator|=(basic_mask& __x, const basic_mask& __y) noexcept { __x._M_data0 |= __y._M_data0; __x._M_data1 |= __y._M_data1; return __x; } [[__gnu__::__always_inline__]] friend constexpr basic_mask& operator^=(basic_mask& __x, const basic_mask& __y) noexcept { __x._M_data0 ^= __y._M_data0; __x._M_data1 ^= __y._M_data1; return __x; } // [simd.mask.comparison] ----------------------------------------------- [[__gnu__::__always_inline__]] friend constexpr basic_mask operator==(const basic_mask& __x, const basic_mask& __y) noexcept { return !(__x ^ __y); } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator!=(const basic_mask& __x, const basic_mask& __y) noexcept { return __x ^ __y; } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator>=(const basic_mask& __x, const basic_mask& __y) noexcept { return __x || !__y; } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator<=(const basic_mask& __x, const basic_mask& __y) noexcept { return !__x || __y; } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator>(const basic_mask& __x, const basic_mask& __y) noexcept { return __x && !__y; } [[__gnu__::__always_inline__]] friend constexpr basic_mask operator<(const basic_mask& __x, const basic_mask& __y) noexcept { return !__x && __y; } // [simd.mask.cond] ----------------------------------------------------- [[__gnu__::__always_inline__]] friend constexpr basic_mask __select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept { return _S_init(__select_impl(__k._M_data0, __t._M_data0, __f._M_data0), __select_impl(__k._M_data1, __t._M_data1, __f._M_data1)); } [[__gnu__::__always_inline__]] friend constexpr basic_mask __select_impl(const basic_mask& __k, same_as auto __t, same_as auto __f) noexcept { if (__t == __f) return basic_mask(__t); else return __t ? __k : !__k; } template <__vectorizable _T0, same_as<_T0> _T1> requires (sizeof(_T0) == _Bytes) [[__gnu__::__always_inline__]] friend constexpr vec<_T0, _S_size> __select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept { using _Vp = vec<_T0, _S_size>; if constexpr (!is_same_v) return __select_impl(static_cast<_Vp::mask_type>(__k), __t, __f); else return _Vp::_S_init(__select_impl(__k._M_data0, __t, __f), __select_impl(__k._M_data1, __t, __f)); } template <_ArchTraits _Traits = {}> [[__gnu__::__always_inline__]] constexpr bool _M_all_of() const { if constexpr (_N0 == _N1) return (_M_data0 && _M_data1)._M_all_of(); else return _M_data0._M_all_of() && _M_data1._M_all_of(); } template <_ArchTraits _Traits = {}> [[__gnu__::__always_inline__]] constexpr bool _M_any_of() const { if constexpr (_N0 == _N1) return (_M_data0 || _M_data1)._M_any_of(); else return _M_data0._M_any_of() || _M_data1._M_any_of(); } template <_ArchTraits _Traits = {}> [[__gnu__::__always_inline__]] constexpr bool _M_none_of() const { if constexpr (_N0 == _N1) return (_M_data0 || _M_data1)._M_none_of(); else return _M_data0._M_none_of() && _M_data1._M_none_of(); } [[__gnu__::__always_inline__]] constexpr __simd_size_type _M_reduce_min_index() const { if constexpr (_S_size <= numeric_limits::digits) { const auto __bits = _M_to_uint(); __glibcxx_simd_precondition(__bits, "An empty mask does not have a min_index."); if constexpr (_S_size == 1) return 0; else return __countr_zero(_M_to_uint()); } else if (_M_data0._M_none_of()) return _M_data1._M_reduce_min_index() + _N0; else return _M_data0._M_reduce_min_index(); } [[__gnu__::__always_inline__]] constexpr __simd_size_type _M_reduce_max_index() const { if constexpr (_S_size <= numeric_limits::digits) { const auto __bits = _M_to_uint(); __glibcxx_simd_precondition(__bits, "An empty mask does not have a max_index."); if constexpr (_S_size == 1) return 0; else return __highest_bit(_M_to_uint()); } else if (_M_data1._M_none_of()) return _M_data0._M_reduce_max_index(); else return _M_data1._M_reduce_max_index() + _N0; } [[__gnu__::__always_inline__]] friend constexpr bool __is_const_known(const basic_mask& __x) { return __is_const_known(__x._M_data0) && __is_const_known(__x._M_data1); } }; } // namespace simd _GLIBCXX_END_NAMESPACE_VERSION } // namespace std #pragma GCC diagnostic pop #endif // C++26 #endif // _GLIBCXX_SIMD_MASK_H