diff options
author | Luc Grosheintz <luc.grosheintz@gmail.com> | 2025-08-03 22:57:28 +0200 |
---|---|---|
committer | Tomasz Kamiński <tkaminsk@redhat.com> | 2025-08-21 10:22:11 +0200 |
commit | 2d3282663c1ab392801ca498daf1542e67a89ae2 (patch) | |
tree | eef5d38b3756ae2c49c7378727bcd8c2ef1ff084 | |
parent | 0197c3b15853de18d7bce386caed6f7fa792fb33 (diff) | |
download | gcc-2d3282663c1ab392801ca498daf1542e67a89ae2.zip gcc-2d3282663c1ab392801ca498daf1542e67a89ae2.tar.gz gcc-2d3282663c1ab392801ca498daf1542e67a89ae2.tar.bz2 |
libstdc++: Reduce indirection in extents::extent.
In both fully static and dynamic extents the comparison
static_extent(i) == dynamic_extent
is known at compile time. As a result, extents::extent doesn't
need to perform the check at runtime.
An illustrative example is:
using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
int required_span_size(const typename Layout::mapping<E>& m)
{ return m.required_span_size(); }
Prior to this commit the generated code (on -O2) is:
2a0: b9 01 00 00 00 mov ecx,0x1
2a5: 31 d2 xor edx,edx
2a7: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2ae: 00 00 00 00
2b2: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
2b9: 00 00 00 00
2bd: 0f 1f 00 nop DWORD PTR [rax]
2c0: 48 8b 04 d5 00 00 00 mov rax,QWORD PTR [rdx*8+0x0]
2c7: 00
2c8: 48 83 f8 ff cmp rax,0xffffffffffffffff
2cc: 0f 84 00 00 00 00 je 2d2 <required_span_size_6d_static+0x32>
2d2: 83 e8 01 sub eax,0x1
2d5: 0f af 04 97 imul eax,DWORD PTR [rdi+rdx*4]
2d9: 48 83 c2 01 add rdx,0x1
2dd: 01 c1 add ecx,eax
2df: 48 83 fa 06 cmp rdx,0x6
2e3: 75 db jne 2c0 <required_span_size_6d_static+0x20>
2e5: 89 c8 mov eax,ecx
2e7: c3 ret
which is a scalar loop, and notably includes the check
308: 48 83 f8 ff cmp rax,0xffffffffffffffff
to assert that the static extent is indeed not -1. Note, that on -O3 the
optimizer eliminates the comparison; and generates a sequence of scalar
operations: lea, shl, add and mov. The aim of this commit is to
eliminate this comparison also for -O2. With the optimization applied we
get:
2e0: f3 0f 6f 0f movdqu xmm1,XMMWORD PTR [rdi]
2e4: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
2eb: 00
2ec: 8b 57 10 mov edx,DWORD PTR [rdi+0x10]
2ef: 66 0f 6f c1 movdqa xmm0,xmm1
2f3: 66 0f 73 d1 20 psrlq xmm1,0x20
2f8: 66 0f f4 c2 pmuludq xmm0,xmm2
2fc: 66 0f 73 d2 20 psrlq xmm2,0x20
301: 8d 14 52 lea edx,[rdx+rdx*2]
304: 66 0f f4 ca pmuludq xmm1,xmm2
308: 66 0f 70 c0 08 pshufd xmm0,xmm0,0x8
30d: 66 0f 70 c9 08 pshufd xmm1,xmm1,0x8
312: 66 0f 62 c1 punpckldq xmm0,xmm1
316: 66 0f 6f c8 movdqa xmm1,xmm0
31a: 66 0f 73 d9 08 psrldq xmm1,0x8
31f: 66 0f fe c1 paddd xmm0,xmm1
323: 66 0f 6f c8 movdqa xmm1,xmm0
327: 66 0f 73 d9 04 psrldq xmm1,0x4
32c: 66 0f fe c1 paddd xmm0,xmm1
330: 66 0f 7e c0 movd eax,xmm0
334: 8d 54 90 01 lea edx,[rax+rdx*4+0x1]
338: 8b 47 14 mov eax,DWORD PTR [rdi+0x14]
33b: c1 e0 04 shl eax,0x4
33e: 01 d0 add eax,edx
340: c3 ret
Which shows eliminating the trivial comparison, unlocks a new set of
optimizations, i.e. SIMD-vectorization. In particular, the loop has been
vectorized by loading the first four constants from aligned memory; the
first four strides from non-aligned memory, then computes the product
and reduction. It interleaves the above with computing 1 + 12*S[4] +
16*S[5] (as scalar operations) and then finishes the reduction.
A similar effect can be observed for fully dynamic extents.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__all_static): New function.
(__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
(__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
(__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
-rw-r--r-- | libstdc++-v3/include/std/mdspan | 35 |
1 files changed, 25 insertions, 10 deletions
diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan index a9168af..4b27111 100644 --- a/libstdc++-v3/include/std/mdspan +++ b/libstdc++-v3/include/std/mdspan @@ -50,6 +50,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION namespace __mdspan { consteval bool + __all_static(std::span<const size_t> __extents) + { + for(auto __ext : __extents) + if (__ext == dynamic_extent) + return false; + return true; + } + + consteval bool __all_dynamic(std::span<const size_t> __extents) { for(auto __ext : __extents) @@ -62,10 +71,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION class _StaticExtents { public: - static consteval bool - _S_is_dyn(size_t __ext) noexcept - { return __ext == dynamic_extent; } - static constexpr size_t _S_rank = _Extents.size(); // For __r in [0, _S_rank], _S_dynamic_index(__r) is the number @@ -85,7 +90,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION for (size_t __i = 0; __i < _S_rank; ++__i) { __ret[__i] = __dyn; - __dyn += _S_is_dyn(_Extents[__i]); + __dyn += (_Extents[__i] == dynamic_extent); } __ret[_S_rank] = __dyn; return __ret; @@ -103,7 +108,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { array<size_t, _S_rank_dynamic> __ret; for (size_t __i = 0, __r = 0; __i < _S_rank; ++__i) - if (_S_is_dyn(_Extents[__i])) + if (_Extents[__i] == dynamic_extent) __ret[__r++] = __i; return __ret; }(); @@ -148,6 +153,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION using _S_base::_S_dynamic_index_inv; using _S_base::_S_static_extent; + static constexpr bool + _S_is_dynamic(size_t __r) noexcept + { + if constexpr (__all_static(_Extents)) + return false; + else if constexpr (__all_dynamic(_Extents)) + return true; + else + return _Extents[__r] == dynamic_extent; + } + template<typename _OIndexType> static constexpr _IndexType _S_int_cast(const _OIndexType& __other) noexcept @@ -156,11 +172,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION constexpr _IndexType _M_extent(size_t __r) const noexcept { - auto __se = _Extents[__r]; - if (__se == dynamic_extent) + if (_S_is_dynamic(__r)) return _M_dyn_exts[_S_dynamic_index(__r)]; else - return __se; + return _S_static_extent(__r); } template<size_t _OtherRank, typename _GetOtherExtent> @@ -169,7 +184,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if constexpr (_OtherRank == _S_rank) for (size_t __i = 0; __i < _S_rank; ++__i) - if (_Extents[__i] != dynamic_extent + if (!_S_is_dynamic(__i) && !cmp_equal(_Extents[__i], _S_int_cast(__get_extent(__i)))) return false; return true; |