diff options
author | Luc Grosheintz <luc.grosheintz@gmail.com> | 2025-08-03 22:57:24 +0200 |
---|---|---|
committer | Tomasz Kamiński <tkaminsk@redhat.com> | 2025-08-21 10:16:22 +0200 |
commit | 3134742307f1133a449da9590e2294e9a7179226 (patch) | |
tree | 3d02a05a321caa798f5f3673c53eb7d54b8ebcc2 | |
parent | 997cd37809002840379488857291da9a8c650e3a (diff) | |
download | gcc-3134742307f1133a449da9590e2294e9a7179226.zip gcc-3134742307f1133a449da9590e2294e9a7179226.tar.gz gcc-3134742307f1133a449da9590e2294e9a7179226.tar.bz2 |
libstdc++: Precompute products of static extents.
Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.
The two partial products for r = 0, ..., n:
\prod_{i = 0}^r E[i] (fwd)
\prod_{i = r+1}^n E[i] (rev)
can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.
Three methods are directly affected by this optimization:
layout_left::mapping::stride
layout_right::mapping::stride
mdspan::size
We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.
Consider a generic case:
using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
{ return m.stride(r); }
The code generated prior to this commit:
4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8
4f7: 00
4f8: 48 83 c6 01 add rsi,0x1
4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff
503: ff ff
505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
50c: 00
50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523
522: 00
523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
528: 48 83 f8 38 cmp rax,0x38
52c: 74 72 je 5a0 <stride_right_E1+0xb0>
52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
538: b8 01 00 00 00 mov eax,0x1
53d: 0f 1f 00 nop DWORD PTR [rax]
540: 48 8b 0a mov rcx,QWORD PTR [rdx]
543: 49 89 c0 mov r8,rax
546: 4c 0f af c1 imul r8,rcx
54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
54e: 49 0f 45 c0 cmovne rax,r8
552: 48 83 c2 08 add rdx,0x8
556: 49 39 d1 cmp r9,rdx
559: 75 e5 jne 540 <stride_right_E1+0x50>
55b: 48 85 c0 test rax,rax
55e: 74 38 je 598 <stride_right_E1+0xa8>
560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
567: 00
568: 48 c1 e2 02 shl rdx,0x2
56c: 48 83 fa 10 cmp rdx,0x10
570: 74 1e je 590 <stride_right_E1+0xa0>
572: 48 8d 4f 10 lea rcx,[rdi+0x10]
576: 48 01 d7 add rdi,rdx
579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
583: 48 83 c7 04 add rdi,0x4
587: 48 0f af c2 imul rax,rdx
58b: 48 39 f9 cmp rcx,rdi
58e: 75 f0 jne 580 <stride_right_E1+0x90>
590: c3 ret
591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
598: c3 ret
599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
5a0: b8 01 00 00 00 mov eax,0x1
5a5: eb b9 jmp 560 <stride_right_E1+0x70>
5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
5ae: 00 00
which seems to be performing:
preparatory_work();
ret = 1
for(i = 0; i < rank; ++i)
tmp = ret * E[i]
if E[i] != -1
ret = tmp
for(i = 0; i < rank_dynamic; ++i)
ret *= D[i]
This commit reduces it down to:
270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
277: 00
278: 31 d2 xor edx,edx
27a: 48 85 c0 test rax,rax
27d: 74 33 je 2b2 <stride_right_E1+0x42>
27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
286: 00
287: 48 c1 e2 02 shl rdx,0x2
28b: 48 83 fa 10 cmp rdx,0x10
28f: 74 1f je 2b0 <stride_right_E1+0x40>
291: 48 8d 4f 10 lea rcx,[rdi+0x10]
295: 48 01 d7 add rdi,rdx
298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
29f: 00
2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
2a3: 48 83 c7 04 add rdi,0x4
2a7: 48 0f af c2 imul rax,rdx
2ab: 48 39 f9 cmp rcx,rdi
2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
2b0: 89 c2 mov edx,eax
2b2: 89 d0 mov eax,edx
2b4: c3 ret
Loosely speaking this does the following:
1. Load the starting position k in the array of dynamic extents; and
return if possible.
2. Load the partial product of static extents.
3. Computes the \prod_{i = k}^d D[i] where d is the number of
dynamic extents in a loop.
It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.
The analogous code is generated for layout_left.
Next, consider
using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
int size2(const std::mdspan<double, E2>& md)
{ return md.size(); }
on immediately preceding commit the generated code is
10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
17: 00
18: 49 89 f8 mov r8,rdi
1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
27: 00 00
29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
2e: ba 01 00 00 00 mov edx,0x1
33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
48: 00
49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
55: 00 00 00 00
59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
60: 48 8b 08 mov rcx,QWORD PTR [rax]
63: 48 89 d6 mov rsi,rdx
66: 48 0f af f1 imul rsi,rcx
6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
6e: 48 0f 45 d6 cmovne rdx,rsi
72: 48 83 c0 08 add rax,0x8
76: 48 39 c7 cmp rdi,rax
79: 75 e5 jne 60 <size2+0x50>
7b: 48 85 d2 test rdx,rdx
7e: 74 18 je 98 <size2+0x88>
80: 49 63 00 movsxd rax,DWORD PTR [r8]
83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
87: 48 0f af c1 imul rax,rcx
8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
90: 0f af c2 imul eax,edx
93: c3 ret
94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
98: 31 c0 xor eax,eax
9a: c3 ret
which is needlessly long. The current commit reduces it down to:
10: 48 63 07 movsxd rax,DWORD PTR [rdi]
13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
17: 48 0f af c2 imul rax,rdx
1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
1f: 69 c0 83 04 00 00 imul eax,eax,0x483
25: c3 ret
Which simply computes the product:
D[0] * D[1] * D[2] * const
where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.
The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.
libstdc++-v3/ChangeLog:
* include/std/mdspan (__mdspan::__static_prod): New function.
(__mdspan::__fwd_partial_prods): Constexpr array of partial
forward products.
(__mdspan::__fwd_partial_prods): Same for reverse partial
products.
(__mdspan::__static_extents_prod): Delete function.
(__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
include/std/mdspan (__mdspan::__fwd_prod): Compute as the
product of pre-computed static static and the product of dynamic
extents.
(__mdspan::__rev_prod): Ditto.
Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com>
Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
-rw-r--r-- | libstdc++-v3/include/std/mdspan | 77 |
1 files changed, 52 insertions, 25 deletions
diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan index 08e6b8f..dd53e60 100644 --- a/libstdc++-v3/include/std/mdspan +++ b/libstdc++-v3/include/std/mdspan @@ -202,6 +202,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __static_extents() noexcept { return _Extents::_S_storage::_S_static_extents(); } + template<array _Extents> + consteval size_t + __static_prod(size_t __begin, size_t __end) + { + size_t __prod = 1; + for(size_t __i = __begin; __i < __end; ++__i) + { + auto __ext = _Extents[__i]; + __prod *= __ext == dynamic_extent ? size_t(1) : __ext; + } + return __prod; + } + + // Pre-compute: \prod_{i = 0}^r _Extents[i], for r = 0,..., n (exclusive) + template<array _Extents> + constexpr auto __fwd_partial_prods = [] consteval + { + constexpr size_t __rank = _Extents.size(); + std::array<size_t, __rank + 1> __ret; + for(size_t __r = 0; __r < __rank + 1; ++__r) + __ret[__r] = __static_prod<_Extents>(0, __r); + return __ret; + }(); + + // Pre-compute: \prod_{i = r+1}^{n-1} _Extents[i] + template<array _Extents> + constexpr auto __rev_partial_prods = [] consteval + { + constexpr size_t __rank = _Extents.size(); + std::array<size_t, __rank> __ret; + for(size_t __r = 0; __r < __rank; ++__r) + __ret[__r] = __static_prod<_Extents>(__r + 1, __rank); + return __ret; + }(); + template<typename _Extents> constexpr span<const typename _Extents::index_type> __dynamic_extents(const _Extents& __exts, size_t __begin = 0, @@ -369,47 +404,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - constexpr size_t - __static_extents_prod(const auto& __sta_exts) noexcept - { - size_t __ret = 1; - for (auto __factor : __sta_exts) - if (__factor != dynamic_extent) - __ret *= __factor; - return __ret; - } - template<typename _Extents> constexpr typename _Extents::index_type - __exts_prod(const _Extents& __exts, size_t __begin, size_t __end) noexcept + __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t __begin, + size_t __end) noexcept { - using _IndexType = typename _Extents::index_type; - - size_t __ret = 1; - if constexpr (_Extents::rank_dynamic() != _Extents::rank()) - { - std::span<const size_t> __sta_exts = __static_extents<_Extents>(); - __ret = __static_extents_prod(__sta_exts.subspan(__begin, - __end - __begin)); - if (__ret == 0) - return 0; - } + if (__sta_prod == 0) + return 0; + size_t __ret = __sta_prod; if constexpr (_Extents::rank_dynamic() > 0) for (auto __factor : __dynamic_extents(__exts, __begin, __end)) __ret *= size_t(__factor); - return _IndexType(__ret); + return static_cast<typename _Extents::index_type>(__ret); } template<typename _Extents> constexpr typename _Extents::index_type __fwd_prod(const _Extents& __exts, size_t __r) noexcept - { return __exts_prod(__exts, 0, __r); } + { + constexpr auto& __sta_exts = __static_extents<_Extents>(); + size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r]; + return __extents_prod(__exts, __sta_prod, 0, __r); + } template<typename _Extents> constexpr typename _Extents::index_type __rev_prod(const _Extents& __exts, size_t __r) noexcept - { return __exts_prod(__exts, __r + 1, __exts.rank()); } + { + constexpr auto& __sta_exts = __static_extents<_Extents>(); + constexpr size_t __rank = _Extents::rank(); + size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r]; + return __extents_prod(__exts, __sta_prod, __r + 1, __rank); + } template<typename _Extents> constexpr typename _Extents::index_type |