aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuc Grosheintz <luc.grosheintz@gmail.com>2025-08-03 22:57:25 +0200
committerTomasz Kamiński <tkaminsk@redhat.com>2025-08-21 10:17:23 +0200
commitdb563993b69bab00e40ebed567aa7422d77ed016 (patch)
tree1161dd0fdc04c967ba22494f7e3e3c01326850fb
parent3134742307f1133a449da9590e2294e9a7179226 (diff)
downloadgcc-db563993b69bab00e40ebed567aa7422d77ed016.zip
gcc-db563993b69bab00e40ebed567aa7422d77ed016.tar.gz
gcc-db563993b69bab00e40ebed567aa7422d77ed016.tar.bz2
libstdc++: Improve low-rank layout_{left,right}::stride.
The methods layout_{left,right}::mapping::stride are defined as \prod_{i = 0}^r E[i] \prod_{i = r+1}^n E[i] This is computed as the product of a precomputed static product and the product of the required dynamic extents. Disassembly shows that even for low-rank extents, i.e. rank == 1 and rank == 2, with at least one dynamic extent, the generated code loads two values; and then runs the loop over at most one element, e.g. for stride_left_d5 defined below the generated code is: 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 227: 00 228: 31 d2 xor edx,edx 22a: 48 85 c0 test rax,rax 22d: 74 23 je 252 <stride_left_d5+0x32> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0] 236: 00 237: 48 c1 e1 02 shl rcx,0x2 23b: 74 13 je 250 <stride_left_d5+0x30> 23d: 48 01 f9 add rcx,rdi 240: 48 63 17 movsxd rdx,DWORD PTR [rdi] 243: 48 83 c7 04 add rdi,0x4 247: 48 0f af c2 imul rax,rdx 24b: 48 39 f9 cmp rcx,rdi 24e: 75 f0 jne 240 <stride_left_d5+0x20> 250: 89 c2 mov edx,eax 252: 89 d0 mov eax,edx 254: c3 ret If there's no dynamic extents, it simply loads the precomputed product of static extents. For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or extents.extent(k), with k == 0 for layout_left and k == 1 for layout_right. Consider, using Ed = std::extents<int, dyn>; int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r) { return m.stride(r); } using E3d = std::extents<int, 3, dyn>; int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r) { return m.stride(r); } using Ed5 = std::extents<int, dyn, 5>; int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r) { return m.stride(r); } The optimized code for these three cases is: 0000000000000060 <stride_left_d>: 60: b8 01 00 00 00 mov eax,0x1 65: c3 ret 0000000000000090 <stride_left_3d>: 90: 48 83 fe 01 cmp rsi,0x1 94: 19 c0 sbb eax,eax 96: 83 e0 fe and eax,0xfffffffe 99: 83 c0 03 add eax,0x3 9c: c3 ret 00000000000000a0 <stride_left_d5>: a0: b8 01 00 00 00 mov eax,0x1 a5: 48 85 f6 test rsi,rsi a8: 74 02 je ac <stride_left_d5+0xc> aa: 8b 07 mov eax,DWORD PTR [rdi] ac: c3 ret For rank == 1 it simply returns 1 (as expected). For rank == 2, it either implements a branchless formula, or conditionally loads one value. In all cases involving a dynamic extent this seems like it's always doing clearly less work, both in terms of computation and loads. In cases not involving a dynamic extent, it replaces loading one value with a branchless sequence of four instructions. This commit also refactors __size to no use any of the precomputed arrays. This prevents instantiating __{fwd,rev}_partial_prods for low-rank extents. This results in a further size reduction of a reference object file (described two commits prior) by 9% from 46.0kB to 41.9kB. In a prior commit we optimized __size to produce better object code by precomputing the static products. This refactor enables the optimizer to generate the same optimized code. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__fwd_prod): Optimize for rank <= 2. (__mdspan::__rev_prod): Ditto. (__mdspan::__size): Refactor to use a pre-computed product, not a partial product. Reviewed-by: Tomasz Kamiński <tkaminsk@redhat.com> Signed-off-by: Luc Grosheintz <luc.grosheintz@gmail.com>
-rw-r--r--libstdc++-v3/include/std/mdspan34
1 files changed, 28 insertions, 6 deletions
diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan
index dd53e60..3cbf922 100644
--- a/libstdc++-v3/include/std/mdspan
+++ b/libstdc++-v3/include/std/mdspan
@@ -419,29 +419,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return static_cast<typename _Extents::index_type>(__ret);
}
+ // Preconditions: _r < _Extents::rank()
template<typename _Extents>
constexpr typename _Extents::index_type
__fwd_prod(const _Extents& __exts, size_t __r) noexcept
{
+ constexpr size_t __rank = _Extents::rank();
constexpr auto& __sta_exts = __static_extents<_Extents>();
- size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
- return __extents_prod(__exts, __sta_prod, 0, __r);
+ if constexpr (__rank == 1)
+ return 1;
+ else if constexpr (__rank == 2)
+ return __r == 0 ? 1 : __exts.extent(0);
+ else
+ {
+ size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
+ return __extents_prod(__exts, __sta_prod, 0, __r);
+ }
}
+ // Preconditions: _r < _Extents::rank()
template<typename _Extents>
constexpr typename _Extents::index_type
__rev_prod(const _Extents& __exts, size_t __r) noexcept
{
- constexpr auto& __sta_exts = __static_extents<_Extents>();
constexpr size_t __rank = _Extents::rank();
- size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
- return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
+ constexpr auto& __sta_exts = __static_extents<_Extents>();
+ if constexpr (__rank == 1)
+ return 1;
+ else if constexpr (__rank == 2)
+ return __r == 0 ? __exts.extent(1) : 1;
+ else
+ {
+ size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
+ return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
+ }
}
template<typename _Extents>
constexpr typename _Extents::index_type
__size(const _Extents& __exts) noexcept
- { return __fwd_prod(__exts, __exts.rank()); }
+ {
+ constexpr auto& __sta_exts = __static_extents<_Extents>();
+ constexpr size_t __rank = _Extents::rank();
+ constexpr size_t __sta_prod = __static_prod<__sta_exts>(0, __rank);
+ return __extents_prod(__exts, __sta_prod, 0, __rank);
+ }
template<typename _IndexType, size_t... _Counts>
auto __build_dextents_type(integer_sequence<size_t, _Counts...>)