diff options
author | yaozhongxiao <yaozhongxiao@linux.alibaba.com> | 2021-02-03 15:49:30 +0000 |
---|---|---|
committer | Jonathan Wakely <jwakely@redhat.com> | 2021-02-03 15:49:30 +0000 |
commit | 598876574184e745defee4b36dc2408068b7a22e (patch) | |
tree | ec9e5db823dea825e16f10418c11a0bd8c697d43 /libstdc++-v3/include/experimental | |
parent | 3de9bd16c91c5fc050961db6887880b303b3a630 (diff) | |
download | gcc-598876574184e745defee4b36dc2408068b7a22e.zip gcc-598876574184e745defee4b36dc2408068b7a22e.tar.gz gcc-598876574184e745defee4b36dc2408068b7a22e.tar.bz2 |
libstdc++: Improve "find_first/last_set" for NEON
The find_first_set and find_last_set method is not optimal for neon, it
needs to be improved by synthesized with horizontal adds(vaddv) which
will reduce the generated assembly code. In the following cases,
vaddvq_s16 will generate 2 instructions but vpadd_s16 will generate 4
instructions:
# vaddvq_s16
vaddvq_s16(__asint);
// addv h0, v1.8h
// smov w1, v0.h[0]
# vpadd_s16
vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero), __zero)[0]
// addp v1.8h,v1.8h,v2.8h
// addp v1.8h,v1.8h,v2.8h
// addp v1.8h,v1.8h,v2.8h
// smov w1, v1.h[0]
#
libstdc++-v3/ChangeLog:
* include/experimental/bits/simd_neon.h: Replace repeated vpadd
calls with a single vaddv for aarch64.
Diffstat (limited to 'libstdc++-v3/include/experimental')
-rw-r--r-- | libstdc++-v3/include/experimental/bits/simd_neon.h | 17 |
1 files changed, 14 insertions, 3 deletions
diff --git a/libstdc++-v3/include/experimental/bits/simd_neon.h b/libstdc++-v3/include/experimental/bits/simd_neon.h index 8bb2116..7f472e8 100644 --- a/libstdc++-v3/include/experimental/bits/simd_neon.h +++ b/libstdc++-v3/include/experimental/bits/simd_neon.h @@ -311,8 +311,7 @@ struct _MaskImplNeonMixin }); __asint &= __bitsel; #ifdef __aarch64__ - return vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero), - __zero)[0]; + return vaddvq_s16(__asint); #else return vpadd_s16( vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero), @@ -328,7 +327,7 @@ struct _MaskImplNeonMixin }); __asint &= __bitsel; #ifdef __aarch64__ - return vpaddq_s32(vpaddq_s32(__asint, __zero), __zero)[0]; + return vaddvq_s32(__asint); #else return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)), __zero)[0]; @@ -351,8 +350,12 @@ struct _MaskImplNeonMixin return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; +#ifdef __aarch64__ + return vaddv_s8(__asint); +#else return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero), __zero)[0]; +#endif } else if constexpr (sizeof(_Tp) == 2) { @@ -362,12 +365,20 @@ struct _MaskImplNeonMixin return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; +#ifdef __aarch64__ + return vaddv_s16(__asint); +#else return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0]; +#endif } else if constexpr (sizeof(_Tp) == 4) { __asint &= __make_vector<_I>(0x1, 0x2); +#ifdef __aarch64__ + return vaddv_s32(__asint); +#else return vpadd_s32(__asint, __zero)[0]; +#endif } else __assert_unreachable<_Tp>(); |