diff options
author | Ricardo Jesus <rjj@nvidia.com> | 2024-10-14 14:28:02 +0100 |
---|---|---|
committer | Jonathan Wakely <redi@gcc.gnu.org> | 2024-10-24 15:01:23 +0100 |
commit | cc33531df065c20a52cb0c35637f096978128b46 (patch) | |
tree | 68315f262046581e145f78edc258275e6ae98e27 | |
parent | b02503d3fe79778ccc149b0abbc736f7972603da (diff) | |
download | gcc-cc33531df065c20a52cb0c35637f096978128b46.zip gcc-cc33531df065c20a52cb0c35637f096978128b46.tar.gz gcc-cc33531df065c20a52cb0c35637f096978128b46.tar.bz2 |
aarch64: libstdc++: Use shufflevector instead of shuffle in opt_random.h
This patch modifies the implementation of the vectorized mersenne
twister random number generator to use __builtin_shufflevector instead
of __builtin_shuffle. This makes it (almost) compatible with Clang.
To make the implementation fully compatible with Clang, Clang will need
to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
currently it does not. This looks like an oversight in Clang and so will
be addressed separately.
I see no codegen change with this patch.
Bootstrapped and tested on aarch64-none-linux-gnu.
libstdc++-v3/ChangeLog:
* config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
of __builtin_shuffle with __builtin_shufflevector.
(__aarch64_lsl_128): Move shift amount to a template parameter.
(__aarch64_lsr_128): Move shift amount to a template parameter.
(__aarch64_recursion): Update call sites of __aarch64_lsl_128
and __aarch64_lsr_128.
Signed-off-by: Ricardo Jesus <rjj@nvidia.com>
-rw-r--r-- | libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h | 28 |
1 files changed, 16 insertions, 12 deletions
diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h index 7f756d1..7eb816a 100644 --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h @@ -35,13 +35,13 @@ #ifdef __ARM_NEON #ifdef __ARM_BIG_ENDIAN -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \ - {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C}) +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \ + 16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ + 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C) #else -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \ - {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15}) +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \ + _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ + _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15) #endif #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION namespace { // Logical Shift right 128-bits by c * 8 bits - __extension__ extern __inline __Uint32x4_t + __extension__ + template<int __c> + extern __inline __Uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) - __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c) + __aarch64_lsr_128 (__Uint8x16_t __a) { const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Logical Shift left 128-bits by c * 8 bits - __extension__ extern __inline __Uint32x4_t + __extension__ + template<int __c> + extern __inline __Uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) - __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c) + __aarch64_lsl_128 (__Uint8x16_t __a) { const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __Uint32x4_t __e) { __Uint32x4_t __y = (__b >> __sr1); - __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2); + __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c); __Uint32x4_t __v = __d << __sl1; __z = __z ^ __a; __z = __z ^ __v; - __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2); + __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a); __y = __y & __e; __z = __z ^ __x; |