diff options
author | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2021-01-13 12:48:57 +0000 |
---|---|---|
committer | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2021-01-14 08:36:19 +0000 |
commit | 48f8d1d48f2c7c2bc724dee979bcf56957f233cb (patch) | |
tree | d1996f8ec847cae706cacb82558ed745f9f8b713 /gcc | |
parent | 52cd1cd1b67b10a6d58612bafaded6e8e3a303a1 (diff) | |
download | gcc-48f8d1d48f2c7c2bc724dee979bcf56957f233cb.zip gcc-48f8d1d48f2c7c2bc724dee979bcf56957f233cb.tar.gz gcc-48f8d1d48f2c7c2bc724dee979bcf56957f233cb.tar.bz2 |
aarch64: Reimplememnt vmovn/vmovl intrinsics with builtins instead
Turns out __builtin_convertvector is not as good a fit for the widening
and narrowing intrinsics as I had hoped.
During the veclower phase we lower most of it to bitfield operations and
hope DCE cleans it back up into
vector pack/unpack and extend operations. I received reports that in
more complex cases GCC fails to do that
and we're left with many vector extract operations that clutter the
output.
I think veclower can be improved on that front, but for GCC 10 I'd like
to just implement these builtins
with a good old RTL builtin rather than inline asm.
gcc/
* config/aarch64/aarch64-simd.md (aarch64_<su>xtl<mode>):
Define.
(aarch64_xtn<mode>): Likewise.
* config/aarch64/aarch64-simd-builtins.def (sxtl, uxtl, xtn):
Define
builtins.
* config/aarch64/arm_neon.h (vmovl_s8): Reimplement using
builtin.
(vmovl_s16): Likewise.
(vmovl_s32): Likewise.
(vmovl_u8): Likewise.
(vmovl_u16): Likewise.
(vmovl_u32): Likewise.
(vmovn_s16): Likewise.
(vmovn_s32): Likewise.
(vmovn_s64): Likewise.
(vmovn_u16): Likewise.
(vmovn_u32): Likewise.
(vmovn_u64): Likewise.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-simd-builtins.def | 7 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 14 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 24 |
3 files changed, 33 insertions, 12 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 27e9026..f56e59c 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -171,6 +171,13 @@ BUILTIN_VQN (TERNOP, raddhn2, 0, NONE) BUILTIN_VQN (TERNOP, rsubhn2, 0, NONE) + /* Implemented by aarch64_<us>xtl<mode>. */ + BUILTIN_VQN (UNOP, sxtl, 0, NONE) + BUILTIN_VQN (UNOPU, uxtl, 0, NONE) + + /* Implemented by aarch64_xtn<mode>. */ + BUILTIN_VQN (UNOP, xtn, 0, NONE) + BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, ALL) /* Implemented by aarch64_<sur>qmovn<mode>. */ BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, ALL) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 0827f0e..4b869de 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -7301,6 +7301,20 @@ [(set_attr "type" "neon_shift_imm_long")] ) +(define_expand "aarch64_<su>xtl<mode>" + [(set (match_operand:VQN 0 "register_operand" "=w") + (ANY_EXTEND:VQN (match_operand:<VNARROWQ> 1 "register_operand" "w")))] + "TARGET_SIMD" + "" +) + +(define_expand "aarch64_xtn<mode>" + [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w") + (truncate:<VNARROWQ> (match_operand:VQN 1 "register_operand" "w")))] + "TARGET_SIMD" + "" +) + ;; Truncate a 128-bit integer vector to a 64-bit vector. (define_insn "trunc<mode><Vnarrowq>2" [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w") diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 6095c0d..46331ae 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -8709,42 +8709,42 @@ __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_s8 (int8x8_t __a) { - return __builtin_convertvector (__a, int16x8_t); + return __builtin_aarch64_sxtlv8hi (__a); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_s16 (int16x4_t __a) { - return __builtin_convertvector (__a, int32x4_t); + return __builtin_aarch64_sxtlv4si (__a); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_s32 (int32x2_t __a) { - return __builtin_convertvector (__a, int64x2_t); + return __builtin_aarch64_sxtlv2di (__a); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_u8 (uint8x8_t __a) { - return __builtin_convertvector (__a, uint16x8_t); + return __builtin_aarch64_uxtlv8hi_uu (__a); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_u16 (uint16x4_t __a) { - return __builtin_convertvector (__a, uint32x4_t); + return __builtin_aarch64_uxtlv4si_uu (__a); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_u32 (uint32x2_t __a) { - return __builtin_convertvector (__a, uint64x2_t); + return __builtin_aarch64_uxtlv2di_uu (__a); } __extension__ extern __inline int8x16_t @@ -8796,42 +8796,42 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_s16 (int16x8_t __a) { - return __builtin_convertvector (__a, int8x8_t); + return __builtin_aarch64_xtnv8hi (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_s32 (int32x4_t __a) { - return __builtin_convertvector (__a, int16x4_t); + return __builtin_aarch64_xtnv4si (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_s64 (int64x2_t __a) { - return __builtin_convertvector (__a, int32x2_t); + return __builtin_aarch64_xtnv2di (__a); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_u16 (uint16x8_t __a) { - return __builtin_convertvector (__a, uint8x8_t); + return (uint8x8_t)__builtin_aarch64_xtnv8hi ((int16x8_t) __a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_u32 (uint32x4_t __a) { - return __builtin_convertvector (__a, uint16x4_t); + return (uint16x4_t) __builtin_aarch64_xtnv4si ((int32x4_t )__a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_u64 (uint64x2_t __a) { - return __builtin_convertvector (__a, uint32x2_t); + return (uint32x2_t) __builtin_aarch64_xtnv2di ((int64x2_t) __a); } #define vmull_high_lane_s16(a, b, c) \ |