aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2021-02-08 16:50:30 +0000
committerJonathan Wright <jonathan.wright@arm.com>2021-04-28 21:10:58 +0100
commiteb2b36024c94bc32465777927092cdbdf2d95204 (patch)
tree99b0886667605aa8eb6819764ae4a8e05ce96167
parenta53b8229e64c78256449005929e599b2eab83fbd (diff)
downloadgcc-eb2b36024c94bc32465777927092cdbdf2d95204.zip
gcc-eb2b36024c94bc32465777927092cdbdf2d95204.tar.gz
gcc-eb2b36024c94bc32465777927092cdbdf2d95204.tar.bz2
aarch64: Use RTL builtins for vpaddq intrinsics
Rewrite vpaddq Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-08 Jonathan Wright <jonathan.wright@arm.com> * config/aarch64/aarch64-simd-builtins.def: Use VDQ_I iterator for aarch64_addp<mode> builtin macro generator. * config/aarch64/aarch64-simd.md: Use VDQ_I iterator in aarch64_addp<mode> RTL pattern. * config/aarch64/arm_neon.h (vpaddq_s8): Use RTL builtin instead of inline asm. (vpaddq_s16): Likewise. (vpaddq_s32): Likewise. (vpaddq_s64): Likewise. (vpaddq_u8): Likewise. (vpaddq_u16): Likewise. (vpaddq_u32): Likewise. (vpaddq_u64): Likewise.
-rw-r--r--gcc/config/aarch64/aarch64-simd-builtins.def2
-rw-r--r--gcc/config/aarch64/aarch64-simd.md8
-rw-r--r--gcc/config/aarch64/arm_neon.h60
3 files changed, 17 insertions, 53 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index f79e716..92804e0 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -48,7 +48,7 @@
BUILTIN_VB (BINOP, pmul, 0, NONE)
BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP)
BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP)
- BUILTIN_VD_BHSI (BINOP, addp, 0, NONE)
+ BUILTIN_VDQ_I (BINOP, addp, 0, NONE)
VAR1 (UNOP, addp, 0, NONE, di)
BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, NONE)
BUILTIN_VDQ_BHSI (UNOP, clz, 2, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 5245cf0..60e11c6 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -6004,10 +6004,10 @@
;; addp
(define_insn "aarch64_addp<mode>"
- [(set (match_operand:VD_BHSI 0 "register_operand" "=w")
- (unspec:VD_BHSI
- [(match_operand:VD_BHSI 1 "register_operand" "w")
- (match_operand:VD_BHSI 2 "register_operand" "w")]
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+ (unspec:VDQ_I
+ [(match_operand:VDQ_I 1 "register_operand" "w")
+ (match_operand:VDQ_I 2 "register_operand" "w")]
UNSPEC_ADDP))]
"TARGET_SIMD"
"addp\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 5fb2b3d..52f3714 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8673,96 +8673,60 @@ __extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddq_s8 (int8x16_t __a, int8x16_t __b)
{
- int8x16_t __result;
- __asm__ ("addp %0.16b,%1.16b,%2.16b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_addpv16qi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddq_s16 (int16x8_t __a, int16x8_t __b)
{
- int16x8_t __result;
- __asm__ ("addp %0.8h,%1.8h,%2.8h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_addpv8hi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddq_s32 (int32x4_t __a, int32x4_t __b)
{
- int32x4_t __result;
- __asm__ ("addp %0.4s,%1.4s,%2.4s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_addpv4si (__a, __b);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddq_s64 (int64x2_t __a, int64x2_t __b)
{
- int64x2_t __result;
- __asm__ ("addp %0.2d,%1.2d,%2.2d"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_addpv2di (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddq_u8 (uint8x16_t __a, uint8x16_t __b)
{
- uint8x16_t __result;
- __asm__ ("addp %0.16b,%1.16b,%2.16b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint8x16_t) __builtin_aarch64_addpv16qi ((int8x16_t) __a,
+ (int8x16_t) __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddq_u16 (uint16x8_t __a, uint16x8_t __b)
{
- uint16x8_t __result;
- __asm__ ("addp %0.8h,%1.8h,%2.8h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint16x8_t) __builtin_aarch64_addpv8hi ((int16x8_t) __a,
+ (int16x8_t) __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddq_u32 (uint32x4_t __a, uint32x4_t __b)
{
- uint32x4_t __result;
- __asm__ ("addp %0.4s,%1.4s,%2.4s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint32x4_t) __builtin_aarch64_addpv4si ((int32x4_t) __a,
+ (int32x4_t) __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddq_u64 (uint64x2_t __a, uint64x2_t __b)
{
- uint64x2_t __result;
- __asm__ ("addp %0.2d,%1.2d,%2.2d"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint64x2_t) __builtin_aarch64_addpv2di ((int64x2_t) __a,
+ (int64x2_t) __b);
}
__extension__ extern __inline int16x4_t