aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2021-02-10 11:39:39 +0000
committerJonathan Wright <jonathan.wright@arm.com>2021-04-28 21:11:58 +0100
commit6372b05e5b14f27ddce11c28654956c1ad715dac (patch)
treed7df4b289a8c77d137f3162c1ee4a9258a6ac6b1
parent8e7f6e03955244827a513777e4845c98e130319d (diff)
downloadgcc-6372b05e5b14f27ddce11c28654956c1ad715dac.zip
gcc-6372b05e5b14f27ddce11c28654956c1ad715dac.tar.gz
gcc-6372b05e5b14f27ddce11c28654956c1ad715dac.tar.bz2
aarch64: Use RTL builtins for polynomial vsli[q]_n intrinsics
Rewrite vsli[q]_n_p* Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-10 Jonathan Wright <jonathan.wright@arm.com> * config/aarch64/aarch64-simd-builtins.def: Use VALLP mode iterator for polynomial ssli_n builtin generator macro. * config/aarch64/arm_neon.h (vsli_n_p8): Use RTL builtin instead of inline asm. (vsli_n_p16): Likewise. (vsliq_n_p8): Likewise. (vsliq_n_p16): Likewise. * config/aarch64/iterators.md: Define VALLP mode iterator.
-rw-r--r--gcc/config/aarch64/aarch64-simd-builtins.def2
-rw-r--r--gcc/config/aarch64/arm_neon.h72
-rw-r--r--gcc/config/aarch64/iterators.md3
3 files changed, 28 insertions, 49 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 202f690..5349791 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -436,7 +436,7 @@
BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, NONE)
BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, NONE)
BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, NONE)
- VAR2 (SHIFTINSERTP, ssli_n, 0, NONE, di, v2di)
+ BUILTIN_VALLP (SHIFTINSERTP, ssli_n, 0, NONE)
BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0, NONE)
/* Implemented by aarch64_<sur>qshl<u>_n<mode>. */
BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0, NONE)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 164c76d..38a3a3f 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9050,57 +9050,33 @@ vshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c)
__builtin_aarch64_shrn2v2di ((int32x2_t) __a, (int64x2_t) __b, __c);
}
-#define vsli_n_p8(a, b, c) \
- __extension__ \
- ({ \
- poly8x8_t b_ = (b); \
- poly8x8_t a_ = (a); \
- poly8x8_t result; \
- __asm__ ("sli %0.8b,%2.8b,%3" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+{
+ return __builtin_aarch64_ssli_nv8qi_ppps (__a, __b, __c);
+}
-#define vsli_n_p16(a, b, c) \
- __extension__ \
- ({ \
- poly16x4_t b_ = (b); \
- poly16x4_t a_ = (a); \
- poly16x4_t result; \
- __asm__ ("sli %0.4h,%2.4h,%3" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+{
+ return __builtin_aarch64_ssli_nv4hi_ppps (__a, __b, __c);
+}
-#define vsliq_n_p8(a, b, c) \
- __extension__ \
- ({ \
- poly8x16_t b_ = (b); \
- poly8x16_t a_ = (a); \
- poly8x16_t result; \
- __asm__ ("sli %0.16b,%2.16b,%3" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+{
+ return __builtin_aarch64_ssli_nv16qi_ppps (__a, __b, __c);
+}
-#define vsliq_n_p16(a, b, c) \
- __extension__ \
- ({ \
- poly16x8_t b_ = (b); \
- poly16x8_t a_ = (a); \
- poly16x8_t result; \
- __asm__ ("sli %0.8h,%2.8h,%3" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_ssli_nv8hi_ppps (__a, __b, __c);
+}
#define vsri_n_p8(a, b, c) \
__extension__ \
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 8a765ea..fe2c51c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -203,6 +203,9 @@
(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF
V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
+;; All Advanced SIMD polynomial modes and DI.
+(define_mode_iterator VALLP [V8QI V16QI V4HI V8HI V2DI DI])
+
;; Advanced SIMD modes for Integer reduction across lanes.
(define_mode_iterator VDQV [V8QI V16QI V4HI V8HI V4SI V2DI])