diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2021-02-12 12:13:27 +0000 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2021-04-28 21:12:46 +0100 |
commit | ffb112289452f58fbf00a4e57c0d7de930aca6b1 (patch) | |
tree | 6d304c794924e04c633ffdc0415c4ca10c43d002 /gcc | |
parent | 1d66367a71ef969235e10c77685f5ca4551bf519 (diff) | |
download | gcc-ffb112289452f58fbf00a4e57c0d7de930aca6b1.zip gcc-ffb112289452f58fbf00a4e57c0d7de930aca6b1.tar.gz gcc-ffb112289452f58fbf00a4e57c0d7de930aca6b1.tar.bz2 |
aarch64: Use RTL builtins for v[q]tbl intrinsics
Rewrite v[q]tbl Neon intrinsics to use RTL builtins rather than
inline assembly code, allowing for better scheduling and
optimization.
gcc/ChangeLog:
2021-02-12 Jonathan Wright <jonathan.wright@arm.com>
* config/aarch64/aarch64-simd-builtins.def: Add tbl1 builtin
generator macros.
* config/aarch64/arm_neon.h (vqtbl1_p8): Use RTL builtin
instead of inline asm.
(vqtbl1_s8): Likewise.
(vqtbl1_u8): Likewise.
(vqtbl1q_p8): Likewise.
(vqtbl1q_s8): Likewise.
(vqtbl1q_u8): Likewise.
(vtbl1_s8): Likewise.
(vtbl1_u8): Likewise.
(vtbl1_p8): Likewise.
(vtbl2_s8): Likewise.
(vtbl2_u8): Likewise.
(vtbl2_p8): Likewise.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-simd-builtins.def | 4 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 109 |
2 files changed, 32 insertions, 81 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 86614e7..04b392b 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -696,6 +696,10 @@ VAR1 (BINOP, tbl3, 0, NONE, v8qi) VAR1 (BINOP, tbl3, 0, NONE, v16qi) + /* Implemented by aarch64_tbl1<mode>. */ + VAR2 (BINOP, tbl1, 0, NONE, v8qi, v16qi) + VAR2 (BINOPU, tbl1, 0, NONE, v8qi, v16qi) + /* Implemented by aarch64_qtbl3<mode>. */ VAR1 (BINOP, qtbl3, 0, NONE, v8qi) VAR1 (BINOP, qtbl3, 0, NONE, v16qi) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 3536052..0817129 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9579,74 +9579,46 @@ vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1_p8 (poly8x16_t __a, uint8x8_t __b) +vqtbl1_p8 (poly8x16_t __tab, uint8x8_t __idx) { - poly8x8_t __result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (poly8x8_t) __builtin_aarch64_tbl1v8qi ((int8x16_t) __tab, + (int8x8_t) __idx); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1_s8 (int8x16_t __a, uint8x8_t __b) +vqtbl1_s8 (int8x16_t __tab, uint8x8_t __idx) { - int8x8_t __result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v8qi (__tab, (int8x8_t) __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1_u8 (uint8x16_t __a, uint8x8_t __b) +vqtbl1_u8 (uint8x16_t __tab, uint8x8_t __idx) { - uint8x8_t __result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v8qi_uuu (__tab, __idx); } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1q_p8 (poly8x16_t __a, uint8x16_t __b) +vqtbl1q_p8 (poly8x16_t __tab, uint8x16_t __idx) { - poly8x16_t __result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return (poly8x16_t) __builtin_aarch64_tbl1v16qi ((int8x16_t) __tab, + (int8x16_t) __idx); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1q_s8 (int8x16_t __a, uint8x16_t __b) +vqtbl1q_s8 (int8x16_t __tab, uint8x16_t __idx) { - int8x16_t __result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v16qi (__tab, (int8x16_t) __idx); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1q_u8 (uint8x16_t __a, uint8x16_t __b) +vqtbl1q_u8 (uint8x16_t __tab, uint8x16_t __idx) { - uint8x16_t __result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v16qi_uuu (__tab, __idx); } __extension__ extern __inline int8x8_t @@ -9727,78 +9699,53 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl1_s8 (int8x8_t __tab, int8x8_t __idx) { - int8x8_t __result; - int8x16_t __temp = vcombine_s8 (__tab, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + int8x16_t __temp = vcombine_s8 (__tab, + vcreate_s8 (__AARCH64_UINT64_C (0x0))); + return __builtin_aarch64_tbl1v8qi (__temp, __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl1_u8 (uint8x8_t __tab, uint8x8_t __idx) { - uint8x8_t __result; - uint8x16_t __temp = vcombine_u8 (__tab, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + uint8x16_t __temp = vcombine_u8 (__tab, + vcreate_u8 (__AARCH64_UINT64_C (0x0))); + return __builtin_aarch64_tbl1v8qi_uuu (__temp, __idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl1_p8 (poly8x8_t __tab, uint8x8_t __idx) { - poly8x8_t __result; - poly8x16_t __temp = vcombine_p8 (__tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + poly8x16_t __temp = vcombine_p8 (__tab, + vcreate_p8 (__AARCH64_UINT64_C (0x0))); + return (poly8x8_t) __builtin_aarch64_tbl1v8qi ((int8x16_t) __temp, + (int8x8_t) __idx); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_s8 (int8x8x2_t __tab, int8x8_t __idx) { - int8x8_t __result; int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v8qi (__temp, __idx); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_u8 (uint8x8x2_t __tab, uint8x8_t __idx) { - uint8x8_t __result; uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + return __builtin_aarch64_tbl1v8qi_uuu (__temp, __idx); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_p8 (poly8x8x2_t __tab, uint8x8_t __idx) { - poly8x8_t __result; poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(__result) - : "w"(__temp), "w"(__idx) - : /* No clobbers */); - return __result; + return (poly8x8_t) __builtin_aarch64_tbl1v8qi ((int8x16_t) __temp, + (int8x8_t) __idx); } __extension__ extern __inline int8x8_t |