aarch64: Reimplement vget_low* intrinsics

We can do better on the vget_low* intrinsics. Currently they reinterpret their argument into a V2DI vector and extract the low "lane", reinterpreting that back into the shorter vector. This is functionally correct and generates a sequence of subregs and a vec_select that, by itself, gets optimised away eventually. However it's bad when we want to use the result in a other SIMD operations. Then the subreg-vec_select-subreg combo blocks many combine patterns. This patch reimplements them to emit a proper low vec_select from the start. It generates much cleaner RTL and allows for more aggressive combinations, particularly with the patterns that Jonathan has been pushing lately. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (get_low): Define builtin. * config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Define. * config/aarch64/arm_neon.h (__GET_LOW): Delete. (vget_low_f16): Reimplement using new builtin. (vget_low_f32): Likewise. (vget_low_f64): Likewise. (vget_low_p8): Likewise. (vget_low_p16): Likewise. (vget_low_p64): Likewise. (vget_low_s8): Likewise. (vget_low_s16): Likewise. (vget_low_s32): Likewise. (vget_low_s64): Likewise. (vget_low_u8): Likewise. (vget_low_u16): Likewise. (vget_low_u32): Likewise. (vget_low_u64): Likewise.
author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2021-02-05 08:14:07 +0000
committer: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2021-02-05 08:14:07 +0000
commit: b6e7a7498732b83df61443c211b8d69454ad0b22 (patch)
tree: 59812c9e666a19680aef716fccaff066146bff71 /gcc
parent: 072f20c555907cce38a424da47b6c1baa8330169 (diff)
download: gcc-b6e7a7498732b83df61443c211b8d69454ad0b22.zip
gcc-b6e7a7498732b83df61443c211b8d69454ad0b22.tar.gz
gcc-b6e7a7498732b83df61443c211b8d69454ad0b22.tar.bz2
3 files changed, 28 insertions, 21 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 29a7bbc..66420cf 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -55,6 +55,9 @@
   BUILTIN_VS (UNOP, ctz, 2, NONE)
   BUILTIN_VB (UNOP, popcount, 2, NONE)
 
+  /* Implemented by aarch64_get_low<mode>.  */
+  BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP)
+
   /* Implemented by aarch64_<sur>q<r>shl<mode>.  */
   BUILTIN_VSDQ_I (BINOP, sqshl, 0, NONE)
   BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 60eeddc..e730ff5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -297,6 +297,17 @@
   "TARGET_SIMD"
 )
 
+(define_expand "aarch64_get_low<mode>"
+  [(match_operand:<VHALF> 0 "register_operand")
+   (match_operand:VQMOV 1 "register_operand")]
+  "TARGET_SIMD"
+  {
+    rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+    emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo));
+    DONE;
+  }
+)
+
 (define_insn_and_split "aarch64_simd_mov_from_<mode>low"
   [(set (match_operand:<VHALF> 0 "register_operand" "=w,?r")
         (vec_select:<VHALF>
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 2d776ef..67c7f24 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6302,111 +6302,104 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-#define __GET_LOW(__TYPE) \
-  uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a);  \
-  uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0));  \
-  return vreinterpret_##__TYPE##_u64 (lo);
-
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f16 (float16x8_t __a)
 {
-  __GET_LOW (f16);
+  return __builtin_aarch64_get_lowv8hf (__a);
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f32 (float32x4_t __a)
 {
-  __GET_LOW (f32);
+  return __builtin_aarch64_get_lowv4sf (__a);
 }
 
 __extension__ extern __inline float64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f64 (float64x2_t __a)
 {
-  return (float64x1_t) {vgetq_lane_f64 (__a, 0)};
+  return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)};
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p8 (poly8x16_t __a)
 {
-  __GET_LOW (p8);
+  return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
 }
 
 __extension__ extern __inline poly16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p16 (poly16x8_t __a)
 {
-  __GET_LOW (p16);
+  return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
 }
 
 __extension__ extern __inline poly64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p64 (poly64x2_t __a)
 {
-  __GET_LOW (p64);
+  return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a);
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s8 (int8x16_t __a)
 {
-  __GET_LOW (s8);
+  return  __builtin_aarch64_get_lowv16qi (__a);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s16 (int16x8_t __a)
 {
-  __GET_LOW (s16);
+  return  __builtin_aarch64_get_lowv8hi (__a);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s32 (int32x4_t __a)
 {
-  __GET_LOW (s32);
+  return  __builtin_aarch64_get_lowv4si (__a);
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s64 (int64x2_t __a)
 {
-  __GET_LOW (s64);
+  return  (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)};
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u8 (uint8x16_t __a)
 {
-  __GET_LOW (u8);
+  return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u16 (uint16x8_t __a)
 {
-  __GET_LOW (u16);
+  return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u32 (uint32x4_t __a)
 {
-  __GET_LOW (u32);
+  return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u64 (uint64x2_t __a)
 {
-  return vcreate_u64 (vgetq_lane_u64 (__a, 0));
+  return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)};
 }
 
-#undef __GET_LOW
-
 #define __GET_HIGH(__TYPE)					\
   uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a);		\
   uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1));	\
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2021-02-05 08:14:07 +0000
committer	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2021-02-05 08:14:07 +0000
commit	b6e7a7498732b83df61443c211b8d69454ad0b22 (patch)
tree	59812c9e666a19680aef716fccaff066146bff71 /gcc
parent	072f20c555907cce38a424da47b6c1baa8330169 (diff)
download	gcc-b6e7a7498732b83df61443c211b8d69454ad0b22.zip gcc-b6e7a7498732b83df61443c211b8d69454ad0b22.tar.gz gcc-b6e7a7498732b83df61443c211b8d69454ad0b22.tar.bz2