aarch64: Reimplement vabal* intrinsics using builtins

This patch reimplements the vabal intrinsics with builtins. The RTL pattern is cleaned up to emit the right .8b suffixes for the inputs (though .16b is also accepted) and iterate over the right modes. The pattern's only other use is through the sadv16qi expander, which is adjusted. I've verified that the codegen for sadv16qi is not worse off. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (sabal): Define builtin. (uabal): Likewise. * config/aarch64/aarch64-simd.md (aarch64_<sur>abal<mode>_4): Rename to... (aarch64_<sur>abal<mode>): ... This (<sur>sadv16qi): Adust use of the above. * config/aarch64/arm_neon.h (vabal_s8): Reimplement using builtin. (vabal_s16): Likewise. (vabal_s32): Likewise. (vabal_u8): Likewise. (vabal_u16): Likewise. (vabal_u32): Likewise.
author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2021-01-29 10:57:44 +0000
committer: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2021-01-29 13:49:19 +0000
commit: d5e0d1f1d2c430515eb727c4464d1f51b20d4c9c (patch)
tree: 5249a7c69ee0a545712d610909baa96ec783e873
parent: cb995de62aa6484dba4f9807ee3c8d2959a40c46 (diff)
download: gcc-d5e0d1f1d2c430515eb727c4464d1f51b20d4c9c.zip
gcc-d5e0d1f1d2c430515eb727c4464d1f51b20d4c9c.tar.gz
gcc-d5e0d1f1d2c430515eb727c4464d1f51b20d4c9c.tar.bz2
3 files changed, 21 insertions, 45 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 77ba043..4893607 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -164,6 +164,10 @@
   BUILTIN_VDQV_S (BINOP, sadalp, 0, NONE)
   BUILTIN_VDQV_S (BINOPU, uadalp, 0, NONE)
 
+  /* Implemented by aarch64_<sur>abal<mode>.  */
+  BUILTIN_VD_BHSI (TERNOP, sabal, 0, NONE)
+  BUILTIN_VD_BHSI (TERNOPU, uabal, 0, NONE)
+
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
   BUILTIN_VQN (BINOP, addhn, 0, NONE)
   BUILTIN_VQN (BINOP, subhn, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 91077f0..9390eb2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -828,14 +828,14 @@
   [(set_attr "type" "neon_abd<q>")]
 )
 
-(define_insn "aarch64_<sur>abal<mode>_4"
-  [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
-	(unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w")
-			  (match_operand:VDQV_S 2 "register_operand" "w")
-			 (match_operand:<VDBLW> 3 "register_operand" "0")]
+(define_insn "aarch64_<sur>abal<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+	(unspec:<VWIDE> [(match_operand:VD_BHSI 2 "register_operand" "w")
+			  (match_operand:VD_BHSI 3 "register_operand" "w")
+			 (match_operand:<VWIDE> 1 "register_operand" "0")]
 	ABAL))]
   "TARGET_SIMD"
-  "<sur>abal\t%0.<Vwtype>, %1.<Vhalftype>, %2.<Vhalftype>"
+  "<sur>abal\t%0.<Vwtype>, %2.<Vtype>, %3.<Vtype>"
   [(set_attr "type" "neon_arith_acc<q>")]
 )
 
@@ -855,7 +855,7 @@
 ;; operand 3 before copying that into the result operand 0.
 ;; Perform that with a sequence of:
 ;; UABDL2	tmp.8h, op1.16b, op2.16b
-;; UABAL	tmp.8h, op1.16b, op2.16b
+;; UABAL	tmp.8h, op1.8b, op2.8b
 ;; UADALP	op3.4s, tmp.8h
 ;; MOV		op0, op3 // should be eliminated in later passes.
 ;;
@@ -888,8 +888,10 @@
     rtx reduc = gen_reg_rtx (V8HImode);
     emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
 					       operands[2]));
-    emit_insn (gen_aarch64_<sur>abalv16qi_4 (reduc, operands[1],
-					      operands[2], reduc));
+    emit_insn (gen_aarch64_<sur>abalv8qi (reduc, reduc,
+					  gen_lowpart (V8QImode, operands[1]),
+					  gen_lowpart (V8QImode,
+						       operands[2])));
     emit_insn (gen_aarch64_<sur>adalpv8hi (operands[3], operands[3], reduc));
     emit_move_insn (operands[0], operands[3]);
     DONE;
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 198a593..8d5e0f4 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6735,72 +6735,42 @@ __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int16x8_t __result;
-  __asm__ ("sabal %0.8h,%2.8b,%3.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_sabalv8qi (__a, __b, __c);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("sabal %0.4s,%2.4h,%3.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_sabalv4hi (__a, __b, __c);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int64x2_t __result;
-  __asm__ ("sabal %0.2d,%2.2s,%3.2s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_sabalv2si (__a, __b, __c);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint16x8_t __result;
-  __asm__ ("uabal %0.8h,%2.8b,%3.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_uabalv8qi_uuuu (__a, __b, __c);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("uabal %0.4s,%2.4h,%3.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_uabalv4hi_uuuu (__a, __b, __c);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint64x2_t __result;
-  __asm__ ("uabal %0.2d,%2.2s,%3.2s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_uabalv2si_uuuu (__a, __b, __c);
 }
 
 __extension__ extern __inline int8x16_t
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2021-01-29 10:57:44 +0000
committer	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2021-01-29 13:49:19 +0000
commit	d5e0d1f1d2c430515eb727c4464d1f51b20d4c9c (patch)
tree	5249a7c69ee0a545712d610909baa96ec783e873
parent	cb995de62aa6484dba4f9807ee3c8d2959a40c46 (diff)
download	gcc-d5e0d1f1d2c430515eb727c4464d1f51b20d4c9c.zip gcc-d5e0d1f1d2c430515eb727c4464d1f51b20d4c9c.tar.gz gcc-d5e0d1f1d2c430515eb727c4464d1f51b20d4c9c.tar.bz2