[1/2] aarch64: Reimplement (R){ADD,SUB}HN intrinsics with RTL codes

We can implement the halving-narrowing add/sub patterns with standard RTL codes as well rather than relying on unspecs. This patch handles the low-part ones and the second patch does the high-part ones and removes the unspecs themselves. The operation ADDHN on V4SI, for example, is represented as (truncate:V4HI ((src1:V4SI + src2:V4SI) >> 16)) and RADDHN as (truncate:V4HI ((src1:V4SI + src2:V4SI + (1 << 15)) >> 16)). Taking this opportunity I specified the patterns returning the narrow mode and annotated them with the <vczle><vczbe> define_subst rules to get the vec_concat-zero meta-patterns too. This allows us to simplify the expanders somewhat too. Tests are added to check that the combinations work. Bootstrapped and tested on aarch64-none-linux-gnu. Also tested on aarch64_be-none-elf. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_<sur><addsub>hn<mode>_insn_le): Delete. (aarch64_<optab>hn<mode>_insn<vczle><vczbe>): New define_insn. (aarch64_<sur><addsub>hn<mode>_insn_be): Delete. (aarch64_r<optab>hn<mode>_insn<vczle><vczbe>): New define_insn. (aarch64_<sur><addsub>hn<mode>): Delete. (aarch64_<optab>hn<mode>): New define_expand. (aarch64_r<optab>hn<mode>): Likewise. * config/aarch64/predicates.md (aarch64_simd_raddsubhn_imm_vec): New predicate. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/pr99195_4.c: New test.
author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2023-05-04 15:19:52 +0100
committer: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2023-05-04 15:19:52 +0100
commit: fe3e4557471dfc7e617884186f41e18d6b4023c1 (patch)
tree: bad71dde7a9dad7cc4f97e1e8b8b9ed656cf6b41
parent: 0a26a42b237bada32165e61867a2bf4461c5fab2 (diff)
download: gcc-fe3e4557471dfc7e617884186f41e18d6b4023c1.zip
gcc-fe3e4557471dfc7e617884186f41e18d6b4023c1.tar.gz
gcc-fe3e4557471dfc7e617884186f41e18d6b4023c1.tar.bz2
3 files changed, 88 insertions, 35 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 705c4b0..421173e7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4955,49 +4955,61 @@
 
 ;; <r><addsub>hn<q>.
 
-(define_insn "aarch64_<sur><addsub>hn<mode>_insn_le"
-  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
-	(vec_concat:<VNARROWQ2>
-	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
-			      (match_operand:VQN 2 "register_operand" "w")]
-			     ADDSUBHN)
-	  (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-  "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
+(define_insn "aarch64_<optab>hn<mode>_insn<vczle><vczbe>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+	(truncate:<VNARROWQ>
+	  (ashiftrt:VQN
+	    (ADDSUB:VQN (match_operand:VQN 1 "register_operand" "w")
+			(match_operand:VQN 2 "register_operand" "w"))
+	    (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top"))))]
+  "TARGET_SIMD"
+  "<optab>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_<optab>_halve_narrow_q")]
 )
 
-(define_insn "aarch64_<sur><addsub>hn<mode>_insn_be"
-  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
-	(vec_concat:<VNARROWQ2>
-	  (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")
-	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
-			      (match_operand:VQN 2 "register_operand" "w")]
-			     ADDSUBHN)))]
-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-  "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
+(define_insn "aarch64_r<optab>hn<mode>_insn<vczle><vczbe>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+	(truncate:<VNARROWQ>
+	  (ashiftrt:VQN
+	    (plus:VQN
+	      (ADDSUB:VQN (match_operand:VQN 1 "register_operand" "w")
+			  (match_operand:VQN 2 "register_operand" "w"))
+	      (match_operand:VQN 3 "aarch64_simd_raddsubhn_imm_vec"))
+	    (match_operand:VQN 4 "aarch64_simd_shift_imm_vec_exact_top"))))]
+  "TARGET_SIMD"
+  "r<optab>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_<optab>_halve_narrow_q")]
 )
 
-(define_expand "aarch64_<sur><addsub>hn<mode>"
+(define_expand "aarch64_<optab>hn<mode>"
   [(set (match_operand:<VNARROWQ> 0 "register_operand")
-	(unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand")
-			    (match_operand:VQN 2 "register_operand")]
-			   ADDSUBHN))]
+	(ADDSUB:VQN (match_operand:VQN 1 "register_operand")
+		    (match_operand:VQN 2 "register_operand")))]
   "TARGET_SIMD"
   {
-    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
-    if (BYTES_BIG_ENDIAN)
-      emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_be (tmp, operands[1],
-				operands[2], CONST0_RTX (<VNARROWQ>mode)));
-    else
-      emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_le (tmp, operands[1],
-				operands[2], CONST0_RTX (<VNARROWQ>mode)));
+    rtx shft
+      = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+				GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2);
+    emit_insn (gen_aarch64_<optab>hn<mode>_insn (operands[0], operands[1],
+						 operands[2], shft));
+    DONE;
+  }
+)
 
-    /* The intrinsic expects a narrow result, so emit a subreg that will get
-       optimized away as appropriate.  */
-    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
-						 <VNARROWQ2>mode));
+(define_expand "aarch64_r<optab>hn<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand")
+	(ADDSUB:VQN (match_operand:VQN 1 "register_operand")
+		    (match_operand:VQN 2 "register_operand")))]
+  "TARGET_SIMD"
+  {
+    rtx shft
+      = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+				GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2);
+    rtx rnd
+      = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+	HOST_WIDE_INT_1U << (GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2 - 1));
+    emit_insn (gen_aarch64_r<optab>hn<mode>_insn (operands[0], operands[1],
+						  operands[2], rnd, shft));
     DONE;
   }
 )
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 242f10a..73f7ade 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -593,6 +593,12 @@
 				HOST_WIDE_INT_1U
 				<< (GET_MODE_UNIT_BITSIZE  (mode) - 1))")))
 
+(define_predicate "aarch64_simd_raddsubhn_imm_vec"
+  (and (match_code "const_vector")
+       (match_test "aarch64_const_vec_all_same_in_range_p (op, 1,
+				HOST_WIDE_INT_1U
+				<< (GET_MODE_UNIT_BITSIZE  (mode) / 2 - 1))")))
+
 (define_predicate "aarch64_simd_shift_imm_bitsize_qi"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 8)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c
new file mode 100644
index 0000000..b6ef15b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_4.c
@@ -0,0 +1,35 @@
+/* PR target/99195.  */
+/*  Check that we take advantage of 64-bit Advanced SIMD operations clearing
+    the top half of the vector register and no explicit zeroing instructions
+    are emitted.  */
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+#define MYOP(OT,IT,IMT,OP,IS,OS)                         \
+OT                                              \
+foo_##OP##_##OS (IT a, IT b)                     \
+{                                               \
+  IMT zeros = vcreate_##OS (0);                   \
+  return vcombine_##OS (v##OP##_##IS (a, b), zeros);      \
+}
+
+
+#define FUNC(OT,IT,IMT,IS,OS)			\
+MYOP (OT, IT, IMT, addhn, IS, OS)		\
+MYOP (OT, IT, IMT, subhn, IS, OS)		\
+MYOP (OT, IT, IMT, raddhn, IS, OS)		\
+MYOP (OT, IT, IMT, rsubhn, IS, OS)
+
+FUNC (int8x16_t, int16x8_t, int8x8_t, s16, s8)
+FUNC (int16x8_t, int32x4_t, int16x4_t, s32, s16)
+FUNC (int32x4_t, int64x2_t, int32x2_t, s64, s32)
+
+FUNC (uint8x16_t, uint16x8_t, uint8x8_t, u16, u8)
+FUNC (uint16x8_t, uint32x4_t, uint16x4_t, u32, u16)
+FUNC (uint32x4_t, uint64x2_t, uint32x2_t, u64, u32)
+
+/* { dg-final { scan-assembler-not {\tfmov\t} } }  */
+/* { dg-final { scan-assembler-not {\tmov\t} } }  */
+
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2023-05-04 15:19:52 +0100
committer	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2023-05-04 15:19:52 +0100
commit	fe3e4557471dfc7e617884186f41e18d6b4023c1 (patch)
tree	bad71dde7a9dad7cc4f97e1e8b8b9ed656cf6b41
parent	0a26a42b237bada32165e61867a2bf4461c5fab2 (diff)
download	gcc-fe3e4557471dfc7e617884186f41e18d6b4023c1.zip gcc-fe3e4557471dfc7e617884186f41e18d6b4023c1.tar.gz gcc-fe3e4557471dfc7e617884186f41e18d6b4023c1.tar.bz2