aarch64: Model zero-high-half semantics of ADDHN/SUBHN instructions

Model the zero-high-half semantics of the narrowing arithmetic Neon instructions in the aarch64_<sur><addsub>hn<mode> RTL pattern. Modeling these semantics allows for better RTL combinations while also removing some register allocation issues as the compiler now knows that the operation is totally destructive. Add new tests to narrow_zero_high_half.c to verify the benefit of this change. gcc/ChangeLog: 2021-06-14 Jonathan Wright <jonathan.wright@arm.com> * config/aarch64/aarch64-simd.md (aarch64_<sur><addsub>hn<mode>): Change to an expander that emits the correct instruction depending on endianness. (aarch64_<sur><addsub>hn<mode>_insn_le): Define. (aarch64_<sur><addsub>hn<mode>_insn_be): Define. gcc/testsuite/ChangeLog: * gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.
author: Jonathan Wright <jonathan.wright@arm.com> 2021-06-14 16:18:44 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> 2021-06-16 14:22:42 +0100
commit: dbfc149b639342a9555c60aa9ee787fb3d009316 (patch)
tree: 2fddb90915757489cf18830e987f7a4a71352a8b
parent: d0889b5d37ff40149b44e3c7d82f693d430cd891 (diff)
download: gcc-dbfc149b639342a9555c60aa9ee787fb3d009316.zip
gcc-dbfc149b639342a9555c60aa9ee787fb3d009316.tar.gz
gcc-dbfc149b639342a9555c60aa9ee787fb3d009316.tar.bz2
2 files changed, 83 insertions, 6 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 2b75e57..540244c 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4661,16 +4661,53 @@
 
 ;; <r><addsub>hn<q>.
 
-(define_insn "aarch64_<sur><addsub>hn<mode>"
-  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
-        (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
-			    (match_operand:VQN 2 "register_operand" "w")]
-                           ADDSUBHN))]
-  "TARGET_SIMD"
+(define_insn "aarch64_<sur><addsub>hn<mode>_insn_le"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+	(vec_concat:<VNARROWQ2>
+	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
+			      (match_operand:VQN 2 "register_operand" "w")]
+			     ADDSUBHN)
+	  (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")))]
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
+)
+
+(define_insn "aarch64_<sur><addsub>hn<mode>_insn_be"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+	(vec_concat:<VNARROWQ2>
+	  (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")
+	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
+			      (match_operand:VQN 2 "register_operand" "w")]
+			     ADDSUBHN)))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
 )
 
+(define_expand "aarch64_<sur><addsub>hn<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand")
+	(unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand")
+			    (match_operand:VQN 2 "register_operand")]
+			   ADDSUBHN))]
+  "TARGET_SIMD"
+  {
+    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
+    if (BYTES_BIG_ENDIAN)
+      emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_be (tmp, operands[1],
+				operands[2], CONST0_RTX (<VNARROWQ>mode)));
+    else
+      emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_le (tmp, operands[1],
+				operands[2], CONST0_RTX (<VNARROWQ>mode)));
+
+    /* The intrinsic expects a narrow result, so emit a subreg that will get
+       optimized away as appropriate.  */
+    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
+						 <VNARROWQ2>mode));
+    DONE;
+  }
+)
+
 (define_insn "aarch64_<sur><addsub>hn2<mode>_insn_le"
   [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
 	(vec_concat:<VNARROWQ2>
diff --git a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
index aa6c7ef..dd5ddf8 100644
--- a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
+++ b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
@@ -74,6 +74,42 @@ TEST_UNARY (vqmovn, uint8x16_t, uint16x8_t, u16, u8)
 TEST_UNARY (vqmovn, uint16x8_t, uint32x4_t, u32, u16)
 TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
 
+#define TEST_ARITH(name, rettype, intype, fs, rs) \
+  rettype test_ ## name ## _ ## fs ## _zero_high \
+		(intype a, intype b) \
+	{ \
+		return vcombine_ ## rs (name ## _ ## fs (a, b), \
+					vdup_n_ ## rs (0)); \
+	}
+
+TEST_ARITH (vaddhn, int8x16_t, int16x8_t, s16, s8)
+TEST_ARITH (vaddhn, int16x8_t, int32x4_t, s32, s16)
+TEST_ARITH (vaddhn, int32x4_t, int64x2_t, s64, s32)
+TEST_ARITH (vaddhn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_ARITH (vaddhn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_ARITH (vaddhn, uint32x4_t, uint64x2_t, u64, u32)
+
+TEST_ARITH (vraddhn, int8x16_t, int16x8_t, s16, s8)
+TEST_ARITH (vraddhn, int16x8_t, int32x4_t, s32, s16)
+TEST_ARITH (vraddhn, int32x4_t, int64x2_t, s64, s32)
+TEST_ARITH (vraddhn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_ARITH (vraddhn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_ARITH (vraddhn, uint32x4_t, uint64x2_t, u64, u32)
+
+TEST_ARITH (vsubhn, int8x16_t, int16x8_t, s16, s8)
+TEST_ARITH (vsubhn, int16x8_t, int32x4_t, s32, s16)
+TEST_ARITH (vsubhn, int32x4_t, int64x2_t, s64, s32)
+TEST_ARITH (vsubhn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_ARITH (vsubhn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_ARITH (vsubhn, uint32x4_t, uint64x2_t, u64, u32)
+
+TEST_ARITH (vrsubhn, int8x16_t, int16x8_t, s16, s8)
+TEST_ARITH (vrsubhn, int16x8_t, int32x4_t, s32, s16)
+TEST_ARITH (vrsubhn, int32x4_t, int64x2_t, s64, s32)
+TEST_ARITH (vrsubhn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_ARITH (vrsubhn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_ARITH (vrsubhn, uint32x4_t, uint64x2_t, u64, u32)
+
 /* { dg-final { scan-assembler-not "dup\\t" } } */
 
 /* { dg-final { scan-assembler-times "\\tshrn\\tv" 6} }  */
@@ -88,3 +124,7 @@ TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
 /* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\tuqxtn\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\tsqxtn\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "\\taddhn\\tv" 6} }  */
+/* { dg-final { scan-assembler-times "\\tsubhn\\tv" 6} }  */
+/* { dg-final { scan-assembler-times "\\trsubhn\\tv" 6} }  */
+/* { dg-final { scan-assembler-times "\\traddhn\\tv" 6} }  */
author	Jonathan Wright <jonathan.wright@arm.com>	2021-06-14 16:18:44 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	2021-06-16 14:22:42 +0100
commit	dbfc149b639342a9555c60aa9ee787fb3d009316 (patch)
tree	2fddb90915757489cf18830e987f7a4a71352a8b
parent	d0889b5d37ff40149b44e3c7d82f693d430cd891 (diff)
download	gcc-dbfc149b639342a9555c60aa9ee787fb3d009316.zip gcc-dbfc149b639342a9555c60aa9ee787fb3d009316.tar.gz gcc-dbfc149b639342a9555c60aa9ee787fb3d009316.tar.bz2