aarch64: Model zero-high-half semantics of [SU]QXTN instructions

Split the aarch64_<su>qmovn<mode> pattern into separate scalar and vector variants. Further split the vector RTL pattern into big/ little endian variants that model the zero-high-half semantics of the underlying instruction. Modeling these semantics allows for better RTL combinations while also removing some register allocation issues as the compiler now knows that the operation is totally destructive. Add new tests to narrow_zero_high_half.c to verify the benefit of this change. gcc/ChangeLog: 2021-06-14 Jonathan Wright <jonathan.wright@arm.com> * config/aarch64/aarch64-simd-builtins.def: Split generator for aarch64_<su>qmovn builtins into scalar and vector variants. * config/aarch64/aarch64-simd.md (aarch64_<su>qmovn<mode>_insn_le): Define. (aarch64_<su>qmovn<mode>_insn_be): Define. (aarch64_<su>qmovn<mode>): Split into scalar and vector variants. Change vector variant to an expander that emits the correct instruction depending on endianness. gcc/testsuite/ChangeLog: * gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.
author: Jonathan Wright <jonathan.wright@arm.com> 2021-06-14 15:09:18 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> 2021-06-16 14:22:22 +0100
commit: d0889b5d37ff40149b44e3c7d82f693d430cd891 (patch)
tree: a9bc2a939eef3ebf8df796c46860f74485b8d734
parent: c86a3039683a8d2bb1006c1a0277678de3786ceb (diff)
download: gcc-d0889b5d37ff40149b44e3c7d82f693d430cd891.zip
gcc-d0889b5d37ff40149b44e3c7d82f693d430cd891.tar.gz
gcc-d0889b5d37ff40149b44e3c7d82f693d430cd891.tar.bz2
3 files changed, 59 insertions, 4 deletions
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 2adb4b1..ac5d4fc 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -271,8 +271,10 @@
   BUILTIN_VQN (BINOP_UUS, sqxtun2, 0, NONE)
 
   /* Implemented by aarch64_<su>qmovn<mode>.  */
-  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, NONE)
-  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0, NONE)
+  BUILTIN_VQN (UNOP, sqmovn, 0, NONE)
+  BUILTIN_SD_HSDI (UNOP, sqmovn, 0, NONE)
+  BUILTIN_VQN (UNOP, uqmovn, 0, NONE)
+  BUILTIN_SD_HSDI (UNOP, uqmovn, 0, NONE)
 
   /* Implemented by aarch64_<su>qxtn2<mode>.  */
   BUILTIN_VQN (BINOP, sqxtn2, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 59779b8..2b75e57 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4875,10 +4875,54 @@
 (define_insn "aarch64_<su>qmovn<mode>"
   [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
 	(SAT_TRUNC:<VNARROWQ>
-    (match_operand:VSQN_HSDI 1 "register_operand" "w")))]
+	  (match_operand:SD_HSDI 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "<su>qxtn\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
-   [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_insn "aarch64_<su>qmovn<mode>_insn_le"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+	(vec_concat:<VNARROWQ2>
+	  (SAT_TRUNC:<VNARROWQ>
+	    (match_operand:VQN 1 "register_operand" "w"))
+	  (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")))]
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "<su>qxtn\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_insn "aarch64_<su>qmovn<mode>_insn_be"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+	(vec_concat:<VNARROWQ2>
+	  (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")
+	  (SAT_TRUNC:<VNARROWQ>
+	    (match_operand:VQN 1 "register_operand" "w"))))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+  "<su>qxtn\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_expand "aarch64_<su>qmovn<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand")
+	(SAT_TRUNC:<VNARROWQ>
+	  (match_operand:VQN 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
+    if (BYTES_BIG_ENDIAN)
+      emit_insn (gen_aarch64_<su>qmovn<mode>_insn_be (tmp, operands[1],
+				CONST0_RTX (<VNARROWQ>mode)));
+    else
+      emit_insn (gen_aarch64_<su>qmovn<mode>_insn_le (tmp, operands[1],
+				CONST0_RTX (<VNARROWQ>mode)));
+
+    /* The intrinsic expects a narrow result, so emit a subreg that will get
+       optimized away as appropriate.  */
+    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
+						 <VNARROWQ2>mode));
+    DONE;
+  }
 )
 
 (define_insn "aarch64_<su>qxtn2<mode>_le"
diff --git a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
index 53e03d3..aa6c7ef 100644
--- a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
+++ b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
@@ -67,6 +67,13 @@ TEST_UNARY (vqmovun, uint8x16_t, int16x8_t, s16, u8)
 TEST_UNARY (vqmovun, uint16x8_t, int32x4_t, s32, u16)
 TEST_UNARY (vqmovun, uint32x4_t, int64x2_t, s64, u32)
 
+TEST_UNARY (vqmovn, int8x16_t, int16x8_t, s16, s8)
+TEST_UNARY (vqmovn, int16x8_t, int32x4_t, s32, s16)
+TEST_UNARY (vqmovn, int32x4_t, int64x2_t, s64, s32)
+TEST_UNARY (vqmovn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_UNARY (vqmovn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
+
 /* { dg-final { scan-assembler-not "dup\\t" } } */
 
 /* { dg-final { scan-assembler-times "\\tshrn\\tv" 6} }  */
@@ -79,3 +86,5 @@ TEST_UNARY (vqmovun, uint32x4_t, int64x2_t, s64, u32)
 /* { dg-final { scan-assembler-times "\\tsqrshrun\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\txtn\\tv" 6} }  */
 /* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "\\tuqxtn\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "\\tsqxtn\\tv" 3} }  */
author	Jonathan Wright <jonathan.wright@arm.com>	2021-06-14 15:09:18 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	2021-06-16 14:22:22 +0100
commit	d0889b5d37ff40149b44e3c7d82f693d430cd891 (patch)
tree	a9bc2a939eef3ebf8df796c46860f74485b8d734
parent	c86a3039683a8d2bb1006c1a0277678de3786ceb (diff)
download	gcc-d0889b5d37ff40149b44e3c7d82f693d430cd891.zip gcc-d0889b5d37ff40149b44e3c7d82f693d430cd891.tar.gz gcc-d0889b5d37ff40149b44e3c7d82f693d430cd891.tar.bz2