aarch64: PR target/99195 Annotate complex FP patterns for vec-concat-zero

This patch annotates the complex add and mla patterns for vec-concat-zero. Testing showed an interesting bug in our MD patterns where they were defined to match: (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0") (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") (match_operand:VHSDF 3 "register_operand" "w") (match_operand:SI 4 "const_int_operand" "n")] FCMLA)) but the canonicalisation rules for PLUS require the more "complex" operand to be first so during combine when the new substituted patterns were attempted to be formed combine/recog would try to match: (plus:V2SF (unspec:V2SF [ (reg:V2SF 100) (reg:V2SF 101) (const_int 0 [0]) ] UNSPEC_FCMLA270) (reg:V2SF 99)) instead. This patch fixes the operands of the PLUS RTX in these patterns. Similar patterns for the dot-product instructions already used the right order. Bootstrapped and tested on aarch64-none-linux-gnu and aarch64_be-none-elf. gcc/ChangeLog: PR target/99195 * config/aarch64/aarch64-simd.md (aarch64_fcadd<rot><mode>): Rename to... (aarch64_fcadd<rot><mode><vczle><vczbe>): ... This. Fix canonicalization of PLUS operands. (aarch64_fcmla<rot><mode>): Rename to... (aarch64_fcmla<rot><mode><vczle><vczbe>): ... This. Fix canonicalization of PLUS operands. (aarch64_fcmla_lane<rot><mode>): Rename to... (aarch64_fcmla_lane<rot><mode><vczle><vczbe>): ... This. Fix canonicalization of PLUS operands. (aarch64_fcmla_laneq<rot>v4hf): Rename to... (aarch64_fcmla_laneq<rot>v4hf<vczle><vczbe>): ... This. Fix canonicalization of PLUS operands. (aarch64_fcmlaq_lane<rot><mode>): Fix canonicalization of PLUS operands. gcc/testsuite/ChangeLog: PR target/99195 * gcc.target/aarch64/simd/pr99195_9.c: New test.
author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2023-05-25 15:00:16 +0100
committer: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2023-05-25 15:00:16 +0100
commit: 560bb845321f5ad039a318a081b0e88d9900f5cb (patch)
tree: 2b0c1bca337c4357476d593772d1f848918d6e4c
parent: f3dbc4112da318d1685a0833c7b3180589bbba2e (diff)
download: gcc-560bb845321f5ad039a318a081b0e88d9900f5cb.zip
gcc-560bb845321f5ad039a318a081b0e88d9900f5cb.tar.gz
gcc-560bb845321f5ad039a318a081b0e88d9900f5cb.tar.bz2
2 files changed, 80 insertions, 16 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 0df9731..da9c59e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -554,7 +554,7 @@
 ;; to describe the permute that is also required, but even if that is done
 ;; the permute would have been created as a LOAD_LANES which means the values
 ;; in the registers are in the wrong order.
-(define_insn "aarch64_fcadd<rot><mode>"
+(define_insn "aarch64_fcadd<rot><mode><vczle><vczbe>"
   [(set (match_operand:VHSDF 0 "register_operand" "=w")
 	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
 		       (match_operand:VHSDF 2 "register_operand" "w")]
@@ -572,25 +572,25 @@
   "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
 )
 
-(define_insn "aarch64_fcmla<rot><mode>"
+(define_insn "aarch64_fcmla<rot><mode><vczle><vczbe>"
   [(set (match_operand:VHSDF 0 "register_operand" "=w")
-	(plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0")
-		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w")
+	(plus:VHSDF (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w")
 				   (match_operand:VHSDF 3 "register_operand" "w")]
-				   FCMLA)))]
+				   FCMLA)
+		    (match_operand:VHSDF 1 "register_operand" "0")))]
   "TARGET_COMPLEX"
   "fcmla\t%0.<Vtype>, %2.<Vtype>, %3.<Vtype>, #<rot>"
   [(set_attr "type" "neon_fcmla")]
 )
 
 
-(define_insn "aarch64_fcmla_lane<rot><mode>"
+(define_insn "aarch64_fcmla_lane<rot><mode><vczle><vczbe>"
   [(set (match_operand:VHSDF 0 "register_operand" "=w")
-	(plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0")
-		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w")
+	(plus:VHSDF (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w")
 				   (match_operand:VHSDF 3 "register_operand" "w")
 				   (match_operand:SI 4 "const_int_operand" "n")]
-				   FCMLA)))]
+				   FCMLA)
+		    (match_operand:VHSDF 1 "register_operand" "0")))]
   "TARGET_COMPLEX"
 {
   operands[4] = aarch64_endian_lane_rtx (<VHALF>mode, INTVAL (operands[4]));
@@ -599,13 +599,13 @@
   [(set_attr "type" "neon_fcmla")]
 )
 
-(define_insn "aarch64_fcmla_laneq<rot>v4hf"
+(define_insn "aarch64_fcmla_laneq<rot>v4hf<vczle><vczbe>"
   [(set (match_operand:V4HF 0 "register_operand" "=w")
-	(plus:V4HF (match_operand:V4HF 1 "register_operand" "0")
-		   (unspec:V4HF [(match_operand:V4HF 2 "register_operand" "w")
+	(plus:V4HF (unspec:V4HF [(match_operand:V4HF 2 "register_operand" "w")
 				 (match_operand:V8HF 3 "register_operand" "w")
 				 (match_operand:SI 4 "const_int_operand" "n")]
-				 FCMLA)))]
+				 FCMLA)
+		   (match_operand:V4HF 1 "register_operand" "0")))]
   "TARGET_COMPLEX"
 {
   operands[4] = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
@@ -616,11 +616,11 @@
 
 (define_insn "aarch64_fcmlaq_lane<rot><mode>"
   [(set (match_operand:VQ_HSF 0 "register_operand" "=w")
-	(plus:VQ_HSF (match_operand:VQ_HSF 1 "register_operand" "0")
-		     (unspec:VQ_HSF [(match_operand:VQ_HSF 2 "register_operand" "w")
+	(plus:VQ_HSF (unspec:VQ_HSF [(match_operand:VQ_HSF 2 "register_operand" "w")
 				     (match_operand:<VHALF> 3 "register_operand" "w")
 				     (match_operand:SI 4 "const_int_operand" "n")]
-				     FCMLA)))]
+				     FCMLA)
+		     (match_operand:VQ_HSF 1 "register_operand" "0")))]
   "TARGET_COMPLEX"
 {
   int nunits = GET_MODE_NUNITS (<VHALF>mode).to_constant ();
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr99195_9.c b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_9.c
new file mode 100644
index 0000000..bb86735
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_9.c
@@ -0,0 +1,64 @@
+/* PR target/99195.  */
+/*  Check that we take advantage of 64-bit Advanced SIMD operations clearing
+    the top half of the vector register and no explicit zeroing instructions
+    are emitted.  */
+/* { dg-do compile } */
+/* { dg-options "-O -march=armv8.3-a+fp16" } */
+
+#include <arm_neon.h>
+
+#define BINARY(OT,IT,OP,S)                         \
+OT                                              \
+foo_##OP##_##S (IT a, IT b, IT c)                 \
+{                                               \
+  IT zeros = vcreate_##S (0);                   \
+  return vcombine_##S (v##OP##_##S (a, b), zeros);      \
+}
+
+#define FUNC(T,IS,OS,OP,S) BINARY (T##x##OS##_t, T##x##IS##_t, OP, S)
+
+#define OPTWO(T,IS,OS,S,OP1,OP2)        \
+FUNC (T, IS, OS, OP1, S)                \
+FUNC (T, IS, OS, OP2, S)
+
+#define OPTHREE(T, IS, OS, S, OP1, OP2, OP3)    \
+FUNC (T, IS, OS, OP1, S)        \
+OPTWO (T, IS, OS, S, OP2, OP3)
+
+#define OPFOUR(T,IS,OS,S,OP1,OP2,OP3,OP4)       \
+FUNC (T, IS, OS, OP1, S)                \
+OPTHREE (T, IS, OS, S, OP2, OP3, OP4)
+
+OPTWO (float16, 4, 8, f16, cadd_rot90, cadd_rot270)
+OPTWO (float32, 2, 4, f32, cadd_rot90, cadd_rot270)
+
+#define TERNARY(OT,IT,OP,S)                         \
+OT                                              \
+foo_##OP##_##S (IT a, IT b, IT c)                 \
+{                                               \
+  IT zeros = vcreate_##S (0);                   \
+  return vcombine_##S (v##OP##_##S (a, b, c), zeros);      \
+}
+
+#undef FUNC
+#define FUNC(T,IS,OS,OP,S) TERNARY (T##x##OS##_t, T##x##IS##_t, OP, S)
+
+OPFOUR (float16, 4, 8, f16, cmla, cmla_rot90, cmla_rot180, cmla_rot270)
+OPFOUR (float32, 2, 4, f32, cmla, cmla_rot90, cmla_rot180, cmla_rot270)
+
+#define TERNARY_IDX(OT,IT,OP,S)                         \
+OT                                              \
+foo_##OP##_##S (IT a, IT b, IT c)                 \
+{                                               \
+  IT zeros = vcreate_##S (0);                   \
+  return vcombine_##S (v##OP##_##S (a, b, c, 0), zeros);      \
+}
+
+#undef FUNC
+#define FUNC(T,IS,OS,OP,S) TERNARY_IDX (T##x##OS##_t, T##x##IS##_t, OP, S)
+OPFOUR (float16, 4, 8, f16, cmla_lane, cmla_rot90_lane, cmla_rot180_lane, cmla_rot270_lane)
+OPFOUR (float32, 2, 4, f32, cmla_lane, cmla_rot90_lane, cmla_rot180_lane, cmla_rot270_lane)
+
+/* { dg-final { scan-assembler-not {\tfmov\t} } }  */
+/* { dg-final { scan-assembler-not {\tmov\t} } }  */
+
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2023-05-25 15:00:16 +0100
committer	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2023-05-25 15:00:16 +0100
commit	560bb845321f5ad039a318a081b0e88d9900f5cb (patch)
tree	2b0c1bca337c4357476d593772d1f848918d6e4c
parent	f3dbc4112da318d1685a0833c7b3180589bbba2e (diff)
download	gcc-560bb845321f5ad039a318a081b0e88d9900f5cb.zip gcc-560bb845321f5ad039a318a081b0e88d9900f5cb.tar.gz gcc-560bb845321f5ad039a318a081b0e88d9900f5cb.tar.bz2