[AArch64] Emit TARGET_DOTPROD-specific sequence for <us>sadv16qi

Wilco pointed out that when the Dot Product instructions are available we can use them to generate an even more efficient expansion for the [us]sadv16qi optab. Instead of the current: uabdl2 v0.8h, v1.16b, v2.16b uabal v0.8h, v1.8b, v2.8b uadalp v3.4s, v0.8h we can generate: (1) mov v4.16b, 1 (2) uabd v0.16b, v1.16b, v2.16b (3) udot v3.4s, v0.16b, v4.16b Instruction (1) can be CSEd across multiple such expansions and even hoisted outside of loops, so when this sequence appears frequently back-to-back (like in x264_r) we essentially only have 2 instructions per sum. Also, the UDOT instruction does the byte-to-word accumulation in one step, which allows us to use the much simpler UABD instruction before it. This makes it a shorter and lower-latency sequence overall for targets that support it. * config/aarch64/iterators.md (MAX_OPP): New code attr. * config/aarch64/aarch64-simd.md (*aarch64_<su>abd<mode>_3): Rename to... (aarch64_<su>abd<mode>_3): ... This. (<sur>sadv16qi): Add TARGET_DOTPROD expansion. * gcc.target/aarch64/ssadv16qi.c: Add +nodotprod to pragma. * gcc.target/aarch64/usadv16qi.c: Likewise. * gcc.target/aarch64/ssadv16qi-dotprod.c: New test. * gcc.target/aarch64/usadv16qi-dotprod.c: Likewise. From-SVN: r271863
author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2019-06-03 11:20:58 +0000
committer: Kyrylo Tkachov <ktkachov@gcc.gnu.org> 2019-06-03 11:20:58 +0000
commit: 72215009a9f9827397a4eb74e9341b2b7dc658df (patch)
tree: 85c9597bd0985e8be2de5f8dfbbcce8493abad31 /gcc
parent: c89503d957f13f7f0a5eeeab1326048c455d9533 (diff)
download: gcc-72215009a9f9827397a4eb74e9341b2b7dc658df.zip
gcc-72215009a9f9827397a4eb74e9341b2b7dc658df.tar.gz
gcc-72215009a9f9827397a4eb74e9341b2b7dc658df.tar.bz2
7 files changed, 97 insertions, 4 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 372e880..06184ed 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2019-06-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+	* config/aarch64/iterators.md (MAX_OPP): New code attr.
+	* config/aarch64/aarch64-simd.md (*aarch64_<su>abd<mode>_3): Rename to...
+	(aarch64_<su>abd<mode>_3): ... This.
+	(<sur>sadv16qi): Add TARGET_DOTPROD expansion.
+
 2019-06-03  Richard Biener  <rguenther@suse.de>
 
 	* tree-ssa-sccvn.c (ao_ref_init_from_vn_reference): Get original
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d4c48d2..b648e9e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -710,7 +710,7 @@
 ;; So (ABS:QI (minus:QI 64 -128)) == (ABS:QI (192 or -64 signed)) == 64.
 ;; Whereas SABD would return 192 (-64 signed) on the above example.
 ;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead.
-(define_insn "*aarch64_<su>abd<mode>_3"
+(define_insn "aarch64_<su>abd<mode>_3"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
 	(minus:VDQ_BHSI
 	  (USMAX:VDQ_BHSI
@@ -764,7 +764,16 @@
 ;; UABAL	tmp.8h, op1.16b, op2.16b
 ;; UADALP	op3.4s, tmp.8h
 ;; MOV		op0, op3 // should be eliminated in later passes.
-;; The signed version just uses the signed variants of the above instructions.
+;;
+;; For TARGET_DOTPROD we do:
+;; MOV	tmp1.16b, #1 // Can be CSE'd and hoisted out of loops.
+;; UABD	tmp2.16b, op1.16b, op2.16b
+;; UDOT	op3.4s, tmp2.16b, tmp1.16b
+;; MOV	op0, op3 // RA will tie the operands of UDOT appropriately.
+;;
+;; The signed version just uses the signed variants of the above instructions
+;; but for TARGET_DOTPROD still emits a UDOT as the absolute difference is
+;; unsigned.
 
 (define_expand "<sur>sadv16qi"
   [(use (match_operand:V4SI 0 "register_operand"))
@@ -773,6 +782,15 @@
    (use (match_operand:V4SI 3 "register_operand"))]
   "TARGET_SIMD"
   {
+    if (TARGET_DOTPROD)
+      {
+	rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
+	rtx abd = gen_reg_rtx (V16QImode);
+	emit_insn (gen_aarch64_<sur>abdv16qi_3 (abd, operands[1], operands[2]));
+	emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
+					  abd, ones));
+	DONE;
+      }
     rtx reduc = gen_reg_rtx (V8HImode);
     emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
 					       operands[2]));
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 999bdc2..112cf11 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2019-06-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+	* gcc.target/aarch64/ssadv16qi.c: Add +nodotprod to pragma.
+	* gcc.target/aarch64/usadv16qi.c: Likewise.
+	* gcc.target/aarch64/ssadv16qi-dotprod.c: New test.
+	* gcc.target/aarch64/usadv16qi-dotprod.c: Likewise.
+
 2019-06-03  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>
 
 	* lib/target-supports.exp (add_options_for_aarch64_sve): New procedure.
diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
new file mode 100644
index 0000000..08b6831
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+signed char pix1[N], pix2[N];
+
+int foo (void)
+{
+  int i_sum = 0;
+  int i;
+
+  for (i = 0; i < N; i++)
+    i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+  return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tsshll\t} } } */
+/* { dg-final { scan-assembler-not {\tsshll2\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tsabd\t} } } */
+/* { dg-final { scan-assembler {\tudot\t} } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
index 40b2884..85a867a 100644
--- a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
+++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O3" } */
 
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
 
 #define N 1024
 
diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
new file mode 100644
index 0000000..ea8de4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+unsigned char pix1[N], pix2[N];
+
+int foo (void)
+{
+  int i_sum = 0;
+  int i;
+
+  for (i = 0; i < N; i++)
+    i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+  return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tushll\t} } } */
+/* { dg-final { scan-assembler-not {\tushll2\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tuabd\t} } } */
+/* { dg-final { scan-assembler {\tudot\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
index 69ceaf4..a66e120 100644
--- a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
+++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O3" } */
 
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
 
 #define N 1024
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2019-06-03 11:20:58 +0000
committer	Kyrylo Tkachov <ktkachov@gcc.gnu.org>	2019-06-03 11:20:58 +0000
commit	72215009a9f9827397a4eb74e9341b2b7dc658df (patch)
tree	85c9597bd0985e8be2de5f8dfbbcce8493abad31 /gcc
parent	c89503d957f13f7f0a5eeeab1326048c455d9533 (diff)
download	gcc-72215009a9f9827397a4eb74e9341b2b7dc658df.zip gcc-72215009a9f9827397a4eb74e9341b2b7dc658df.tar.gz gcc-72215009a9f9827397a4eb74e9341b2b7dc658df.tar.bz2