[patch AArch64] Do not perform a vector splat for vector initialisation if it is not useful

Our current vector initialisation code will first duplicate the first element to both lanes, then overwrite the top lane with a new value. This duplication can be clunky and wasteful. Better would be to simply use the fact that we will always be overwriting the remaining bits, and simply move the first element to the corrcet place (implicitly zeroing all other bits). We also need a new pattern in simplify-rtx.c:simplify_ternary_operation , to ensure we can still simplify: (vec_merge:OUTER (vec_duplicate:OUTER x:INNER) (subreg:OUTER y:INNER 0) (const_int N)) To: (vec_concat:OUTER x:INNER y:INNER) or (vec_concat y x) --- gcc/ * config/aarch64/aarch64.c (aarch64_expand_vector_init): Modify code generation for cases where splatting a value is not useful. * simplify-rtx.c (simplify_ternary_operation): Simplify vec_merge across a vec_duplicate and a paradoxical subreg forming a vector mode to a vec_concat. gcc/testsuite/ * gcc.target/aarch64/vect-slp-dup.c: New. From-SVN: r255946
author: James Greenhalgh <james.greenhalgh@arm.com> 2017-12-21 16:39:43 +0000
committer: James Greenhalgh <jgreenhalgh@gcc.gnu.org> 2017-12-21 16:39:43 +0000
commit: 6b6d8f38f7b3bd8a2f4e4dbeb3014ba1339afa89 (patch)
tree: a26bfcca0fadf5099804a4c2887409b687957d8e /gcc
parent: c587c0a9c8cbe80a64461fe9cab0a23d3ff35211 (diff)
download: gcc-6b6d8f38f7b3bd8a2f4e4dbeb3014ba1339afa89.zip
gcc-6b6d8f38f7b3bd8a2f4e4dbeb3014ba1339afa89.tar.gz
gcc-6b6d8f38f7b3bd8a2f4e4dbeb3014ba1339afa89.tar.bz2
5 files changed, 128 insertions, 3 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c6dc455..d3c4063 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,13 @@
 2017-12-21  James Greenhalgh  <james.greenhalgh@arm.com>
 
+	* config/aarch64/aarch64.c (aarch64_expand_vector_init): Modify code
+	generation for cases where splatting a value is not useful.
+	* simplify-rtx.c (simplify_ternary_operation): Simplify vec_merge
+	across a vec_duplicate and a paradoxical subreg forming a vector
+	mode to a vec_concat.
+
+2017-12-21  James Greenhalgh  <james.greenhalgh@arm.com>
+
 	* combine.c (simplify_set): Do not transform subregs to zero_extends
 	if the destination is not a scalar int mode.
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fc27b40..1da313f 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12107,9 +12107,51 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	    maxv = matches[i][1];
 	  }
 
-      /* Create a duplicate of the most common element.  */
-      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
-      aarch64_emit_move (target, gen_vec_duplicate (mode, x));
+      /* Create a duplicate of the most common element, unless all elements
+	 are equally useless to us, in which case just immediately set the
+	 vector register using the first element.  */
+
+      if (maxv == 1)
+	{
+	  /* For vectors of two 64-bit elements, we can do even better.  */
+	  if (n_elts == 2
+	      && (inner_mode == E_DImode
+		  || inner_mode == E_DFmode))
+
+	    {
+	      rtx x0 = XVECEXP (vals, 0, 0);
+	      rtx x1 = XVECEXP (vals, 0, 1);
+	      /* Combine can pick up this case, but handling it directly
+		 here leaves clearer RTL.
+
+		 This is load_pair_lanes<mode>, and also gives us a clean-up
+		 for store_pair_lanes<mode>.  */
+	      if (memory_operand (x0, inner_mode)
+		  && memory_operand (x1, inner_mode)
+		  && !STRICT_ALIGNMENT
+		  && rtx_equal_p (XEXP (x1, 0),
+				  plus_constant (Pmode,
+						 XEXP (x0, 0),
+						 GET_MODE_SIZE (inner_mode))))
+		{
+		  rtx t;
+		  if (inner_mode == DFmode)
+		    t = gen_load_pair_lanesdf (target, x0, x1);
+		  else
+		    t = gen_load_pair_lanesdi (target, x0, x1);
+		  emit_insn (t);
+		  return;
+		}
+	    }
+	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+	  aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
+	  maxelement = 0;
+	}
+      else
+	{
+	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+	  aarch64_emit_move (target, gen_vec_duplicate (mode, x));
+	}
 
       /* Insert the rest.  */
       for (int i = 0; i < n_elts; i++)
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 4f9796c..6b163f9 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -5860,6 +5860,57 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
 		return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1);
 	    }
 
+	  /* Replace:
+
+	      (vec_merge:outer (vec_duplicate:outer x:inner)
+			       (subreg:outer y:inner 0)
+			       (const_int N))
+
+	     with (vec_concat:outer x:inner y:inner) if N == 1,
+	     or (vec_concat:outer y:inner x:inner) if N == 2.
+	     We assume that degenrate cases (N == 0 or N == 3), which
+	     represent taking all elements from either input, are handled
+	     elsewhere.
+
+	     Implicitly, this means we have a paradoxical subreg, but such
+	     a check is cheap, so make it anyway.
+
+	     Only applies for vectors of two elements.  */
+
+	  if ((GET_CODE (op0) == VEC_DUPLICATE
+	       || GET_CODE (op1) == VEC_DUPLICATE)
+	      && GET_MODE (op0) == GET_MODE (op1)
+	      && GET_MODE_NUNITS (GET_MODE (op0)) == 2
+	      && GET_MODE_NUNITS (GET_MODE (op1)) == 2
+	      && IN_RANGE (sel, 1, 2))
+	    {
+	      rtx newop0 = op0, newop1 = op1;
+
+	      /* Canonicalize locally such that the VEC_DUPLICATE is always
+		 the first operand.  */
+	      if (GET_CODE (newop1) == VEC_DUPLICATE)
+		{
+		  std::swap (newop0, newop1);
+		  /* If we swap the operand order, we also need to swap
+		     the selector mask.  */
+		  sel = sel == 1 ? 2 : 1;
+		}
+
+	      if (GET_CODE (newop1) == SUBREG
+		  && paradoxical_subreg_p (newop1)
+		  && subreg_lowpart_p (newop1)
+		  && GET_MODE (SUBREG_REG (newop1))
+		      == GET_MODE (XEXP (newop0, 0)))
+		{
+		  newop0 = XEXP (newop0, 0);
+		  newop1 = SUBREG_REG (newop1);
+		  if (sel == 2)
+		    std::swap (newop0, newop1);
+		  return simplify_gen_binary (VEC_CONCAT, mode,
+					      newop0, newop1);
+		}
+	    }
+
 	  /* Replace (vec_merge (vec_duplicate x) (vec_duplicate y)
 				 (const_int n))
 	     with (vec_concat x y) or (vec_concat y x) depending on value
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index a607911..7171973 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2017-12-21  James Greenhalgh  <james.greenhalgh@arm.com>
+
+	* gcc.target/aarch64/vect-slp-dup.c: New.
+
 2017-12-21  Eric Botcazou  <ebotcazou@adacore.com>
 
 	* c-c++-common/pr82872.c: New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c b/gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c
new file mode 100644
index 0000000..0541e48
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+
+/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+void bar (double);
+
+void
+foo (double *restrict in, double *restrict in2,
+     double *restrict out1, double *restrict out2)
+{
+  for (int i = 0; i < 1024; i++)
+    {
+      out1[i] = in[i] + 2.0 * in[i+128];
+      out1[i+1] = in[i+1] + 2.0 * in2[i];
+      bar (in[i]);
+    }
+}
+
+/* { dg-final { scan-assembler-not "dup\tv\[0-9\]+.2d, v\[0-9\]+" } } */
+
author	James Greenhalgh <james.greenhalgh@arm.com>	2017-12-21 16:39:43 +0000
committer	James Greenhalgh <jgreenhalgh@gcc.gnu.org>	2017-12-21 16:39:43 +0000
commit	6b6d8f38f7b3bd8a2f4e4dbeb3014ba1339afa89 (patch)
tree	a26bfcca0fadf5099804a4c2887409b687957d8e /gcc
parent	c587c0a9c8cbe80a64461fe9cab0a23d3ff35211 (diff)
download	gcc-6b6d8f38f7b3bd8a2f4e4dbeb3014ba1339afa89.zip gcc-6b6d8f38f7b3bd8a2f4e4dbeb3014ba1339afa89.tar.gz gcc-6b6d8f38f7b3bd8a2f4e4dbeb3014ba1339afa89.tar.bz2