[Patch AArch64] GCC 6 regression in vector performance. - Fix vector initialization to happen with lane load instructions.

gcc/ * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor, always use lane loads to construct non-constant vectors. gcc/testsuite/ * gcc.target/aarch64/vector_initialization_nostack.c: New. Co-Authored-By: Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> From-SVN: r233461
author: James Greenhalgh <james.greenhalgh@arm.com> 2016-02-16 16:02:09 +0000
committer: James Greenhalgh <jgreenhalgh@gcc.gnu.org> 2016-02-16 16:02:09 +0000
commit: 35a093b60bcc40cf31e47a2336b50311578fc973 (patch)
tree: b2c3f6801774f27a05ac2e7e0e6d644d1ad4f9c8
parent: 51b3f0773f84ef1e3aac56e687f67027c3fb070c (diff)
download: gcc-35a093b60bcc40cf31e47a2336b50311578fc973.zip
gcc-35a093b60bcc40cf31e47a2336b50311578fc973.tar.gz
gcc-35a093b60bcc40cf31e47a2336b50311578fc973.tar.bz2
4 files changed, 92 insertions, 28 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 90d91a7..08b3f0d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,4 +1,10 @@
 2016-02-16  James Greenhalgh  <james.greenhalgh@arm.com>
+	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,
+	always use lane loads to construct non-constant vectors.
+
+2016-02-16  James Greenhalgh  <james.greenhalgh@arm.com>
 
 	* config/aarch64/aarch64.md
 	(arch64_sqrdml<SQRDMLH_AS:rdma_as>h_lane<mode>): Fix register
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 8326a4c..974a789 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11053,28 +11053,37 @@ aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
+/* Expand a vector initialisation sequence, such that TARGET is
+   initialised to contain VALS.  */
+
 void
 aarch64_expand_vector_init (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
+  /* The number of vector elements.  */
   int n_elts = GET_MODE_NUNITS (mode);
+  /* The number of vector elements which are not constant.  */
   int n_var = 0;
   rtx any_const = NULL_RTX;
+  /* The first element of vals.  */
+  rtx v0 = XVECEXP (vals, 0, 0);
   bool all_same = true;
 
+  /* Count the number of variable elements to initialise.  */
   for (int i = 0; i < n_elts; ++i)
     {
       rtx x = XVECEXP (vals, 0, i);
-      if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
+      if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
 	++n_var;
       else
 	any_const = x;
 
-      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
-	all_same = false;
+      all_same &= rtx_equal_p (x, v0);
     }
 
+  /* No variable elements, hand off to aarch64_simd_make_constant which knows
+     how best to handle this.  */
   if (n_var == 0)
     {
       rtx constant = aarch64_simd_make_constant (vals);
@@ -11088,14 +11097,15 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   /* Splat a single non-constant element if we can.  */
   if (all_same)
     {
-      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+      rtx x = copy_to_mode_reg (inner_mode, v0);
       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
       return;
     }
 
-  /* Half the fields (or less) are non-constant.  Load constant then overwrite
-     varying fields.  Hope that this is more efficient than using the stack.  */
-  if (n_var <= n_elts/2)
+  /* Initialise a vector which is part-variable.  We want to first try
+     to build those lanes which are constant in the most efficient way we
+     can.  */
+  if (n_var != n_elts)
     {
       rtx copy = copy_rtx (vals);
 
@@ -11122,31 +11132,21 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	  XVECEXP (copy, 0, i) = subst;
 	}
       aarch64_expand_vector_init (target, copy);
+    }
 
-      /* Insert variables.  */
-      enum insn_code icode = optab_handler (vec_set_optab, mode);
-      gcc_assert (icode != CODE_FOR_nothing);
+  /* Insert the variable lanes directly.  */
 
-      for (int i = 0; i < n_elts; i++)
-	{
-	  rtx x = XVECEXP (vals, 0, i);
-	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
-	    continue;
-	  x = copy_to_mode_reg (inner_mode, x);
-	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
-	}
-      return;
-    }
+  enum insn_code icode = optab_handler (vec_set_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
 
-  /* Construct the vector in memory one field at a time
-     and load the whole vector.  */
-  rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   for (int i = 0; i < n_elts; i++)
-    emit_move_insn (adjust_address_nv (mem, inner_mode,
-				    i * GET_MODE_SIZE (inner_mode)),
-		    XVECEXP (vals, 0, i));
-  emit_move_insn (target, mem);
-
+    {
+      rtx x = XVECEXP (vals, 0, i);
+      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+	continue;
+      x = copy_to_mode_reg (inner_mode, x);
+      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+    }
 }
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 6dd7c2d..cf1da01 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2016-02-16  James Greenhalgh  <james.greenhalgh@arm.com>
+	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>
+
+	* gcc.target/aarch64/vector_initialization_nostack.c: New.
+
 2016-02-16  Jakub Jelinek  <jakub@redhat.com>
 
 	PR middle-end/69801
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
new file mode 100644
index 0000000..bbad04d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */
+float arr_f[100][100];
+float
+f9 (void)
+{
+
+  int i;
+  float sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr_f[i][0] * arr_f[0][i];
+  return sum;
+
+}
+
+
+int arr[100][100];
+int
+f10 (void)
+{
+
+  int i;
+  int sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr[i][0] * arr[0][i];
+  return sum;
+
+}
+
+double arr_d[100][100];
+double
+f11 (void)
+{
+  int i;
+  double sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr_d[i][0] * arr_d[0][i];
+  return sum;
+}
+
+char arr_c[100][100];
+char
+f12 (void)
+{
+  int i;
+  char sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr_c[i][0] * arr_c[0][i];
+  return sum;
+}
+
+
+/* { dg-final { scan-assembler-not "sp" } } */
author	James Greenhalgh <james.greenhalgh@arm.com>	2016-02-16 16:02:09 +0000
committer	James Greenhalgh <jgreenhalgh@gcc.gnu.org>	2016-02-16 16:02:09 +0000
commit	35a093b60bcc40cf31e47a2336b50311578fc973 (patch)
tree	b2c3f6801774f27a05ac2e7e0e6d644d1ad4f9c8
parent	51b3f0773f84ef1e3aac56e687f67027c3fb070c (diff)
download	gcc-35a093b60bcc40cf31e47a2336b50311578fc973.zip gcc-35a093b60bcc40cf31e47a2336b50311578fc973.tar.gz gcc-35a093b60bcc40cf31e47a2336b50311578fc973.tar.bz2