10 files changed, 263 insertions, 13 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index fb7a205..c660ff1 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -2,6 +2,23 @@
 	    Alan Hayward  <alan.hayward@arm.com>
 	    David Sherwood  <david.sherwood@arm.com>
 
+	* tree-vectorizer.h (vect_gather_scatter_fn_p): Declare.
+	* tree-vect-data-refs.c (vect_gather_scatter_fn_p): Make public.
+	* tree-vect-stmts.c (vect_truncate_gather_scatter_offset): New
+	function.
+	(vect_use_strided_gather_scatters_p): Take a masked_p argument.
+	Use vect_truncate_gather_scatter_offset if we can't treat the
+	operation as a normal gather load or scatter store.
+	(get_group_load_store_type): Take the gather_scatter_info
+	as argument.  Try using a gather load or scatter store for
+	single-element groups.
+	(get_load_store_type): Update calls to get_group_load_store_type
+	and vect_use_strided_gather_scatters_p.
+
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+	    Alan Hayward  <alan.hayward@arm.com>
+	    David Sherwood  <david.sherwood@arm.com>
+
 	* tree-vectorizer.h (vect_create_data_ref_ptr): Take an extra
 	optional tree argument.
 	* tree-vect-data-refs.c (vect_check_gather_scatter): Check for
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 30bcb7a..20d84c2 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -2,6 +2,17 @@
 	    Alan Hayward  <alan.hayward@arm.com>
 	    David Sherwood  <david.sherwood@arm.com>
 
+	* gcc.target/aarch64/sve/reduc_strict_3.c: Expect FADDA to be used
+	for double_reduc1.
+	* gcc.target/aarch64/sve/strided_load_4.c: New test.
+	* gcc.target/aarch64/sve/strided_load_5.c: Likewise.
+	* gcc.target/aarch64/sve/strided_load_6.c: Likewise.
+	* gcc.target/aarch64/sve/strided_load_7.c: Likewise.
+
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+	    Alan Hayward  <alan.hayward@arm.com>
+	    David Sherwood  <david.sherwood@arm.com>
+
 	* gcc.target/aarch64/sve/strided_load_1.c: New test.
 	* gcc.target/aarch64/sve/strided_load_2.c: Likewise.
 	* gcc.target/aarch64/sve/strided_load_3.c: Likewise.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c
index a28145f..a718e9d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c
@@ -118,14 +118,11 @@ double_reduc3 (float *restrict i, float *restrict j)
   return l * k;
 }
 
-/* We can't yet handle double_reduc1.  */
-/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 3 } } */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 4 } } */
 /* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 9 } } */
 /* 1 reduction each for double_reduc{1,2} and 2 for double_reduc3.  Each one
    is reported three times, once for SVE, once for 128-bit AdvSIMD and once
    for 64-bit AdvSIMD.  */
 /* { dg-final { scan-tree-dump-times "Detected double reduction" 12 "vect" } } */
-/* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3.
-   double_reduc1 is reported 3 times (SVE, 128-bit AdvSIMD, 64-bit AdvSIMD)
-   before failing.  */
-/* { dg-final { scan-tree-dump-times "Detected reduction" 12 "vect" } } */
+/* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3.  */
+/* { dg-final { scan-tree-dump-times "Detected reduction" 10 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c
new file mode 100644
index 0000000..0eff384
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c
@@ -0,0 +1,33 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE)			\
+  void __attribute__ ((noinline, noclone))			\
+  f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest,		\
+			   DATA_TYPE *restrict src, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      dest[i] += src[i * SCALE];				\
+  }
+
+#define TEST_TYPE(T, DATA_TYPE)			\
+  T (DATA_TYPE, 5, 5)				\
+  T (DATA_TYPE, 7, 7)				\
+  T (DATA_TYPE, 11, 11)				\
+  T (DATA_TYPE, 200, 200)			\
+  T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T)				\
+  TEST_TYPE (T, int32_t)			\
+  TEST_TYPE (T, uint32_t)			\
+  TEST_TYPE (T, float)				\
+  TEST_TYPE (T, int64_t)			\
+  TEST_TYPE (T, uint64_t)			\
+  TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c
new file mode 100644
index 0000000..415b466
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c
@@ -0,0 +1,34 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE)			\
+  void __attribute__ ((noinline, noclone))			\
+  f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest,		\
+			  DATA_TYPE *restrict src, long n)	\
+  {								\
+    for (long i = 0; i < n; ++i)				\
+      dest[i] += src[i * SCALE];				\
+  }
+
+#define TEST_TYPE(T, DATA_TYPE)			\
+  T (DATA_TYPE, 5, 5)				\
+  T (DATA_TYPE, 7, 7)				\
+  T (DATA_TYPE, 11, 11)				\
+  T (DATA_TYPE, 200, 200)			\
+  T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T)				\
+  TEST_TYPE (T, int32_t)			\
+  TEST_TYPE (T, uint32_t)			\
+  TEST_TYPE (T, float)				\
+  TEST_TYPE (T, int64_t)			\
+  TEST_TYPE (T, uint64_t)			\
+  TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c
new file mode 100644
index 0000000..9e00015
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c
@@ -0,0 +1,7 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable --save-temps" } */
+
+#include "strided_load_5.c"
+
+/* { dg-final { scan-assembler-not {\[x[0-9]+, z[0-9]+\.s} } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c
new file mode 100644
index 0000000..3a36367
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c
@@ -0,0 +1,34 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE)			\
+  void __attribute__ ((noinline, noclone))			\
+  f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest,		\
+			  DATA_TYPE *restrict src)		\
+  {								\
+    for (long i = 0; i < 1000; ++i)				\
+      dest[i] += src[i * SCALE];				\
+  }
+
+#define TEST_TYPE(T, DATA_TYPE)			\
+  T (DATA_TYPE, 5, 5)				\
+  T (DATA_TYPE, 7, 7)				\
+  T (DATA_TYPE, 11, 11)				\
+  T (DATA_TYPE, 200, 200)			\
+  T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T)				\
+  TEST_TYPE (T, int32_t)			\
+  TEST_TYPE (T, uint32_t)			\
+  TEST_TYPE (T, float)				\
+  TEST_TYPE (T, int64_t)			\
+  TEST_TYPE (T, uint64_t)			\
+  TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 69721a9..daa8b0c 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -3312,7 +3312,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
    Return true if the function is supported, storing the function
    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
 
-static bool
+bool
 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
 			  tree memory_type, unsigned int offset_bits,
 			  signop offset_sign, int scale,
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 079cbdd..df58834 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1849,18 +1849,117 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
   return and_res;
 }
 
+/* Determine whether we can use a gather load or scatter store to vectorize
+   strided load or store STMT by truncating the current offset to a smaller
+   width.  We need to be able to construct an offset vector:
+
+     { 0, X, X*2, X*3, ... }
+
+   without loss of precision, where X is STMT's DR_STEP.
+
+   Return true if this is possible, describing the gather load or scatter
+   store in GS_INFO.  MASKED_P is true if the load or store is conditional.  */
+
+static bool
+vect_truncate_gather_scatter_offset (gimple *stmt, loop_vec_info loop_vinfo,
+				     bool masked_p,
+				     gather_scatter_info *gs_info)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+  tree step = DR_STEP (dr);
+  if (TREE_CODE (step) != INTEGER_CST)
+    {
+      /* ??? Perhaps we could use range information here?  */
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "cannot truncate variable step.\n");
+      return false;
+    }
+
+  /* Get the number of bits in an element.  */
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  scalar_mode element_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
+  unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
+
+  /* Set COUNT to the upper limit on the number of elements - 1.
+     Start with the maximum vectorization factor.  */
+  unsigned HOST_WIDE_INT count = vect_max_vf (loop_vinfo) - 1;
+
+  /* Try lowering COUNT to the number of scalar latch iterations.  */
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  widest_int max_iters;
+  if (max_loop_iterations (loop, &max_iters)
+      && max_iters < count)
+    count = max_iters.to_shwi ();
+
+  /* Try scales of 1 and the element size.  */
+  int scales[] = { 1, vect_get_scalar_dr_size (dr) };
+  bool overflow_p = false;
+  for (int i = 0; i < 2; ++i)
+    {
+      int scale = scales[i];
+      widest_int factor;
+      if (!wi::multiple_of_p (wi::to_widest (step), scale, SIGNED, &factor))
+	continue;
+
+      /* See whether we can calculate (COUNT - 1) * STEP / SCALE
+	 in OFFSET_BITS bits.  */
+      widest_int range = wi::mul (count, factor, SIGNED, &overflow_p);
+      if (overflow_p)
+	continue;
+      signop sign = range >= 0 ? UNSIGNED : SIGNED;
+      if (wi::min_precision (range, sign) > element_bits)
+	{
+	  overflow_p = true;
+	  continue;
+	}
+
+      /* See whether the target supports the operation.  */
+      tree memory_type = TREE_TYPE (DR_REF (dr));
+      if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
+				     memory_type, element_bits, sign, scale,
+				     &gs_info->ifn, &gs_info->element_type))
+	continue;
+
+      tree offset_type = build_nonstandard_integer_type (element_bits,
+							 sign == UNSIGNED);
+
+      gs_info->decl = NULL_TREE;
+      /* Logically the sum of DR_BASE_ADDRESS, DR_INIT and DR_OFFSET,
+	 but we don't need to store that here.  */
+      gs_info->base = NULL_TREE;
+      gs_info->offset = fold_convert (offset_type, step);
+      gs_info->offset_dt = vect_unknown_def_type;
+      gs_info->offset_vectype = NULL_TREE;
+      gs_info->scale = scale;
+      gs_info->memory_type = memory_type;
+      return true;
+    }
+
+  if (overflow_p && dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "truncating gather/scatter offset to %d bits"
+		     " might change its value.\n", element_bits);
+
+  return false;
+}
+
 /* Return true if we can use gather/scatter internal functions to
    vectorize STMT, which is a grouped or strided load or store.
-   When returning true, fill in GS_INFO with the information required
-   to perform the operation.  */
+   MASKED_P is true if load or store is conditional.  When returning
+   true, fill in GS_INFO with the information required to perform the
+   operation.  */
 
 static bool
 vect_use_strided_gather_scatters_p (gimple *stmt, loop_vec_info loop_vinfo,
+				    bool masked_p,
 				    gather_scatter_info *gs_info)
 {
   if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info)
       || gs_info->decl)
-    return false;
+    return vect_truncate_gather_scatter_offset (stmt, loop_vinfo,
+						masked_p, gs_info);
 
   scalar_mode element_mode = SCALAR_TYPE_MODE (gs_info->element_type);
   unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
@@ -1951,7 +2050,8 @@ vect_get_store_rhs (gimple *stmt)
 static bool
 get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
 			   bool masked_p, vec_load_store_type vls_type,
-			   vect_memory_access_type *memory_access_type)
+			   vect_memory_access_type *memory_access_type,
+			   gather_scatter_info *gs_info)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   vec_info *vinfo = stmt_info->vinfo;
@@ -2073,6 +2173,20 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
 	      overrun_p = would_overrun_p;
 	    }
 	}
+
+      /* As a last resort, trying using a gather load or scatter store.
+
+	 ??? Although the code can handle all group sizes correctly,
+	 it probably isn't a win to use separate strided accesses based
+	 on nearby locations.  Or, even if it's a win over scalar code,
+	 it might not be a win over vectorizing at a lower VF, if that
+	 allows us to use contiguous accesses.  */
+      if (*memory_access_type == VMAT_ELEMENTWISE
+	  && single_element_p
+	  && loop_vinfo
+	  && vect_use_strided_gather_scatters_p (stmt, loop_vinfo,
+						 masked_p, gs_info))
+	*memory_access_type = VMAT_GATHER_SCATTER;
     }
 
   if (vls_type != VLS_LOAD && first_stmt == stmt)
@@ -2200,14 +2314,15 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p,
   else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
       if (!get_group_load_store_type (stmt, vectype, slp, masked_p, vls_type,
-				      memory_access_type))
+				      memory_access_type, gs_info))
 	return false;
     }
   else if (STMT_VINFO_STRIDED_P (stmt_info))
     {
       gcc_assert (!slp);
       if (loop_vinfo
-	  && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, gs_info))
+	  && vect_use_strided_gather_scatters_p (stmt, loop_vinfo,
+						 masked_p, gs_info))
 	*memory_access_type = VMAT_GATHER_SCATTER;
       else
 	*memory_access_type = VMAT_ELEMENTWISE;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index c661578..903e56e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1455,6 +1455,8 @@ extern bool vect_verify_datarefs_alignment (loop_vec_info);
 extern bool vect_slp_analyze_and_verify_instance_alignment (slp_instance);
 extern bool vect_analyze_data_ref_accesses (vec_info *);
 extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
+extern bool vect_gather_scatter_fn_p (bool, bool, tree, tree, unsigned int,
+				      signop, int, internal_fn *, tree *);
 extern bool vect_check_gather_scatter (gimple *, loop_vec_info,
 				       gather_scatter_info *);
 extern bool vect_analyze_data_refs (vec_info *, poly_uint64 *);