34 files changed, 1720 insertions, 245 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2c6f9eb..37656fa 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -2,6 +2,64 @@
 	    Alan Hayward  <alan.hayward@arm.com>
 	    David Sherwood  <david.sherwood@arm.com>
 
+	* tree-vectorizer.h (vec_lower_bound): New structure.
+	(_loop_vec_info): Add check_nonzero and lower_bounds.
+	(LOOP_VINFO_CHECK_NONZERO): New macro.
+	(LOOP_VINFO_LOWER_BOUNDS): Likewise.
+	(LOOP_REQUIRES_VERSIONING_FOR_ALIAS): Check lower_bounds too.
+	* tree-data-ref.h (dr_with_seg_len): Add access_size and align
+	fields.  Make seg_len the distance travelled, not including the
+	access size.
+	(dr_direction_indicator): Declare.
+	(dr_zero_step_indicator): Likewise.
+	(dr_known_forward_stride_p): Likewise.
+	* tree-data-ref.c: Include stringpool.h, tree-vrp.h and
+	tree-ssanames.h.
+	(runtime_alias_check_p): Allow runtime alias checks with
+	variable strides.
+	(operator ==): Compare access_size and align.
+	(prune_runtime_alias_test_list): Rework for new distinction between
+	the access_size and seg_len.
+	(create_intersect_range_checks_index): Likewise.  Cope with polynomial
+	segment lengths.
+	(get_segment_min_max): New function.
+	(create_intersect_range_checks): Use it.
+	(dr_step_indicator): New function.
+	(dr_direction_indicator): Likewise.
+	(dr_zero_step_indicator): Likewise.
+	(dr_known_forward_stride_p): Likewise.
+	* tree-loop-distribution.c (data_ref_segment_size): Return
+	DR_STEP * (niters - 1).
+	(compute_alias_check_pairs): Update call to the dr_with_seg_len
+	constructor.
+	* tree-vect-data-refs.c (vect_check_nonzero_value): New function.
+	(vect_preserves_scalar_order_p): New function, split out from...
+	(vect_analyze_data_ref_dependence): ...here.  Check for zero steps.
+	(vect_vfa_segment_size): Return DR_STEP * (length_factor - 1).
+	(vect_vfa_access_size): New function.
+	(vect_vfa_align): Likewise.
+	(vect_compile_time_alias): Take access_size_a and access_b arguments.
+	(dump_lower_bound): New function.
+	(vect_check_lower_bound): Likewise.
+	(vect_small_gap_p): Likewise.
+	(vectorizable_with_step_bound_p): Likewise.
+	(vect_prune_runtime_alias_test_list): Ignore cross-iteration
+	depencies if the vectorization factor is 1.  Convert the checks
+	for nonzero steps into checks on the bounds of DR_STEP.  Try using
+	a bunds check for variable steps if the minimum required step is
+	relatively small. Update calls to the dr_with_seg_len
+	constructor and to vect_compile_time_alias.
+	* tree-vect-loop-manip.c (vect_create_cond_for_lower_bounds): New
+	function.
+	(vect_loop_versioning): Call it.
+	* tree-vect-loop.c (vect_analyze_loop_2): Clear LOOP_VINFO_LOWER_BOUNDS
+	when retrying.
+	(vect_estimate_min_profitable_iters): Account for any bounds checks.
+
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+	    Alan Hayward  <alan.hayward@arm.com>
+	    David Sherwood  <david.sherwood@arm.com>
+
 	* doc/sourcebuild.texi (vect_scatter_store): Document.
 	* optabs.def (scatter_store_optab, mask_scatter_store_optab): New
 	optabs.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 9955037..612489f 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -2,6 +2,37 @@
 	    Alan Hayward  <alan.hayward@arm.com>
 	    David Sherwood  <david.sherwood@arm.com>
 
+	* gcc.dg/vect/bb-slp-cond-1.c: Expect loop vectorization rather
+	than SLP vectorization.
+	* gcc.dg/vect/vect-alias-check-10.c: New test.
+	* gcc.dg/vect/vect-alias-check-11.c: Likewise.
+	* gcc.dg/vect/vect-alias-check-12.c: Likewise.
+	* gcc.dg/vect/vect-alias-check-8.c: Likewise.
+	* gcc.dg/vect/vect-alias-check-9.c: Likewise.
+	* gcc.target/aarch64/sve/strided_load_8.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_1.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_1.h: Likewise.
+	* gcc.target/aarch64/sve/var_stride_1_run.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_2.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_2_run.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_3.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_3_run.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_4.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_4_run.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_5.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_5_run.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_6.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_6_run.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_7.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_7_run.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_8.c: Likewise.
+	* gcc.target/aarch64/sve/var_stride_8_run.c: Likewise.
+	* gfortran.dg/vect/vect-alias-check-1.F90: Likewise.
+
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+	    Alan Hayward  <alan.hayward@arm.com>
+	    David Sherwood  <david.sherwood@arm.com>
+
 	* lib/target-supports.exp (check_effective_target_vect_scatter_store):
 	New proc.
 	* gcc.dg/vect/pr25413a.c: Expect both loops to be optimized on
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c b/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c
index 2c4c36f..4bd286b 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c
@@ -45,10 +45,6 @@ int main ()
   return 0;
 }
 
-/* Basic blocks of if-converted loops are vectorized from within the loop
-   vectorizer pass.  In this case it is really a deficiency in loop
-   vectorization data dependence analysis that causes us to require
-   basic block vectorization in the first place.  */
-
-/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "vect" { target vect_element_align } } } */
+/* { dg-final { scan-tree-dump {(no need for alias check [^\n]* when VF is 1|no alias between [^\n]* when [^\n]* is outside \(-16, 16\))} "vect" { target vect_element_align } } } */
+/* { dg-final { scan-tree-dump-times "loop vectorized" 1 "vect" { target vect_element_align } } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c
new file mode 100644
index 0000000..d4eea87
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c
@@ -0,0 +1,69 @@
+/* { dg-do run } */
+
+#define N 87
+#define M 6
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define TEST_VALUE(I) ((I) * 5 / 2)
+
+#define ADD_TEST(TYPE)				\
+  void __attribute__((noinline, noclone))	\
+  test_##TYPE (TYPE *a, int step)		\
+  {						\
+    for (int i = 0; i < N; ++i)			\
+      {						\
+	a[i * step + 0] = a[i * step + 0] + 1;	\
+	a[i * step + 1] = a[i * step + 1] + 2;	\
+	a[i * step + 2] = a[i * step + 2] + 4;	\
+	a[i * step + 3] = a[i * step + 3] + 8;	\
+      }						\
+  }						\
+  void __attribute__((noinline, noclone))	\
+  ref_##TYPE (TYPE *a, int step)		\
+  {						\
+    for (int i = 0; i < N; ++i)			\
+      {						\
+	a[i * step + 0] = a[i * step + 0] + 1;	\
+	a[i * step + 1] = a[i * step + 1] + 2;	\
+	a[i * step + 2] = a[i * step + 2] + 4;	\
+	a[i * step + 3] = a[i * step + 3] + 8;	\
+	asm volatile ("");			\
+      }						\
+  }
+
+#define DO_TEST(TYPE)					\
+  for (int j = -M; j <= M; ++j)				\
+    {							\
+      TYPE a[N * M], b[N * M];				\
+      for (int i = 0; i < N * M; ++i)			\
+	a[i] = b[i] = TEST_VALUE (i);			\
+      int offset = (j < 0 ? N * M - 4 : 0);		\
+      test_##TYPE (a + offset, j);			\
+      ref_##TYPE (b + offset, j);			\
+      if (__builtin_memcmp (a, b, sizeof (a)) != 0)	\
+	__builtin_abort ();				\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c
new file mode 100644
index 0000000..601e17f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c
@@ -0,0 +1,99 @@
+/* { dg-do run } */
+
+#define N 87
+#define M 6
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define TEST_VALUE1(I) ((I) * 5 / 2)
+#define TEST_VALUE2(I) ((I) * 11 / 5)
+
+#define ADD_TEST(TYPE)					\
+  void __attribute__((noinline, noclone))		\
+  test_##TYPE (TYPE *restrict a, TYPE *restrict b,	\
+	       int step)				\
+  {							\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	TYPE r1 = a[i * step + 0] += 1;			\
+	a[i * step + 1] += 2;				\
+	a[i * step + 2] += 4;				\
+	a[i * step + 3] += 8;				\
+	b[i] += r1;					\
+      }							\
+  }							\
+							\
+  void __attribute__((noinline, noclone))		\
+  ref_##TYPE (TYPE *restrict a, TYPE *restrict b,	\
+	      int step)					\
+  {							\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	TYPE r1 = a[i * step + 0] += 1;			\
+	a[i * step + 1] += 2;				\
+	a[i * step + 2] += 4;				\
+	a[i * step + 3] += 8;				\
+	b[i] += r1;					\
+	asm volatile ("");				\
+      }							\
+  }
+
+#define DO_TEST(TYPE)					\
+  for (int j = -M; j <= M; ++j)				\
+    {							\
+      TYPE a1[N * M], a2[N * M], b1[N], b2[N];		\
+      for (int i = 0; i < N * M; ++i)			\
+	a1[i] = a2[i] = TEST_VALUE1 (i);		\
+      for (int i = 0; i < N; ++i)			\
+	b1[i] = b2[i] = TEST_VALUE2 (i);		\
+      int offset = (j < 0 ? N * M - 4 : 0);		\
+      test_##TYPE (a1 + offset, b1, j);			\
+      ref_##TYPE (a2 + offset, b2, j);			\
+      if (__builtin_memcmp (a1, a2, sizeof (a1)) != 0)	\
+	__builtin_abort ();				\
+      if (__builtin_memcmp (b1, b2, sizeof (b1)) != 0)	\
+	__builtin_abort ();				\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* is outside \(-2, 2\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* is outside \(-3, 3\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* is outside \(-4, 4\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {run-time check [^\n]* abs \([^*]*\) >= 4} "vect" { target vect_int } } } */
+
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 2[)]* is outside \(-4, 4\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 2[)]* is outside \(-6, 6\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 2[)]* is outside \(-8, 8\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {run-time check [^\n]* abs \([^*]* \* 2[)]* >= 8} "vect" { target vect_int } } } */
+
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 4[)]* is outside \(-8, 8\)} "vect" { target { vect_int || vect_float } } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 4[)]* is outside \(-12, 12\)} "vect" { target { vect_int || vect_float } } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 4[)]* is outside \(-16, 16\)} "vect" { target { vect_int || vect_float } } } } */
+/* { dg-final { scan-tree-dump {run-time check [^\n]* abs \([^*]* \* 4[)]* >= 16} "vect" { target { vect_int || vect_float } } } } */
+
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 8[)]* is outside \(-16, 16\)} "vect" { target vect_double } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 8[)]* is outside \(-24, 24\)} "vect" { target vect_double } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 8[)]* is outside \(-32, 32\)} "vect" { target vect_double } } } */
+/* { dg-final { scan-tree-dump {run-time check [^\n]* abs \([^*]* \* 8[)]* >= 32} "vect" { target vect_double } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c
new file mode 100644
index 0000000..a44c9bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c
@@ -0,0 +1,99 @@
+/* { dg-do run } */
+
+#define N 87
+#define M 7
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define TEST_VALUE1(I) ((I) * 5 / 2)
+#define TEST_VALUE2(I) ((I) * 11 / 5)
+
+#define ADD_TEST(TYPE)					\
+  void __attribute__((noinline, noclone))		\
+  test_##TYPE (TYPE *restrict a, TYPE *restrict b,	\
+	       int step)				\
+  {							\
+    step = step & M;					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	TYPE r1 = a[i * step + 0] += 1;			\
+	a[i * step + 1] += 2;				\
+	a[i * step + 2] += 4;				\
+	a[i * step + 3] += 8;				\
+	b[i] += r1;					\
+      }							\
+  }							\
+							\
+  void __attribute__((noinline, noclone))		\
+  ref_##TYPE (TYPE *restrict a, TYPE *restrict b,	\
+	      int step)					\
+  {							\
+    for (unsigned short i = 0; i < N; ++i)		\
+      {							\
+	TYPE r1 = a[i * step + 0] += 1;			\
+	a[i * step + 1] += 2;				\
+	a[i * step + 2] += 4;				\
+	a[i * step + 3] += 8;				\
+	b[i] += r1;					\
+	asm volatile ("");				\
+      }							\
+  }
+
+#define DO_TEST(TYPE)					\
+  for (int j = 0; j <= M; ++j)				\
+    {							\
+      TYPE a1[N * M], a2[N * M], b1[N], b2[N];		\
+      for (int i = 0; i < N * M; ++i)			\
+	a1[i] = a2[i] = TEST_VALUE1 (i);		\
+      for (int i = 0; i < N; ++i)			\
+	b1[i] = b2[i] = TEST_VALUE2 (i);		\
+      test_##TYPE (a1, b1, j);				\
+      ref_##TYPE (a2, b2, j);				\
+      if (__builtin_memcmp (a1, a2, sizeof (a1)) != 0)	\
+	__builtin_abort ();				\
+      if (__builtin_memcmp (b1, b2, sizeof (b1)) != 0)	\
+	__builtin_abort ();				\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* is outside \[0, 2\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* is outside \[0, 3\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* is outside \[0, 4\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {run-time check [^\n]* unsigned \([^*]*\) >= 4} "vect" { target vect_int } } } */
+
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 2[)]* is outside \[0, 4\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 2[)]* is outside \[0, 6\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 2[)]* is outside \[0, 8\)} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump {run-time check [^\n]* unsigned \([^*]* \* 2[)]* >= 8} "vect" { target vect_int } } } */
+
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 4[)]* is outside \[0, 8\)} "vect" { target { vect_int || vect_float } }} } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 4[)]* is outside \[0, 12\)} "vect" { target { vect_int || vect_float } }} } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 4[)]* is outside \[0, 16\)} "vect" { target { vect_int || vect_float } }} } */
+/* { dg-final { scan-tree-dump {run-time check [^\n]* unsigned \([^*]* \* 4[)]* >= 16} "vect" { target { vect_int || vect_float } }} } */
+
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 8[)]* is outside \[0, 16\)} "vect" { target vect_double } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 8[)]* is outside \[0, 24\)} "vect" { target vect_double } } } */
+/* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 8[)]* is outside \[0, 32\)} "vect" { target vect_double } } } */
+/* { dg-final { scan-tree-dump {run-time check [^\n]* unsigned \([^*]* \* 8[)]* >= 32} "vect" { target vect_double } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c
new file mode 100644
index 0000000..5aeaf21
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+
+#define N 200
+#define DIST 32
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define TEST_VALUE(I) ((I) * 5 / 2)
+
+#define ADD_TEST(TYPE)				\
+  TYPE a_##TYPE[N * 2];				\
+  void __attribute__((noinline, noclone))	\
+  test_##TYPE (int x, int y)			\
+  {						\
+    for (int i = 0; i < N; ++i)			\
+      a_##TYPE[i + x] += a_##TYPE[i + y];	\
+  }
+
+#define DO_TEST(TYPE)						\
+  for (int i = 0; i < DIST * 2; ++i)				\
+    {								\
+      for (int j = 0; j < N + DIST * 2; ++j)			\
+	a_##TYPE[j] = TEST_VALUE (j);				\
+      test_##TYPE (i, DIST);					\
+      for (int j = 0; j < N + DIST * 2; ++j)			\
+	{							\
+	  TYPE expected;					\
+	  if (j < i || j >= i + N)				\
+	    expected = TEST_VALUE (j);				\
+	  else if (i <= DIST)					\
+	    expected = ((TYPE) TEST_VALUE (j)			\
+			+ (TYPE) TEST_VALUE (j - i + DIST));	\
+	  else							\
+	    expected = ((TYPE) TEST_VALUE (j)			\
+			+ a_##TYPE[j - i + DIST]);		\
+	  if (expected != a_##TYPE[j])				\
+	    __builtin_abort ();					\
+	}							\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c
new file mode 100644
index 0000000..9bc38af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c
@@ -0,0 +1,55 @@
+/* { dg-do run } */
+
+#define N 200
+#define M 4
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define TEST_VALUE(I) ((I) * 5 / 2)
+
+#define ADD_TEST(TYPE)				\
+  void __attribute__((noinline, noclone))	\
+  test_##TYPE (TYPE *a, TYPE *b)		\
+  {						\
+    for (int i = 0; i < N; i += 2)		\
+      {						\
+	a[i + 0] = b[i + 0] + 2;		\
+	a[i + 1] = b[i + 1] + 3;		\
+      }						\
+  }
+
+#define DO_TEST(TYPE)					\
+  for (int j = 1; j < M; ++j)				\
+    {							\
+      TYPE a[N + M];					\
+      for (int i = 0; i < N + M; ++i)			\
+	a[i] = TEST_VALUE (i);				\
+      test_##TYPE (a + j, a);				\
+      for (int i = 0; i < N; i += 2)			\
+	if (a[i + j] != (TYPE) (a[i] + 2)		\
+	    || a[i + j + 1] != (TYPE) (a[i + 1] + 3))	\
+	  __builtin_abort ();				\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_8.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_8.c
new file mode 100644
index 0000000..cd61801
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_8.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+void
+foo (double *x, int m)
+{
+  for (int i = 0; i < 256; ++i)
+    x[i * m] += x[i * m];
+}
+
+/* { dg-final { scan-assembler-times {\tcbz\tw1,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, } 1 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, } 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c
new file mode 100644
index 0000000..68baba9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should multiply by (VF-1)*4 rather than (257-1)*4.  */
+/* { dg-final { scan-assembler-not {, 1024} } } */
+/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */
+/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]10} } } */
+/* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */
+/* Two range checks and a check for n being zero.  */
+/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.h b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.h
new file mode 100644
index 0000000..43b098f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.h
@@ -0,0 +1,61 @@
+extern void abort (void) __attribute__ ((noreturn));
+
+#define MARGIN 6
+
+void __attribute__ ((weak, optimize ("no-tree-vectorize")))
+test (int n, int m, int offset)
+{
+  int abs_n = (n < 0 ? -n : n);
+  int abs_m = (m < 0 ? -m : m);
+  int max_i = (abs_n > abs_m ? abs_n : abs_m);
+  int abs_offset = (offset < 0 ? -offset : offset);
+  int size = MARGIN * 2 + max_i * SIZE + abs_offset;
+  TYPE *array = (TYPE *) __builtin_alloca (size * sizeof (TYPE));
+  for (int i = 0; i < size; ++i)
+    array[i] = i;
+  int base_x = offset < 0 ? MARGIN - offset : MARGIN;
+  int base_y = offset < 0 ? MARGIN : MARGIN + offset;
+  int start_x = n < 0 ? base_x - n * (SIZE - 1) : base_x;
+  int start_y = m < 0 ? base_y - m * (SIZE - 1) : base_y;
+  f (&array[start_x], &array[start_y], n, m);
+  int j = 0;
+  int start = (n < 0 ? size - 1 : 0);
+  int end = (n < 0 ? -1 : size);
+  int inc = (n < 0 ? -1 : 1);
+  for (int i = start; i != end; i += inc)
+    {
+      if (j == SIZE || i != start_x + j * n)
+	{
+	  if (array[i] != i)
+	    abort ();
+	}
+      else if (n == 0)
+	{
+	  TYPE sum = i;
+	  for (; j < SIZE; j++)
+	    {
+	      int next_y = start_y + j * m;
+	      if (n >= 0 ? next_y < i : next_y > i)
+		sum += array[next_y];
+	      else if (next_y == i)
+		sum += sum;
+	      else
+		sum += next_y;
+	    }
+	  if (array[i] != sum)
+	    abort ();
+	}
+      else
+	{
+	  int next_y = start_y + j * m;
+	  TYPE base = i;
+	  if (n >= 0 ? next_y < i : next_y > i)
+	    base += array[next_y];
+	  else
+	    base += next_y;
+	  if (array[i] != base)
+	    abort ();
+	  j += 1;
+	}
+    }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1_run.c
new file mode 100644
index 0000000..65bb9fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1_run.c
@@ -0,0 +1,14 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "var_stride_1.c"
+#include "var_stride_1.h"
+
+int
+main (void)
+{
+  for (int n = 0; n < 10; ++n)
+    for (int offset = -33; offset <= 33; ++offset)
+      test (n, n, offset);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
new file mode 100644
index 0000000..112e84f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, unsigned short n, unsigned short m)
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i * n] += y[i * m];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should multiply by (257-1)*4 rather than (VF-1)*4.  */
+/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 10\n} 2 } } */
+/* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */
+/* Two range checks and a check for n being zero.  (m being zero is OK.)  */
+/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2_run.c
new file mode 100644
index 0000000..313136a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2_run.c
@@ -0,0 +1,18 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "var_stride_2.c"
+#include "var_stride_1.h"
+
+int
+main (void)
+{
+  for (int n = 0; n < 10; ++n)
+    for (int m = 0; m < 10; ++m)
+      for (int offset = -17; offset <= 17; ++offset)
+	{
+	  test (n, m, offset);
+	  test (n, m, offset + n * (SIZE - 1));
+	}
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c
new file mode 100644
index 0000000..70792ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, int n, long m __attribute__((unused)))
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should multiply by (VF-1)*4 rather than (257-1)*4.  */
+/* { dg-final { scan-assembler-not {, 1024} } } */
+/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */
+/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]10} } } */
+/* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
+/* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */
+/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */
+/* Two range checks and a check for n being zero.  */
+/* { dg-final { scan-assembler {\tcmp\t} } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3_run.c
new file mode 100644
index 0000000..e10e70d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3_run.c
@@ -0,0 +1,14 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "var_stride_3.c"
+#include "var_stride_1.h"
+
+int
+main (void)
+{
+  for (int n = -10; n < 10; ++n)
+    for (int offset = -33; offset <= 33; ++offset)
+      test (n, n, offset);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
new file mode 100644
index 0000000..4bcdb5d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, int n, int m)
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i * n] += y[i * m];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should multiply by (257-1)*4 rather than (VF-1)*4.  */
+/* { dg-final { scan-assembler-times {\tlsl\tx[0-9]+, x[0-9]+, 10\n} 2 } } */
+/* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */
+/* { dg-final { scan-assembler {\tcmp\tw3, 0} } } */
+/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 4 } } */
+/* Two range checks and a check for n being zero.  (m being zero is OK.)  */
+/* { dg-final { scan-assembler {\tcmp\t} } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4_run.c
new file mode 100644
index 0000000..7b8645e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4_run.c
@@ -0,0 +1,18 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "var_stride_4.c"
+#include "var_stride_1.h"
+
+int
+main (void)
+{
+  for (int n = -10; n < 10; ++n)
+    for (int m = -10; m < 10; ++m)
+      for (int offset = -17; offset <= 17; ++offset)
+	{
+	  test (n, m, offset);
+	  test (n, m, offset + n * (SIZE - 1));
+	}
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c
new file mode 100644
index 0000000..688f3be6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TYPE double
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, long n, long m __attribute__((unused)))
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1d\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1d\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\td[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\td[0-9]+} } } */
+/* Should multiply by (VF-1)*8 rather than (257-1)*8.  */
+/* { dg-final { scan-assembler-not {, 2048} } } */
+/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */
+/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]11} } } */
+/* { dg-final { scan-assembler {\tcmp\tx[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */
+/* Two range checks and a check for n being zero.  */
+/* { dg-final { scan-assembler {\tcmp\t} } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5_run.c
new file mode 100644
index 0000000..3360d0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5_run.c
@@ -0,0 +1,14 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "var_stride_5.c"
+#include "var_stride_1.h"
+
+int
+main (void)
+{
+  for (int n = -10; n < 10; ++n)
+    for (int offset = -33; offset <= 33; ++offset)
+      test (n, n, offset);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_6.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_6.c
new file mode 100644
index 0000000..37e5d0f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_6.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TYPE long
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, long n, long m)
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i * n] += y[i * m];
+}
+
+/* { dg-final { scan-assembler {\tld1d\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1d\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tx[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tx[0-9]+} } } */
+/* Should multiply by (257-1)*8 rather than (VF-1)*8.  */
+/* { dg-final { scan-assembler-times {lsl\tx[0-9]+, x[0-9]+, 11} 2 } } */
+/* { dg-final { scan-assembler {\tcmp\tx[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 4 } } */
+/* Two range checks and a check for n being zero.  (m being zero is OK.)  */
+/* { dg-final { scan-assembler {\tcmp\t} } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_6_run.c
new file mode 100644
index 0000000..8efaf4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_6_run.c
@@ -0,0 +1,18 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "var_stride_6.c"
+#include "var_stride_1.h"
+
+int
+main (void)
+{
+  for (int n = -10; n < 10; ++n)
+    for (int m = -10; m < 10; ++m)
+      for (int offset = -17; offset <= 17; ++offset)
+	{
+	  test (n, m, offset);
+	  test (n, m, offset + n * (SIZE - 1));
+	}
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_7.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_7.c
new file mode 100644
index 0000000..65e2035
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_7.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TYPE double
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, long n, long m __attribute__((unused)))
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i * n] += y[i];
+}
+
+/* { dg-final { scan-assembler {\tld1d\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1d\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\td[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\td[0-9]+} } } */
+/* Should multiply by (257-1)*8 rather than (VF-1)*8.  */
+/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x1, 2048} 1 } } */
+/* { dg-final { scan-assembler-times {lsl\tx[0-9]+, x[0-9]+, 11} 1 } } */
+/* { dg-final { scan-assembler {\tcmp\tx[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */
+/* Two range checks and a check for n being zero.  */
+/* { dg-final { scan-assembler {\tcmp\t} } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_7_run.c
new file mode 100644
index 0000000..5795145
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_7_run.c
@@ -0,0 +1,14 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "var_stride_7.c"
+#include "var_stride_1.h"
+
+int
+main (void)
+{
+  for (int n = -10; n < 10; ++n)
+    for (int offset = -33; offset <= 33; ++offset)
+      test (n, 1, offset);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_8.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_8.c
new file mode 100644
index 0000000..5224a5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_8.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TYPE long
+#define SIZE 257
+
+void
+f (TYPE *x, TYPE *y, long n __attribute__((unused)), long m)
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i] += y[i * m];
+}
+
+/* { dg-final { scan-assembler {\tld1d\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1d\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tx[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tx[0-9]+} } } */
+/* Should multiply by (257-1)*8 rather than (VF-1)*8.  */
+/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x0, 2048} 1 } } */
+/* { dg-final { scan-assembler-times {lsl\tx[0-9]+, x[0-9]+, 11} 1 } } */
+/* { dg-final { scan-assembler {\tcmp\tx[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
+/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */
+/* Two range checks only; doesn't matter whether n is zero.  */
+/* { dg-final { scan-assembler {\tcmp\t} } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_8_run.c
new file mode 100644
index 0000000..6d4b2a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_8_run.c
@@ -0,0 +1,14 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "var_stride_8.c"
+#include "var_stride_1.h"
+
+int
+main (void)
+{
+  for (int n = -10; n < 10; ++n)
+    for (int offset = -33; offset <= 33; ++offset)
+      test (1, n, offset);
+  return 0;
+}
diff --git a/gcc/testsuite/gfortran.dg/vect/vect-alias-check-1.F90 b/gcc/testsuite/gfortran.dg/vect/vect-alias-check-1.F90
new file mode 100644
index 0000000..ea9ba85
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/vect/vect-alias-check-1.F90
@@ -0,0 +1,102 @@
+! { dg-do run }
+! { dg-additional-options "-fno-inline" }
+
+#define N 200
+
+#define TEST_VALUE(I) ((I) * 5 / 2)
+
+subroutine setup(a)
+  real :: a(N)
+  do i = 1, N
+     a(i) = TEST_VALUE(i)
+  end do
+end subroutine
+
+subroutine check(a, x, gap)
+  real :: a(N), temp, x
+  integer :: gap
+  do i = 1, N - gap
+     temp = a(i + gap) + x
+     if (a(i) /= temp) call abort
+  end do
+  do i = N - gap + 1, N
+     temp = TEST_VALUE(i)
+     if (a(i) /= temp) call abort
+  end do
+end subroutine
+
+subroutine testa(a, x, base, n)
+  real :: a(n), x
+  integer :: base, n
+  do i = n, 2, -1
+     a(base + i - 1) = a(base + i) + x
+  end do
+end subroutine testa
+
+subroutine testb(a, x, base, n)
+  real :: a(n), x
+  integer :: base
+  do i = n, 4, -1
+     a(base + i - 3) = a(base + i) + x
+  end do
+end subroutine testb
+
+subroutine testc(a, x, base, n)
+  real :: a(n), x
+  integer :: base
+  do i = n, 8, -1
+     a(base + i - 7) = a(base + i) + x
+  end do
+end subroutine testc
+
+subroutine testd(a, x, base, n)
+  real :: a(n), x
+  integer :: base
+  do i = n, 16, -1
+     a(base + i - 15) = a(base + i) + x
+  end do
+end subroutine testd
+
+subroutine teste(a, x, base, n)
+  real :: a(n), x
+  integer :: base
+  do i = n, 32, -1
+     a(base + i - 31) = a(base + i) + x
+  end do
+end subroutine teste
+
+subroutine testf(a, x, base, n)
+  real :: a(n), x
+  integer :: base
+  do i = n, 64, -1
+     a(base + i - 63) = a(base + i) + x
+  end do
+end subroutine testf
+
+program main
+  real :: a(N)
+
+  call setup(a)
+  call testa(a, 91.0, 0, N)
+  call check(a, 91.0, 1)
+
+  call setup(a)
+  call testb(a, 55.0, 0, N)
+  call check(a, 55.0, 3)
+
+  call setup(a)
+  call testc(a, 72.0, 0, N)
+  call check(a, 72.0, 7)
+
+  call setup(a)
+  call testd(a, 69.0, 0, N)
+  call check(a, 69.0, 15)
+
+  call setup(a)
+  call teste(a, 44.0, 0, N)
+  call check(a, 44.0, 31)
+
+  call setup(a)
+  call testf(a, 39.0, 0, N)
+  call check(a, 39.0, 63)
+end program
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index 3b15b5d..e39067d 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -95,6 +95,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-affine.h"
 #include "params.h"
 #include "builtins.h"
+#include "stringpool.h"
+#include "tree-vrp.h"
+#include "tree-ssanames.h"
 
 static struct datadep_stats
 {
@@ -1305,18 +1308,6 @@ runtime_alias_check_p (ddr_p ddr, struct loop *loop, bool speed_p)
       return false;
     }
 
-  /* FORNOW: We don't support creating runtime alias tests for non-constant
-     step.  */
-  if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
-      || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
-    {
-      if (dump_enabled_p ())
-	dump_printf (MSG_MISSED_OPTIMIZATION,
-                     "runtime alias check not supported for non-constant "
-		     "step\n");
-      return false;
-    }
-
   return true;
 }
 
@@ -1331,11 +1322,13 @@ static bool
 operator == (const dr_with_seg_len& d1,
 	     const dr_with_seg_len& d2)
 {
-  return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
-			  DR_BASE_ADDRESS (d2.dr), 0)
-	   && data_ref_compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
-	   && data_ref_compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
-	   && data_ref_compare_tree (d1.seg_len, d2.seg_len) == 0;
+  return (operand_equal_p (DR_BASE_ADDRESS (d1.dr),
+			   DR_BASE_ADDRESS (d2.dr), 0)
+	  && data_ref_compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
+	  && data_ref_compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
+	  && data_ref_compare_tree (d1.seg_len, d2.seg_len) == 0
+	  && known_eq (d1.access_size, d2.access_size)
+	  && d1.align == d2.align);
 }
 
 /* Comparison function for sorting objects of dr_with_seg_len_pair_t
@@ -1415,7 +1408,7 @@ comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
 
 void
 prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
-			       poly_uint64 factor)
+			       poly_uint64)
 {
   /* Sort the collected data ref pairs so that we can scan them once to
      combine all possible aliasing checks.  */
@@ -1461,6 +1454,8 @@ prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
 	    }
 
 	  poly_int64 init_a1, init_a2;
+	  /* Only consider cases in which the distance between the initial
+	     DR_A1 and the initial DR_A2 is known at compile time.  */
 	  if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
 				DR_BASE_ADDRESS (dr_a2->dr), 0)
 	      || !operand_equal_p (DR_OFFSET (dr_a1->dr),
@@ -1480,141 +1475,79 @@ prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
 	      std::swap (init_a1, init_a2);
 	    }
 
-	  /* Only merge const step data references.  */
-	  poly_int64 step_a1, step_a2;
-	  if (!poly_int_tree_p (DR_STEP (dr_a1->dr), &step_a1)
-	      || !poly_int_tree_p (DR_STEP (dr_a2->dr), &step_a2))
-	    continue;
+	  /* Work out what the segment length would be if we did combine
+	     DR_A1 and DR_A2:
 
-	  bool neg_step = maybe_lt (step_a1, 0) || maybe_lt (step_a2, 0);
+	     - If DR_A1 and DR_A2 have equal lengths, that length is
+	       also the combined length.
 
-	  /* DR_A1 and DR_A2 must go in the same direction.  */
-	  if (neg_step && (maybe_gt (step_a1, 0) || maybe_gt (step_a2, 0)))
-	    continue;
+	     - If DR_A1 and DR_A2 both have negative "lengths", the combined
+	       length is the lower bound on those lengths.
 
-	  poly_uint64 seg_len_a1 = 0, seg_len_a2 = 0;
-	  bool const_seg_len_a1 = poly_int_tree_p (dr_a1->seg_len,
-						   &seg_len_a1);
-	  bool const_seg_len_a2 = poly_int_tree_p (dr_a2->seg_len,
-						   &seg_len_a2);
-
-	  /* We need to compute merged segment length at compilation time for
-	     dr_a1 and dr_a2, which is impossible if either one has non-const
-	     segment length.  */
-	  if ((!const_seg_len_a1 || !const_seg_len_a2)
-	      && maybe_ne (step_a1, step_a2))
-	    continue;
+	     - If DR_A1 and DR_A2 both have positive lengths, the combined
+	       length is the upper bound on those lengths.
 
-	  bool do_remove = false;
-	  poly_uint64 diff = init_a2 - init_a1;
-	  poly_uint64 min_seg_len_b;
-	  tree new_seg_len;
+	     Other cases are unlikely to give a useful combination.
 
-	  if (!poly_int_tree_p (dr_b1->seg_len, &min_seg_len_b))
+	     The lengths both have sizetype, so the sign is taken from
+	     the step instead.  */
+	  if (!operand_equal_p (dr_a1->seg_len, dr_a2->seg_len, 0))
 	    {
-	      tree step_b = DR_STEP (dr_b1->dr);
-	      if (!tree_fits_shwi_p (step_b))
+	      poly_uint64 seg_len_a1, seg_len_a2;
+	      if (!poly_int_tree_p (dr_a1->seg_len, &seg_len_a1)
+		  || !poly_int_tree_p (dr_a2->seg_len, &seg_len_a2))
 		continue;
-	      min_seg_len_b = factor * abs_hwi (tree_to_shwi (step_b));
-	    }
-
-	  /* Now we try to merge alias check dr_a1 & dr_b and dr_a2 & dr_b.
-
-	     Case A:
-	       check if the following condition is satisfied:
-
-	       DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
 
-	       where DIFF = DR_A2_INIT - DR_A1_INIT.  However,
-	       SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
-	       have to make a best estimation.  We can get the minimum value
-	       of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
-	       then either of the following two conditions can guarantee the
-	       one above:
+	      tree indicator_a = dr_direction_indicator (dr_a1->dr);
+	      if (TREE_CODE (indicator_a) != INTEGER_CST)
+		continue;
 
-	       1: DIFF <= MIN_SEG_LEN_B
-	       2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
-		  Because DIFF - SEGMENT_LENGTH_A is done in sizetype, we need
-		  to take care of wrapping behavior in it.
+	      tree indicator_b = dr_direction_indicator (dr_a2->dr);
+	      if (TREE_CODE (indicator_b) != INTEGER_CST)
+		continue;
 
-	     Case B:
-	       If the left segment does not extend beyond the start of the
-	       right segment the new segment length is that of the right
-	       plus the segment distance.  The condition is like:
+	      int sign_a = tree_int_cst_sgn (indicator_a);
+	      int sign_b = tree_int_cst_sgn (indicator_b);
 
-	       DIFF >= SEGMENT_LENGTH_A   ;SEGMENT_LENGTH_A is a constant.
+	      poly_uint64 new_seg_len;
+	      if (sign_a <= 0 && sign_b <= 0)
+		new_seg_len = lower_bound (seg_len_a1, seg_len_a2);
+	      else if (sign_a >= 0 && sign_b >= 0)
+		new_seg_len = upper_bound (seg_len_a1, seg_len_a2);
+	      else
+		continue;
 
-	     Note 1: Case A.2 and B combined together effectively merges every
-	     dr_a1 & dr_b and dr_a2 & dr_b when SEGMENT_LENGTH_A is const.
+	      dr_a1->seg_len = build_int_cst (TREE_TYPE (dr_a1->seg_len),
+					      new_seg_len);
+	      dr_a1->align = MIN (dr_a1->align, known_alignment (new_seg_len));
+	    }
 
-	     Note 2: Above description is based on positive DR_STEP, we need to
-	     take care of negative DR_STEP for wrapping behavior.  See PR80815
-	     for more information.  */
-	  if (neg_step)
-	    {
-	      /* Adjust diff according to access size of both references.  */
-	      diff += tree_to_poly_uint64
-		(TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a2->dr))));
-	      diff -= tree_to_poly_uint64
-		(TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a1->dr))));
-	      /* Case A.1.  */
-	      if (known_le (diff, min_seg_len_b)
-		  /* Case A.2 and B combined.  */
-		  || const_seg_len_a2)
-		{
-		  if (const_seg_len_a1 || const_seg_len_a2)
-		    new_seg_len
-		      = build_int_cstu (sizetype,
-					lower_bound (seg_len_a1 - diff,
-						     seg_len_a2));
-		  else
-		    new_seg_len
-		      = size_binop (MINUS_EXPR, dr_a2->seg_len,
-				    build_int_cstu (sizetype, diff));
+	  /* This is always positive due to the swap above.  */
+	  poly_uint64 diff = init_a2 - init_a1;
 
-		  dr_a2->seg_len = new_seg_len;
-		  do_remove = true;
-		}
-	    }
-	  else
+	  /* The new check will start at DR_A1.  Make sure that its access
+	     size encompasses the initial DR_A2.  */
+	  if (maybe_lt (dr_a1->access_size, diff + dr_a2->access_size))
 	    {
-	      /* Case A.1.  */
-	      if (known_le (diff, min_seg_len_b)
-		  /* Case A.2 and B combined.  */
-		  || const_seg_len_a1)
-		{
-		  if (const_seg_len_a1 && const_seg_len_a2)
-		    new_seg_len
-		      = build_int_cstu (sizetype,
-					upper_bound (seg_len_a2 + diff,
-						     seg_len_a1));
-		  else
-		    new_seg_len
-		      = size_binop (PLUS_EXPR, dr_a2->seg_len,
-				    build_int_cstu (sizetype, diff));
-
-		  dr_a1->seg_len = new_seg_len;
-		  do_remove = true;
-		}
+	      dr_a1->access_size = upper_bound (dr_a1->access_size,
+						diff + dr_a2->access_size);
+	      unsigned int new_align = known_alignment (dr_a1->access_size);
+	      dr_a1->align = MIN (dr_a1->align, new_align);
 	    }
-
-	  if (do_remove)
+	  if (dump_enabled_p ())
 	    {
-	      if (dump_enabled_p ())
-		{
-		  dump_printf (MSG_NOTE, "merging ranges for ");
-		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
-		  dump_printf (MSG_NOTE,  ", ");
-		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
-		  dump_printf (MSG_NOTE,  " and ");
-		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
-		  dump_printf (MSG_NOTE,  ", ");
-		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
-		  dump_printf (MSG_NOTE, "\n");
-		}
-	      alias_pairs->ordered_remove (neg_step ? i - 1 : i);
-	      i--;
+	      dump_printf (MSG_NOTE, "merging ranges for ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
+	      dump_printf (MSG_NOTE,  ", ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
+	      dump_printf (MSG_NOTE,  " and ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
+	      dump_printf (MSG_NOTE,  ", ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
+	      dump_printf (MSG_NOTE, "\n");
 	    }
+	  alias_pairs->ordered_remove (i);
+	  i--;
 	}
     }
 }
@@ -1654,7 +1587,9 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr,
       || DR_NUM_DIMENSIONS (dr_a.dr) != DR_NUM_DIMENSIONS (dr_b.dr))
     return false;
 
-  if (!tree_fits_uhwi_p (dr_a.seg_len) || !tree_fits_uhwi_p (dr_b.seg_len))
+  poly_uint64 seg_len1, seg_len2;
+  if (!poly_int_tree_p (dr_a.seg_len, &seg_len1)
+      || !poly_int_tree_p (dr_b.seg_len, &seg_len2))
     return false;
 
   if (!tree_fits_shwi_p (DR_STEP (dr_a.dr)))
@@ -1669,19 +1604,42 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr,
   gcc_assert (TREE_CODE (DR_STEP (dr_a.dr)) == INTEGER_CST);
 
   bool neg_step = tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0;
-  unsigned HOST_WIDE_INT abs_step
-    = absu_hwi (tree_to_shwi (DR_STEP (dr_a.dr)));
+  unsigned HOST_WIDE_INT abs_step = tree_to_shwi (DR_STEP (dr_a.dr));
+  if (neg_step)
+    {
+      abs_step = -abs_step;
+      seg_len1 = -seg_len1;
+      seg_len2 = -seg_len2;
+    }
+  else
+    {
+      /* Include the access size in the length, so that we only have one
+	 tree addition below.  */
+      seg_len1 += dr_a.access_size;
+      seg_len2 += dr_b.access_size;
+    }
 
-  unsigned HOST_WIDE_INT seg_len1 = tree_to_uhwi (dr_a.seg_len);
-  unsigned HOST_WIDE_INT seg_len2 = tree_to_uhwi (dr_b.seg_len);
   /* Infer the number of iterations with which the memory segment is accessed
      by DR.  In other words, alias is checked if memory segment accessed by
      DR_A in some iterations intersect with memory segment accessed by DR_B
      in the same amount iterations.
      Note segnment length is a linear function of number of iterations with
      DR_STEP as the coefficient.  */
-  unsigned HOST_WIDE_INT niter_len1 = (seg_len1 + abs_step - 1) / abs_step;
-  unsigned HOST_WIDE_INT niter_len2 = (seg_len2 + abs_step - 1) / abs_step;
+  poly_uint64 niter_len1, niter_len2;
+  if (!can_div_trunc_p (seg_len1 + abs_step - 1, abs_step, &niter_len1)
+      || !can_div_trunc_p (seg_len2 + abs_step - 1, abs_step, &niter_len2))
+    return false;
+
+  poly_uint64 niter_access1 = 0, niter_access2 = 0;
+  if (neg_step)
+    {
+      /* Divide each access size by the byte step, rounding up.  */
+      if (!can_div_trunc_p (dr_a.access_size - abs_step - 1,
+			    abs_step, &niter_access1)
+	  || !can_div_trunc_p (dr_b.access_size + abs_step - 1,
+			       abs_step, &niter_access2))
+	return false;
+    }
 
   unsigned int i;
   for (i = 0; i < DR_NUM_DIMENSIONS (dr_a.dr); i++)
@@ -1732,12 +1690,22 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr,
       /* Adjust ranges for negative step.  */
       if (neg_step)
 	{
-	  min1 = fold_build2 (MINUS_EXPR, TREE_TYPE (min1), max1, idx_step);
-	  max1 = fold_build2 (MINUS_EXPR, TREE_TYPE (min1),
-			      CHREC_LEFT (access1), idx_step);
-	  min2 = fold_build2 (MINUS_EXPR, TREE_TYPE (min2), max2, idx_step);
-	  max2 = fold_build2 (MINUS_EXPR, TREE_TYPE (min2),
-			      CHREC_LEFT (access2), idx_step);
+	  /* IDX_LEN1 and IDX_LEN2 are negative in this case.  */
+	  std::swap (min1, max1);
+	  std::swap (min2, max2);
+
+	  /* As with the lengths just calculated, we've measured the access
+	     sizes in iterations, so multiply them by the index step.  */
+	  tree idx_access1
+	    = fold_build2 (MULT_EXPR, TREE_TYPE (min1), idx_step,
+			   build_int_cst (TREE_TYPE (min1), niter_access1));
+	  tree idx_access2
+	    = fold_build2 (MULT_EXPR, TREE_TYPE (min2), idx_step,
+			   build_int_cst (TREE_TYPE (min2), niter_access2));
+
+	  /* MINUS_EXPR because the above values are negative.  */
+	  max1 = fold_build2 (MINUS_EXPR, TREE_TYPE (max1), max1, idx_access1);
+	  max2 = fold_build2 (MINUS_EXPR, TREE_TYPE (max2), max2, idx_access2);
 	}
       tree part_cond_expr
 	= fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
@@ -1752,6 +1720,89 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr,
   return true;
 }
 
+/* If ALIGN is nonzero, set up *SEQ_MIN_OUT and *SEQ_MAX_OUT so that for
+   every address ADDR accessed by D:
+
+     *SEQ_MIN_OUT <= ADDR (== ADDR & -ALIGN) <= *SEQ_MAX_OUT
+
+   In this case, every element accessed by D is aligned to at least
+   ALIGN bytes.
+
+   If ALIGN is zero then instead set *SEG_MAX_OUT so that:
+
+     *SEQ_MIN_OUT <= ADDR < *SEQ_MAX_OUT.  */
+
+static void
+get_segment_min_max (const dr_with_seg_len &d, tree *seg_min_out,
+		     tree *seg_max_out, HOST_WIDE_INT align)
+{
+  /* Each access has the following pattern:
+
+	  <- |seg_len| ->
+	  <--- A: -ve step --->
+	  +-----+-------+-----+-------+-----+
+	  | n-1 | ,.... |  0  | ..... | n-1 |
+	  +-----+-------+-----+-------+-----+
+			<--- B: +ve step --->
+			<- |seg_len| ->
+			|
+		   base address
+
+     where "n" is the number of scalar iterations covered by the segment.
+     (This should be VF for a particular pair if we know that both steps
+     are the same, otherwise it will be the full number of scalar loop
+     iterations.)
+
+     A is the range of bytes accessed when the step is negative,
+     B is the range when the step is positive.
+
+     If the access size is "access_size" bytes, the lowest addressed byte is:
+
+	 base + (step < 0 ? seg_len : 0)   [LB]
+
+     and the highest addressed byte is always below:
+
+	 base + (step < 0 ? 0 : seg_len) + access_size   [UB]
+
+     Thus:
+
+	 LB <= ADDR < UB
+
+     If ALIGN is nonzero, all three values are aligned to at least ALIGN
+     bytes, so:
+
+	 LB <= ADDR <= UB - ALIGN
+
+     where "- ALIGN" folds naturally with the "+ access_size" and often
+     cancels it out.
+
+     We don't try to simplify LB and UB beyond this (e.g. by using
+     MIN and MAX based on whether seg_len rather than the stride is
+     negative) because it is possible for the absolute size of the
+     segment to overflow the range of a ssize_t.
+
+     Keeping the pointer_plus outside of the cond_expr should allow
+     the cond_exprs to be shared with other alias checks.  */
+  tree indicator = dr_direction_indicator (d.dr);
+  tree neg_step = fold_build2 (LT_EXPR, boolean_type_node,
+			       fold_convert (ssizetype, indicator),
+			       ssize_int (0));
+  tree addr_base = fold_build_pointer_plus (DR_BASE_ADDRESS (d.dr),
+					    DR_OFFSET (d.dr));
+  addr_base = fold_build_pointer_plus (addr_base, DR_INIT (d.dr));
+  tree seg_len = fold_convert (sizetype, d.seg_len);
+
+  tree min_reach = fold_build3 (COND_EXPR, sizetype, neg_step,
+				seg_len, size_zero_node);
+  tree max_reach = fold_build3 (COND_EXPR, sizetype, neg_step,
+				size_zero_node, seg_len);
+  max_reach = fold_build2 (PLUS_EXPR, sizetype, max_reach,
+			   size_int (d.access_size - align));
+
+  *seg_min_out = fold_build_pointer_plus (addr_base, min_reach);
+  *seg_max_out = fold_build_pointer_plus (addr_base, max_reach);
+}
+
 /* Given two data references and segment lengths described by DR_A and DR_B,
    create expression checking if the two addresses ranges intersect with
    each other:
@@ -1768,43 +1819,48 @@ create_intersect_range_checks (struct loop *loop, tree *cond_expr,
   if (create_intersect_range_checks_index (loop, cond_expr, dr_a, dr_b))
     return;
 
-  tree segment_length_a = dr_a.seg_len;
-  tree segment_length_b = dr_b.seg_len;
-  tree addr_base_a = DR_BASE_ADDRESS (dr_a.dr);
-  tree addr_base_b = DR_BASE_ADDRESS (dr_b.dr);
-  tree offset_a = DR_OFFSET (dr_a.dr), offset_b = DR_OFFSET (dr_b.dr);
-
-  offset_a = fold_build2 (PLUS_EXPR, TREE_TYPE (offset_a),
-			  offset_a, DR_INIT (dr_a.dr));
-  offset_b = fold_build2 (PLUS_EXPR, TREE_TYPE (offset_b),
-			  offset_b, DR_INIT (dr_b.dr));
-  addr_base_a = fold_build_pointer_plus (addr_base_a, offset_a);
-  addr_base_b = fold_build_pointer_plus (addr_base_b, offset_b);
-
-  tree seg_a_min = addr_base_a;
-  tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
-  /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
-     bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
-     [a, a+12) */
-  if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
+  unsigned HOST_WIDE_INT min_align;
+  tree_code cmp_code;
+  if (TREE_CODE (DR_STEP (dr_a.dr)) == INTEGER_CST
+      && TREE_CODE (DR_STEP (dr_b.dr)) == INTEGER_CST)
     {
-      tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a.dr)));
-      seg_a_min = fold_build_pointer_plus (seg_a_max, unit_size);
-      seg_a_max = fold_build_pointer_plus (addr_base_a, unit_size);
+      /* In this case adding access_size to seg_len is likely to give
+	 a simple X * step, where X is either the number of scalar
+	 iterations or the vectorization factor.  We're better off
+	 keeping that, rather than subtracting an alignment from it.
+
+	 In this case the maximum values are exclusive and so there is
+	 no alias if the maximum of one segment equals the minimum
+	 of another.  */
+      min_align = 0;
+      cmp_code = LE_EXPR;
     }
-
-  tree seg_b_min = addr_base_b;
-  tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
-  if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
+  else
     {
-      tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b.dr)));
-      seg_b_min = fold_build_pointer_plus (seg_b_max, unit_size);
-      seg_b_max = fold_build_pointer_plus (addr_base_b, unit_size);
+      /* Calculate the minimum alignment shared by all four pointers,
+	 then arrange for this alignment to be subtracted from the
+	 exclusive maximum values to get inclusive maximum values.
+	 This "- min_align" is cumulative with a "+ access_size"
+	 in the calculation of the maximum values.  In the best
+	 (and common) case, the two cancel each other out, leaving
+	 us with an inclusive bound based only on seg_len.  In the
+	 worst case we're simply adding a smaller number than before.
+
+	 Because the maximum values are inclusive, there is an alias
+	 if the maximum value of one segment is equal to the minimum
+	 value of the other.  */
+      min_align = MIN (dr_a.align, dr_b.align);
+      cmp_code = LT_EXPR;
     }
+
+  tree seg_a_min, seg_a_max, seg_b_min, seg_b_max;
+  get_segment_min_max (dr_a, &seg_a_min, &seg_a_max, min_align);
+  get_segment_min_max (dr_b, &seg_b_min, &seg_b_max, min_align);
+
   *cond_expr
     = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
-	fold_build2 (LE_EXPR, boolean_type_node, seg_a_max, seg_b_min),
-	fold_build2 (LE_EXPR, boolean_type_node, seg_b_max, seg_a_min));
+	fold_build2 (cmp_code, boolean_type_node, seg_a_max, seg_b_min),
+	fold_build2 (cmp_code, boolean_type_node, seg_b_max, seg_a_min));
 }
 
 /* Create a conditional expression that represents the run-time checks for
@@ -5271,3 +5327,90 @@ free_data_refs (vec<data_reference_p> datarefs)
     free_data_ref (dr);
   datarefs.release ();
 }
+
+/* Common routine implementing both dr_direction_indicator and
+   dr_zero_step_indicator.  Return USEFUL_MIN if the indicator is known
+   to be >= USEFUL_MIN and -1 if the indicator is known to be negative.
+   Return the step as the indicator otherwise.  */
+
+static tree
+dr_step_indicator (struct data_reference *dr, int useful_min)
+{
+  tree step = DR_STEP (dr);
+  STRIP_NOPS (step);
+  /* Look for cases where the step is scaled by a positive constant
+     integer, which will often be the access size.  If the multiplication
+     doesn't change the sign (due to overflow effects) then we can
+     test the unscaled value instead.  */
+  if (TREE_CODE (step) == MULT_EXPR
+      && TREE_CODE (TREE_OPERAND (step, 1)) == INTEGER_CST
+      && tree_int_cst_sgn (TREE_OPERAND (step, 1)) > 0)
+    {
+      tree factor = TREE_OPERAND (step, 1);
+      step = TREE_OPERAND (step, 0);
+
+      /* Strip widening and truncating conversions as well as nops.  */
+      if (CONVERT_EXPR_P (step)
+	  && INTEGRAL_TYPE_P (TREE_TYPE (TREE_OPERAND (step, 0))))
+	step = TREE_OPERAND (step, 0);
+      tree type = TREE_TYPE (step);
+
+      /* Get the range of step values that would not cause overflow.  */
+      widest_int minv = (wi::to_widest (TYPE_MIN_VALUE (ssizetype))
+			 / wi::to_widest (factor));
+      widest_int maxv = (wi::to_widest (TYPE_MAX_VALUE (ssizetype))
+			 / wi::to_widest (factor));
+
+      /* Get the range of values that the unconverted step actually has.  */
+      wide_int step_min, step_max;
+      if (TREE_CODE (step) != SSA_NAME
+	  || get_range_info (step, &step_min, &step_max) != VR_RANGE)
+	{
+	  step_min = wi::to_wide (TYPE_MIN_VALUE (type));
+	  step_max = wi::to_wide (TYPE_MAX_VALUE (type));
+	}
+
+      /* Check whether the unconverted step has an acceptable range.  */
+      signop sgn = TYPE_SIGN (type);
+      if (wi::les_p (minv, widest_int::from (step_min, sgn))
+	  && wi::ges_p (maxv, widest_int::from (step_max, sgn)))
+	{
+	  if (wi::ge_p (step_min, useful_min, sgn))
+	    return ssize_int (useful_min);
+	  else if (wi::lt_p (step_max, 0, sgn))
+	    return ssize_int (-1);
+	  else
+	    return fold_convert (ssizetype, step);
+	}
+    }
+  return DR_STEP (dr);
+}
+
+/* Return a value that is negative iff DR has a negative step.  */
+
+tree
+dr_direction_indicator (struct data_reference *dr)
+{
+  return dr_step_indicator (dr, 0);
+}
+
+/* Return a value that is zero iff DR has a zero step.  */
+
+tree
+dr_zero_step_indicator (struct data_reference *dr)
+{
+  return dr_step_indicator (dr, 1);
+}
+
+/* Return true if DR is known to have a nonnegative (but possibly zero)
+   step.  */
+
+bool
+dr_known_forward_stride_p (struct data_reference *dr)
+{
+  tree indicator = dr_direction_indicator (dr);
+  tree neg_step_val = fold_binary (LT_EXPR, boolean_type_node,
+				   fold_convert (ssizetype, indicator),
+				   ssize_int (0));
+  return neg_step_val && integer_zerop (neg_step_val);
+}
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index 6df38a5..63094a7 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -203,11 +203,20 @@ typedef struct data_reference *data_reference_p;
 
 struct dr_with_seg_len
 {
-  dr_with_seg_len (data_reference_p d, tree len)
-    : dr (d), seg_len (len) {}
+  dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
+		   unsigned int a)
+    : dr (d), seg_len (len), access_size (size), align (a) {}
 
   data_reference_p dr;
+  /* The offset of the last access that needs to be checked minus
+     the offset of the first.  */
   tree seg_len;
+  /* A value that, when added to abs (SEG_LEN), gives the total number of
+     bytes in the segment.  */
+  poly_uint64 access_size;
+  /* The minimum common alignment of DR's start address, SEG_LEN and
+     ACCESS_SIZE.  */
+  unsigned int align;
 };
 
 /* This struct contains two dr_with_seg_len objects with aliasing data
@@ -475,6 +484,10 @@ extern void prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *,
 					   poly_uint64);
 extern void create_runtime_alias_checks (struct loop *,
 					 vec<dr_with_seg_len_pair_t> *, tree*);
+extern tree dr_direction_indicator (struct data_reference *);
+extern tree dr_zero_step_indicator (struct data_reference *);
+extern bool dr_known_forward_stride_p (struct data_reference *);
+
 /* Return true when the base objects of data references A and B are
    the same memory object.  */
 
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index eca8dd2..a3d76e4 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -2330,16 +2330,12 @@ break_alias_scc_partitions (struct graph *rdg,
 static tree
 data_ref_segment_size (struct data_reference *dr, tree niters)
 {
-  tree segment_length;
-
-  if (integer_zerop (DR_STEP (dr)))
-    segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
-  else
-    segment_length = size_binop (MULT_EXPR,
-				 fold_convert (sizetype, DR_STEP (dr)),
-				 fold_convert (sizetype, niters));
-
-  return segment_length;
+  niters = size_binop (MINUS_EXPR,
+		       fold_convert (sizetype, niters),
+		       size_one_node);
+  return size_binop (MULT_EXPR,
+		     fold_convert (sizetype, DR_STEP (dr)),
+		     fold_convert (sizetype, niters));
 }
 
 /* Return true if LOOP's latch is dominated by statement for data reference
@@ -2394,9 +2390,16 @@ compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
       else
 	seg_length_b = data_ref_segment_size (dr_b, niters);
 
+      unsigned HOST_WIDE_INT access_size_a
+	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
+      unsigned HOST_WIDE_INT access_size_b
+	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
+      unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
+      unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
+
       dr_with_seg_len_pair_t dr_with_seg_len_pair
-	  (dr_with_seg_len (dr_a, seg_length_a),
-	   dr_with_seg_len (dr_b, seg_length_b));
+	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
+	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
 
       /* Canonicalize pairs by sorting the two DR members.  */
       if (comp_res > 0)
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index c6bfe45..684b7c5 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -169,6 +169,50 @@ vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
   return true;
 }
 
+/* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
+
+static void
+vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
+{
+  vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
+  for (unsigned int i = 0; i < checks.length(); ++i)
+    if (checks[i] == value)
+      return;
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
+      dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
+      dump_printf (MSG_NOTE, " is nonzero\n");
+    }
+  LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
+}
+
+/* Return true if we know that the order of vectorized STMT_A and
+   vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
+   At least one of the statements is a write.  */
+
+static bool
+vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
+{
+  stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
+  stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
+
+  /* Single statements are always kept in their original order.  */
+  if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
+      && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
+    return true;
+
+  /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
+     group are emitted at the position of the first scalar load and all
+     stores in a group are emitted at the position of the last scalar store.
+     Thus writes will happen no earlier than their current position
+     (but could happen later) while reads will happen no later than their
+     current position (but could happen earlier).  Reordering is therefore
+     only possible if the first access is a write.  */
+  gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
+  return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
+}
 
 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
@@ -414,22 +458,27 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 		... = a[i];
 		a[i+1] = ...;
 	     where loads from the group interleave with the store.  */
-	  if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
-	      || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
+	  if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "READ_WRITE dependence in interleaving.\n");
+	      return true;
+	    }
+
+	  if (!loop->force_vectorize)
 	    {
-	      gimple *earlier_stmt;
-	      earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
-	      if (DR_IS_WRITE
-		    (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
+	      tree indicator = dr_zero_step_indicator (dra);
+	      if (TREE_CODE (indicator) != INTEGER_CST)
+		vect_check_nonzero_value (loop_vinfo, indicator);
+	      else if (integer_zerop (indicator))
 		{
 		  if (dump_enabled_p ())
 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "READ_WRITE dependence in interleaving."
-				     "\n");
+				 "access also has a zero step\n");
 		  return true;
 		}
 	    }
-
 	  continue;
 	}
 
@@ -3030,38 +3079,57 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)
 
 /* Function vect_vfa_segment_size.
 
-   Create an expression that computes the size of segment
-   that will be accessed for a data reference.  The functions takes into
-   account that realignment loads may access one more vector.
-
    Input:
      DR: The data reference.
      LENGTH_FACTOR: segment length to consider.
 
-   Return an expression whose value is the size of segment which will be
-   accessed by DR.  */
+   Return a value suitable for the dr_with_seg_len::seg_len field.
+   This is the "distance travelled" by the pointer from the first
+   iteration in the segment to the last.  Note that it does not include
+   the size of the access; in effect it only describes the first byte.  */
 
 static tree
 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
 {
-  tree segment_length;
+  length_factor = size_binop (MINUS_EXPR,
+			      fold_convert (sizetype, length_factor),
+			      size_one_node);
+  return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
+		     length_factor);
+}
 
-  if (integer_zerop (DR_STEP (dr)))
-    segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
-  else
-    segment_length = size_binop (MULT_EXPR,
-				 fold_convert (sizetype, DR_STEP (dr)),
-				 fold_convert (sizetype, length_factor));
+/* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
+   gives the worst-case number of bytes covered by the segment.  */
 
-  if (vect_supportable_dr_alignment (dr, false)
-	== dr_explicit_realign_optimized)
+static unsigned HOST_WIDE_INT
+vect_vfa_access_size (data_reference *dr)
+{
+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
+  tree ref_type = TREE_TYPE (DR_REF (dr));
+  unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
+  unsigned HOST_WIDE_INT access_size = ref_size;
+  if (GROUP_FIRST_ELEMENT (stmt_vinfo))
     {
-      tree vector_size = TYPE_SIZE_UNIT
-			  (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
-
-      segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
+      gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
+      access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
+    }
+  if (STMT_VINFO_VEC_STMT (stmt_vinfo)
+      && (vect_supportable_dr_alignment (dr, false)
+	  == dr_explicit_realign_optimized))
+    {
+      /* We might access a full vector's worth.  */
+      tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
+      access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
     }
-  return segment_length;
+  return access_size;
+}
+
+/* Get the minimum alignment for all the scalar accesses that DR describes.  */
+
+static unsigned int
+vect_vfa_align (const data_reference *dr)
+{
+  return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
 }
 
 /* Function vect_no_alias_p.
@@ -3069,13 +3137,15 @@ vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
    Given data references A and B with equal base and offset, see whether
    the alias relation can be decided at compilation time.  Return 1 if
    it can and the references alias, 0 if it can and the references do
-   not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A
-   and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
-   respectively.  */
+   not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
+   SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
+   of dr_with_seg_len::{seg_len,access_size} for A and B.  */
 
 static int
 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
-			 tree segment_length_a, tree segment_length_b)
+			 tree segment_length_a, tree segment_length_b,
+			 unsigned HOST_WIDE_INT access_size_a,
+			 unsigned HOST_WIDE_INT access_size_b)
 {
   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
@@ -3088,18 +3158,21 @@ vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
     {
       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
-      offset_a = (offset_a + vect_get_scalar_dr_size (a)) - const_length_a;
+      offset_a = (offset_a + access_size_a) - const_length_a;
     }
   else
     const_length_a = tree_to_poly_uint64 (segment_length_a);
   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
     {
       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
-      offset_b = (offset_b + vect_get_scalar_dr_size (b)) - const_length_b;
+      offset_b = (offset_b + access_size_b) - const_length_b;
     }
   else
     const_length_b = tree_to_poly_uint64 (segment_length_b);
 
+  const_length_a += access_size_a;
+  const_length_b += access_size_b;
+
   if (ranges_known_overlap_p (offset_a, const_length_a,
 			      offset_b, const_length_b))
     return 1;
@@ -3149,6 +3222,108 @@ dependence_distance_ge_vf (data_dependence_relation *ddr,
   return true;
 }
 
+/* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
+
+static void
+dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
+{
+  dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
+  dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
+  dump_printf (dump_kind, ") >= ");
+  dump_dec (dump_kind, lower_bound.min_value);
+}
+
+/* Record that the vectorized loop requires the vec_lower_bound described
+   by EXPR, UNSIGNED_P and MIN_VALUE.  */
+
+static void
+vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
+			poly_uint64 min_value)
+{
+  vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
+  for (unsigned int i = 0; i < lower_bounds.length (); ++i)
+    if (operand_equal_p (lower_bounds[i].expr, expr, 0))
+      {
+	unsigned_p &= lower_bounds[i].unsigned_p;
+	min_value = upper_bound (lower_bounds[i].min_value, min_value);
+	if (lower_bounds[i].unsigned_p != unsigned_p
+	    || maybe_lt (lower_bounds[i].min_value, min_value))
+	  {
+	    lower_bounds[i].unsigned_p = unsigned_p;
+	    lower_bounds[i].min_value = min_value;
+	    if (dump_enabled_p ())
+	      {
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "updating run-time check to ");
+		dump_lower_bound (MSG_NOTE, lower_bounds[i]);
+		dump_printf (MSG_NOTE, "\n");
+	      }
+	  }
+	return;
+      }
+
+  vec_lower_bound lower_bound (expr, unsigned_p, min_value);
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
+      dump_lower_bound (MSG_NOTE, lower_bound);
+      dump_printf (MSG_NOTE, "\n");
+    }
+  LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
+}
+
+/* Return true if it's unlikely that the step of the vectorized form of DR
+   will span fewer than GAP bytes.  */
+
+static bool
+vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
+  HOST_WIDE_INT count
+    = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  if (GROUP_FIRST_ELEMENT (stmt_info))
+    count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
+  return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
+}
+
+/* Return true if we know that there is no alias between DR_A and DR_B
+   when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
+   *LOWER_BOUND_OUT to this N.  */
+
+static bool
+vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
+				poly_uint64 *lower_bound_out)
+{
+  /* Check that there is a constant gap of known sign between DR_A
+     and DR_B.  */
+  poly_int64 init_a, init_b;
+  if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
+      || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
+      || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
+      || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
+      || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
+      || !ordered_p (init_a, init_b))
+    return false;
+
+  /* Sort DR_A and DR_B by the address they access.  */
+  if (maybe_lt (init_b, init_a))
+    {
+      std::swap (init_a, init_b);
+      std::swap (dr_a, dr_b);
+    }
+
+  /* If the two accesses could be dependent within a scalar iteration,
+     make sure that we'd retain their order.  */
+  if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
+      && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
+    return false;
+
+  /* There is no alias if abs (DR_STEP) is greater than or equal to
+     the bytes spanned by the combination of the two accesses.  */
+  *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
+  return true;
+}
+
 /* Function vect_prune_runtime_alias_test_list.
 
    Prune a list of ddrs to be tested at run-time by versioning for alias.
@@ -3178,6 +3353,19 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
     dump_printf_loc (MSG_NOTE, vect_location,
                      "=== vect_prune_runtime_alias_test_list ===\n");
 
+  /* Step values are irrelevant for aliasing if the number of vector
+     iterations is equal to the number of scalar iterations (which can
+     happen for fully-SLP loops).  */
+  bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
+
+  if (!ignore_step_p)
+    {
+      /* Convert the checks for nonzero steps into bound tests.  */
+      tree value;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
+	vect_check_lower_bound (loop_vinfo, value, true, 1);
+    }
+
   if (may_alias_ddrs.is_empty ())
     return true;
 
@@ -3191,9 +3379,12 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
     {
       int comp_res;
+      poly_uint64 lower_bound;
       struct data_reference *dr_a, *dr_b;
       gimple *dr_group_first_a, *dr_group_first_b;
       tree segment_length_a, segment_length_b;
+      unsigned HOST_WIDE_INT access_size_a, access_size_b;
+      unsigned int align_a, align_b;
       gimple *stmt_a, *stmt_b;
 
       /* Ignore the alias if the VF we chose ended up being no greater
@@ -3221,6 +3412,64 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 
       dr_a = DDR_A (ddr);
       stmt_a = DR_STMT (DDR_A (ddr));
+
+      dr_b = DDR_B (ddr);
+      stmt_b = DR_STMT (DDR_B (ddr));
+
+      /* Skip the pair if inter-iteration dependencies are irrelevant
+	 and intra-iteration dependencies are guaranteed to be honored.  */
+      if (ignore_step_p
+	  && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
+	      || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
+	{
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "no need for alias check between ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
+	      dump_printf (MSG_NOTE, " and ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
+	      dump_printf (MSG_NOTE, " when VF is 1\n");
+	    }
+	  continue;
+	}
+
+      /* See whether we can handle the alias using a bounds check on
+	 the step, and whether that's likely to be the best approach.
+	 (It might not be, for example, if the minimum step is much larger
+	 than the number of bytes handled by one vector iteration.)  */
+      if (!ignore_step_p
+	  && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
+	  && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
+	  && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
+	      || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
+	{
+	  bool unsigned_p = dr_known_forward_stride_p (dr_a);
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
+	      dump_printf (MSG_NOTE, " and ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
+	      dump_printf (MSG_NOTE, " when the step ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
+	      dump_printf (MSG_NOTE, " is outside ");
+	      if (unsigned_p)
+		dump_printf (MSG_NOTE, "[0");
+	      else
+		{
+		  dump_printf (MSG_NOTE, "(");
+		  dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
+		}
+	      dump_printf (MSG_NOTE, ", ");
+	      dump_dec (MSG_NOTE, lower_bound);
+	      dump_printf (MSG_NOTE, ")\n");
+	    }
+	  vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
+				  lower_bound);
+	  continue;
+	}
+
       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
       if (dr_group_first_a)
 	{
@@ -3228,8 +3477,6 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
 	}
 
-      dr_b = DDR_B (ddr);
-      stmt_b = DR_STMT (DDR_B (ddr));
       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
       if (dr_group_first_b)
 	{
@@ -3237,12 +3484,24 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
 	}
 
-      if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
-	length_factor = scalar_loop_iters;
+      if (ignore_step_p)
+	{
+	  segment_length_a = size_zero_node;
+	  segment_length_b = size_zero_node;
+	}
       else
-	length_factor = size_int (vect_factor);
-      segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
-      segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
+	{
+	  if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
+	    length_factor = scalar_loop_iters;
+	  else
+	    length_factor = size_int (vect_factor);
+	  segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
+	  segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
+	}
+      access_size_a = vect_vfa_access_size (dr_a);
+      access_size_b = vect_vfa_access_size (dr_b);
+      align_a = vect_vfa_align (dr_a);
+      align_b = vect_vfa_align (dr_b);
 
       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
 					DR_BASE_ADDRESS (dr_b));
@@ -3259,7 +3518,22 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 	{
 	  int res = vect_compile_time_alias (dr_a, dr_b,
 					     segment_length_a,
-					     segment_length_b);
+					     segment_length_b,
+					     access_size_a,
+					     access_size_b);
+	  if (res >= 0 && dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "can tell at compile time that ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
+	      dump_printf (MSG_NOTE, " and ");
+	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
+	      if (res == 0)
+		dump_printf (MSG_NOTE, " do not alias\n");
+	      else
+		dump_printf (MSG_NOTE, " alias\n");
+	    }
+
 	  if (res == 0)
 	    continue;
 
@@ -3273,8 +3547,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 	}
 
       dr_with_seg_len_pair_t dr_with_seg_len_pair
-	  (dr_with_seg_len (dr_a, segment_length_a),
-	   dr_with_seg_len (dr_b, segment_length_b));
+	(dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
+	 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
 
       /* Canonicalize pairs by sorting the two DR members.  */
       if (comp_res > 0)
@@ -3287,6 +3561,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 
   unsigned int count = (comp_alias_ddrs.length ()
 			+ check_unequal_addrs.length ());
+
   dump_printf_loc (MSG_NOTE, vect_location,
 		   "improved number of alias checks from %d to %d\n",
 		   may_alias_ddrs.length (), count);
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index a2b4989..53684e5 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2875,6 +2875,31 @@ vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
     }
 }
 
+/* Create an expression that is true when all lower-bound conditions for
+   the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
+
+static void
+vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
+{
+  vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
+  for (unsigned int i = 0; i < lower_bounds.length (); ++i)
+    {
+      tree expr = lower_bounds[i].expr;
+      tree type = unsigned_type_for (TREE_TYPE (expr));
+      expr = fold_convert (type, expr);
+      poly_uint64 bound = lower_bounds[i].min_value;
+      if (!lower_bounds[i].unsigned_p)
+	{
+	  expr = fold_build2 (PLUS_EXPR, type, expr,
+			      build_int_cstu (type, bound - 1));
+	  bound += bound - 1;
+	}
+      tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
+					 build_int_cstu (type, bound));
+      chain_cond_expr (cond_expr, part_cond_expr);
+    }
+}
+
 /* Function vect_create_cond_for_alias_checks.
 
    Create a conditional expression that represents the run-time checks for
@@ -2986,6 +3011,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
   if (version_alias)
     {
       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
+      vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
     }
 
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 7fc2215..64b9ce3 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2475,6 +2475,7 @@ again:
 	}
     }
   /* Free optimized alias test DDRS.  */
+  LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
   /* Reset target cost data.  */
@@ -3673,6 +3674,18 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	/* Count LEN - 1 ANDs and LEN comparisons.  */
 	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
 			      NULL, 0, vect_prologue);
+      len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
+      if (len)
+	{
+	  /* Count LEN - 1 ANDs and LEN comparisons.  */
+	  unsigned int nstmts = len * 2 - 1;
+	  /* +1 for each bias that needs adding.  */
+	  for (unsigned int i = 0; i < len; ++i)
+	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
+	      nstmts += 1;
+	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
+				NULL, 0, vect_prologue);
+	}
       dump_printf (MSG_NOTE,
                    "cost model: Adding cost of checks for loop "
                    "versioning aliasing.\n");
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index d3dda52..56e875f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -174,6 +174,18 @@ typedef struct _slp_instance {
    loop to be valid.  */
 typedef std::pair<tree, tree> vec_object_pair;
 
+/* Records that vectorization is only possible if abs (EXPR) >= MIN_VALUE.
+   UNSIGNED_P is true if we can assume that abs (EXPR) == EXPR.  */
+struct vec_lower_bound {
+  vec_lower_bound () {}
+  vec_lower_bound (tree e, bool u, poly_uint64 m)
+    : expr (e), unsigned_p (u), min_value (m) {}
+
+  tree expr;
+  bool unsigned_p;
+  poly_uint64 min_value;
+};
+
 /* Vectorizer state common between loop and basic-block vectorization.  */
 struct vec_info {
   enum vec_kind { bb, loop };
@@ -406,6 +418,14 @@ typedef struct _loop_vec_info : public vec_info {
   /* Check that the addresses of each pair of objects is unequal.  */
   auto_vec<vec_object_pair> check_unequal_addrs;
 
+  /* List of values that are required to be nonzero.  This is used to check
+     whether things like "x[i * n] += 1;" are safe and eventually gets added
+     to the checks for lower bounds below.  */
+  auto_vec<tree> check_nonzero;
+
+  /* List of values that need to be checked for a minimum value.  */
+  auto_vec<vec_lower_bound> lower_bounds;
+
   /* Statements in the loop that have data references that are candidates for a
      runtime (loop versioning) misalignment check.  */
   auto_vec<gimple *> may_misalign_stmts;
@@ -514,6 +534,8 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_MAY_ALIAS_DDRS(L)       (L)->may_alias_ddrs
 #define LOOP_VINFO_COMP_ALIAS_DDRS(L)      (L)->comp_alias_ddrs
 #define LOOP_VINFO_CHECK_UNEQUAL_ADDRS(L)  (L)->check_unequal_addrs
+#define LOOP_VINFO_CHECK_NONZERO(L)        (L)->check_nonzero
+#define LOOP_VINFO_LOWER_BOUNDS(L)         (L)->lower_bounds
 #define LOOP_VINFO_GROUPED_STORES(L)       (L)->grouped_stores
 #define LOOP_VINFO_SLP_INSTANCES(L)        (L)->slp_instances
 #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
@@ -534,7 +556,8 @@ typedef struct _loop_vec_info : public vec_info {
   ((L)->may_misalign_stmts.length () > 0)
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIAS(L)		\
   ((L)->comp_alias_ddrs.length () > 0 \
-   || (L)->check_unequal_addrs.length () > 0)
+   || (L)->check_unequal_addrs.length () > 0 \
+   || (L)->lower_bounds.length () > 0)
 #define LOOP_REQUIRES_VERSIONING_FOR_NITERS(L)		\
   (LOOP_VINFO_NITERS_ASSUMPTIONS (L))
 #define LOOP_REQUIRES_VERSIONING(L)			\