11 files changed, 291 insertions, 97 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c
new file mode 100644
index 0000000..feb7ee7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c
@@ -0,0 +1,20 @@
+/* Peeling for alignment with masking in VLA modes.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define START 3
+#define END 510
+
+int __attribute__((noipa))
+foo (int *a) {
+  for (signed int i = START; i < END; ++i) {
+    if (a[i] != 0)
+      return i;
+  }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
+/* { dg-final { scan-assembler {\tnot\tp[0-7]\.b, p[0-7]/z, p.*\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c
new file mode 100644
index 0000000..b4c267f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c
@@ -0,0 +1,27 @@
+/* Peeling for alignment with masking in VLA modes.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only" } */
+
+#include "peel_ind_11.c"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 512
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  for (int k = 5; k < 30; k++) {
+    int *a = (int *) malloc (sizeof(int) * N);
+
+    /* Set only one non-zero element for test.  */
+    for (int i = 5; i < 30; i++)
+      a[i] = (i == k ? 1 : 0);
+
+    int res = foo (a);
+    asm volatile ("");
+    if (res != k) {
+      __builtin_abort ();
+    }
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c
new file mode 100644
index 0000000..260482a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c
@@ -0,0 +1,21 @@
+/* Peeling for alignment with masking together with versioning in VLA modes.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define START 5
+#define END 509
+
+int __attribute__((noipa))
+foo (int *restrict a, int * restrict b) {
+  for (signed int i = START; i < END; ++i) {
+    if (a[i] != b[i])
+      return i;
+  }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Both peeling and versioning will be applied" "vect" } } */
+/* { dg-final { scan-assembler {\tnot\tp[0-7]\.b, p[0-7]/z, p.*\n} } } */
+/* { dg-final { scan-assembler {\teor\t.*\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c
new file mode 100644
index 0000000..ba978fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c
@@ -0,0 +1,29 @@
+/* Peeling for alignment with masking together with versioning in VLA modes.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only" } */
+
+#include "peel_ind_12.c"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 512
+
+int __attribute__ ((optimize (1)))
+main (void) {
+  for (int k = 5; k < 50; k++) {
+    int *a = (int *) malloc (sizeof(int) * N);
+    int *b = (int *) malloc (sizeof(int) * N);
+
+    /* Set only one place of different values for test.  */
+    for (int i = 5; i < 50; i++) {
+      a[i] = (i == k ? 1 : 0);
+      b[i] = 0;
+    }
+
+    int res = foo (a, b);
+    asm volatile ("");
+    if (res != k) {
+      __builtin_abort ();
+    }
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c
new file mode 100644
index 0000000..730e33e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c
@@ -0,0 +1,24 @@
+/* Known inbounds DR in VLA modes.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 5
+#define END 509
+
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (signed int i = START; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c
new file mode 100644
index 0000000..83352a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c
@@ -0,0 +1,15 @@
+/* Known inbounds DR in VLA modes.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only" } */
+
+#include "peel_ind_13.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index a9d4aae..a3d3b3e 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -1448,17 +1448,20 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
   if (loop_vinfo
       && dr_safe_speculative_read_required (stmt_info))
     {
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      auto vectype_size
+      /* The required target alignment must be a power-of-2 value and is
+	 computed as the product of vector element size, VF and group size.
+	 We compute the constant part first as VF may be a variable.  For
+	 variable VF, the power-of-2 check of VF is deferred to runtime.  */
+      auto align_factor_c
 	= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
-      poly_uint64 new_alignment = vf * vectype_size;
-      /* If we have a grouped access we require that the alignment be N * elem.  */
       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-	new_alignment *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+	align_factor_c *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      poly_uint64 new_alignment = vf * align_factor_c;
 
-      unsigned HOST_WIDE_INT target_alignment;
-      if (new_alignment.is_constant (&target_alignment)
-	  && pow2p_hwi (target_alignment))
+      if ((vf.is_constant () && pow2p_hwi (new_alignment.to_constant ()))
+	  || (!vf.is_constant () && pow2p_hwi (align_factor_c)))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -1467,7 +1470,7 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
 	      dump_dec (MSG_NOTE, new_alignment);
 	      dump_printf (MSG_NOTE, " bytes.\n");
 	    }
-	  vector_alignment = target_alignment;
+	  vector_alignment = new_alignment;
 	}
     }
 
@@ -2438,6 +2441,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
      - The cost of peeling (the extra runtime checks, the increase
        in code size).  */
 
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
@@ -2446,9 +2450,18 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
       stmt_vec_info stmt_info = dr_info->stmt;
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-      do_peeling
-	= vector_alignment_reachable_p (dr_info,
-					LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+
+      /* With variable VF, unsafe speculative read can be avoided for known
+	 inbounds DRs as long as partial vectors are used.  */
+      if (!vf.is_constant ()
+	  && dr_safe_speculative_read_required (stmt_info)
+	  && DR_SCALAR_KNOWN_BOUNDS (dr_info))
+	{
+	  dr_set_safe_speculative_read_required (stmt_info, false);
+	  LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
+	}
+
+      do_peeling = vector_alignment_reachable_p (dr_info, vf);
       if (do_peeling)
         {
 	  if (known_alignment_for_access_p (dr_info, vectype))
@@ -2488,7 +2501,6 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 	      poly_uint64 nscalars = npeel_tmp;
               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
 		{
-		  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 		  unsigned group_size = 1;
 		  if (STMT_SLP_TYPE (stmt_info)
 		      && STMT_VINFO_GROUPED_ACCESS (stmt_info))
@@ -2911,14 +2923,12 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
      2) there is at least one unsupported misaligned data ref with an unknown
         misalignment, and
      3) all misaligned data refs with a known misalignment are supported, and
-     4) the number of runtime alignment checks is within reason.
-     5) the vectorization factor is a constant.  */
+     4) the number of runtime alignment checks is within reason.  */
 
   do_versioning
     = (optimize_loop_nest_for_speed_p (loop)
        && !loop->inner /* FORNOW */
-       && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP)
-       && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ();
+       && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
 
   if (do_versioning)
     {
@@ -2965,25 +2975,22 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 		 ?? We could actually unroll the loop to achieve the required
 		 overall step alignment, and forcing the alignment could be
 		 done by doing some iterations of the non-vectorized loop.  */
-	      if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-			       * DR_STEP_ALIGNMENT (dr),
+	      if (!multiple_p (vf * DR_STEP_ALIGNMENT (dr),
 			       DR_TARGET_ALIGNMENT (dr_info)))
 		{
 		  do_versioning = false;
 		  break;
 		}
 
-              /* The rightmost bits of an aligned address must be zeros.
-                 Construct the mask needed for this test.  For example,
-                 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
-                 mask must be 15 = 0xf. */
-	      gcc_assert (DR_TARGET_ALIGNMENT (dr_info).is_constant ());
-	      int mask = DR_TARGET_ALIGNMENT (dr_info).to_constant () - 1;
+	      /* Use "mask = DR_TARGET_ALIGNMENT - 1" to test rightmost address
+		 bits for runtime alignment check.  For example, for 16 bytes
+		 target alignment the mask is 15 = 0xf.  */
+	      poly_uint64 mask = DR_TARGET_ALIGNMENT (dr_info) - 1;
 
 	      /* FORNOW: use the same mask to test all potentially unaligned
 		 references in the loop.  */
-	      if (LOOP_VINFO_PTR_MASK (loop_vinfo)
-		  && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
+	      if (maybe_ne (LOOP_VINFO_PTR_MASK (loop_vinfo), 0U)
+		  && maybe_ne (LOOP_VINFO_PTR_MASK (loop_vinfo), mask))
 		{
 		  do_versioning = false;
 		  break;
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 6c1b26a..566308f 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -2454,10 +2454,7 @@ get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
   else
     {
       tree vla = build_int_cst (type, target_align);
-      tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
-				    fold_build2 (MINUS_EXPR, type,
-						 build_int_cst (type, 0), vla));
-      target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
+      target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla,
 					  build_int_cst (type, 1));
     }
 
@@ -3840,7 +3837,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
   const vec<stmt_vec_info> &may_misalign_stmts
     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
   stmt_vec_info stmt_info;
-  int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
+  poly_uint64 mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
   tree mask_cst;
   unsigned int i;
   tree int_ptrsize_type;
@@ -3852,9 +3849,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
   tree ptrsize_zero;
   tree part_cond_expr;
 
-  /* Check that mask is one less than a power of 2, i.e., mask is
-     all zeros followed by all ones.  */
-  gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
+  gcc_assert (known_ne (mask, 0U));
 
   int_ptrsize_type = signed_type_for (ptr_type_node);
 
@@ -3962,6 +3957,62 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
   chain_cond_expr (cond_expr, part_cond_expr);
 }
 
+/* Function vect_create_cond_for_vla_spec_read.
+
+   Create a conditional expression that represents the run-time checks with
+   max speculative read amount in VLA modes.  We check two things:
+     1) if the max speculative read amount exceeds the min page size
+     2) if the VF is power-of-2 - done by checking the max read amount instead
+
+   Input:
+   COND_EXPR  - input conditional expression.  New conditions will be chained
+		with logical AND operation.
+   LOOP_VINFO - field LOOP_VINFO_MAX_SPEC_READ_AMOUNT contains the max
+		possible speculative read amount in VLA modes.
+
+   Output:
+   COND_EXPR - conditional expression.
+
+   The returned COND_EXPR is the conditional expression to be used in the
+   if statement that controls which version of the loop gets executed at
+   runtime.  */
+
+static void
+vect_create_cond_for_vla_spec_read (loop_vec_info loop_vinfo, tree *cond_expr)
+{
+  poly_uint64 read_amount_poly = LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo);
+  tree amount = build_int_cst (long_unsigned_type_node, read_amount_poly);
+
+  /* Both the read amount and the VF must be variants, and the read amount must
+     be a constant power-of-2 multiple of the VF.  */
+  unsigned HOST_WIDE_INT multiple;
+  gcc_assert (!read_amount_poly.is_constant ()
+	      && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
+	      && constant_multiple_p (read_amount_poly,
+				      LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+				      &multiple)
+	      && pow2p_hwi (multiple));
+
+  tree cst_ul_zero = build_int_cstu (long_unsigned_type_node, 0U);
+  tree cst_ul_one = build_int_cstu (long_unsigned_type_node, 1U);
+  tree cst_ul_pagesize = build_int_cstu (long_unsigned_type_node,
+					 (unsigned long) param_min_pagesize);
+
+  /* Create an expression of "amount & (amount - 1) == 0".  */
+  tree amount_m1 = fold_build2 (MINUS_EXPR, long_unsigned_type_node,
+				amount, cst_ul_one);
+  tree amount_and_expr = fold_build2 (BIT_AND_EXPR, long_unsigned_type_node,
+				      amount, amount_m1);
+  tree powof2_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
+				       amount_and_expr, cst_ul_zero);
+  chain_cond_expr (cond_expr, powof2_cond_expr);
+
+  /* Create an expression of "amount <= cst_ul_pagesize".  */
+  tree pagesize_cond_expr = fold_build2 (LE_EXPR, boolean_type_node,
+					 amount, cst_ul_pagesize);
+  chain_cond_expr (cond_expr, pagesize_cond_expr);
+}
+
 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
@@ -4087,6 +4138,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
   gimple_seq gimplify_stmt_list = NULL;
   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
+  bool version_spec_read = LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (loop_vinfo);
   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
   poly_uint64 versioning_threshold
@@ -4145,6 +4197,9 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
 				       &cond_expr_stmt_list);
 
+  if (version_spec_read)
+    vect_create_cond_for_vla_spec_read (loop_vinfo, &cond_expr);
+
   if (version_alias)
     {
       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 85f3e90..55a8495 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1009,6 +1009,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     unaligned_dr (NULL),
     peeling_for_alignment (0),
     ptr_mask (0),
+    max_spec_read_amount (0),
     nonlinear_iv (false),
     ivexpr_map (NULL),
     scan_map (NULL),
@@ -10141,7 +10142,12 @@ vectorizable_induction (loop_vec_info loop_vinfo,
       if (peel_mul)
 	{
 	  if (!step_mul)
-	    step_mul = peel_mul;
+	    {
+	      gcc_assert (!nunits.is_constant ());
+	      step_mul = gimple_build (&init_stmts,
+				       MINUS_EXPR, step_vectype,
+				       build_zero_cst (step_vectype), peel_mul);
+	    }
 	  else
 	    step_mul = gimple_build (&init_stmts,
 				     MINUS_EXPR, step_vectype,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a6f4db4..dbeb8bd 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2400,70 +2400,26 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
       /* We can only peel for loops, of course.  */
       gcc_checking_assert (loop_vinfo);
 
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      poly_uint64 read_amount
+	= vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+	read_amount *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+
       auto target_alignment
 	= DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
-      unsigned HOST_WIDE_INT target_align;
-
-      bool group_aligned = false;
-      if (target_alignment.is_constant (&target_align)
-	  && nunits.is_constant ())
-	{
-	  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-	  auto vectype_size
-	    = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
-	  poly_uint64 required_alignment = vf * vectype_size;
-	  /* If we have a grouped access we require that the alignment be N * elem.  */
-	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-	    required_alignment *=
-		DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
-	  if (!multiple_p (target_alignment, required_alignment))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "desired alignment %wu not met. Instead got %wu "
-			     "for DR alignment at %G",
-			     required_alignment.to_constant (),
-			     target_align, STMT_VINFO_STMT (stmt_info));
-	      return false;
-	    }
-
-	  if (!pow2p_hwi (target_align))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "non-power-of-two vector alignment %wd "
-			     "for DR alignment at %G",
-			     target_align, STMT_VINFO_STMT (stmt_info));
-	      return false;
-	    }
-
-	  /* For VLA we have to insert a runtime check that the vector loads
-	     per iterations don't exceed a page size.  For now we can use
-	     POLY_VALUE_MAX as a proxy as we can't peel for VLA.  */
-	  if (known_gt (required_alignment, (unsigned)param_min_pagesize))
+      if (!multiple_p (target_alignment, read_amount))
+	{
+	  if (dump_enabled_p ())
 	    {
-	      if (dump_enabled_p ())
-		{
-		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			       "alignment required for correctness (");
-		  dump_dec (MSG_MISSED_OPTIMIZATION, required_alignment);
-		  dump_printf (MSG_NOTE, ") may exceed page size\n");
-		}
-	      return false;
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "desired alignment not met, target was ");
+	      dump_dec (MSG_NOTE, target_alignment);
+	      dump_printf (MSG_NOTE, " previously, but read amount is ");
+	      dump_dec (MSG_NOTE, read_amount);
+	      dump_printf (MSG_NOTE, " at %G.\n", STMT_VINFO_STMT (stmt_info));
 	    }
-
-	  group_aligned = true;
-	}
-
-      /* There are multiple loads that have a misalignment that we couldn't
-	 align.  We would need LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P to
-	 vectorize. */
-      if (!group_aligned)
-	{
-	  if (inbounds)
-	    LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
-	  else
-	    return false;
+	  return false;
 	}
 
       /* When using a group access the first element may be aligned but the
@@ -2485,6 +2441,33 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
 			     STMT_VINFO_STMT (stmt_info));
 	  return false;
 	}
+
+      /* Reject vectorization if we know the read mount per vector iteration
+	 exceeds the min page size.  */
+      if (known_gt (read_amount, (unsigned) param_min_pagesize))
+	{
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			       "alignment required for correctness (");
+	      dump_dec (MSG_MISSED_OPTIMIZATION, read_amount);
+	      dump_printf (MSG_NOTE, ") may exceed page size.\n");
+	    }
+	  return false;
+	}
+
+      if (!vf.is_constant ())
+	{
+	  /* For VLA modes, we need a runtime check to ensure any speculative
+	     read amount does not exceed the page size.  Here we record the max
+	     possible read amount for the check.  */
+	  if (maybe_gt (read_amount,
+			LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo)))
+	    LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo) = read_amount;
+
+	  /* For VLA modes, we must use partial vectors.  */
+	  LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
+	}
     }
 
   if (*alignment_support_scheme == dr_unaligned_unsupported)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9653496..041cff8 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -919,7 +919,10 @@ public:
   int peeling_for_alignment;
 
   /* The mask used to check the alignment of pointers or arrays.  */
-  int ptr_mask;
+  poly_uint64 ptr_mask;
+
+  /* The maximum speculative read amount in VLA modes for runtime check.  */
+  poly_uint64 max_spec_read_amount;
 
   /* Indicates whether the loop has any non-linear IV.  */
   bool nonlinear_iv;
@@ -1155,6 +1158,7 @@ public:
 #define LOOP_VINFO_RGROUP_IV_TYPE(L)       (L)->rgroup_iv_type
 #define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
 #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
+#define LOOP_VINFO_MAX_SPEC_READ_AMOUNT(L) (L)->max_spec_read_amount
 #define LOOP_VINFO_LOOP_NEST(L)            (L)->shared->loop_nest
 #define LOOP_VINFO_DATAREFS(L)             (L)->shared->datarefs
 #define LOOP_VINFO_DDRS(L)                 (L)->shared->ddrs
@@ -1209,6 +1213,8 @@ public:
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L)	\
   ((L)->may_misalign_stmts.length () > 0)
+#define LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ(L)	\
+  (maybe_gt ((L)->max_spec_read_amount, 0U))
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIAS(L)		\
   ((L)->comp_alias_ddrs.length () > 0 \
    || (L)->check_unequal_addrs.length () > 0 \
@@ -1219,6 +1225,7 @@ public:
   (LOOP_VINFO_SIMD_IF_COND (L))
 #define LOOP_REQUIRES_VERSIONING(L)			\
   (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (L)		\
+   || LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (L)	\
    || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (L)		\
    || LOOP_REQUIRES_VERSIONING_FOR_NITERS (L)		\
    || LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (L))