Use single-iteration epilogues when peeling for gaps

This patch adds support for fully-masking loops that require peeling for gaps. It peels exactly one scalar iteration and uses the masked loop to handle the rest. Previously we would fall back on using a standard unmasked loop instead. 2018-01-13 Richard Sandiford <richard.sandiford@linaro.org> Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> gcc/ * tree-vect-loop-manip.c (vect_gen_scalar_loop_niters): Replace vfm1 with a bound_epilog parameter. (vect_do_peeling): Update calls accordingly, and move the prologue call earlier in the function. Treat the base bound_epilog as 0 for fully-masked loops and retain vf - 1 for other loops. Add 1 to this base when peeling for gaps. * tree-vect-loop.c (vect_analyze_loop_2): Allow peeling for gaps with fully-masked loops. (vect_estimate_min_profitable_iters): Handle the single peeled iteration in that case. gcc/testsuite/ * gcc.target/aarch64/sve/struct_vect_18.c: Check the number of branches. * gcc.target/aarch64/sve/struct_vect_19.c: Likewise. * gcc.target/aarch64/sve/struct_vect_20.c: New test. * gcc.target/aarch64/sve/struct_vect_20_run.c: Likewise. * gcc.target/aarch64/sve/struct_vect_21.c: Likewise. * gcc.target/aarch64/sve/struct_vect_21_run.c: Likewise. * gcc.target/aarch64/sve/struct_vect_22.c: Likewise. * gcc.target/aarch64/sve/struct_vect_22_run.c: Likewise. * gcc.target/aarch64/sve/struct_vect_23.c: Likewise. * gcc.target/aarch64/sve/struct_vect_23_run.c: Likewise. Co-Authored-By: Alan Hayward <alan.hayward@arm.com> Co-Authored-By: David Sherwood <david.sherwood@arm.com> From-SVN: r256635
author: Richard Sandiford <richard.sandiford@linaro.org> 2018-01-13 18:00:41 +0000
committer: Richard Sandiford <rsandifo@gcc.gnu.org> 2018-01-13 18:00:41 +0000
commit: d1d20a49a788bdb82f09ada6377d932ceac07934 (patch)
tree: da9369ee0298c56a7f3c618a641a99026cd3d33c /gcc/tree-vect-loop-manip.c
parent: 4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f (diff)
download: gcc-d1d20a49a788bdb82f09ada6377d932ceac07934.zip
gcc-d1d20a49a788bdb82f09ada6377d932ceac07934.tar.gz
gcc-d1d20a49a788bdb82f09ada6377d932ceac07934.tar.bz2
1 files changed, 52 insertions, 40 deletions
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index b9bb047..a2b4989 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -618,8 +618,9 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 
 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
-   number of iterations of the original scalar loop.  NITERS_MAYBE_ZERO
-   and FINAL_IV are as for vect_set_loop_condition.
+   number of iterations of the original scalar loop that should be
+   handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
+   as for vect_set_loop_condition.
 
    Insert the branch-back condition before LOOP_COND_GSI and return the
    final gcond.  */
@@ -1836,23 +1837,24 @@ vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
 /* Calculate the number of iterations above which vectorized loop will be
    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
    of prolog loop.  If it's integer const, the integer number is also passed
-   in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (included) of
-   number of iterations of prolog loop.  VFM1 is vector factor minus one.
-   If CHECK_PROFITABILITY is true, TH is the threshold below which scalar
-   (rather than vectorized) loop will be executed.  This function stores
-   upper bound (included) of the result in BOUND_SCALAR.  */
+   in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
+   number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
+   value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
+   threshold below which the scalar (rather than vectorized) loop will be
+   executed.  This function stores the upper bound (inclusive) of the result
+   in BOUND_SCALAR.  */
 
 static tree
 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
-			     int bound_prolog, poly_int64 vfm1, int th,
+			     int bound_prolog, poly_int64 bound_epilog, int th,
 			     poly_uint64 *bound_scalar,
 			     bool check_profitability)
 {
   tree type = TREE_TYPE (niters_prolog);
   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
-			     build_int_cst (type, vfm1));
+			     build_int_cst (type, bound_epilog));
 
-  *bound_scalar = vfm1 + bound_prolog;
+  *bound_scalar = bound_prolog + bound_epilog;
   if (check_profitability)
     {
       /* TH indicates the minimum niters of vectorized loop, while we
@@ -1861,18 +1863,18 @@ vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
       /* Peeling for constant times.  */
       if (int_niters_prolog >= 0)
 	{
-	  *bound_scalar = upper_bound (int_niters_prolog + vfm1, th);
+	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
 	  return build_int_cst (type, *bound_scalar);
 	}
-      /* Peeling for unknown times.  Note BOUND_PROLOG is the upper
-	 bound (inlcuded) of niters of prolog loop.  */
-      if (known_ge (th, vfm1 + bound_prolog))
+      /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
+	 and BOUND_EPILOG are inclusive upper bounds.  */
+      if (known_ge (th, bound_prolog + bound_epilog))
 	{
 	  *bound_scalar = th;
 	  return build_int_cst (type, th);
 	}
       /* Need to do runtime comparison.  */
-      else if (maybe_gt (th, vfm1))
+      else if (maybe_gt (th, bound_epilog))
 	{
 	  *bound_scalar = upper_bound (*bound_scalar, th);
 	  return fold_build2 (MAX_EXPR, type,
@@ -2405,14 +2407,20 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   tree type = TREE_TYPE (niters), guard_cond;
   basic_block guard_bb, guard_to;
   profile_probability prob_prolog, prob_vector, prob_epilog;
-  int bound_prolog = 0;
-  poly_uint64 bound_scalar = 0;
   int estimated_vf;
   int prolog_peeling = 0;
   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-  bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
-			 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  poly_uint64 bound_epilog = 0;
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
+    bound_epilog += vf - 1;
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    bound_epilog += 1;
+  bool epilog_peeling = maybe_ne (bound_epilog, 0U);
+  poly_uint64 bound_scalar = bound_epilog;
 
   if (!prolog_peeling && !epilog_peeling)
     return NULL;
@@ -2423,7 +2431,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
     estimated_vf = 3;
   prob_prolog = prob_epilog = profile_probability::guessed_always ()
 			.apply_scale (estimated_vf - 1, estimated_vf);
-  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 
   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
   struct loop *first_loop = loop;
@@ -2438,14 +2445,29 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
     }
   initialize_original_copy_tables ();
 
+  /* Record the anchor bb at which the guard should be placed if the scalar
+     loop might be preferred.  */
+  basic_block anchor = loop_preheader_edge (loop)->src;
+
+  /* Generate the number of iterations for the prolog loop.  We do this here
+     so that we can also get the upper bound on the number of iterations.  */
+  tree niters_prolog;
+  int bound_prolog = 0;
+  if (prolog_peeling)
+    niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
+						 &bound_prolog);
+  else
+    niters_prolog = build_int_cst (type, 0);
+
   /* Prolog loop may be skipped.  */
   bool skip_prolog = (prolog_peeling != 0);
   /* Skip to epilog if scalar loop may be preferred.  It's only needed
      when we peel for epilog loop and when it hasn't been checked with
      loop versioning.  */
-  bool skip_vector = ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-		       && !LOOP_REQUIRES_VERSIONING (loop_vinfo))
-		      || !vf.is_constant ());
+  bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
+				  bound_prolog + bound_epilog)
+		      : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
   /* Epilog loop must be executed if the number of iterations for epilog
      loop is known at compile time, otherwise we need to add a check at
      the end of vector loop and skip to the end of epilog loop.  */
@@ -2456,9 +2478,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
     skip_epilog = false;
 
-  /* Record the anchor bb at which guard should be placed if scalar loop
-     may be preferred.  */
-  basic_block anchor = loop_preheader_edge (loop)->src;
   if (skip_vector)
     {
       split_edge (loop_preheader_edge (loop));
@@ -2476,7 +2495,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	}
     }
 
-  tree niters_prolog = build_int_cst (type, 0);
   source_location loop_loc = find_loop_location (loop);
   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
   if (prolog_peeling)
@@ -2500,9 +2518,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       first_loop = prolog;
       reset_original_copy_tables ();
 
-      /* Generate and update the number of iterations for prolog loop.  */
-      niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
-						   &bound_prolog);
+      /* Update the number of iterations for prolog loop.  */
       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
       vect_set_loop_condition (prolog, NULL, niters_prolog,
 			       step_prolog, NULL_TREE, false);
@@ -2577,10 +2593,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       if (skip_vector)
 	{
 	  /* Additional epilogue iteration is peeled if gap exists.  */
-	  bool peel_for_gaps = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
 	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
-						bound_prolog,
-						peel_for_gaps ? vf : vf - 1,
+						bound_prolog, bound_epilog,
 						th, &bound_scalar,
 						check_profitability);
 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
@@ -2664,14 +2678,12 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       else
 	slpeel_update_phi_nodes_for_lcssa (epilog);
 
-      unsigned HOST_WIDE_INT bound1, bound2;
-      if (vf.is_constant (&bound1) && bound_scalar.is_constant (&bound2))
+      unsigned HOST_WIDE_INT bound;
+      if (bound_scalar.is_constant (&bound))
 	{
-	  bound1 -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 2;
-	  if (bound2)
-	    /* We share epilog loop with scalar version loop.  */
-	    bound1 = MAX (bound1, bound2 - 1);
-	  record_niter_bound (epilog, bound1, false, true);
+	  gcc_assert (bound != 0);
+	  /* -1 to convert loop iterations to latch iterations.  */
+	  record_niter_bound (epilog, bound - 1, false, true);
 	}
 
       delete_update_ssa ();
author	Richard Sandiford <richard.sandiford@linaro.org>	2018-01-13 18:00:41 +0000
committer	Richard Sandiford <rsandifo@gcc.gnu.org>	2018-01-13 18:00:41 +0000
commit	d1d20a49a788bdb82f09ada6377d932ceac07934 (patch)
tree	da9369ee0298c56a7f3c618a641a99026cd3d33c /gcc/tree-vect-loop-manip.c
parent	4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f (diff)
download	gcc-d1d20a49a788bdb82f09ada6377d932ceac07934.zip gcc-d1d20a49a788bdb82f09ada6377d932ceac07934.tar.gz gcc-d1d20a49a788bdb82f09ada6377d932ceac07934.tar.bz2