re PR target/88838 ([SVE] Use 32-bit WHILELO in LP64 mode)

gcc/ChangeLog: 2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> PR target/88838 * tree-vect-loop-manip.c (vect_set_loop_masks_directly): If the compare_type is not with Pmode size, we will create an IV with Pmode size with truncated use (i.e. converted to the correct type). * tree-vect-loop.c (vect_verify_full_masking): Find IV type. (vect_iv_limit_for_full_masking): New. Factored out of vect_set_loop_condition_masked. * tree-vectorizer.h (LOOP_VINFO_MASK_IV_TYPE): New. (vect_iv_limit_for_full_masking): Declare. gcc/testsuite/ChangeLog: 2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org> PR target/88838 * gcc.target/aarch64/pr88838.c: New test. * gcc.target/aarch64/sve/while_1.c: Adjust. From-SVN: r272233
author: Kugan Vivekanandarajah <kuganv@linaro.org> 2019-06-13 03:34:28 +0000
committer: Kugan Vivekanandarajah <kugan@gcc.gnu.org> 2019-06-13 03:34:28 +0000
commit: 9b884225bfc609606f9b169b021c4da93feba48e (patch)
tree: 28c9e494f42ea0d6f529cb8033d031f8f2b46f04 /gcc/tree-vect-loop.c
parent: fa9863e7d34ecd011ae75083be2ae124e5831b64 (diff)
download: gcc-9b884225bfc609606f9b169b021c4da93feba48e.zip
gcc-9b884225bfc609606f9b169b021c4da93feba48e.tar.gz
gcc-9b884225bfc609606f9b169b021c4da93feba48e.tar.bz2
1 files changed, 80 insertions, 5 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 4942c69..671ef2f 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1030,6 +1030,8 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int min_ni_width;
+  unsigned int max_nscalars_per_iter
+    = vect_get_max_nscalars_per_iter (loop_vinfo);
 
   /* Use a normal loop if there are no statements that need masking.
      This only happens in rare degenerate cases: it means that the loop
@@ -1048,7 +1050,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     max_ni = wi::smin (max_ni, max_back_edges + 1);
 
   /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+  max_ni *= max_nscalars_per_iter;
 
   /* Work out how many bits we need to represent the limit.  */
   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
@@ -1056,6 +1058,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
   tree cmp_type = NULL_TREE;
+  tree iv_type = NULL_TREE;
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  widest_int iv_precision = UINT_MAX;
+
+  if (iv_limit != -1)
+    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
+				      UNSIGNED);
+
   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
     {
       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@@ -1067,10 +1077,32 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
 	    {
 	      /* Although we could stop as soon as we find a valid mode,
-		 it's often better to continue until we hit Pmode, since the
-		 operands to the WHILE are more likely to be reusable in
-		 address calculations.  */
-	      cmp_type = this_type;
+		 there are at least two reasons why that's not always the
+		 best choice:
+
+		 - An IV that's Pmode or wider is more likely to be reusable
+		 in address calculations than an IV that's narrower than
+		 Pmode.
+
+		 - Doing the comparison in IV_PRECISION or wider allows
+		 a natural 0-based IV, whereas using a narrower comparison
+		 type requires mitigations against wrap-around.
+
+		 Conversely, if the IV limit is variable, doing the comparison
+		 in a wider type than the original type can introduce
+		 unnecessary extensions, so picking the widest valid mode
+		 is not always a good choice either.
+
+		 Here we prefer the first IV type that's Pmode or wider,
+		 and the first comparison type that's IV_PRECISION or wider.
+		 (The comparison type must be no wider than the IV type,
+		 to avoid extensions in the vector loop.)
+
+		 ??? We might want to try continuing beyond Pmode for ILP32
+		 targets if CMP_BITS < IV_PRECISION.  */
+	      iv_type = this_type;
+	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
+		cmp_type = this_type;
 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
 		break;
 	    }
@@ -1081,6 +1113,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     return false;
 
   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
   return true;
 }
 
@@ -9014,3 +9047,45 @@ optimize_mask_stores (struct loop *loop)
       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
     }
 }
+
+/* Decide whether it is possible to use a zero-based induction variable
+   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
+   return the value that the induction variable must be able to hold
+   in order to ensure that the loop ends with an all-false mask.
+   Return -1 otherwise.  */
+widest_int
+vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+{
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+
+  /* Calculate the value that the induction variable must be able
+     to hit in order to ensure that we end the loop with an all-false mask.
+     This involves adding the maximum number of inactive trailing scalar
+     iterations.  */
+  widest_int iv_limit = -1;
+  if (max_loop_iterations (loop, &iv_limit))
+    {
+      if (niters_skip)
+	{
+	  /* Add the maximum number of skipped iterations to the
+	     maximum iteration count.  */
+	  if (TREE_CODE (niters_skip) == INTEGER_CST)
+	    iv_limit += wi::to_widest (niters_skip);
+	  else
+	    iv_limit += max_vf - 1;
+	}
+      else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+	/* Make a conservatively-correct assumption.  */
+	iv_limit += max_vf - 1;
+
+      /* IV_LIMIT is the maximum number of latch iterations, which is also
+	 the maximum in-range IV value.  Round this value down to the previous
+	 vector alignment boundary and then add an extra full iteration.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+    }
+  return iv_limit;
+}
+
author	Kugan Vivekanandarajah <kuganv@linaro.org>	2019-06-13 03:34:28 +0000
committer	Kugan Vivekanandarajah <kugan@gcc.gnu.org>	2019-06-13 03:34:28 +0000
commit	9b884225bfc609606f9b169b021c4da93feba48e (patch)
tree	28c9e494f42ea0d6f529cb8033d031f8f2b46f04 /gcc/tree-vect-loop.c
parent	fa9863e7d34ecd011ae75083be2ae124e5831b64 (diff)
download	gcc-9b884225bfc609606f9b169b021c4da93feba48e.zip gcc-9b884225bfc609606f9b169b021c4da93feba48e.tar.gz gcc-9b884225bfc609606f9b169b021c4da93feba48e.tar.bz2