1 files changed, 305 insertions, 20 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index d679115..5deb780 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1121,12 +1121,15 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
     versioning_threshold (0),
     vectorization_factor (0),
     max_vectorization_factor (0),
+    mask_compare_type (NULL_TREE),
     unaligned_dr (NULL),
     peeling_for_alignment (0),
     ptr_mask (0),
     slp_unrolling_factor (1),
     single_scalar_iteration_cost (0),
     vectorizable (false),
+    can_fully_mask_p (true),
+    fully_masked_p (false),
     peeling_for_gaps (false),
     peeling_for_niter (false),
     operands_swapped (false),
@@ -1168,6 +1171,17 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
   gcc_assert (nbbs == loop->num_nodes);
 }
 
+/* Free all levels of MASKS.  */
+
+void
+release_vec_loop_masks (vec_loop_masks *masks)
+{
+  rgroup_masks *rgm;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (*masks, i, rgm)
+    rgm->masks.release ();
+  masks->release ();
+}
 
 /* Free all memory used by the _loop_vec_info, as well as all the
    stmt_vec_info structs of all the stmts in the loop.  */
@@ -1233,9 +1247,98 @@ _loop_vec_info::~_loop_vec_info ()
 
   free (bbs);
 
+  release_vec_loop_masks (&masks);
+
   loop->aux = NULL;
 }
 
+/* Return true if we can use CMP_TYPE as the comparison type to produce
+   all masks required to mask LOOP_VINFO.  */
+
+static bool
+can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
+{
+  rgroup_masks *rgm;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
+    if (rgm->mask_type != NULL_TREE
+	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
+					    cmp_type, rgm->mask_type,
+					    OPTIMIZE_FOR_SPEED))
+      return false;
+  return true;
+}
+
+/* Calculate the maximum number of scalars per iteration for every
+   rgroup in LOOP_VINFO.  */
+
+static unsigned int
+vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
+{
+  unsigned int res = 1;
+  unsigned int i;
+  rgroup_masks *rgm;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
+    res = MAX (res, rgm->max_nscalars_per_iter);
+  return res;
+}
+
+/* Each statement in LOOP_VINFO can be masked where necessary.  Check
+   whether we can actually generate the masks required.  Return true if so,
+   storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
+
+static bool
+vect_verify_full_masking (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned int min_ni_width;
+
+  /* Get the maximum number of iterations that is representable
+     in the counter type.  */
+  tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
+  widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
+
+  /* Get a more refined estimate for the number of iterations.  */
+  widest_int max_back_edges;
+  if (max_loop_iterations (loop, &max_back_edges))
+    max_ni = wi::smin (max_ni, max_back_edges + 1);
+
+  /* Account for rgroup masks, in which each bit is replicated N times.  */
+  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+
+  /* Work out how many bits we need to represent the limit.  */
+  min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+
+  /* Find a scalar mode for which WHILE_ULT is supported.  */
+  opt_scalar_int_mode cmp_mode_iter;
+  tree cmp_type = NULL_TREE;
+  FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
+    {
+      unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
+      if (cmp_bits >= min_ni_width
+	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
+	{
+	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
+	  if (this_type
+	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
+	    {
+	      /* Although we could stop as soon as we find a valid mode,
+		 it's often better to continue until we hit Pmode, since the
+		 operands to the WHILE are more likely to be reusable in
+		 address calculations.  */
+	      cmp_type = this_type;
+	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
+		break;
+	    }
+	}
+    }
+
+  if (!cmp_type)
+    return false;
+
+  LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  return true;
+}
 
 /* Calculate the cost of one scalar iteration of the loop.  */
 static void
@@ -1980,6 +2083,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
       vect_update_vf_for_slp (loop_vinfo);
     }
 
+  bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
+
+  /* We don't expect to have to roll back to anything other than an empty
+     set of rgroups.  */
+  gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
+
   /* This is the point where we can re-start analysis with SLP forced off.  */
 start_over:
 
@@ -2068,11 +2177,47 @@ start_over:
       return false;
     }
 
+  if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    {
+      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "can't use a fully-masked loop because peeling for"
+			 " gaps is required.\n");
+    }
+
+  if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+    {
+      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "can't use a fully-masked loop because peeling for"
+			 " alignment is required.\n");
+    }
+
+  /* Decide whether to use a fully-masked loop for this vectorization
+     factor.  */
+  LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+    = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+       && vect_verify_full_masking (loop_vinfo));
+  if (dump_enabled_p ())
+    {
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "using a fully-masked loop.\n");
+      else
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "not using a fully-masked loop.\n");
+    }
+
   /* If epilog loop is required because of data accesses with gaps,
      one additional iteration needs to be peeled.  Check if there is
      enough iterations for vectorization.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
     {
       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
@@ -2153,8 +2298,11 @@ start_over:
   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
   unsigned HOST_WIDE_INT const_vf;
-  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    /* The main loop handles all iterations.  */
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
     {
       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
 		       - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
@@ -2212,7 +2360,8 @@ start_over:
 	niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
 
       /* Niters for at least one iteration of vectorized loop.  */
-      niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
       /* One additional iteration because of peeling for gap.  */
       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
 	niters_th += 1;
@@ -2315,11 +2464,14 @@ again:
   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
+  /* Reset accumulated rgroup information.  */
+  release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
   /* Reset assorted flags.  */
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
+  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
 
   goto start_over;
 }
@@ -3523,7 +3675,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
 
   /* Add additional cost for the peeled instructions in prologue and epilogue
-     loop.
+     loop.  (For fully-masked loops there will be no peeling.)
 
      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
      at compile-time - we assume it's vf/2 (the worst would be vf-1).
@@ -3531,7 +3683,12 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      TODO: Build an expression that represents peel_iters for prologue and
      epilogue to be used in a run-time test.  */
 
-  if (npeel  < 0)
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      peel_iters_prologue = 0;
+      peel_iters_epilogue = 0;
+    }
+  else if (npeel < 0)
     {
       peel_iters_prologue = assumed_vf / 2;
       dump_printf (MSG_NOTE, "cost model: "
@@ -3762,8 +3919,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	       "  Calculated minimum iters for profitability: %d\n",
 	       min_profitable_iters);
 
-  /* We want the vectorized loop to execute at least once.  */
-  if (min_profitable_iters < (assumed_vf + peel_iters_prologue))
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && min_profitable_iters < (assumed_vf + peel_iters_prologue))
+    /* We want the vectorized loop to execute at least once.  */
     min_profitable_iters = assumed_vf + peel_iters_prologue;
 
   if (dump_enabled_p ())
@@ -6737,6 +6895,15 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "can't use a fully-masked loop due to "
+			     "reduction operation.\n");
+	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	}
+
       if (first_p)
 	vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
@@ -7557,8 +7724,19 @@ vectorizable_live_operation (gimple *stmt,
     }
 
   if (!vec_stmt)
-    /* No transformation required.  */
-    return true;
+    {
+      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "can't use a fully-masked loop because "
+			     "a value is live outside the loop.\n");
+	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	}
+
+      /* No transformation required.  */
+      return true;
+    }
 
   /* If stmt has a related stmt, then use that for getting the lhs.  */
   if (is_pattern_stmt_p (stmt_info))
@@ -7573,6 +7751,8 @@ vectorizable_live_operation (gimple *stmt,
 	     : TYPE_SIZE (TREE_TYPE (vectype)));
   vec_bitsize = TYPE_SIZE (vectype);
 
+  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
   tree vec_lhs, bitstart;
   if (slp_node)
@@ -7706,6 +7886,97 @@ loop_niters_no_overflow (loop_vec_info loop_vinfo)
   return false;
 }
 
+/* Return a mask type with half the number of elements as TYPE.  */
+
+tree
+vect_halve_mask_nunits (tree type)
+{
+  poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
+  return build_truth_vector_type (nunits, current_vector_size);
+}
+
+/* Return a mask type with twice as many elements as TYPE.  */
+
+tree
+vect_double_mask_nunits (tree type)
+{
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
+  return build_truth_vector_type (nunits, current_vector_size);
+}
+
+/* Record that a fully-masked version of LOOP_VINFO would need MASKS to
+   contain a sequence of NVECTORS masks that each control a vector of type
+   VECTYPE.  */
+
+void
+vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
+		       unsigned int nvectors, tree vectype)
+{
+  gcc_assert (nvectors != 0);
+  if (masks->length () < nvectors)
+    masks->safe_grow_cleared (nvectors);
+  rgroup_masks *rgm = &(*masks)[nvectors - 1];
+  /* The number of scalars per iteration and the number of vectors are
+     both compile-time constants.  */
+  unsigned int nscalars_per_iter
+    = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+  if (rgm->max_nscalars_per_iter < nscalars_per_iter)
+    {
+      rgm->max_nscalars_per_iter = nscalars_per_iter;
+      rgm->mask_type = build_same_sized_truth_vector_type (vectype);
+    }
+}
+
+/* Given a complete set of masks MASKS, extract mask number INDEX
+   for an rgroup that operates on NVECTORS vectors of type VECTYPE,
+   where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
+
+   See the comment above vec_loop_masks for more details about the mask
+   arrangement.  */
+
+tree
+vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
+		    unsigned int nvectors, tree vectype, unsigned int index)
+{
+  rgroup_masks *rgm = &(*masks)[nvectors - 1];
+  tree mask_type = rgm->mask_type;
+
+  /* Populate the rgroup's mask array, if this is the first time we've
+     used it.  */
+  if (rgm->masks.is_empty ())
+    {
+      rgm->masks.safe_grow_cleared (nvectors);
+      for (unsigned int i = 0; i < nvectors; ++i)
+	{
+	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
+	  /* Provide a dummy definition until the real one is available.  */
+	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
+	  rgm->masks[i] = mask;
+	}
+    }
+
+  tree mask = rgm->masks[index];
+  if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
+		TYPE_VECTOR_SUBPARTS (vectype)))
+    {
+      /* A loop mask for data type X can be reused for data type Y
+	 if X has N times more elements than Y and if Y's elements
+	 are N times bigger than X's.  In this case each sequence
+	 of N elements in the loop mask will be all-zero or all-one.
+	 We can then view-convert the mask so that each sequence of
+	 N elements is replaced by a single element.  */
+      gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
+			      TYPE_VECTOR_SUBPARTS (vectype)));
+      gimple_seq seq = NULL;
+      mask_type = build_same_sized_truth_vector_type (vectype);
+      mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
+      if (seq)
+	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+    }
+  return mask;
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
    by factor VF.  */
 
@@ -7840,9 +8111,12 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
 			      &step_vector, &niters_vector_mult_vf, th,
 			      check_profitability, niters_no_overflow);
+
   if (niters_vector == NULL_TREE)
     {
-      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && known_eq (lowest_vf, vf))
+      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && known_eq (lowest_vf, vf))
 	{
 	  niters_vector
 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
@@ -8124,13 +8398,15 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
   if (integer_onep (step_vector))
     niters_no_overflow = true;
-  slpeel_make_loop_iterate_ntimes (loop, niters_vector, step_vector,
-				   niters_vector_mult_vf,
-				   !niters_no_overflow);
+  vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
+			   niters_vector_mult_vf, !niters_no_overflow);
 
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
   scale_profile_for_vect_loop (loop, assumed_vf);
 
+  /* True if the final iteration might not handle a full vector's
+     worth of scalar iterations.  */
+  bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   /* The minimum number of iterations performed by the epilogue.  This
      is 1 when peeling for gaps because we always need a final scalar
      iteration.  */
@@ -8143,16 +8419,25 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      back to latch counts.  */
   if (loop->any_upper_bound)
     loop->nb_iterations_upper_bound
-      = wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
-			lowest_vf) - 1;
+      = (final_iter_may_be_partial
+	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias,
+			  lowest_vf) - 1
+	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
+			   lowest_vf) - 1);
   if (loop->any_likely_upper_bound)
     loop->nb_iterations_likely_upper_bound
-      = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
-			lowest_vf) - 1;
+      = (final_iter_may_be_partial
+	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound + bias,
+			  lowest_vf) - 1
+	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
+			   lowest_vf) - 1);
   if (loop->any_estimate)
     loop->nb_iterations_estimate
-      = wi::udiv_floor (loop->nb_iterations_estimate + bias,
-			assumed_vf) - 1;
+      = (final_iter_may_be_partial
+	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias,
+			  assumed_vf) - 1
+	 : wi::udiv_floor (loop->nb_iterations_estimate + bias,
+			   assumed_vf) - 1);
 
   if (dump_enabled_p ())
     {