Merge branch 'autopar_rebase2' into autopar_develdevel/autopar_devel

Quickly commit changes in the rebase branch.
author: Giuliano Belinassi <giuliano.belinassi@usp.br> 2020-08-22 17:43:43 -0300
committer: Giuliano Belinassi <giuliano.belinassi@usp.br> 2020-08-22 17:43:43 -0300
commit: a926878ddbd5a98b272c22171ce58663fc04c3e0 (patch)
tree: 86af256e5d9a9c06263c00adc90e5fe348008c43 /gcc/tree-vect-loop.c
parent: 542730f087133690b47e036dfd43eb0db8a650ce (diff)
parent: 07cbaed8ba7d1b6e4ab3a9f44175502a4e1ecdb1 (diff)
download: gcc-devel/autopar_devel.zip
gcc-devel/autopar_devel.tar.gz
gcc-devel/autopar_devel.tar.bz2
1 files changed, 773 insertions, 533 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index c4c3cc9..dba230f 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -342,6 +342,8 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 	   gsi_next (&si))
 	{
+	  if (is_gimple_debug (gsi_stmt (si)))
+	    continue;
 	  stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 	  opt_result res
 	    = vect_determine_vf_for_stmt (loop_vinfo,
@@ -800,7 +802,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     vectorization_factor (0),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
-    mask_compare_type (NULL_TREE),
+    rgroup_compare_type (NULL_TREE),
     simd_if_cond (NULL_TREE),
     unaligned_dr (NULL),
     peeling_for_alignment (0),
@@ -812,8 +814,9 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     vec_outside_cost (0),
     vec_inside_cost (0),
     vectorizable (false),
-    can_fully_mask_p (true),
-    fully_masked_p (false),
+    can_use_partial_vectors_p (true),
+    using_partial_vectors_p (false),
+    epil_using_partial_vectors_p (false),
     peeling_for_gaps (false),
     peeling_for_niter (false),
     no_data_dependencies (false),
@@ -847,6 +850,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 	{
 	  gimple *stmt = gsi_stmt (si);
 	  gimple_set_uid (stmt, 0);
+	  if (is_gimple_debug (stmt))
+	    continue;
 	  add_stmt (stmt);
 	  /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 	     third argument is the #pragma omp simd if (x) condition, when 0,
@@ -874,16 +879,16 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
   epilogue_vinfos.create (6);
 }
 
-/* Free all levels of MASKS.  */
+/* Free all levels of rgroup CONTROLS.  */
 
 void
-release_vec_loop_masks (vec_loop_masks *masks)
+release_vec_loop_controls (vec<rgroup_controls> *controls)
 {
-  rgroup_masks *rgm;
+  rgroup_controls *rgc;
   unsigned int i;
-  FOR_EACH_VEC_ELT (*masks, i, rgm)
-    rgm->masks.release ();
-  masks->release ();
+  FOR_EACH_VEC_ELT (*controls, i, rgc)
+    rgc->controls.release ();
+  controls->release ();
 }
 
 /* Free all memory used by the _loop_vec_info, as well as all the
@@ -893,7 +898,8 @@ _loop_vec_info::~_loop_vec_info ()
 {
   free (bbs);
 
-  release_vec_loop_masks (&masks);
+  release_vec_loop_controls (&masks);
+  release_vec_loop_controls (&lens);
   delete ivexpr_map;
   delete scan_map;
   epilogue_vinfos.release ();
@@ -934,12 +940,12 @@ cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 static bool
 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 {
-  rgroup_masks *rgm;
+  rgroup_controls *rgm;
   unsigned int i;
   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
-    if (rgm->mask_type != NULL_TREE
+    if (rgm->type != NULL_TREE
 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
-					    cmp_type, rgm->mask_type,
+					    cmp_type, rgm->type,
 					    OPTIMIZE_FOR_SPEED))
       return false;
   return true;
@@ -953,20 +959,45 @@ vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 {
   unsigned int res = 1;
   unsigned int i;
-  rgroup_masks *rgm;
+  rgroup_controls *rgm;
   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
     res = MAX (res, rgm->max_nscalars_per_iter);
   return res;
 }
 
+/* Calculate the minimum precision necessary to represent:
+
+      MAX_NITERS * FACTOR
+
+   as an unsigned integer, where MAX_NITERS is the maximum number of
+   loop header iterations for the original scalar form of LOOP_VINFO.  */
+
+static unsigned
+vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
+{
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+  /* Get the maximum number of iterations that is representable
+     in the counter type.  */
+  tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
+  widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
+
+  /* Get a more refined estimate for the number of iterations.  */
+  widest_int max_back_edges;
+  if (max_loop_iterations (loop, &max_back_edges))
+    max_ni = wi::smin (max_ni, max_back_edges + 1);
+
+  /* Work out how many bits we need to represent the limit.  */
+  return wi::min_precision (max_ni * factor, UNSIGNED);
+}
+
 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
    whether we can actually generate the masks required.  Return true if so,
-   storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
+   storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
 
 static bool
 vect_verify_full_masking (loop_vec_info loop_vinfo)
 {
-  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int min_ni_width;
   unsigned int max_nscalars_per_iter
     = vect_get_max_nscalars_per_iter (loop_vinfo);
@@ -977,27 +1008,15 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
     return false;
 
-  /* Get the maximum number of iterations that is representable
-     in the counter type.  */
-  tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
-  widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
-
-  /* Get a more refined estimate for the number of iterations.  */
-  widest_int max_back_edges;
-  if (max_loop_iterations (loop, &max_back_edges))
-    max_ni = wi::smin (max_ni, max_back_edges + 1);
-
-  /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= max_nscalars_per_iter;
-
   /* Work out how many bits we need to represent the limit.  */
-  min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+  min_ni_width
+    = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
 
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
   tree cmp_type = NULL_TREE;
   tree iv_type = NULL_TREE;
-  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
   unsigned int iv_precision = UINT_MAX;
 
   if (iv_limit != -1)
@@ -1050,8 +1069,83 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   if (!cmp_type)
     return false;
 
-  LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
-  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
+  LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+  return true;
+}
+
+/* Check whether we can use vector access with length based on precison
+   comparison.  So far, to keep it simple, we only allow the case that the
+   precision of the target supported length is larger than the precision
+   required by loop niters.  */
+
+static bool
+vect_verify_loop_lens (loop_vec_info loop_vinfo)
+{
+  if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+    return false;
+
+  unsigned int max_nitems_per_iter = 1;
+  unsigned int i;
+  rgroup_controls *rgl;
+  /* Find the maximum number of items per iteration for every rgroup.  */
+  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
+    {
+      unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
+      max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
+    }
+
+  /* Work out how many bits we need to represent the length limit.  */
+  unsigned int min_ni_prec
+    = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
+
+  /* Now use the maximum of below precisions for one suitable IV type:
+     - the IV's natural precision
+     - the precision needed to hold: the maximum number of scalar
+       iterations multiplied by the scale factor (min_ni_prec above)
+     - the Pmode precision
+
+     If min_ni_prec is less than the precision of the current niters,
+     we perfer to still use the niters type.  Prefer to use Pmode and
+     wider IV to avoid narrow conversions.  */
+
+  unsigned int ni_prec
+    = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
+  min_ni_prec = MAX (min_ni_prec, ni_prec);
+  min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
+
+  tree iv_type = NULL_TREE;
+  opt_scalar_int_mode tmode_iter;
+  FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
+    {
+      scalar_mode tmode = tmode_iter.require ();
+      unsigned int tbits = GET_MODE_BITSIZE (tmode);
+
+      /* ??? Do we really want to construct one IV whose precision exceeds
+	 BITS_PER_WORD?  */
+      if (tbits > BITS_PER_WORD)
+	break;
+
+      /* Find the first available standard integral type.  */
+      if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
+	{
+	  iv_type = build_nonstandard_integer_type (tbits, true);
+	  break;
+	}
+    }
+
+  if (!iv_type)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "can't vectorize with length-based partial vectors"
+			 " because there is no suitable iv type.\n");
+      return false;
+    }
+
+  LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
+  LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+
   return true;
 }
 
@@ -1124,8 +1218,8 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
 		    j, si)
     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
-			  si->kind, si->stmt_info, si->misalign,
-			  vect_body);
+			  si->kind, si->stmt_info, si->vectype,
+			  si->misalign, vect_body);
   unsigned dummy, body_cost = 0;
   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
   destroy_cost_data (target_cost_data);
@@ -1393,6 +1487,8 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 	   gsi_next (&si))
 	{
+	  if (is_gimple_debug (gsi_stmt (si)))
+	    continue;
 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 	  stmt_info = vect_stmt_to_vectorize (stmt_info);
 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
@@ -1554,7 +1650,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
 		  && ! PURE_SLP_STMT (stmt_info))
 		ok = vectorizable_induction (loop_vinfo,
-					     stmt_info, NULL, NULL, NULL,
+					     stmt_info, NULL, NULL,
 					     &cost_vec);
 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
 			|| (STMT_VINFO_DEF_TYPE (stmt_info)
@@ -1584,7 +1680,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 	   gsi_next (&si))
         {
 	  gimple *stmt = gsi_stmt (si);
-	  if (!gimple_clobber_p (stmt))
+	  if (!gimple_clobber_p (stmt)
+	      && !is_gimple_debug (stmt))
 	    {
 	      opt_result res
 		= vect_analyze_stmt (loop_vinfo,
@@ -1617,6 +1714,27 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
   return opt_result::success ();
 }
 
+/* Return true if we know that the iteration count is smaller than the
+   vectorization factor.  Return false if it isn't, or if we can't be sure
+   either way.  */
+
+static bool
+vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
+{
+  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
+
+  HOST_WIDE_INT max_niter;
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
+  else
+    max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
+    return true;
+
+  return false;
+}
+
 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
    definitely no, or -1 if it's worth retrying.  */
@@ -1627,19 +1745,11 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
 
-  /* Only fully-masked loops can have iteration counts less than the
-     vectorization factor.  */
-  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  /* Only loops that can handle partially-populated vectors can have iteration
+     counts less than the vectorization factor.  */
+  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
-      HOST_WIDE_INT max_niter;
-
-      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
-	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
-      else
-	max_niter = max_stmt_executions_int (loop);
-
-      if (max_niter != -1
-	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
+      if (vect_known_niters_smaller_than_vf (loop_vinfo))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1810,7 +1920,7 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
 
   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
 
-  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
       gcc_assert (DR_REF (dr));
@@ -1865,7 +1975,7 @@ determine_peel_for_niter (loop_vec_info loop_vinfo)
     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
 					  (loop_vinfo));
 
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     /* The main loop handles all iterations.  */
     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
@@ -2049,9 +2159,13 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
 
       /* Update the vectorization factor based on the SLP decision.  */
       vect_update_vf_for_slp (loop_vinfo);
+
+      /* Optimize the SLP graph with the vectorization factor fixed.  */
+      vect_optimize_slp (loop_vinfo);
     }
 
-  bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
+  bool saved_can_use_partial_vectors_p
+    = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
 
   /* We don't expect to have to roll back to anything other than an empty
      set of rgroups.  */
@@ -2099,8 +2213,6 @@ start_over:
     /* This pass will decide on using loop versioning and/or loop peeling in
        order to enhance the alignment of data references in the loop.  */
     ok = vect_enhance_data_refs_alignment (loop_vinfo);
-  else
-    ok = vect_verify_datarefs_alignment (loop_vinfo);
   if (!ok)
     return ok;
 
@@ -2133,19 +2245,56 @@ start_over:
       return ok;
     }
 
-  /* Decide whether to use a fully-masked loop for this vectorization
-     factor.  */
-  LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-    = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
-       && vect_verify_full_masking (loop_vinfo));
+  /* For now, we don't expect to mix both masking and length approaches for one
+     loop, disable it if both are recorded.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "can't vectorize a loop with partial vectors"
+			 " because we don't expect to mix different"
+			 " approaches with partial vectors for the"
+			 " same loop.\n");
+      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
+
+  /* Decide whether to vectorize a loop with partial vectors for
+     this vectorization factor.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+    {
+      if (param_vect_partial_vector_usage == 0)
+	LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+      else if (vect_verify_full_masking (loop_vinfo)
+	       || vect_verify_loop_lens (loop_vinfo))
+	{
+	  /* The epilogue and other known niters less than VF
+	    cases can still use vector access with length fully.  */
+	  if (param_vect_partial_vector_usage == 1
+	      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+	      && !vect_known_niters_smaller_than_vf (loop_vinfo))
+	    {
+	      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+	      LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+	    }
+	  else
+	    LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+	}
+      else
+	LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
+  else
+    LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+
   if (dump_enabled_p ())
     {
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 	dump_printf_loc (MSG_NOTE, vect_location,
-			 "using a fully-masked loop.\n");
+			 "operating on partial vectors.\n");
       else
 	dump_printf_loc (MSG_NOTE, vect_location,
-			 "not using a fully-masked loop.\n");
+			 "operating only on full vectors.\n");
     }
 
   /* If epilog loop is required because of data accesses with gaps,
@@ -2153,7 +2302,7 @@ start_over:
      enough iterations for vectorization.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
@@ -2164,10 +2313,11 @@ start_over:
 				       " support peeling for gaps.\n");
     }
 
-  /* If we're vectorizing an epilogue loop, we either need a fully-masked
-     loop or a loop that has a lower VF than the main loop.  */
+  /* If we're vectorizing an epilogue loop, the vectorized loop either needs
+     to be able to handle fewer than VF scalars, or needs to have a lower VF
+     than the main loop.  */
   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
 		   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
     return opt_result::failure_at (vect_location,
@@ -2238,7 +2388,7 @@ start_over:
 	}
 
       /* Niters for at least one iteration of vectorized loop.  */
-      if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
       /* One additional iteration because of peeling for gap.  */
       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
@@ -2342,6 +2492,8 @@ again:
       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 	   !gsi_end_p (si); gsi_next (&si))
 	{
+	  if (is_gimple_debug (gsi_stmt (si)))
+	    continue;
 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
@@ -2365,13 +2517,15 @@ again:
   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
   /* Reset accumulated rgroup information.  */
-  release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
   /* Reset assorted flags.  */
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
-  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
+  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+    = saved_can_use_partial_vectors_p;
 
   goto start_over;
 }
@@ -2651,7 +2805,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 		lowest_th = ordered_min (lowest_th, th);
 	    }
 	  else
-	    delete loop_vinfo;
+	    {
+	      delete loop_vinfo;
+	      loop_vinfo = opt_loop_vec_info::success (NULL);
+	    }
 
 	  /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
 	     enabled, SIMDUID is not set, it is the innermost loop and we have
@@ -2676,6 +2833,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
       else
 	{
 	  delete loop_vinfo;
+	  loop_vinfo = opt_loop_vec_info::success (NULL);
 	  if (fatal)
 	    {
 	      gcc_checking_assert (first_loop_vinfo == NULL);
@@ -2683,6 +2841,23 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 	    }
 	}
 
+      /* Handle the case that the original loop can use partial
+	 vectorization, but want to only adopt it for the epilogue.
+	 The retry should be in the same mode as original.  */
+      if (vect_epilogues
+	  && loop_vinfo
+	  && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+	{
+	  gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+		      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "***** Re-trying analysis with same vector mode"
+			     " %s for epilogue with partial vectors.\n",
+			     GET_MODE_NAME (loop_vinfo->vector_mode));
+	  continue;
+	}
+
       if (mode_i < vector_modes.length ()
 	  && VECTOR_MODE_P (autodetected_vector_mode)
 	  && (related_vector_mode (vector_modes[mode_i],
@@ -3299,42 +3474,58 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
   return NULL;
 }
 
-/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
-int
-vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
-                             int *peel_iters_epilogue,
-                             stmt_vector_for_cost *scalar_cost_vec,
-			     stmt_vector_for_cost *prologue_cost_vec,
-			     stmt_vector_for_cost *epilogue_cost_vec)
+/* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
+   PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
+   or -1 if not known.  */
+
+static int
+vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
 {
-  int retval = 0;
   int assumed_vf = vect_vf_for_cost (loop_vinfo);
-
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
     {
-      *peel_iters_epilogue = assumed_vf / 2;
       if (dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location,
+	dump_printf_loc (MSG_NOTE, vect_location,
 			 "cost model: epilogue peel iters set to vf/2 "
 			 "because loop iterations are unknown .\n");
-
-      /* If peeled iterations are known but number of scalar loop
-         iterations are unknown, count a taken branch per peeled loop.  */
-      retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
-				 NULL, 0, vect_prologue);
-      retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
-				  NULL, 0, vect_epilogue);
+      return assumed_vf / 2;
     }
   else
     {
       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
-      peel_iters_prologue = niters < peel_iters_prologue ?
-                            niters : peel_iters_prologue;
-      *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
+      peel_iters_prologue = MIN (niters, peel_iters_prologue);
+      int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
       /* If we need to peel for gaps, but no peeling is required, we have to
 	 peel VF iterations.  */
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
-	*peel_iters_epilogue = assumed_vf;
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
+	peel_iters_epilogue = assumed_vf;
+      return peel_iters_epilogue;
+    }
+}
+
+/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
+int
+vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
+			     int *peel_iters_epilogue,
+			     stmt_vector_for_cost *scalar_cost_vec,
+			     stmt_vector_for_cost *prologue_cost_vec,
+			     stmt_vector_for_cost *epilogue_cost_vec)
+{
+  int retval = 0;
+
+  *peel_iters_epilogue
+    = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
+
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      /* If peeled iterations are known but number of scalar loop
+	 iterations are unknown, count a taken branch per peeled loop.  */
+      if (peel_iters_prologue > 0)
+	retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
+				   NULL, NULL_TREE, 0, vect_prologue);
+      if (*peel_iters_epilogue > 0)
+	retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
+				    NULL, NULL_TREE, 0, vect_epilogue);
     }
 
   stmt_info_for_cost *si;
@@ -3404,7 +3595,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       /*  FIXME: Make cost depend on complexity of individual check.  */
       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
-			    NULL, 0, vect_prologue);
+			    NULL, NULL_TREE, 0, vect_prologue);
       if (dump_enabled_p ())
 	dump_printf (MSG_NOTE,
 		     "cost model: Adding cost of checks for loop "
@@ -3417,12 +3608,12 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       /*  FIXME: Make cost depend on complexity of individual check.  */
       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
-			    NULL, 0, vect_prologue);
+			    NULL, NULL_TREE, 0, vect_prologue);
       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
       if (len)
 	/* Count LEN - 1 ANDs and LEN comparisons.  */
 	(void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
-			      scalar_stmt, NULL, 0, vect_prologue);
+			      scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
       if (len)
 	{
@@ -3433,7 +3624,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
 	      nstmts += 1;
 	  (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
-				scalar_stmt, NULL, 0, vect_prologue);
+				scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
 	}
       if (dump_enabled_p ())
 	dump_printf (MSG_NOTE,
@@ -3446,7 +3637,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     {
       /*  FIXME: Make cost depend on complexity of individual check.  */
       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
-			    NULL, 0, vect_prologue);
+			    NULL, NULL_TREE, 0, vect_prologue);
       if (dump_enabled_p ())
 	dump_printf (MSG_NOTE,
 		     "cost model: Adding cost of checks for loop "
@@ -3455,7 +3646,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
-			  NULL, 0, vect_prologue);
+			  NULL, NULL_TREE, 0, vect_prologue);
 
   /* Count statements in scalar loop.  Using this as scalar cost for a single
      iteration for now.
@@ -3477,30 +3668,116 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      TODO: Build an expression that represents peel_iters for prologue and
      epilogue to be used in a run-time test.  */
 
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  bool prologue_need_br_taken_cost = false;
+  bool prologue_need_br_not_taken_cost = false;
+
+  /* Calculate peel_iters_prologue.  */
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
+    peel_iters_prologue = 0;
+  else if (npeel < 0)
     {
-      peel_iters_prologue = 0;
-      peel_iters_epilogue = 0;
+      peel_iters_prologue = assumed_vf / 2;
+      if (dump_enabled_p ())
+	dump_printf (MSG_NOTE, "cost model: "
+		     "prologue peel iters set to vf/2.\n");
 
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-	{
-	  /* We need to peel exactly one iteration.  */
-	  peel_iters_epilogue += 1;
-	  stmt_info_for_cost *si;
-	  int j;
-	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
-			    j, si)
-	    (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
-				  si->kind, si->stmt_info, si->misalign,
-				  vect_epilogue);
-	}
+      /* If peeled iterations are unknown, count a taken branch and a not taken
+	 branch per peeled loop.  Even if scalar loop iterations are known,
+	 vector iterations are not known since peeled prologue iterations are
+	 not known.  Hence guards remain the same.  */
+      prologue_need_br_taken_cost = true;
+      prologue_need_br_not_taken_cost = true;
+    }
+  else
+    {
+      peel_iters_prologue = npeel;
+      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
+	/* If peeled iterations are known but number of scalar loop
+	   iterations are unknown, count a taken branch per peeled loop.  */
+	prologue_need_br_taken_cost = true;
+    }
+
+  bool epilogue_need_br_taken_cost = false;
+  bool epilogue_need_br_not_taken_cost = false;
+
+  /* Calculate peel_iters_epilogue.  */
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+    /* We need to peel exactly one iteration for gaps.  */
+    peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
+  else if (npeel < 0)
+    {
+      /* If peeling for alignment is unknown, loop bound of main loop
+	 becomes unknown.  */
+      peel_iters_epilogue = assumed_vf / 2;
+      if (dump_enabled_p ())
+	dump_printf (MSG_NOTE, "cost model: "
+		     "epilogue peel iters set to vf/2 because "
+		     "peeling for alignment is unknown.\n");
+
+      /* See the same reason above in peel_iters_prologue calculation.  */
+      epilogue_need_br_taken_cost = true;
+      epilogue_need_br_not_taken_cost = true;
+    }
+  else
+    {
+      peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
+      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
+	/* If peeled iterations are known but number of scalar loop
+	   iterations are unknown, count a taken branch per peeled loop.  */
+	epilogue_need_br_taken_cost = true;
+    }
+
+  stmt_info_for_cost *si;
+  int j;
+  /* Add costs associated with peel_iters_prologue.  */
+  if (peel_iters_prologue)
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+      {
+	(void) add_stmt_cost (loop_vinfo, target_cost_data,
+			      si->count * peel_iters_prologue, si->kind,
+			      si->stmt_info, si->vectype, si->misalign,
+			      vect_prologue);
+      }
+
+  /* Add costs associated with peel_iters_epilogue.  */
+  if (peel_iters_epilogue)
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+      {
+	(void) add_stmt_cost (loop_vinfo, target_cost_data,
+			      si->count * peel_iters_epilogue, si->kind,
+			      si->stmt_info, si->vectype, si->misalign,
+			      vect_epilogue);
+      }
+
+  /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
+
+  if (prologue_need_br_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+			  NULL, NULL_TREE, 0, vect_prologue);
+
+  if (prologue_need_br_not_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+			  cond_branch_not_taken, NULL, NULL_TREE, 0,
+			  vect_prologue);
+
+  if (epilogue_need_br_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+			  NULL, NULL_TREE, 0, vect_epilogue);
+
+  if (epilogue_need_br_not_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+			  cond_branch_not_taken, NULL, NULL_TREE, 0,
+			  vect_epilogue);
 
+  /* Take care of special costs for rgroup controls of partial vectors.  */
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
       /* Calculate how many masks we need to generate.  */
       unsigned int num_masks = 0;
-      rgroup_masks *rgm;
+      rgroup_controls *rgm;
       unsigned int num_vectors_m1;
       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
-	if (rgm->mask_type)
+	if (rgm->type)
 	  num_masks += num_vectors_m1 + 1;
       gcc_assert (num_masks > 0);
 
@@ -3516,86 +3793,62 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	 simpler and safer to use the worst-case cost; if this ends up
 	 being the tie-breaker between vectorizing or not, then it's
 	 probably better not to vectorize.  */
-      (void) add_stmt_cost (loop_vinfo,
-			    target_cost_data, num_masks, vector_stmt,
-			    NULL, 0, vect_prologue);
-      (void) add_stmt_cost (loop_vinfo,
-			    target_cost_data, num_masks - 1, vector_stmt,
-			    NULL, 0, vect_body);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
+			    vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
+			    vector_stmt, NULL, NULL_TREE, 0, vect_body);
     }
-  else if (npeel < 0)
+  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
     {
-      peel_iters_prologue = assumed_vf / 2;
-      if (dump_enabled_p ())
-	dump_printf (MSG_NOTE, "cost model: "
-		     "prologue peel iters set to vf/2.\n");
+      /* Referring to the functions vect_set_loop_condition_partial_vectors
+	 and vect_set_loop_controls_directly, we need to generate each
+	 length in the prologue and in the loop body if required. Although
+	 there are some possible optimizations, we consider the worst case
+	 here.  */
 
-      /* If peeling for alignment is unknown, loop bound of main loop becomes
-         unknown.  */
-      peel_iters_epilogue = assumed_vf / 2;
-      if (dump_enabled_p ())
-	dump_printf (MSG_NOTE, "cost model: "
-		     "epilogue peel iters set to vf/2 because "
-		     "peeling for alignment is unknown.\n");
+      bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
+      bool need_iterate_p
+	= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+	   && !vect_known_niters_smaller_than_vf (loop_vinfo));
 
-      /* If peeled iterations are unknown, count a taken branch and a not taken
-         branch per peeled loop. Even if scalar loop iterations are known,
-         vector iterations are not known since peeled prologue iterations are
-         not known. Hence guards remain the same.  */
-      (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
-			    NULL, 0, vect_prologue);
-      (void) add_stmt_cost (loop_vinfo,
-			    target_cost_data, 1, cond_branch_not_taken,
-			    NULL, 0, vect_prologue);
-      (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
-			    NULL, 0, vect_epilogue);
-      (void) add_stmt_cost (loop_vinfo,
-			    target_cost_data, 1, cond_branch_not_taken,
-			    NULL, 0, vect_epilogue);
-      stmt_info_for_cost *si;
-      int j;
-      FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
-	{
-	  (void) add_stmt_cost (loop_vinfo, target_cost_data,
-				si->count * peel_iters_prologue,
-				si->kind, si->stmt_info, si->misalign,
-				vect_prologue);
-	  (void) add_stmt_cost (loop_vinfo, target_cost_data,
-				si->count * peel_iters_epilogue,
-				si->kind, si->stmt_info, si->misalign,
-				vect_epilogue);
-	}
-    }
-  else
-    {
-      stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
-      stmt_info_for_cost *si;
-      int j;
-      void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
+      /* Calculate how many statements to be added.  */
+      unsigned int prologue_stmts = 0;
+      unsigned int body_stmts = 0;
 
-      prologue_cost_vec.create (2);
-      epilogue_cost_vec.create (2);
-      peel_iters_prologue = npeel;
+      rgroup_controls *rgc;
+      unsigned int num_vectors_m1;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+	if (rgc->type)
+	  {
+	    /* May need one SHIFT for nitems_total computation.  */
+	    unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+	    if (nitems != 1 && !niters_known_p)
+	      prologue_stmts += 1;
+
+	    /* May need one MAX and one MINUS for wrap around.  */
+	    if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
+	      prologue_stmts += 2;
 
-      (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
-					  &peel_iters_epilogue,
-					  &LOOP_VINFO_SCALAR_ITERATION_COST
-					    (loop_vinfo),
-					  &prologue_cost_vec,
-					  &epilogue_cost_vec);
+	    /* Need one MAX and one MINUS for each batch limit excepting for
+	       the 1st one.  */
+	    prologue_stmts += num_vectors_m1 * 2;
 
-      FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
-	(void) add_stmt_cost (loop_vinfo,
-			      data, si->count, si->kind, si->stmt_info,
-			      si->misalign, vect_prologue);
+	    unsigned int num_vectors = num_vectors_m1 + 1;
 
-      FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
-	(void) add_stmt_cost (loop_vinfo,
-			      data, si->count, si->kind, si->stmt_info,
-			      si->misalign, vect_epilogue);
+	    /* Need to set up lengths in prologue, only one MIN required
+	       for each since start index is zero.  */
+	    prologue_stmts += num_vectors;
 
-      prologue_cost_vec.release ();
-      epilogue_cost_vec.release ();
+	    /* Each may need two MINs and one MINUS to update lengths in body
+	       for next iteration.  */
+	    if (need_iterate_p)
+	      body_stmts += 3 * num_vectors;
+	  }
+
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
+			    scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
+			    scalar_stmt, NULL, NULL_TREE, 0, vect_body);
     }
 
   /* FORNOW: The scalar outside cost is incremented in one of the
@@ -3731,8 +3984,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     }
 
   /* ??? The "if" arm is written to handle all cases; see below for what
-     we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+     we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
       /* Rewriting the condition above in terms of the number of
 	 vector iterations (vniters) rather than the number of
@@ -3759,7 +4012,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
 		     min_vec_niters);
 
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 	{
 	  /* Now that we know the minimum number of vector iterations,
 	     find the minimum niters for which the scalar cost is larger:
@@ -3810,10 +4063,14 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 		 "  Calculated minimum iters for profitability: %d\n",
 		 min_profitable_iters);
 
-  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
     /* We want the vectorized loop to execute at least once.  */
     min_profitable_iters = assumed_vf + peel_iters_prologue;
+  else if (min_profitable_iters < peel_iters_prologue)
+    /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
+       vectorized loop executes at least once.  */
+    min_profitable_iters = peel_iters_prologue;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -3831,7 +4088,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 
   if (vec_outside_cost <= 0)
     min_profitable_estimate = 0;
-  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
       /* This is a repeat of the code above, but with + SOC rather
 	 than - SOC.  */
@@ -3843,7 +4100,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       if (outside_overhead > 0)
 	min_vec_niters = outside_overhead / saving_per_viter + 1;
 
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 	{
 	  int threshold = (vec_inside_cost * min_vec_niters
 			   + vec_outside_cost
@@ -4480,7 +4737,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
-  stmt_vec_info prev_phi_info;
   tree vectype;
   machine_mode mode;
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
@@ -4488,7 +4744,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest;
   tree scalar_type;
   gimple *new_phi = NULL, *phi;
-  stmt_vec_info phi_info;
   gimple_stmt_iterator exit_gsi;
   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
   gimple *epilog_stmt = NULL;
@@ -4511,7 +4766,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   tree induction_index = NULL_TREE;
 
   if (slp_node)
-    group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 
+    group_size = SLP_TREE_LANES (slp_node);
 
   if (nested_in_vect_loop_p (loop, stmt_info))
     {
@@ -4558,15 +4813,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     }
   else
     {
+      stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
       vec_num = 1;
-      ncopies = 0;
-      phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
-      do
-	{
-	  ncopies++;
-	  phi_info = STMT_VINFO_RELATED_STMT (phi_info);
-	}
-      while (phi_info);
+      ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
     }
 
   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
@@ -4588,7 +4837,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	{
 	  if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
 	    {
-	      gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
+	      gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
 	      gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
 	      ccompares.safe_push
 		(std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
@@ -4639,7 +4888,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       /* Create a vector phi node.  */
       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
       new_phi = create_phi_node (new_phi_tree, loop->header);
-      loop_vinfo->add_stmt (new_phi);
       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
 
@@ -4666,9 +4914,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 					 new_phi_tree, indx_before_incr);
 	}
       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
-      stmt_vec_info index_vec_info
-	= loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
-      STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
 
       /* Update the phi with the vec cond.  */
       induction_index = new_phi_tree;
@@ -4709,29 +4954,26 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   if (double_reduc)
     loop = outer_loop;
   exit_bb = single_exit (loop)->dest;
-  prev_phi_info = NULL;
   new_phis.create (slp_node ? vec_num : ncopies);
   for (unsigned i = 0; i < vec_num; i++)
     {
       if (slp_node)
-	def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
+	def = vect_get_slp_vect_def (slp_node, i);
       else
-	def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
+	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
       for (j = 0; j < ncopies; j++)
         {
 	  tree new_def = copy_ssa_name (def);
           phi = create_phi_node (new_def, exit_bb);
-	  stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
           if (j == 0)
             new_phis.quick_push (phi);
           else
 	    {
-	      def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
-	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
+	      def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
+	      new_phis.quick_push (phi);
 	    }
 
           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
-	  prev_phi_info = phi_info;
         }
     }
 
@@ -4802,15 +5044,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   /* Likewise if we couldn't use a single defuse cycle.  */
   else if (ncopies > 1)
     {
-      gcc_assert (new_phis.length () == 1);
       gimple_seq stmts = NULL;
       tree first_vect = PHI_RESULT (new_phis[0]);
       first_vect = gimple_convert (&stmts, vectype, first_vect);
-      stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
       for (int k = 1; k < ncopies; ++k)
 	{
-	  next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
-	  tree second_vect = PHI_RESULT (next_phi_info->stmt);
+	  tree second_vect = PHI_RESULT (new_phis[k]);
 	  second_vect = gimple_convert (&stmts, vectype, second_vect);
 	  first_vect = gimple_build (&stmts, code, vectype,
 				     first_vect, second_vect);
@@ -5454,10 +5693,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
       if (nested_in_vect_loop)
         {
-	  stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
-	  STMT_VINFO_RELATED_STMT (epilog_stmt_info)
-	    = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
-
           if (!double_reduc)
             scalar_results.quick_push (new_temp);
           else
@@ -5674,7 +5909,7 @@ static bool
 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 			       stmt_vec_info stmt_info,
 			       gimple_stmt_iterator *gsi,
-			       stmt_vec_info *vec_stmt, slp_tree slp_node,
+			       gimple **vec_stmt, slp_tree slp_node,
 			       gimple *reduc_def_stmt,
 			       tree_code code, internal_fn reduc_fn,
 			       tree ops[3], tree vectype_in,
@@ -5682,7 +5917,6 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
-  stmt_vec_info new_stmt_info = NULL;
   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
 
   int ncopies;
@@ -5716,10 +5950,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
     }
   else
     {
-      tree loop_vec_def0 = vect_get_vec_def_for_operand (loop_vinfo,
-							 op0, stmt_info);
-      vec_oprnds0.create (1);
-      vec_oprnds0.quick_push (loop_vec_def0);
+      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+				     op0, &vec_oprnds0);
       scalar_dest_def_info = stmt_info;
     }
 
@@ -5795,22 +6027,24 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       if (i == vec_num - 1)
 	{
 	  gimple_set_lhs (new_stmt, scalar_dest);
-	  new_stmt_info = vect_finish_replace_stmt (loop_vinfo,
-						    scalar_dest_def_info,
-						    new_stmt);
+	  vect_finish_replace_stmt (loop_vinfo,
+				    scalar_dest_def_info,
+				    new_stmt);
 	}
       else
-	new_stmt_info = vect_finish_stmt_generation (loop_vinfo,
-						     scalar_dest_def_info,
-						     new_stmt, gsi);
+	vect_finish_stmt_generation (loop_vinfo,
+				     scalar_dest_def_info,
+				     new_stmt, gsi);
 
       if (slp_node)
-	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
+	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+      else
+	{
+	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	  *vec_stmt = new_stmt;
+	}
     }
 
-  if (!slp_node)
-    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
-
   return true;
 }
 
@@ -6180,17 +6414,35 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
      The last use is the reduction variable.  In case of nested cycle this
      assumption is not true: we use reduc_index to record the index of the
      reduction variable.  */
-  reduc_def = PHI_RESULT (reduc_def_phi);
+  /* ???  To get at invariant/constant uses on the SLP node we have to
+     get to it here, slp_node is still the reduction PHI.  */
+  slp_tree slp_for_stmt_info = NULL;
+  if (slp_node)
+    {
+      slp_for_stmt_info = slp_node_instance->root;
+      /* And then there's reduction chain with a conversion ...  */
+      if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
+	slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
+      gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
+    }
+  slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
+  /* We need to skip an extra operand for COND_EXPRs with embedded
+     comparison.  */
+  unsigned opno_adjust = 0;
+  if (code == COND_EXPR
+      && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
+    opno_adjust = 1;
   for (i = 0; i < op_type; i++)
     {
-      tree op = gimple_op (stmt, i + 1);
       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
       if (i == 0 && code == COND_EXPR)
         continue;
 
       stmt_vec_info def_stmt_info;
       enum vect_def_type dt;
-      if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
+      tree op;
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
+			       i + opno_adjust, &op, &slp_op[i], &dt, &tem,
 			       &def_stmt_info))
 	{
 	  if (dump_enabled_p ())
@@ -6571,7 +6823,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	 which each SLP statement has its own initial value and in which
 	 that value needs to be repeated for every instance of the
 	 statement within the initial vector.  */
-      unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+      unsigned int group_size = SLP_TREE_LANES (slp_node);
       if (!neutral_op
 	  && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
 					      TREE_TYPE (vectype_out)))
@@ -6724,6 +6976,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       return false;
     }
 
+  if (slp_node
+      && !(!single_defuse_cycle
+	   && code != DOT_PROD_EXPR
+	   && code != WIDEN_SUM_EXPR
+	   && code != SAD_EXPR
+	   && reduction_type != FOLD_LEFT_REDUCTION))
+    for (i = 0; i < op_type; i++)
+      if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "incompatible vector types for invariants\n");
+	  return false;
+	}
+
   if (slp_node)
     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
   else
@@ -6754,7 +7021,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
     }
-  else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+  else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
     {
       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
       internal_fn cond_fn = get_conditional_internal_fn (code);
@@ -6767,9 +7034,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "can't use a fully-masked loop because no"
-			     " conditional operation is available.\n");
-	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+			     "can't operate on partial vectors because"
+			     " no conditional operation is available.\n");
+	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	}
       else if (reduction_type == FOLD_LEFT_REDUCTION
 	       && reduc_fn == IFN_LAST
@@ -6779,9 +7046,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "can't use a fully-masked loop because no"
-			     " conditional operation is available.\n");
-	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+			     "can't operate on partial vectors because"
+			     " no conditional operation is available.\n");
+	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	}
       else
 	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
@@ -6796,13 +7063,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 bool
 vect_transform_reduction (loop_vec_info loop_vinfo,
 			  stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
-			  stmt_vec_info *vec_stmt, slp_tree slp_node)
+			  gimple **vec_stmt, slp_tree slp_node)
 {
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
-  int j;
   int vec_num;
 
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
@@ -6858,8 +7124,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
 
   /* Transform.  */
-  stmt_vec_info new_stmt_info = NULL;
-  stmt_vec_info prev_stmt_info;
   tree new_temp = NULL_TREE;
   auto_vec<tree> vec_oprnds0;
   auto_vec<tree> vec_oprnds1;
@@ -6894,139 +7158,83 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_assign_lhs (stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
-  prev_stmt_info = NULL;
-  if (!slp_node)
+  vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
+		     single_defuse_cycle && reduc_index == 0
+		     ? NULL_TREE : ops[0], &vec_oprnds0,
+		     single_defuse_cycle && reduc_index == 1
+		     ? NULL_TREE : ops[1], &vec_oprnds1,
+		     op_type == ternary_op
+		     && !(single_defuse_cycle && reduc_index == 2)
+		     ? ops[2] : NULL_TREE, &vec_oprnds2);
+  if (single_defuse_cycle)
     {
-      vec_oprnds0.create (1);
-      vec_oprnds1.create (1);
-      if (op_type == ternary_op)
-        vec_oprnds2.create (1);
+      gcc_assert (!slp_node);
+      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+				     ops[reduc_index],
+				     reduc_index == 0 ? &vec_oprnds0
+				     : (reduc_index == 1 ? &vec_oprnds1
+					: &vec_oprnds2));
     }
 
-  for (j = 0; j < ncopies; j++)
+  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
     {
-      /* Handle uses.  */
-      if (j == 0)
-        {
-	  if (slp_node)
-	    {
-	      /* Get vec defs for all the operands except the reduction index,
-		 ensuring the ordering of the ops in the vector is kept.  */
-	      auto_vec<vec<tree>, 3> vec_defs;
-	      vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
-	      vec_oprnds0.safe_splice (vec_defs[0]);
-	      vec_defs[0].release ();
-	      vec_oprnds1.safe_splice (vec_defs[1]);
-	      vec_defs[1].release ();
-	      if (op_type == ternary_op)
-		{
-		  vec_oprnds2.safe_splice (vec_defs[2]);
-		  vec_defs[2].release ();
-		}
-	    }
-          else
+      gimple *new_stmt;
+      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
+      if (masked_loop_p && !mask_by_cond_expr)
+	{
+	  /* Make sure that the reduction accumulator is vop[0].  */
+	  if (reduc_index == 1)
 	    {
-              vec_oprnds0.quick_push
-		(vect_get_vec_def_for_operand (loop_vinfo, ops[0], stmt_info));
-              vec_oprnds1.quick_push
-		(vect_get_vec_def_for_operand (loop_vinfo, ops[1], stmt_info));
-              if (op_type == ternary_op)
-		vec_oprnds2.quick_push 
-		  (vect_get_vec_def_for_operand (loop_vinfo, ops[2], stmt_info));
+	      gcc_assert (commutative_tree_code (code));
+	      std::swap (vop[0], vop[1]);
 	    }
-        }
+	  tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+					  vectype_in, i);
+	  gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
+						    vop[0], vop[1], vop[0]);
+	  new_temp = make_ssa_name (vec_dest, call);
+	  gimple_call_set_lhs (call, new_temp);
+	  gimple_call_set_nothrow (call, true);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
+	  new_stmt = call;
+	}
       else
-        {
-          if (!slp_node)
-            {
-	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
-
-	      if (single_defuse_cycle && reduc_index == 0)
-		vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
-	      else
-		vec_oprnds0[0]
-		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-						    vec_oprnds0[0]);
-	      if (single_defuse_cycle && reduc_index == 1)
-		vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
-	      else
-		vec_oprnds1[0]
-		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-						    vec_oprnds1[0]);
-	      if (op_type == ternary_op)
-		{
-		  if (single_defuse_cycle && reduc_index == 2)
-		    vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
-		  else
-		    vec_oprnds2[0] 
-		      = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-							vec_oprnds2[0]);
-		}
-            }
-        }
+	{
+	  if (op_type == ternary_op)
+	    vop[2] = vec_oprnds2[i];
 
-      FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
-        {
-	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-	  if (masked_loop_p && !mask_by_cond_expr)
+	  if (masked_loop_p && mask_by_cond_expr)
 	    {
-	      /* Make sure that the reduction accumulator is vop[0].  */
-	      if (reduc_index == 1)
-		{
-		  gcc_assert (commutative_tree_code (code));
-		  std::swap (vop[0], vop[1]);
-		}
 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
-					      vectype_in, i * ncopies + j);
-	      gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
-							vop[0], vop[1],
-							vop[0]);
-	      new_temp = make_ssa_name (vec_dest, call);
-	      gimple_call_set_lhs (call, new_temp);
-	      gimple_call_set_nothrow (call, true);
-	      new_stmt_info
-		= vect_finish_stmt_generation (loop_vinfo,
-					       stmt_info, call, gsi);
-	    }
-	  else
-	    {
-	      if (op_type == ternary_op)
-		vop[2] = vec_oprnds2[i];
-
-	      if (masked_loop_p && mask_by_cond_expr)
-		{
-		  tree mask = vect_get_loop_mask (gsi, masks,
-						  vec_num * ncopies,
-						  vectype_in, i * ncopies + j);
-		  build_vect_cond_expr (code, vop, mask, gsi);
-		}
-
-	      gassign *new_stmt = gimple_build_assign (vec_dest, code,
-						       vop[0], vop[1], vop[2]);
-	      new_temp = make_ssa_name (vec_dest, new_stmt);
-	      gimple_assign_set_lhs (new_stmt, new_temp);
-	      new_stmt_info
-		= vect_finish_stmt_generation (loop_vinfo,
-					       stmt_info, new_stmt, gsi);
+					      vectype_in, i);
+	      build_vect_cond_expr (code, vop, mask, gsi);
 	    }
 
-          if (slp_node)
-	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
-        }
-
-      if (slp_node || single_defuse_cycle)
-        continue;
+	  new_stmt = gimple_build_assign (vec_dest, code,
+					  vop[0], vop[1], vop[2]);
+	  new_temp = make_ssa_name (vec_dest, new_stmt);
+	  gimple_assign_set_lhs (new_stmt, new_temp);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+	}
 
-      if (j == 0)
-	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+      if (slp_node)
+	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+      else if (single_defuse_cycle
+	       && i < ncopies - 1)
+	{
+	  if (reduc_index == 0)
+	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
+	  else if (reduc_index == 1)
+	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
+	  else if (reduc_index == 2)
+	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
+	}
       else
-	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
-
-      prev_stmt_info = new_stmt_info;
+	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
 
-  if (single_defuse_cycle && !slp_node)
-    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+  if (!slp_node)
+    *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
 
   return true;
 }
@@ -7035,14 +7243,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
 bool
 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
-			  stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+			  stmt_vec_info stmt_info, gimple **vec_stmt,
 			  slp_tree slp_node, slp_instance slp_node_instance)
 {
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
-  stmt_vec_info prev_phi_info;
   int j;
   bool nested_cycle = false;
   int vec_num;
@@ -7072,9 +7279,8 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
   if (slp_node)
     {
       /* The size vect_schedule_slp_instance computes is off for us.  */
-      vec_num = vect_get_num_vectors
-	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-	   * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
+      vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+				      * SLP_TREE_LANES (slp_node), vectype_in);
       ncopies = 1;
     }
   else
@@ -7134,14 +7340,17 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
 	    }
 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
+	  vec_initial_defs.create (ncopies);
+	  for (i = 0; i < ncopies; ++i)
+	    vec_initial_defs.quick_push (vec_initial_def);
 	}
       else if (nested_cycle)
 	{
 	  /* Do not use an adjustment def as that case is not supported
 	     correctly if ncopies is not one.  */
-	  vec_initial_def = vect_get_vec_def_for_operand (loop_vinfo,
-							  initial_def,
-							  reduc_stmt_info);
+	  vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
+					 ncopies, initial_def,
+					 &vec_initial_defs);
 	}
       else
 	{
@@ -7154,13 +7363,13 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	    = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
 					     initial_def, adjustment_defp);
 	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
+	  vec_initial_defs.create (ncopies);
+	  for (i = 0; i < ncopies; ++i)
+	    vec_initial_defs.quick_push (vec_initial_def);
 	}
-      vec_initial_defs.create (1);
-      vec_initial_defs.quick_push (vec_initial_def);
     }
 
   /* Generate the reduction PHIs upfront.  */
-  prev_phi_info = NULL;
   for (i = 0; i < vec_num; i++)
     {
       tree vec_init_def = vec_initial_defs[i];
@@ -7169,26 +7378,22 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	  /* Create the reduction-phi that defines the reduction
 	     operand.  */
 	  gphi *new_phi = create_phi_node (vec_dest, loop->header);
-	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
 
 	  /* Set the loop-entry arg of the reduction-phi.  */
 	  if (j != 0 && nested_cycle)
-	    vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-							   vec_init_def);
+	    vec_init_def = vec_initial_defs[j];
 	  add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
 		       UNKNOWN_LOCATION);
 
 	  /* The loop-latch arg is set in epilogue processing.  */
 
 	  if (slp_node)
-	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
+	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
 	  else
 	    {
 	      if (j == 0)
-		STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
-	      else
-		STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
-	      prev_phi_info = new_phi_info;
+		*vec_stmt = new_phi;
+	      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
 	    }
 	}
     }
@@ -7200,7 +7405,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 
 bool
 vectorizable_lc_phi (loop_vec_info loop_vinfo,
-		     stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+		     stmt_vec_info stmt_info, gimple **vec_stmt,
 		     slp_tree slp_node)
 {
   if (!loop_vinfo
@@ -7223,43 +7428,22 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
   basic_block bb = gimple_bb (stmt_info->stmt);
   edge e = single_pred_edge (bb);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
-  vec<tree> vec_oprnds = vNULL;
-  vect_get_vec_defs (loop_vinfo,
-		     gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
-		     stmt_info, &vec_oprnds, NULL, slp_node);
-  if (slp_node)
-    {
-      unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      gcc_assert (vec_oprnds.length () == vec_num);
-      for (unsigned i = 0; i < vec_num; i++)
-	{
-	  /* Create the vectorized LC PHI node.  */
-	  gphi *new_phi = create_phi_node (vec_dest, bb);
-	  add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
-	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
-	  SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
-	}
-    }
-  else
-    {
-      unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
-      stmt_vec_info prev_phi_info = NULL;
-      for (unsigned i = 0; i < ncopies; i++)
-	{
-	  if (i != 0)
-	    vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
-	  /* Create the vectorized LC PHI node.  */
-	  gphi *new_phi = create_phi_node (vec_dest, bb);
-	  add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
-	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
-	  if (i == 0)
-	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
-	  else
-	    STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
-	  prev_phi_info = new_phi_info;
-	}
+  auto_vec<tree> vec_oprnds;
+  vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
+		     !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
+		     gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
+  for (unsigned i = 0; i < vec_oprnds.length (); i++)
+    {
+      /* Create the vectorized LC PHI node.  */
+      gphi *new_phi = create_phi_node (vec_dest, bb);
+      add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
+      if (slp_node)
+	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
+      else
+	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
     }
-  vec_oprnds.release ();
+  if (!slp_node)
+    *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
 
   return true;
 }
@@ -7315,8 +7499,7 @@ vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
 bool
 vectorizable_induction (loop_vec_info loop_vinfo,
 			stmt_vec_info stmt_info,
-			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
-			stmt_vec_info *vec_stmt, slp_tree slp_node,
+			gimple **vec_stmt, slp_tree slp_node,
 			stmt_vector_for_cost *cost_vec)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -7336,11 +7519,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   unsigned i;
   tree expr;
   gimple_seq stmts;
-  imm_use_iterator imm_iter;
-  use_operand_p use_p;
-  gimple *exit_phi;
-  edge latch_e;
-  tree loop_arg;
   gimple_stmt_iterator si;
 
   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
@@ -7448,9 +7626,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
 
-  latch_e = loop_latch_edge (iv_loop);
-  loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
-
   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
   gcc_assert (step_expr != NULL_TREE);
   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
@@ -7520,10 +7695,16 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 				   new_vec, step_vectype, NULL);
 
       /* Now generate the IVs.  */
-      unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+      unsigned group_size = SLP_TREE_LANES (slp_node);
       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
       unsigned elts = const_nunits * nvects;
-      unsigned nivs = least_common_multiple (group_size,
+      /* Compute the number of distinct IVs we need.  First reduce
+	 group_size if it is a multiple of const_nunits so we get
+	 one IV for a group_size of 4 but const_nunits 2.  */
+      unsigned group_sizep = group_size;
+      if (group_sizep % const_nunits == 0)
+	group_sizep = group_sizep / const_nunits;
+      unsigned nivs = least_common_multiple (group_sizep,
 					     const_nunits) / const_nunits;
       gcc_assert (elts % group_size == 0);
       tree elt = init_expr;
@@ -7551,8 +7732,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 	  /* Create the induction-phi that defines the induction-operand.  */
 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
-	  stmt_vec_info induction_phi_info
-	    = loop_vinfo->add_stmt (induction_phi);
 	  induc_def = PHI_RESULT (induction_phi);
 
 	  /* Create the iv update inside the loop  */
@@ -7561,7 +7740,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 	  vec_def = gimple_build (&stmts,
 				  PLUS_EXPR, step_vectype, vec_def, vec_step);
 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
-	  loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
 
 	  /* Set the arguments of the phi node:  */
@@ -7569,8 +7747,14 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
 		       UNKNOWN_LOCATION);
 
-	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
+	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
 	}
+      /* Fill up to the number of vectors we need for the whole group.  */
+      nivs = least_common_multiple (group_size,
+				    const_nunits) / const_nunits;
+      for (; ivn < nivs; ++ivn)
+	SLP_TREE_VEC_STMTS (slp_node)
+	  .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
 
       /* Re-use IVs when we can.  */
       if (ivn < nvects)
@@ -7595,7 +7779,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 				       step_vectype, NULL);
 	  for (; ivn < nvects; ++ivn)
 	    {
-	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
+	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
 	      tree def;
 	      if (gimple_code (iv) == GIMPLE_PHI)
 		def = gimple_phi_result (iv);
@@ -7613,8 +7797,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
 		  gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
 		}
-	      SLP_TREE_VEC_STMTS (slp_node).quick_push
-		(loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
+	      SLP_TREE_VEC_STMTS (slp_node)
+		.quick_push (SSA_NAME_DEF_STMT (def));
 	    }
 	}
 
@@ -7627,8 +7811,10 @@ vectorizable_induction (loop_vec_info loop_vinfo,
       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
 	 been created during vectorization of previous stmts.  We obtain it
 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
-      vec_init = vect_get_vec_def_for_operand (loop_vinfo,
-					       init_expr, stmt_info);
+      auto_vec<tree> vec_inits;
+      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+				     init_expr, &vec_inits);
+      vec_init = vec_inits[0];
       /* If the initial value is not of proper type, convert it.  */
       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
 	{
@@ -7643,7 +7829,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
 						 new_stmt);
 	  gcc_assert (!new_bb);
-	  loop_vinfo->add_stmt (new_stmt);
 	}
     }
   else
@@ -7749,7 +7934,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   /* Create the induction-phi that defines the induction-operand.  */
   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
   induction_phi = create_phi_node (vec_dest, iv_loop->header);
-  stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
   induc_def = PHI_RESULT (induction_phi);
 
   /* Create the iv update inside the loop  */
@@ -7759,14 +7943,14 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   vec_def = gimple_convert (&stmts, vectype, vec_def);
   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
   new_stmt = SSA_NAME_DEF_STMT (vec_def);
-  stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
 
   /* Set the arguments of the phi node:  */
   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
 	       UNKNOWN_LOCATION);
 
-  STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
+  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
+  *vec_stmt = induction_phi;
 
   /* In case that vectorization factor (VF) is bigger than the number
      of elements that we can fit in a vectype (nunits), we have to generate
@@ -7777,7 +7961,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   if (ncopies > 1)
     {
       gimple_seq seq = NULL;
-      stmt_vec_info prev_stmt_vinfo;
       /* FORNOW. This restriction should be relaxed.  */
       gcc_assert (!nested_in_vect_loop);
 
@@ -7805,7 +7988,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 				   new_vec, step_vectype, NULL);
 
       vec_def = induc_def;
-      prev_stmt_vinfo = induction_phi_info;
       for (i = 1; i < ncopies; i++)
 	{
 	  /* vec_i = vec_prev + vec_step  */
@@ -7817,46 +7999,10 @@ vectorizable_induction (loop_vec_info loop_vinfo,
  
 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
 	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
-	  new_stmt_info = loop_vinfo->add_stmt (new_stmt);
-	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
-	  prev_stmt_vinfo = new_stmt_info;
+	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
 	}
     }
 
-  if (nested_in_vect_loop)
-    {
-      /* Find the loop-closed exit-phi of the induction, and record
-         the final vector of induction results:  */
-      exit_phi = NULL;
-      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
-        {
-	  gimple *use_stmt = USE_STMT (use_p);
-	  if (is_gimple_debug (use_stmt))
-	    continue;
-
-	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
-	    {
-	      exit_phi = use_stmt;
-	      break;
-	    }
-        }
-      if (exit_phi)
-	{
-	  stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
-	  /* FORNOW. Currently not supporting the case that an inner-loop induction
-	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
-	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
-		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
-
-	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, vect_location,
-			     "vector of inductions after inner-loop:%G",
-			     new_stmt);
-	}
-    }
-
-
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "transform induction: created def-use cycle: %G%G",
@@ -7908,6 +8054,10 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
 	     all involved stmts together.  */
 	  else if (slp_index != 0)
 	    return true;
+	  else
+	    /* For SLP reductions the meta-info is attached to
+	       the representative.  */
+	    stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
 	}
       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
       gcc_assert (reduc_info->is_reduc_info);
@@ -7945,7 +8095,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
     {
       gcc_assert (slp_index >= 0);
 
-      int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+      int num_scalar = SLP_TREE_LANES (slp_node);
       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
 
       /* Get the last occurrence of the scalar index from the concatenation of
@@ -7968,33 +8118,34 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
   if (!vec_stmt_p)
     {
       /* No transformation required.  */
-      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+      if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
 	{
 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
 					       OPTIMIZE_FOR_SPEED))
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't use a fully-masked loop because "
-				 "the target doesn't support extract last "
-				 "reduction.\n");
-	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+				 "can't operate on partial vectors "
+				 "because the target doesn't support extract "
+				 "last reduction.\n");
+	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	    }
 	  else if (slp_node)
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't use a fully-masked loop because an "
-				 "SLP statement is live after the loop.\n");
-	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+				 "can't operate on partial vectors "
+				 "because an SLP statement is live after "
+				 "the loop.\n");
+	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	    }
 	  else if (ncopies > 1)
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't use a fully-masked loop because"
-				 " ncopies is greater than 1.\n");
-	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+				 "can't operate on partial vectors "
+				 "because ncopies is greater than 1.\n");
+	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	    }
 	  else
 	    {
@@ -8014,9 +8165,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
 	: gimple_get_lhs (stmt);
   lhs_type = TREE_TYPE (lhs);
 
-  bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
-	     ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
-	     : TYPE_SIZE (TREE_TYPE (vectype)));
+  bitsize = vector_element_bits_tree (vectype);
   vec_bitsize = TYPE_SIZE (vectype);
 
   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
@@ -8026,7 +8175,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
 
       /* Get the correct slp vectorized stmt.  */
-      gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
+      gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
 	vec_lhs = gimple_phi_result (phi);
       else
@@ -8038,14 +8187,8 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
     }
   else
     {
-      enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
-      vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
-      gcc_checking_assert (ncopies == 1
-			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
-
       /* For multiple copies, get the last copy.  */
-      for (int i = 1; i < ncopies; ++i)
-	vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
+      vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
 
       /* Get the last lane in the vector.  */
       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
@@ -8250,7 +8393,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
   gcc_assert (nvectors != 0);
   if (masks->length () < nvectors)
     masks->safe_grow_cleared (nvectors);
-  rgroup_masks *rgm = &(*masks)[nvectors - 1];
+  rgroup_controls *rgm = &(*masks)[nvectors - 1];
   /* The number of scalars per iteration and the number of vectors are
      both compile-time constants.  */
   unsigned int nscalars_per_iter
@@ -8266,7 +8409,8 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
     {
       rgm->max_nscalars_per_iter = nscalars_per_iter;
-      rgm->mask_type = truth_type_for (vectype);
+      rgm->type = truth_type_for (vectype);
+      rgm->factor = 1;
     }
 }
 
@@ -8281,24 +8425,24 @@ tree
 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
 		    unsigned int nvectors, tree vectype, unsigned int index)
 {
-  rgroup_masks *rgm = &(*masks)[nvectors - 1];
-  tree mask_type = rgm->mask_type;
+  rgroup_controls *rgm = &(*masks)[nvectors - 1];
+  tree mask_type = rgm->type;
 
   /* Populate the rgroup's mask array, if this is the first time we've
      used it.  */
-  if (rgm->masks.is_empty ())
+  if (rgm->controls.is_empty ())
     {
-      rgm->masks.safe_grow_cleared (nvectors);
+      rgm->controls.safe_grow_cleared (nvectors);
       for (unsigned int i = 0; i < nvectors; ++i)
 	{
 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
 	  /* Provide a dummy definition until the real one is available.  */
 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
-	  rgm->masks[i] = mask;
+	  rgm->controls[i] = mask;
 	}
     }
 
-  tree mask = rgm->masks[index];
+  tree mask = rgm->controls[index];
   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
 		TYPE_VECTOR_SUBPARTS (vectype)))
     {
@@ -8319,6 +8463,69 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
   return mask;
 }
 
+/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
+   lengths for controlling an operation on VECTYPE.  The operation splits
+   each element of VECTYPE into FACTOR separate subelements, measuring the
+   length as a number of these subelements.  */
+
+void
+vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+		      unsigned int nvectors, tree vectype, unsigned int factor)
+{
+  gcc_assert (nvectors != 0);
+  if (lens->length () < nvectors)
+    lens->safe_grow_cleared (nvectors);
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* The number of scalars per iteration, scalar occupied bytes and
+     the number of vectors are both compile-time constants.  */
+  unsigned int nscalars_per_iter
+    = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+  if (rgl->max_nscalars_per_iter < nscalars_per_iter)
+    {
+      /* For now, we only support cases in which all loads and stores fall back
+	 to VnQI or none do.  */
+      gcc_assert (!rgl->max_nscalars_per_iter
+		  || (rgl->factor == 1 && factor == 1)
+		  || (rgl->max_nscalars_per_iter * rgl->factor
+		      == nscalars_per_iter * factor));
+      rgl->max_nscalars_per_iter = nscalars_per_iter;
+      rgl->type = vectype;
+      rgl->factor = factor;
+    }
+}
+
+/* Given a complete set of length LENS, extract length number INDEX for an
+   rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
+
+tree
+vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+		   unsigned int nvectors, unsigned int index)
+{
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* Populate the rgroup's len array, if this is the first time we've
+     used it.  */
+  if (rgl->controls.is_empty ())
+    {
+      rgl->controls.safe_grow_cleared (nvectors);
+      for (unsigned int i = 0; i < nvectors; ++i)
+	{
+	  tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+	  gcc_assert (len_type != NULL_TREE);
+	  tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
+
+	  /* Provide a dummy definition until the real one is available.  */
+	  SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
+	  rgl->controls[i] = len;
+	}
+    }
+
+  return rgl->controls[index];
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
    by factor VF.  */
 
@@ -8482,6 +8689,8 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
 	   !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
 	{
 	  new_stmt = gsi_stmt (epilogue_gsi);
+	  if (is_gimple_debug (new_stmt))
+	    continue;
 
 	  gcc_assert (gimple_uid (new_stmt) > 0);
 	  stmt_vinfo
@@ -8546,7 +8755,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
     }
 
   struct data_reference *dr;
-  vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
+  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
       orig_stmt = DR_STMT (dr);
@@ -8678,7 +8887,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   if (niters_vector == NULL_TREE)
     {
       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
 	  && known_eq (lowest_vf, vf))
 	{
 	  niters_vector
@@ -8686,9 +8895,15 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
 	  step_vector = build_one_cst (TREE_TYPE (niters));
 	}
-      else
+      else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
 				     &step_vector, niters_no_overflow);
+      else
+	/* vect_do_peeling subtracted the number of peeled prologue
+	   iterations from LOOP_VINFO_NITERS.  */
+	vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
+				     &niters_vector, &step_vector,
+				     niters_no_overflow);
     }
 
   /* 1) Make sure the loop header has exactly two entries
@@ -8771,6 +8986,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	    }
 	  else
 	    {
+	      /* Ignore vector stmts created in the outer loop.  */
 	      stmt_info = loop_vinfo->lookup_stmt (stmt);
 
 	      /* vector stmts created in the outer-loop during vectorization of
@@ -8846,7 +9062,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 
   /* True if the final iteration might not handle a full vector's
      worth of scalar iterations.  */
-  bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool final_iter_may_be_partial
+    = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
   /* The minimum number of iterations performed by the epilogue.  This
      is 1 when peeling for gaps because we always need a final scalar
      iteration.  */
@@ -8857,7 +9074,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   int bias_for_lowest = 1 - min_epilogue_iters;
   int bias_for_assumed = bias_for_lowest;
   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-  if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
       /* When the amount of peeling is known at compile time, the first
 	 iteration will have exactly alignment_npeels active elements.
@@ -9149,12 +9366,13 @@ optimize_mask_stores (class loop *loop)
 }
 
 /* Decide whether it is possible to use a zero-based induction variable
-   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
-   return the value that the induction variable must be able to hold
-   in order to ensure that the loop ends with an all-false mask.
+   when vectorizing LOOP_VINFO with partial vectors.  If it is, return
+   the value that the induction variable must be able to hold in order
+   to ensure that the rgroups eventually have no active vector elements.
    Return -1 otherwise.  */
+
 widest_int
-vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
 {
   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -9189,3 +9407,25 @@ vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
   return iv_limit;
 }
 
+/* For the given rgroup_controls RGC, check whether an induction variable
+   would ever hit a value that produces a set of all-false masks or zero
+   lengths before wrapping around.  Return true if it's possible to wrap
+   around before hitting the desirable value, otherwise return false.  */
+
+bool
+vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
+{
+  widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
+
+  if (iv_limit == -1)
+    return true;
+
+  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  unsigned int compare_precision = TYPE_PRECISION (compare_type);
+  unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+
+  if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
+    return true;
+
+  return false;
+}
author	Giuliano Belinassi <giuliano.belinassi@usp.br>	2020-08-22 17:43:43 -0300
committer	Giuliano Belinassi <giuliano.belinassi@usp.br>	2020-08-22 17:43:43 -0300
commit	a926878ddbd5a98b272c22171ce58663fc04c3e0 (patch)
tree	86af256e5d9a9c06263c00adc90e5fe348008c43 /gcc/tree-vect-loop.c
parent	542730f087133690b47e036dfd43eb0db8a650ce (diff)
parent	07cbaed8ba7d1b6e4ab3a9f44175502a4e1ecdb1 (diff)
download	gcc-devel/autopar_devel.zip gcc-devel/autopar_devel.tar.gz gcc-devel/autopar_devel.tar.bz2