1 files changed, 678 insertions, 523 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 3e973e7..c9dcc64 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -813,7 +814,7 @@ bb_in_loop_p (const_basic_block bb, const void *data)
    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 
 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
-  : vec_info (vec_info::loop, init_cost (loop_in), shared),
+  : vec_info (vec_info::loop, init_cost (loop_in, false), shared),
     loop (loop_in),
     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
     num_itersm1 (NULL_TREE),
@@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     th (0),
     versioning_threshold (0),
     vectorization_factor (0),
+    main_loop_edge (nullptr),
+    skip_main_loop_edge (nullptr),
+    skip_this_loop_edge (nullptr),
+    reusable_accumulators (),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
     rgroup_compare_type (NULL_TREE),
@@ -836,6 +841,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     single_scalar_iteration_cost (0),
     vec_outside_cost (0),
     vec_inside_cost (0),
+    inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
     vectorizable (false),
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
@@ -1237,7 +1243,7 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
   /* FORNOW.  */
   innerloop_iters = 1;
   if (loop->inner)
-    innerloop_iters = 50; /* FIXME */
+    innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
 
   for (i = 0; i < nbbs; i++)
     {
@@ -1278,24 +1284,28 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
 	  else
             kind = scalar_stmt;
 
+	  /* We are using vect_prologue here to avoid scaling twice
+	     by the inner loop factor.  */
 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
 			    factor, kind, stmt_info, 0, vect_prologue);
         }
     }
 
   /* Now accumulate cost.  */
-  void *target_cost_data = init_cost (loop);
+  void *target_cost_data = init_cost (loop, true);
   stmt_info_for_cost *si;
   int j;
   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
 		    j, si)
     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
 			  si->kind, si->stmt_info, si->vectype,
-			  si->misalign, vect_body);
-  unsigned dummy, body_cost = 0;
-  finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
+			  si->misalign, si->where);
+  unsigned prologue_cost = 0, body_cost = 0, epilogue_cost = 0;
+  finish_cost (target_cost_data, &prologue_cost, &body_cost,
+	       &epilogue_cost);
   destroy_cost_data (target_cost_data);
-  LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
+  LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
+    = prologue_cost + body_cost + epilogue_cost;
 }
 
 
@@ -1509,6 +1519,13 @@ vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
       stmt_vec_info inner_loop_cond_info
 	= loop_vinfo->lookup_stmt (inner_loop_cond);
       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
+      /* If we have an estimate on the number of iterations of the inner
+	 loop use that to limit the scale for costing, otherwise use
+	 --param vect-inner-loop-cost-factor literally.  */
+      widest_int nit;
+      if (estimated_stmt_executions (loop->inner, &nit))
+	LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
+	  = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
     }
 
   gcc_assert (!loop->aux);
@@ -2723,7 +2740,7 @@ again:
   /* Reset target cost data.  */
   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
-    = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
+    = init_cost (LOOP_VINFO_LOOP (loop_vinfo), false);
   /* Reset accumulated rgroup information.  */
   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
@@ -2766,7 +2783,15 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
 
   /* Limit the VFs to what is likely to be the maximum number of iterations,
      to handle cases in which at least one loop_vinfo is fully-masked.  */
-  HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
+  HOST_WIDE_INT estimated_max_niter;
+  loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
+  unsigned HOST_WIDE_INT main_vf;
+  if (main_loop
+      && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
+      && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
+    estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
+  else
+    estimated_max_niter = likely_max_stmt_executions_int (loop);
   if (estimated_max_niter != -1)
     {
       if (known_le (estimated_max_niter, new_vf))
@@ -3058,7 +3083,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 			= opt_loop_vec_info::success (main_loop_vinfo);
 		    }
 		  else
-		    delete main_loop_vinfo;
+		    {
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_NOTE, vect_location,
+					 "***** No longer preferring vector"
+					 " mode %s after reanalyzing the loop"
+					 " as a main loop\n",
+					 GET_MODE_NAME
+					   (main_loop_vinfo->vector_mode));
+		      delete main_loop_vinfo;
+		    }
 		}
 	    }
 
@@ -3208,7 +3242,7 @@ fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
 
    Return FALSE if CODE currently cannot be vectorized as reduction.  */
 
-static bool
+bool
 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
 {
   switch (code)
@@ -3247,23 +3281,15 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
     }
 }
 
-/* If there is a neutral value X such that SLP reduction NODE would not
-   be affected by the introduction of additional X elements, return that X,
-   otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
-   is the vector type that would hold element X.  REDUC_CHAIN is true if
-   the SLP statements perform a single reduction, false if each statement
-   performs an independent reduction.  */
+/* If there is a neutral value X such that a reduction would not be affected
+   by the introduction of additional X elements, return that X, otherwise
+   return null.  CODE is the code of the reduction and SCALAR_TYPE is type
+   of the scalar elements.  If the reduction has just a single initial value
+   then INITIAL_VALUE is that value, otherwise it is null.  */
 
 static tree
-neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
-			      tree_code code, bool reduc_chain)
+neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
-  tree scalar_type = TREE_TYPE (vector_type);
-  class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
-  gcc_assert (loop);
-
   switch (code)
     {
     case WIDEN_SUM_EXPR:
@@ -3283,13 +3309,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
 
     case MAX_EXPR:
     case MIN_EXPR:
-      /* For MIN/MAX the initial values are neutral.  A reduction chain
-	 has only a single initial value, so that value is neutral for
-	 all statements.  */
-      if (reduc_chain)
-	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
-				      loop_preheader_edge (loop));
-      return NULL_TREE;
+      return initial_value;
 
     default:
       return NULL_TREE;
@@ -4465,7 +4485,7 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 			   vect_reduction_type reduction_type,
 			   int ncopies, stmt_vector_for_cost *cost_vec)
 {
-  int prologue_cost = 0, epilogue_cost = 0, inside_cost;
+  int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
   enum tree_code code;
   optab optab;
   tree vectype;
@@ -4620,64 +4640,58 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
                  prologue_cost, epilogue_cost);
 }
 
+/* SEQ is a sequence of instructions that initialize the reduction
+   described by REDUC_INFO.  Emit them in the appropriate place.  */
 
+static void
+vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info, gimple *seq)
+{
+  if (reduc_info->reused_accumulator)
+    {
+      /* When reusing an accumulator from the main loop, we only need
+	 initialization instructions if the main loop can be skipped.
+	 In that case, emit the initialization instructions at the end
+	 of the guard block that does the skip.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      gcc_assert (skip_edge);
+      gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
+      gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+    }
+  else
+    {
+      /* The normal case: emit the initialization instructions on the
+	 preheader edge.  */
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
+    }
+}
 
 /* Function get_initial_def_for_reduction
 
    Input:
-   STMT_VINFO - a stmt that performs a reduction operation in the loop.
+   REDUC_INFO - the info_for_reduction
    INIT_VAL - the initial value of the reduction variable
+   NEUTRAL_OP - a value that has no effect on the reduction, as per
+		neutral_op_for_reduction
 
    Output:
-   ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
-        of the reduction (used for adjusting the epilog - see below).
    Return a vector variable, initialized according to the operation that
 	STMT_VINFO performs. This vector will be used as the initial value
 	of the vector of partial results.
 
-   Option1 (adjust in epilog): Initialize the vector as follows:
-     add/bit or/xor:    [0,0,...,0,0]
-     mult/bit and:      [1,1,...,1,1]
-     min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
-   and when necessary (e.g. add/mult case) let the caller know
-   that it needs to adjust the result by init_val.
-
-   Option2: Initialize the vector as follows:
-     add/bit or/xor:    [init_val,0,0,...,0]
-     mult/bit and:      [init_val,1,1,...,1]
-     min/max/cond_expr: [init_val,init_val,...,init_val]
-   and no adjustments are needed.
-
-   For example, for the following code:
-
-   s = init_val;
-   for (i=0;i<n;i++)
-     s = s + a[i];
-
-   STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
-   For a vector of 4 units, we want to return either [0,0,0,init_val],
-   or [0,0,0,0] and let the caller know that it needs to adjust
-   the result at the end by 'init_val'.
-
-   FORNOW, we are using the 'adjust in epilog' scheme, because this way the
-   initialization vector is simpler (same element in all entries), if
-   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
-
-   A cost model should help decide between these two schemes.  */
+   The value we need is a vector in which element 0 has value INIT_VAL
+   and every other element has value NEUTRAL_OP.  */
 
 static tree
 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
-			       stmt_vec_info stmt_vinfo,
-			       enum tree_code code, tree init_val,
-                               tree *adjustment_def)
+			       stmt_vec_info reduc_info,
+			       tree init_val, tree neutral_op)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
-  tree def_for_init;
   tree init_def;
-  REAL_VALUE_TYPE real_init_val = dconst0;
-  int int_init_val = 0;
   gimple_seq stmts = NULL;
 
   gcc_assert (vectype);
@@ -4685,115 +4699,64 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
 
-  gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
-	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
+  gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
+	      || loop == (gimple_bb (reduc_info->stmt))->loop_father);
 
-  /* ADJUSTMENT_DEF is NULL when called from
-     vect_create_epilog_for_reduction to vectorize double reduction.  */
-  if (adjustment_def)
-    *adjustment_def = NULL;
-
-  switch (code)
+  if (operand_equal_p (init_val, neutral_op))
     {
-    case WIDEN_SUM_EXPR:
-    case DOT_PROD_EXPR:
-    case SAD_EXPR:
-    case PLUS_EXPR:
-    case MINUS_EXPR:
-    case BIT_IOR_EXPR:
-    case BIT_XOR_EXPR:
-    case MULT_EXPR:
-    case BIT_AND_EXPR:
-      {
-        if (code == MULT_EXPR)
-          {
-            real_init_val = dconst1;
-            int_init_val = 1;
-          }
-
-        if (code == BIT_AND_EXPR)
-          int_init_val = -1;
-
-        if (SCALAR_FLOAT_TYPE_P (scalar_type))
-          def_for_init = build_real (scalar_type, real_init_val);
-        else
-          def_for_init = build_int_cst (scalar_type, int_init_val);
-
-	if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
-	  {
-	    /* Option1: the first element is '0' or '1' as well.  */
-	    if (!operand_equal_p (def_for_init, init_val, 0))
-	      *adjustment_def = init_val;
-	    init_def = gimple_build_vector_from_val (&stmts, vectype,
-						     def_for_init);
-	  }
-	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
-	  {
-	    /* Option2 (variable length): the first element is INIT_VAL.  */
-	    init_def = gimple_build_vector_from_val (&stmts, vectype,
-						     def_for_init);
-	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
-				     vectype, init_def, init_val);
-	  }
-	else
-	  {
-	    /* Option2: the first element is INIT_VAL.  */
-	    tree_vector_builder elts (vectype, 1, 2);
-	    elts.quick_push (init_val);
-	    elts.quick_push (def_for_init);
-	    init_def = gimple_build_vector (&stmts, &elts);
-	  }
-      }
-      break;
-
-    case MIN_EXPR:
-    case MAX_EXPR:
-    case COND_EXPR:
-      {
-	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
-	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
-      }
-      break;
-
-    default:
-      gcc_unreachable ();
+      /* If both elements are equal then the vector described above is
+	 just a splat.  */
+      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
+      init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
+    }
+  else
+    {
+      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
+      init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
+      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
+	{
+	  /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
+	     element 0.  */
+	  init_def = gimple_build_vector_from_val (&stmts, vectype,
+						   neutral_op);
+	  init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
+				   vectype, init_def, init_val);
+	}
+      else
+	{
+	  /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
+	  tree_vector_builder elts (vectype, 1, 2);
+	  elts.quick_push (init_val);
+	  elts.quick_push (neutral_op);
+	  init_def = gimple_build_vector (&stmts, &elts);
+	}
     }
 
   if (stmts)
-    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
   return init_def;
 }
 
-/* Get at the initial defs for the reduction PHIs in SLP_NODE.
-   NUMBER_OF_VECTORS is the number of vector defs to create.
-   If NEUTRAL_OP is nonnull, introducing extra elements of that
-   value will not change the result.  */
+/* Get at the initial defs for the reduction PHIs for REDUC_INFO,
+   which performs a reduction involving GROUP_SIZE scalar statements.
+   NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
+   is nonnull, introducing extra elements of that value will not change the
+   result.  */
 
 static void
-get_initial_defs_for_reduction (vec_info *vinfo,
-				slp_tree slp_node,
+get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info,
 				vec<tree> *vec_oprnds,
 				unsigned int number_of_vectors,
-				bool reduc_chain, tree neutral_op)
+				unsigned int group_size, tree neutral_op)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
+  vec<tree> &initial_values = reduc_info->reduc_initial_values;
   unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
-  tree vector_type;
-  unsigned int group_size = stmts.length ();
+  tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
   unsigned int i;
-  class loop *loop;
-
-  vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
-
-  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
 
-  loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
-  gcc_assert (loop);
-  edge pe = loop_preheader_edge (loop);
-
-  gcc_assert (!reduc_chain || neutral_op);
+  gcc_assert (group_size == initial_values.length () || neutral_op);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
      created vectors. It is greater than 1 if unrolling is performed.
@@ -4823,18 +4786,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
     {
       tree op;
       i = j % group_size;
-      stmt_vinfo = stmts[i];
 
       /* Get the def before the loop.  In reduction chain we have only
 	 one initial value.  Else we have as many as PHIs in the group.  */
-      if (reduc_chain)
-	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
-      else if (((vec_oprnds->length () + 1) * nunits
-		- number_of_places_left_in_vector >= group_size)
-	       && neutral_op)
+      if (i >= initial_values.length () || (j > i && neutral_op))
 	op = neutral_op;
       else
-	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
+	op = initial_values[i];
 
       /* Create 'vect_ = {op0,op1,...,opn}'.  */
       number_of_places_left_in_vector--;
@@ -4870,8 +4828,8 @@ get_initial_defs_for_reduction (vec_info *vinfo,
 	    {
 	      /* First time round, duplicate ELTS to fill the
 		 required number of vectors.  */
-	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
-					number_of_vectors, *vec_oprnds);
+	      duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
+					elts, number_of_vectors, *vec_oprnds);
 	      break;
 	    }
 	  vec_oprnds->quick_push (init);
@@ -4883,7 +4841,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
 	}
     }
   if (ctor_seq != NULL)
-    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
 }
 
 /* For a statement STMT_INFO taking part in a reduction operation return
@@ -4905,15 +4863,200 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
     }
   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
     {
-      edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
-      stmt_vec_info info
-	  = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
+      stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
 	stmt_info = info;
     }
   return stmt_info;
 }
 
+/* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
+   REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
+   return false.  */
+
+static bool
+vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info)
+{
+  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+  if (!main_loop_vinfo)
+    return false;
+
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
+    return false;
+
+  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
+  auto_vec<tree, 16> main_loop_results (num_phis);
+  auto_vec<tree, 16> initial_values (num_phis);
+  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
+    {
+      /* The epilogue loop can be entered either from the main loop or
+	 from an earlier guard block.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      for (tree incoming_value : reduc_info->reduc_initial_values)
+	{
+	  /* Look for:
+
+	       INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
+				    INITIAL_VALUE(guard block)>.  */
+	  gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
+
+	  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
+	  gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
+
+	  tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
+	  tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
+
+	  main_loop_results.quick_push (from_main_loop);
+	  initial_values.quick_push (from_skip);
+	}
+    }
+  else
+    /* The main loop dominates the epilogue loop.  */
+    main_loop_results.splice (reduc_info->reduc_initial_values);
+
+  /* See if the main loop has the kind of accumulator we need.  */
+  vect_reusable_accumulator *accumulator
+    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
+  if (!accumulator
+      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
+      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
+		      accumulator->reduc_info->reduc_scalar_results.begin ()))
+    return false;
+
+  /* Handle the case where we can reduce wider vectors to narrower ones.  */
+  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
+  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
+  if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
+			    TYPE_VECTOR_SUBPARTS (vectype)))
+    return false;
+
+  /* Non-SLP reductions might apply an adjustment after the reduction
+     operation, in order to simplify the initialization of the accumulator.
+     If the epilogue loop carries on from where the main loop left off,
+     it should apply the same adjustment to the final reduction result.
+
+     If the epilogue loop can also be entered directly (rather than via
+     the main loop), we need to be able to handle that case in the same way,
+     with the same adjustment.  (In principle we could add a PHI node
+     to select the correct adjustment, but in practice that shouldn't be
+     necessary.)  */
+  tree main_adjustment
+    = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
+  if (loop_vinfo->main_loop_edge && main_adjustment)
+    {
+      gcc_assert (num_phis == 1);
+      tree initial_value = initial_values[0];
+      /* Check that we can use INITIAL_VALUE as the adjustment and
+	 initialize the accumulator with a neutral value instead.  */
+      if (!operand_equal_p (initial_value, main_adjustment))
+	return false;
+      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+      initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
+						    code, initial_value);
+    }
+  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
+  reduc_info->reduc_initial_values.truncate (0);
+  reduc_info->reduc_initial_values.splice (initial_values);
+  reduc_info->reused_accumulator = accumulator;
+  return true;
+}
+
+/* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
+   CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
+
+static tree
+vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
+			    gimple_seq *seq)
+{
+  unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
+  unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+  tree stype = TREE_TYPE (vectype);
+  tree new_temp = vec_def;
+  while (nunits > nunits1)
+    {
+      nunits /= 2;
+      tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
+							   stype, nunits);
+      unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
+
+      /* The target has to make sure we support lowpart/highpart
+	 extraction, either via direct vector extract or through
+	 an integer mode punning.  */
+      tree dst1, dst2;
+      gimple *epilog_stmt;
+      if (convert_optab_handler (vec_extract_optab,
+				 TYPE_MODE (TREE_TYPE (new_temp)),
+				 TYPE_MODE (vectype1))
+	  != CODE_FOR_nothing)
+	{
+	  /* Extract sub-vectors directly once vec_extract becomes
+	     a conversion optab.  */
+	  dst1 = make_ssa_name (vectype1);
+	  epilog_stmt
+	      = gimple_build_assign (dst1, BIT_FIELD_REF,
+				     build3 (BIT_FIELD_REF, vectype1,
+					     new_temp, TYPE_SIZE (vectype1),
+					     bitsize_int (0)));
+	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
+	  dst2 =  make_ssa_name (vectype1);
+	  epilog_stmt
+	      = gimple_build_assign (dst2, BIT_FIELD_REF,
+				     build3 (BIT_FIELD_REF, vectype1,
+					     new_temp, TYPE_SIZE (vectype1),
+					     bitsize_int (bitsize)));
+	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
+	}
+      else
+	{
+	  /* Extract via punning to appropriately sized integer mode
+	     vector.  */
+	  tree eltype = build_nonstandard_integer_type (bitsize, 1);
+	  tree etype = build_vector_type (eltype, 2);
+	  gcc_assert (convert_optab_handler (vec_extract_optab,
+					     TYPE_MODE (etype),
+					     TYPE_MODE (eltype))
+		      != CODE_FOR_nothing);
+	  tree tem = make_ssa_name (etype);
+	  epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
+					     build1 (VIEW_CONVERT_EXPR,
+						     etype, new_temp));
+	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
+	  new_temp = tem;
+	  tem = make_ssa_name (eltype);
+	  epilog_stmt
+	      = gimple_build_assign (tem, BIT_FIELD_REF,
+				     build3 (BIT_FIELD_REF, eltype,
+					     new_temp, TYPE_SIZE (eltype),
+					     bitsize_int (0)));
+	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
+	  dst1 = make_ssa_name (vectype1);
+	  epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
+					     build1 (VIEW_CONVERT_EXPR,
+						     vectype1, tem));
+	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
+	  tem = make_ssa_name (eltype);
+	  epilog_stmt
+	      = gimple_build_assign (tem, BIT_FIELD_REF,
+				     build3 (BIT_FIELD_REF, eltype,
+					     new_temp, TYPE_SIZE (eltype),
+					     bitsize_int (bitsize)));
+	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
+	  dst2 =  make_ssa_name (vectype1);
+	  epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
+					     build1 (VIEW_CONVERT_EXPR,
+						     vectype1, tem));
+	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
+	}
+
+      new_temp = make_ssa_name (vectype1);
+      epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
+      gimple_seq_add_stmt_without_update (seq, epilog_stmt);
+    }
+
+  return new_temp;
+}
+
 /* Function vect_create_epilog_for_reduction
 
    Create code at the loop-epilog to finalize the result of a reduction
@@ -5004,15 +5147,18 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   imm_use_iterator imm_iter, phi_imm_iter;
   use_operand_p use_p, phi_use_p;
   gimple *use_stmt;
-  bool nested_in_vect_loop = false;
-  auto_vec<gimple *> new_phis;
+  auto_vec<tree> reduc_inputs;
   int j, i;
-  auto_vec<tree> scalar_results;
+  vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
   unsigned int group_size = 1, k;
   auto_vec<gimple *> phis;
-  bool slp_reduc = false;
+  /* SLP reduction without reduction chain, e.g.,
+     # a1 = phi <a2, a0>
+     # b1 = phi <b2, b0>
+     a2 = operation (a1)
+     b2 = operation (b1)  */
+  bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
   bool direct_slp_reduc;
-  tree new_phi_result;
   tree induction_index = NULL_TREE;
 
   if (slp_node)
@@ -5022,38 +5168,39 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     {
       outer_loop = loop;
       loop = loop->inner;
-      nested_in_vect_loop = true;
-      gcc_assert (!slp_node);
+      gcc_assert (!slp_node && double_reduc);
     }
-  gcc_assert (!nested_in_vect_loop || double_reduc);
 
   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
   gcc_assert (vectype);
   mode = TYPE_MODE (vectype);
 
-  tree initial_def = NULL;
   tree induc_val = NULL_TREE;
   tree adjustment_def = NULL;
   if (slp_node)
     ;
   else
     {
-      /* Get at the scalar def before the loop, that defines the initial value
-	 of the reduction variable.  */
-      initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
-					   loop_preheader_edge (loop));
       /* Optimize: for induction condition reduction, if we can't use zero
          for induc_val, use initial_def.  */
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
 	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
       else if (double_reduc)
 	;
-      else if (nested_in_vect_loop)
-	;
       else
 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
     }
 
+  stmt_vec_info single_live_out_stmt[] = { stmt_info };
+  array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
+  if (slp_reduc)
+    /* All statements produce live-out values.  */
+    live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+  else if (slp_node)
+    /* The last statement in the reduction chain produces the live-out
+       value.  */
+    single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
+
   unsigned vec_num;
   int ncopies;
   if (slp_node)
@@ -5204,31 +5351,28 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   if (double_reduc)
     loop = outer_loop;
   exit_bb = single_exit (loop)->dest;
-  new_phis.create (slp_node ? vec_num : ncopies);
+  exit_gsi = gsi_after_labels (exit_bb);
+  reduc_inputs.create (slp_node ? vec_num : ncopies);
   for (unsigned i = 0; i < vec_num; i++)
     {
+      gimple_seq stmts = NULL;
       if (slp_node)
 	def = vect_get_slp_vect_def (slp_node, i);
       else
 	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
       for (j = 0; j < ncopies; j++)
-        {
+	{
 	  tree new_def = copy_ssa_name (def);
-          phi = create_phi_node (new_def, exit_bb);
-          if (j == 0)
-            new_phis.quick_push (phi);
-          else
-	    {
-	      def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
-	      new_phis.quick_push (phi);
-	    }
-
-          SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
-        }
+	  phi = create_phi_node (new_def, exit_bb);
+	  if (j)
+	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
+	  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
+	  new_def = gimple_convert (&stmts, vectype, new_def);
+	  reduc_inputs.quick_push (new_def);
+	}
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
     }
 
-  exit_gsi = gsi_after_labels (exit_bb);
-
   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
          (i.e. when reduc_fn is not available) and in the final adjustment
 	 code (if needed).  Also get the original scalar reduction variable as
@@ -5246,19 +5390,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
     }
   
-  scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
+  scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   scalar_results.create (group_size); 
   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
   bitsize = TYPE_SIZE (scalar_type);
 
-  /* SLP reduction without reduction chain, e.g.,
-     # a1 = phi <a2, a0>
-     # b1 = phi <b2, b0>
-     a2 = operation (a1)
-     b2 = operation (b1)  */
-  slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
-
   /* True if we should implement SLP_REDUC using native reduction operations
      instead of scalar operations.  */
   direct_slp_reduc = (reduc_fn != IFN_LAST
@@ -5270,52 +5407,60 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
      a2 = operation (a1)
      a3 = operation (a2),
 
-     we may end up with more than one vector result.  Here we reduce them to
-     one vector.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
+     we may end up with more than one vector result.  Here we reduce them
+     to one vector.
+
+     The same is true if we couldn't use a single defuse cycle.  */
+  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+      || direct_slp_reduc
+      || ncopies > 1)
     {
       gimple_seq stmts = NULL;
-      tree first_vect = PHI_RESULT (new_phis[0]);
-      first_vect = gimple_convert (&stmts, vectype, first_vect);
-      for (k = 1; k < new_phis.length (); k++)
-        {
-	  gimple *next_phi = new_phis[k];
-          tree second_vect = PHI_RESULT (next_phi);
-	  second_vect = gimple_convert (&stmts, vectype, second_vect);
-          first_vect = gimple_build (&stmts, code, vectype,
-				     first_vect, second_vect);
-        }
+      tree single_input = reduc_inputs[0];
+      for (k = 1; k < reduc_inputs.length (); k++)
+	single_input = gimple_build (&stmts, code, vectype,
+				     single_input, reduc_inputs[k]);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
-      new_phi_result = first_vect;
-      new_phis.truncate (0);
-      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
+      reduc_inputs.truncate (0);
+      reduc_inputs.safe_push (single_input);
     }
-  /* Likewise if we couldn't use a single defuse cycle.  */
-  else if (ncopies > 1)
+
+  tree orig_reduc_input = reduc_inputs[0];
+
+  /* If this loop is an epilogue loop that can be skipped after the
+     main loop, we can only share a reduction operation between the
+     main loop and the epilogue if we put it at the target of the
+     skip edge.
+
+     We can still reuse accumulators if this check fails.  Doing so has
+     the minor(?) benefit of making the epilogue loop's scalar result
+     independent of the main loop's scalar result.  */
+  bool unify_with_main_loop_p = false;
+  if (reduc_info->reused_accumulator
+      && loop_vinfo->skip_this_loop_edge
+      && single_succ_p (exit_bb)
+      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
     {
-      gimple_seq stmts = NULL;
-      tree first_vect = PHI_RESULT (new_phis[0]);
-      first_vect = gimple_convert (&stmts, vectype, first_vect);
-      for (int k = 1; k < ncopies; ++k)
-	{
-	  tree second_vect = PHI_RESULT (new_phis[k]);
-	  second_vect = gimple_convert (&stmts, vectype, second_vect);
-	  first_vect = gimple_build (&stmts, code, vectype,
-				     first_vect, second_vect);
-	}
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-      new_phi_result = first_vect;
-      new_phis.truncate (0);
-      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
+      unify_with_main_loop_p = true;
+
+      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
+      reduc_inputs[0] = make_ssa_name (vectype);
+      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
+      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
+		   UNKNOWN_LOCATION);
+      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
+		   loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
+      exit_gsi = gsi_after_labels (reduc_block);
     }
-  else
-    new_phi_result = PHI_RESULT (new_phis[0]);
+
+  /* Shouldn't be used beyond this point.  */
+  exit_bb = nullptr;
 
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
-      /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
+      /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
 	 various data values where the condition matched and another vector
 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
 	 need to extract the last matching index (which will be the index with
@@ -5345,10 +5490,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       /* Vector of {0, 0, 0,...}.  */
       tree zero_vec = build_zero_cst (vectype);
 
-      gimple_seq stmts = NULL;
-      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-
       /* Find maximum value from the vector of found indexes.  */
       tree max_index = make_ssa_name (index_scalar_type);
       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
@@ -5366,7 +5507,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
-	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
+	 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
 	 otherwise.  Only one value should match, resulting in a vector
 	 (VEC_COND) with one data value and the rest zeros.
 	 In the case where the loop never made any matches, every index will
@@ -5385,7 +5526,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	 zero.  */
       tree vec_cond = make_ssa_name (vectype);
       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
-						   vec_compare, new_phi_result,
+						   vec_compare,
+						   reduc_inputs[0],
 						   zero_vec);
       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
 
@@ -5415,7 +5557,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* Convert the reduced value back to the result type and set as the
 	 result.  */
-      stmts = NULL;
+      gimple_seq stmts = NULL;
       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
 			       data_reduc);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
@@ -5433,7 +5575,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     val = data_reduc[i], idx_val = induction_index[i];
 	 return val;  */
 
-      tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
+      tree data_eltype = TREE_TYPE (vectype);
       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
@@ -5457,7 +5599,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
 					     build3 (BIT_FIELD_REF,
 						     data_eltype,
-						     new_phi_result,
+						     reduc_inputs[0],
 						     bitsize_int (el_size),
 						     bitsize_int (off)));
 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
@@ -5509,10 +5651,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 			 "Reduce using direct vector reduction.\n");
 
       gimple_seq stmts = NULL;
-      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
-      vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
+      vec_elem_type = TREE_TYPE (vectype);
       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
-			       vec_elem_type, new_phi_result);
+			       vec_elem_type, reduc_inputs[0]);
       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
@@ -5525,6 +5666,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5542,12 +5684,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	 neutral value.  We can then do a normal reduction on each vector.  */
 
       /* Enforced by vectorizable_reduction.  */
-      gcc_assert (new_phis.length () == 1);
+      gcc_assert (reduc_inputs.length () == 1);
       gcc_assert (pow2p_hwi (group_size));
 
-      slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
-      vec<stmt_vec_info> orig_phis
-	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
       gimple_seq seq = NULL;
 
       /* Build a vector {0, 1, 2, ...}, with the same number of elements
@@ -5570,10 +5709,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       tree neutral_op = NULL_TREE;
       if (slp_node)
 	{
-	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
-	  neutral_op
-	    = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
-					    vectype, code, first != NULL);
+	  tree initial_value = NULL_TREE;
+	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+	    initial_value = reduc_info->reduc_initial_values[0];
+	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
+						 initial_value);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5585,9 +5725,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     for MIN and MAX reduction, for example.  */
 	  if (!neutral_op)
 	    {
-	      tree scalar_value
-		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
-					 loop_preheader_edge (loop));
+	      tree scalar_value = reduc_info->reduc_initial_values[i];
 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
 					     scalar_value);
 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5598,7 +5736,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
 	     sel[j] = (index[j] == i);
 
-	     which selects the elements of NEW_PHI_RESULT that should
+	     which selects the elements of REDUC_INPUTS[0] that should
 	     be included in the result.  */
 	  tree compare_val = build_int_cst (index_elt_type, i);
 	  compare_val = build_vector_from_val (index_type, compare_val);
@@ -5607,11 +5745,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
 	  /* Calculate the equivalent of:
 
-	     vec = seq ? new_phi_result : vector_identity;
+	     vec = seq ? reduc_inputs[0] : vector_identity;
 
 	     VEC is now suitable for a full vector reduction.  */
 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
-				   sel, new_phi_result, vector_identity);
+				   sel, reduc_inputs[0], vector_identity);
 
 	  /* Do the reduction and convert it to the appropriate type.  */
 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
@@ -5626,7 +5764,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       bool reduce_with_shift;
       tree vec_temp;
 
-      gcc_assert (slp_reduc || new_phis.length () == 1);
+      gcc_assert (slp_reduc || reduc_inputs.length () == 1);
 
       /* See if the target wants to do the final (shift) reduction
 	 in a vector mode of smaller size and first reduce upper/lower
@@ -5636,7 +5774,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
       unsigned nunits1 = nunits;
       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
-	  && new_phis.length () == 1)
+	  && reduc_inputs.length () == 1)
 	{
 	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
 	  /* For SLP reductions we have to make sure lanes match up, but
@@ -5668,87 +5806,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* First reduce the vector to the desired vector size we should
 	 do shift reduction on by combining upper and lower halves.  */
-      new_temp = new_phi_result;
-      while (nunits > nunits1)
-	{
-	  nunits /= 2;
-	  vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
-							  stype, nunits);
-	  unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
-
-	  /* The target has to make sure we support lowpart/highpart
-	     extraction, either via direct vector extract or through
-	     an integer mode punning.  */
-	  tree dst1, dst2;
-	  if (convert_optab_handler (vec_extract_optab,
-				     TYPE_MODE (TREE_TYPE (new_temp)),
-				     TYPE_MODE (vectype1))
-	      != CODE_FOR_nothing)
-	    {
-	      /* Extract sub-vectors directly once vec_extract becomes
-		 a conversion optab.  */
-	      dst1 = make_ssa_name (vectype1);
-	      epilog_stmt
-		  = gimple_build_assign (dst1, BIT_FIELD_REF,
-					 build3 (BIT_FIELD_REF, vectype1,
-						 new_temp, TYPE_SIZE (vectype1),
-						 bitsize_int (0)));
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	      dst2 =  make_ssa_name (vectype1);
-	      epilog_stmt
-		  = gimple_build_assign (dst2, BIT_FIELD_REF,
-					 build3 (BIT_FIELD_REF, vectype1,
-						 new_temp, TYPE_SIZE (vectype1),
-						 bitsize_int (bitsize)));
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	    }
-	  else
-	    {
-	      /* Extract via punning to appropriately sized integer mode
-		 vector.  */
-	      tree eltype = build_nonstandard_integer_type (bitsize, 1);
-	      tree etype = build_vector_type (eltype, 2);
-	      gcc_assert (convert_optab_handler (vec_extract_optab,
-						 TYPE_MODE (etype),
-						 TYPE_MODE (eltype))
-			  != CODE_FOR_nothing);
-	      tree tem = make_ssa_name (etype);
-	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
-						 build1 (VIEW_CONVERT_EXPR,
-							 etype, new_temp));
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	      new_temp = tem;
-	      tem = make_ssa_name (eltype);
-	      epilog_stmt
-		  = gimple_build_assign (tem, BIT_FIELD_REF,
-					 build3 (BIT_FIELD_REF, eltype,
-						 new_temp, TYPE_SIZE (eltype),
-						 bitsize_int (0)));
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	      dst1 = make_ssa_name (vectype1);
-	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
-						 build1 (VIEW_CONVERT_EXPR,
-							 vectype1, tem));
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	      tem = make_ssa_name (eltype);
-	      epilog_stmt
-		  = gimple_build_assign (tem, BIT_FIELD_REF,
-					 build3 (BIT_FIELD_REF, eltype,
-						 new_temp, TYPE_SIZE (eltype),
-						 bitsize_int (bitsize)));
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	      dst2 =  make_ssa_name (vectype1);
-	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
-						 build1 (VIEW_CONVERT_EXPR,
-							 vectype1, tem));
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	    }
-
-	  new_temp = make_ssa_name (vectype1);
-	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
-	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	  new_phis[0] = epilog_stmt;
-	}
+      gimple_seq stmts = NULL;
+      new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
+					     code, &stmts);
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
+      reduc_inputs[0] = new_temp;
 
       if (reduce_with_shift && !slp_reduc)
 	{
@@ -5828,13 +5890,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  int element_bitsize = tree_to_uhwi (bitsize);
 	  tree compute_type = TREE_TYPE (vectype);
 	  gimple_seq stmts = NULL;
-          FOR_EACH_VEC_ELT (new_phis, i, new_phi)
+	  FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
             {
               int bit_offset;
-              if (gimple_code (new_phi) == GIMPLE_PHI)
-                vec_temp = PHI_RESULT (new_phi);
-              else
-                vec_temp = gimple_assign_lhs (new_phi);
 	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
 				       vec_temp, bitsize, bitsize_zero_node);
 
@@ -5881,6 +5939,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 					  first_res, res);
                   scalar_results[j % group_size] = new_res;
                 }
+	      scalar_results.truncate (group_size);
 	      for (k = 0; k < group_size; k++)
 		scalar_results[k] = gimple_convert (&stmts, scalar_type,
 						    scalar_results[k]);
@@ -5904,6 +5963,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tree tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5922,13 +5982,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     {
       gcc_assert (!slp_reduc);
       gimple_seq stmts = NULL;
-      if (nested_in_vect_loop)
+      if (double_reduc)
 	{
-          new_phi = new_phis[0];
 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
 	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
 	  new_temp = gimple_build (&stmts, code, vectype,
-				   PHI_RESULT (new_phi), adjustment_def);
+				   reduc_inputs[0], adjustment_def);
 	}
       else
 	{
@@ -5941,21 +6000,16 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       epilog_stmt = gimple_seq_last_stmt (stmts);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-      if (nested_in_vect_loop)
-        {
-          if (!double_reduc)
-            scalar_results.quick_push (new_temp);
-          else
-            scalar_results[0] = new_temp;
-        }
-      else
-        scalar_results[0] = new_temp;
-
-      new_phis[0] = epilog_stmt;
+      scalar_results[0] = new_temp;
     }
 
+  /* Record this operation if it could be reused by the epilogue loop.  */
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
+    loop_vinfo->reusable_accumulators.put (scalar_results[0],
+					   { orig_reduc_input, reduc_info });
+
   if (double_reduc)
-    loop = loop->inner;
+    loop = outer_loop;
 
   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
           phis with new adjusted scalar results, i.e., replace use <s_out0>
@@ -5982,47 +6036,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
           use <s_out4>  
           use <s_out4> */
 
-
-  /* In SLP reduction chain we reduce vector results into one vector if
-     necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
-     LHS of the last stmt in the reduction chain, since we are looking for
-     the loop exit phi node.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-    {
-      stmt_vec_info dest_stmt_info
-	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
-      scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
-      group_size = 1;
-    }
-
-  /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
-     case that REDUC_GROUP_SIZE is greater than vectorization factor).
-     Therefore, we need to match SCALAR_RESULTS with corresponding statements.
-     The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
-     correspond to the first vector stmt, etc.
-     (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
-  if (group_size > new_phis.length ())
-    gcc_assert (!(group_size % new_phis.length ()));
-
-  for (k = 0; k < group_size; k++)
+  gcc_assert (live_out_stmts.size () == scalar_results.length ());
+  for (k = 0; k < live_out_stmts.size (); k++)
     {
-      if (slp_reduc)
-        {
-	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
-
-	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
-	  /* SLP statements can't participate in patterns.  */
-	  gcc_assert (!orig_stmt_info);
-	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
-        }
-
-      if (nested_in_vect_loop)
-        {
-          if (double_reduc)
-            loop = outer_loop;
-          else
-	    gcc_unreachable ();
-        }
+      stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
+      scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
 
       phis.create (3);
       /* Find the loop-closed-use at the loop exit of the original scalar
@@ -6057,6 +6075,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
         {
           /* Replace the uses:  */
           orig_name = PHI_RESULT (exit_phi);
+
+	  /* Look for a single use at the target of the skip edge.  */
+	  if (unify_with_main_loop_p)
+	    {
+	      use_operand_p use_p;
+	      gimple *user;
+	      if (!single_imm_use (orig_name, &use_p, &user))
+		gcc_unreachable ();
+	      orig_name = gimple_get_lhs (user);
+	    }
+
           scalar_result = scalar_results[k];
           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
 	    {
@@ -6515,33 +6544,31 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
   stmt_vec_info orig_stmt_of_analysis = stmt_info;
   stmt_vec_info phi_info = stmt_info;
-  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
-      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
+  if (!is_a <gphi *> (stmt_info->stmt))
     {
-      if (!is_a <gphi *> (stmt_info->stmt))
-	{
-	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
-	  return true;
-	}
-      if (slp_node)
-	{
-	  slp_node_instance->reduc_phis = slp_node;
-	  /* ???  We're leaving slp_node to point to the PHIs, we only
-	     need it to get at the number of vector stmts which wasn't
-	     yet initialized for the instance root.  */
-	}
-      if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
-	stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
-      else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
-	{
-	  use_operand_p use_p;
-	  gimple *use_stmt;
-	  bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
-				     &use_p, &use_stmt);
-	  gcc_assert (res);
-	  phi_info = loop_vinfo->lookup_stmt (use_stmt);
-	  stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
-	}
+      STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+      return true;
+    }
+  if (slp_node)
+    {
+      slp_node_instance->reduc_phis = slp_node;
+      /* ???  We're leaving slp_node to point to the PHIs, we only
+	 need it to get at the number of vector stmts which wasn't
+	 yet initialized for the instance root.  */
+    }
+  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
+    stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
+  else
+    {
+      gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
+		  == vect_double_reduction_def);
+      use_operand_p use_p;
+      gimple *use_stmt;
+      bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
+				 &use_p, &use_stmt);
+      gcc_assert (res);
+      phi_info = loop_vinfo->lookup_stmt (use_stmt);
+      stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
     }
 
   /* PHIs should not participate in patterns.  */
@@ -6662,6 +6689,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   bool lane_reduc_code_p
     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
   int op_type = TREE_CODE_LENGTH (code);
+  enum optab_subtype optab_query_kind = optab_vector;
+  if (code == DOT_PROD_EXPR
+      && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
+	   != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
+    optab_query_kind = optab_vector_mixed_sign;
+
 
   scalar_dest = gimple_assign_lhs (stmt);
   scalar_type = TREE_TYPE (scalar_dest);
@@ -6831,10 +6864,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       else if (cond_reduc_dt == vect_constant_def)
 	{
 	  enum vect_def_type cond_initial_dt;
-	  tree cond_initial_val
-	    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
-
-	  gcc_assert (cond_reduc_val != NULL_TREE);
+	  tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
 	  if (cond_initial_dt == vect_constant_def
 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
@@ -7027,9 +7057,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   /* For SLP reductions, see if there is a neutral value we can use.  */
   tree neutral_op = NULL_TREE;
   if (slp_node)
-    neutral_op = neutral_op_for_slp_reduction
-      (slp_node_instance->reduc_phis, vectype_out, orig_code,
-       REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
+    {
+      tree initial_value = NULL_TREE;
+      if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
+	initial_value = vect_phi_initial_value (reduc_def_phi);
+      neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
+					     orig_code, initial_value);
+    }
 
   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -7189,7 +7223,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       bool ok = true;
 
       /* 4.1. check support for the operation in the loop  */
-      optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
+      optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
       if (!optab)
 	{
 	  if (dump_enabled_p ())
@@ -7204,23 +7238,20 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
-	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
+	      || !vect_can_vectorize_without_simd_p (code))
 	    ok = false;
 	  else
 	    if (dump_enabled_p ())
 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
         }
 
-      /* Worthwhile without SIMD support?  */
-      if (ok
-	  && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
-	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
-        {
-          if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "not worthwhile without SIMD support.\n");
-	  ok = false;
-        }
+      if (vect_emulated_vector_p (vectype_in)
+	  && !vect_can_vectorize_without_simd_p (code))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf (MSG_NOTE, "using word mode not possible.\n");
+	  return false;
+	}
 
       /* lane-reducing operations have to go through vect_transform_reduction.
          For the other cases try without the single cycle optimization.  */
@@ -7579,7 +7610,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					       vectype_out);
 
   /* Get the loop-entry arguments.  */
-  tree vec_initial_def;
+  tree vec_initial_def = NULL_TREE;
   auto_vec<tree> vec_initial_defs;
   if (slp_node)
     {
@@ -7593,22 +7624,40 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
       else
 	{
 	  gcc_assert (slp_node == slp_node_instance->reduc_phis);
-	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
-	  tree neutral_op
-	      = neutral_op_for_slp_reduction (slp_node, vectype_out,
-					      STMT_VINFO_REDUC_CODE (reduc_info),
-					      first != NULL);
-	  get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
-					  &vec_initial_defs, vec_num,
-					  first != NULL, neutral_op);
+	  vec<tree> &initial_values = reduc_info->reduc_initial_values;
+	  vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+
+	  unsigned int num_phis = stmts.length ();
+	  if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
+	    num_phis = 1;
+	  initial_values.reserve (num_phis);
+	  for (unsigned int i = 0; i < num_phis; ++i)
+	    {
+	      gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
+	      initial_values.quick_push (vect_phi_initial_value (this_phi));
+	    }
+	  if (vec_num == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!initial_values.is_empty ())
+	    {
+	      tree initial_value
+		= (num_phis == 1 ? initial_values[0] : NULL_TREE);
+	      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (vectype_out),
+					    code, initial_value);
+	      get_initial_defs_for_reduction (loop_vinfo, reduc_info,
+					      &vec_initial_defs, vec_num,
+					      stmts.length (), neutral_op);
+	    }
 	}
     }
   else
     {
       /* Get at the scalar def before the loop, that defines the initial
 	 value of the reduction variable.  */
-      tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
-						loop_preheader_edge (loop));
+      tree initial_def = vect_phi_initial_value (phi);
+      reduc_info->reduc_initial_values.safe_push (initial_def);
       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
 	 and we can't use zero for induc_val, use initial_def.  Similarly
 	 for REDUC_MIN and initial_def larger than the base.  */
@@ -7628,9 +7677,6 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
 	    }
 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
-	  vec_initial_defs.create (ncopies);
-	  for (i = 0; i < ncopies; ++i)
-	    vec_initial_defs.quick_push (vec_initial_def);
 	}
       else if (nested_cycle)
 	{
@@ -7640,23 +7686,106 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					 ncopies, initial_def,
 					 &vec_initial_defs);
 	}
+      else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
+	       || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
+	/* Fill the initial vector with the initial scalar value.  */
+	vec_initial_def
+	  = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
+					   initial_def, initial_def);
       else
 	{
-	  tree adjustment_def = NULL_TREE;
-	  tree *adjustment_defp = &adjustment_def;
-	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
-	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
-	    adjustment_defp = NULL;
-	  vec_initial_def
-	    = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
-					     initial_def, adjustment_defp);
-	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
-	  vec_initial_defs.create (ncopies);
-	  for (i = 0; i < ncopies; ++i)
-	    vec_initial_defs.quick_push (vec_initial_def);
+	  if (ncopies == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!reduc_info->reduc_initial_values.is_empty ())
+	    {
+	      initial_def = reduc_info->reduc_initial_values[0];
+	      enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (initial_def),
+					    code, initial_def);
+	      gcc_assert (neutral_op);
+	      /* Try to simplify the vector initialization by applying an
+		 adjustment after the reduction has been performed.  */
+	      if (!reduc_info->reused_accumulator
+		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+		  && !operand_equal_p (neutral_op, initial_def))
+		{
+		  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
+		    = initial_def;
+		  initial_def = neutral_op;
+		}
+	      vec_initial_def
+		= get_initial_def_for_reduction (loop_vinfo, reduc_info,
+						 initial_def, neutral_op);
+	    }
 	}
     }
 
+  if (vec_initial_def)
+    {
+      vec_initial_defs.create (ncopies);
+      for (i = 0; i < ncopies; ++i)
+	vec_initial_defs.quick_push (vec_initial_def);
+    }
+
+  if (auto *accumulator = reduc_info->reused_accumulator)
+    {
+      tree def = accumulator->reduc_input;
+      if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
+	{
+	  unsigned int nreduc;
+	  bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
+					    (TREE_TYPE (def)),
+					  TYPE_VECTOR_SUBPARTS (vectype_out),
+					  &nreduc);
+	  gcc_assert (res);
+	  gimple_seq stmts = NULL;
+	  /* Reduce the single vector to a smaller one.  */
+	  if (nreduc != 1)
+	    {
+	      /* Perform the reduction in the appropriate type.  */
+	      tree rvectype = vectype_out;
+	      if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
+					      TREE_TYPE (TREE_TYPE (def))))
+		rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
+					      TYPE_VECTOR_SUBPARTS
+						(vectype_out));
+	      def = vect_create_partial_epilog (def, rvectype,
+						STMT_VINFO_REDUC_CODE
+						  (reduc_info),
+						&stmts);
+	    }
+	  if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
+	    def = gimple_convert (&stmts, vectype_out, def);
+	  /* Adjust the input so we pick up the partially reduced value
+	     for the skip edge in vect_create_epilog_for_reduction.  */
+	  accumulator->reduc_input = def;
+	  if (loop_vinfo->main_loop_edge)
+	    {
+	      /* While we'd like to insert on the edge this will split
+		 blocks and disturb bookkeeping, we also will eventually
+		 need this on the skip edge.  Rely on sinking to
+		 fixup optimal placement and insert in the pred.  */
+	      gimple_stmt_iterator gsi
+		= gsi_last_bb (loop_vinfo->main_loop_edge->src);
+	      /* Insert before a cond that eventually skips the
+		 epilogue.  */
+	      if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
+		gsi_prev (&gsi);
+	      gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
+	    }
+	  else
+	    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
+					      stmts);
+	}
+      if (loop_vinfo->main_loop_edge)
+	vec_initial_defs[0]
+	  = vect_get_main_loop_result (loop_vinfo, def,
+				       vec_initial_defs[0]);
+      else
+	vec_initial_defs.safe_push (def);
+    }
+
   /* Generate the reduction PHIs upfront.  */
   for (i = 0; i < vec_num; i++)
     {
@@ -7826,47 +7955,39 @@ vectorizable_phi (vec_info *,
   return true;
 }
 
+/* Return true if VECTYPE represents a vector that requires lowering
+   by the vector lowering pass.  */
 
-/* Function vect_min_worthwhile_factor.
+bool
+vect_emulated_vector_p (tree vectype)
+{
+  return (!VECTOR_MODE_P (TYPE_MODE (vectype))
+	  && (!VECTOR_BOOLEAN_TYPE_P (vectype)
+	      || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
+}
 
-   For a loop where we could vectorize the operation indicated by CODE,
-   return the minimum vectorization factor that makes it worthwhile
-   to use generic vectors.  */
-static unsigned int
-vect_min_worthwhile_factor (enum tree_code code)
+/* Return true if we can emulate CODE on an integer mode representation
+   of a vector.  */
+
+bool
+vect_can_vectorize_without_simd_p (tree_code code)
 {
   switch (code)
     {
     case PLUS_EXPR:
     case MINUS_EXPR:
     case NEGATE_EXPR:
-      return 4;
-
     case BIT_AND_EXPR:
     case BIT_IOR_EXPR:
     case BIT_XOR_EXPR:
     case BIT_NOT_EXPR:
-      return 2;
+      return true;
 
     default:
-      return INT_MAX;
+      return false;
     }
 }
 
-/* Return true if VINFO indicates we are doing loop vectorization and if
-   it is worth decomposing CODE operations into scalar operations for
-   that loop's vectorization factor.  */
-
-bool
-vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
-{
-  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
-  unsigned HOST_WIDE_INT value;
-  return (loop_vinfo
-	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
-	  && value >= vect_min_worthwhile_factor (code));
-}
-
 /* Function vectorizable_induction
 
    Check if STMT_INFO performs an induction computation that can be vectorized.
@@ -8202,11 +8323,12 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 	  /* Fill up to the number of vectors we need for the whole group.  */
 	  nivs = least_common_multiple (group_size,
 					const_nunits) / const_nunits;
+	  vec_steps.reserve (nivs-ivn);
 	  for (; ivn < nivs; ++ivn)
 	    {
 	      SLP_TREE_VEC_STMTS (slp_node)
 		.quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
-	      vec_steps.safe_push (vec_steps[0]);
+	      vec_steps.quick_push (vec_steps[0]);
 	    }
 	}
 
@@ -8253,8 +8375,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
       return true;
     }
 
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-				     loop_preheader_edge (iv_loop));
+  init_expr = vect_phi_initial_value (phi);
 
   gimple_seq stmts = NULL;
   if (!nested_in_vect_loop)
@@ -9148,6 +9269,7 @@ maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
 	  && (phi_info = loop_vinfo->lookup_stmt (phi))
+	  && STMT_VINFO_RELEVANT_P (phi_info)
 	  && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
 	  && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
 	  && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
@@ -9674,7 +9796,10 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	   !gsi_end_p (gsi); gsi_next (&gsi))
 	{
 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
-	  if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
+	  if (!call || !gimple_call_internal_p (call))
+	    continue;
+	  internal_fn ifn = gimple_call_internal_fn (call);
+	  if (ifn == IFN_MASK_LOAD)
 	    {
 	      tree lhs = gimple_get_lhs (call);
 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
@@ -9684,6 +9809,17 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 		  gsi_replace (&gsi, new_stmt, true);
 		}
 	    }
+	  else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
+	    {
+	      tree lhs = gimple_get_lhs (call);
+	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
+		{
+		  tree else_arg
+		    = gimple_call_arg (call, gimple_call_num_args (call) - 1);
+		  gimple *new_stmt = gimple_build_assign (lhs, else_arg);
+		  gsi_replace (&gsi, new_stmt, true);
+		}
+	    }
 	}
     }				/* BBs in loop */
 
@@ -9723,12 +9859,31 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   /* In these calculations the "- 1" converts loop iteration counts
      back to latch counts.  */
   if (loop->any_upper_bound)
-    loop->nb_iterations_upper_bound
-      = (final_iter_may_be_partial
-	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
-			  lowest_vf) - 1
-	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
-			   lowest_vf) - 1);
+    {
+      loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+      loop->nb_iterations_upper_bound
+	= (final_iter_may_be_partial
+	   ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
+			    lowest_vf) - 1
+	   : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
+			     lowest_vf) - 1);
+      if (main_vinfo)
+	{
+	  unsigned int bound;
+	  poly_uint64 main_iters
+	    = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
+			   LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
+	  main_iters
+	    = upper_bound (main_iters,
+			   LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
+	  if (can_div_away_from_zero_p (main_iters,
+					LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+					&bound))
+	    loop->nb_iterations_upper_bound
+	      = wi::umin ((widest_int) (bound - 1),
+			  loop->nb_iterations_upper_bound);
+      }
+  }
   if (loop->any_likely_upper_bound)
     loop->nb_iterations_likely_upper_bound
       = (final_iter_may_be_partial