aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-loop.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r--gcc/tree-vect-loop.c836
1 files changed, 601 insertions, 235 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index e42f327..d6f1ffc 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -666,27 +666,50 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
unsigned i;
FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
- if (STMT_VINFO_IN_PATTERN_P (first))
- {
- stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
- while (next)
- {
- if (! STMT_VINFO_IN_PATTERN_P (next)
- || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
- break;
- next = REDUC_GROUP_NEXT_ELEMENT (next);
- }
- /* If not all stmt in the chain are patterns or if we failed
- to update STMT_VINFO_REDUC_IDX try to handle the chain
- without patterns. */
- if (! next
- && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
- {
- vect_fixup_reduc_chain (first);
- LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
- = STMT_VINFO_RELATED_STMT (first);
- }
- }
+ {
+ stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
+ while (next)
+ {
+ if ((STMT_VINFO_IN_PATTERN_P (next)
+ != STMT_VINFO_IN_PATTERN_P (first))
+ || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
+ break;
+ next = REDUC_GROUP_NEXT_ELEMENT (next);
+ }
+ /* If all reduction chain members are well-formed patterns adjust
+ the group to group the pattern stmts instead. */
+ if (! next
+ && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
+ {
+ if (STMT_VINFO_IN_PATTERN_P (first))
+ {
+ vect_fixup_reduc_chain (first);
+ LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
+ = STMT_VINFO_RELATED_STMT (first);
+ }
+ }
+ /* If not all stmt in the chain are patterns or if we failed
+ to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
+ it as regular reduction instead. */
+ else
+ {
+ stmt_vec_info vinfo = first;
+ stmt_vec_info last = NULL;
+ while (vinfo)
+ {
+ next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
+ REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
+ REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+ last = vinfo;
+ vinfo = next;
+ }
+ STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
+ = vect_internal_def;
+ loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
+ LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
+ --i;
+ }
+ }
}
/* Function vect_get_loop_niters.
@@ -1804,6 +1827,19 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
}
}
+ /* If using the "very cheap" model. reject cases in which we'd keep
+ a copy of the scalar code (even if we might be able to vectorize it). */
+ if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+ && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+ || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+ || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "some scalar iterations would need to be peeled\n");
+ return 0;
+ }
+
int min_profitable_iters, min_profitable_estimate;
vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
&min_profitable_estimate);
@@ -1862,6 +1898,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
min_profitable_estimate = min_profitable_iters;
}
+ /* If the vector loop needs multiple iterations to be beneficial then
+ things are probably too close to call, and the conservative thing
+ would be to stick with the scalar code. */
+ if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+ && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "one iteration of the vector loop would be"
+ " more expensive than the equivalent number of"
+ " iterations of the scalar loop\n");
+ return 0;
+ }
+
HOST_WIDE_INT estimated_niter;
/* If we are vectorizing an epilogue then we know the maximum number of
@@ -2275,6 +2325,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
/* Optimize the SLP graph with the vectorization factor fixed. */
vect_optimize_slp (loop_vinfo);
+
+ /* Gather the loads reachable from the SLP graph entries. */
+ vect_gather_slp_loads (loop_vinfo);
}
bool saved_can_use_partial_vectors_p
@@ -2342,6 +2395,79 @@ start_over:
"unsupported SLP instances\n");
goto again;
}
+
+ /* Check whether any load in ALL SLP instances is possibly permuted. */
+ slp_tree load_node, slp_root;
+ unsigned i, x;
+ slp_instance instance;
+ bool can_use_lanes = true;
+ FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
+ {
+ slp_root = SLP_INSTANCE_TREE (instance);
+ int group_size = SLP_TREE_LANES (slp_root);
+ tree vectype = SLP_TREE_VECTYPE (slp_root);
+ bool loads_permuted = false;
+ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+ {
+ if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+ continue;
+ unsigned j;
+ stmt_vec_info load_info;
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+ if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+ {
+ loads_permuted = true;
+ break;
+ }
+ }
+
+ /* If the loads and stores can be handled with load/store-lane
+ instructions record it and move on to the next instance. */
+ if (loads_permuted
+ && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+ && vect_store_lanes_supported (vectype, group_size, false))
+ {
+ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+ {
+ stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+ (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+ /* Use SLP for strided accesses (or if we can't
+ load-lanes). */
+ if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+ || ! vect_load_lanes_supported
+ (STMT_VINFO_VECTYPE (stmt_vinfo),
+ DR_GROUP_SIZE (stmt_vinfo), false))
+ break;
+ }
+
+ can_use_lanes
+ = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
+
+ if (can_use_lanes && dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP instance %p can use load/store-lanes\n",
+ instance);
+ }
+ else
+ {
+ can_use_lanes = false;
+ break;
+ }
+ }
+
+ /* If all SLP instances can use load/store-lanes abort SLP and try again
+ with SLP disabled. */
+ if (can_use_lanes)
+ {
+ ok = opt_result::failure_at (vect_location,
+ "Built SLP cancelled: can use "
+ "load/store-lanes\n");
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Built SLP cancelled: all SLP instances support "
+ "load/store-lanes\n");
+ goto again;
+ }
}
/* Dissolve SLP-only groups. */
@@ -2572,9 +2698,13 @@ again:
STMT_SLP_TYPE (stmt_info) = loop_vect;
if (STMT_VINFO_IN_PATTERN_P (stmt_info))
{
+ stmt_vec_info pattern_stmt_info
+ = STMT_VINFO_RELATED_STMT (stmt_info);
+ if (STMT_VINFO_SLP_VECT_ONLY (pattern_stmt_info))
+ STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
+
gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
- stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
- STMT_SLP_TYPE (stmt_info) = loop_vect;
+ STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
!gsi_end_p (pi); gsi_next (&pi))
STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
@@ -2643,43 +2773,56 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
/* Check whether the (fractional) cost per scalar iteration is lower
or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
- poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
- * poly_widest_int (old_vf));
- poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
- * poly_widest_int (new_vf));
- if (maybe_lt (rel_old, rel_new))
- {
- /* When old_loop_vinfo uses a variable vectorization factor,
- we know that it has a lower cost for at least one runtime VF.
- However, we don't know how likely that VF is.
-
- One option would be to compare the costs for the estimated VFs.
- The problem is that that can put too much pressure on the cost
- model. E.g. if the estimated VF is also the lowest possible VF,
- and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
- for the estimated VF, we'd then choose new_loop_vinfo even
- though (a) new_loop_vinfo might not actually be better than
- old_loop_vinfo for that VF and (b) it would be significantly
- worse at larger VFs.
-
- Here we go for a hacky compromise: pick new_loop_vinfo if it is
- no more expensive than old_loop_vinfo even after doubling the
- estimated old_loop_vinfo VF. For all but trivial loops, this
- ensures that we only pick new_loop_vinfo if it is significantly
- better than old_loop_vinfo at the estimated VF. */
- if (rel_new.is_constant ())
- return false;
+ poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
+ poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
+
+ HOST_WIDE_INT est_rel_new_min
+ = estimated_poly_value (rel_new, POLY_VALUE_MIN);
+ HOST_WIDE_INT est_rel_new_max
+ = estimated_poly_value (rel_new, POLY_VALUE_MAX);
+
+ HOST_WIDE_INT est_rel_old_min
+ = estimated_poly_value (rel_old, POLY_VALUE_MIN);
+ HOST_WIDE_INT est_rel_old_max
+ = estimated_poly_value (rel_old, POLY_VALUE_MAX);
+
+ /* Check first if we can make out an unambigous total order from the minimum
+ and maximum estimates. */
+ if (est_rel_new_min < est_rel_old_min
+ && est_rel_new_max < est_rel_old_max)
+ return true;
+ else if (est_rel_old_min < est_rel_new_min
+ && est_rel_old_max < est_rel_new_max)
+ return false;
+ /* When old_loop_vinfo uses a variable vectorization factor,
+ we know that it has a lower cost for at least one runtime VF.
+ However, we don't know how likely that VF is.
+
+ One option would be to compare the costs for the estimated VFs.
+ The problem is that that can put too much pressure on the cost
+ model. E.g. if the estimated VF is also the lowest possible VF,
+ and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
+ for the estimated VF, we'd then choose new_loop_vinfo even
+ though (a) new_loop_vinfo might not actually be better than
+ old_loop_vinfo for that VF and (b) it would be significantly
+ worse at larger VFs.
+
+ Here we go for a hacky compromise: pick new_loop_vinfo if it is
+ no more expensive than old_loop_vinfo even after doubling the
+ estimated old_loop_vinfo VF. For all but trivial loops, this
+ ensures that we only pick new_loop_vinfo if it is significantly
+ better than old_loop_vinfo at the estimated VF. */
- HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
- HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
- widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
- * widest_int (old_estimated_vf));
- widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
- * widest_int (new_estimated_vf));
- return estimated_rel_new * 2 <= estimated_rel_old;
+ if (est_rel_old_min != est_rel_new_min
+ || est_rel_old_max != est_rel_new_max)
+ {
+ HOST_WIDE_INT est_rel_new_likely
+ = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
+ HOST_WIDE_INT est_rel_old_likely
+ = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
+
+ return est_rel_new_likely * 2 <= est_rel_old_likely;
}
- if (known_lt (rel_new, rel_old))
- return true;
/* If there's nothing to choose between the loop bodies, see whether
there's a difference in the prologue and epilogue costs. */
@@ -3230,14 +3373,17 @@ pop:
fail = true;
break;
}
- /* Check there's only a single stmt the op is used on inside
- of the loop. */
+ /* Check there's only a single stmt the op is used on. For the
+ not value-changing tail and the last stmt allow out-of-loop uses.
+ ??? We could relax this and handle arbitrary live stmts by
+ forcing a scalar epilogue for example. */
imm_use_iterator imm_iter;
gimple *op_use_stmt;
unsigned cnt = 0;
FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
if (!is_gimple_debug (op_use_stmt)
- && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
+ && (*code != ERROR_MARK
+ || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
{
/* We want to allow x + x but not x < 1 ? x : 2. */
if (is_gimple_assign (op_use_stmt)
@@ -4420,34 +4566,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
}
-/* Function vect_model_induction_cost.
-
- Models cost for induction operations. */
-
-static void
-vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
- stmt_vector_for_cost *cost_vec)
-{
- unsigned inside_cost, prologue_cost;
-
- if (PURE_SLP_STMT (stmt_info))
- return;
-
- /* loop cost for vec_loop. */
- inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
- stmt_info, 0, vect_body);
-
- /* prologue cost for vec_init and vec_step. */
- prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
- stmt_info, 0, vect_prologue);
-
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "vect_model_induction_cost: inside_cost = %d, "
- "prologue_cost = %d .\n", inside_cost, prologue_cost);
-}
-
-
/* Function get_initial_def_for_reduction
@@ -5160,8 +5278,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
int scalar_precision
= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
- tree vectype_unsigned = build_vector_type
- (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
+ tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
+ vectype);
/* First we need to create a vector (ZERO_VEC) of zeros and another
vector (MAX_INDEX_VEC) filled with the last matching index, which we
@@ -6313,9 +6431,28 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
{
if (is_a <gphi *> (stmt_info->stmt))
- /* Analysis for double-reduction is done on the outer
- loop PHI, nested cycles have no further restrictions. */
- STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+ {
+ if (slp_node)
+ {
+ /* We eventually need to set a vector type on invariant
+ arguments. */
+ unsigned j;
+ slp_tree child;
+ FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+ if (!vect_maybe_update_slp_op_vectype
+ (child, SLP_TREE_VECTYPE (slp_node)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for "
+ "invariants\n");
+ return false;
+ }
+ }
+ /* Analysis for double-reduction is done on the outer
+ loop PHI, nested cycles have no further restrictions. */
+ STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+ }
else
STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
return true;
@@ -6805,8 +6942,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
int scalar_precision
= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
cr_index_scalar_type = make_unsigned_type (scalar_precision);
- cr_index_vector_type = build_vector_type (cr_index_scalar_type,
- nunits_out);
+ cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
+ vectype_out);
if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
OPTIMIZE_FOR_SPEED))
@@ -7377,15 +7514,24 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
if (slp_node)
{
vec_initial_defs.reserve (vec_num);
- gcc_assert (slp_node == slp_node_instance->reduc_phis);
- stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
- tree neutral_op
- = neutral_op_for_slp_reduction (slp_node, vectype_out,
- STMT_VINFO_REDUC_CODE (reduc_info),
- first != NULL);
- get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
- &vec_initial_defs, vec_num,
- first != NULL, neutral_op);
+ if (nested_cycle)
+ {
+ unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
+ vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
+ &vec_initial_defs);
+ }
+ else
+ {
+ gcc_assert (slp_node == slp_node_instance->reduc_phis);
+ stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
+ tree neutral_op
+ = neutral_op_for_slp_reduction (slp_node, vectype_out,
+ STMT_VINFO_REDUC_CODE (reduc_info),
+ first != NULL);
+ get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
+ &vec_initial_defs, vec_num,
+ first != NULL, neutral_op);
+ }
}
else
{
@@ -7491,6 +7637,17 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
if (!vec_stmt) /* transformation not required. */
{
+ /* Deal with copies from externs or constants that disguise as
+ loop-closed PHI nodes (PR97886). */
+ if (slp_node
+ && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
+ SLP_TREE_VECTYPE (slp_node)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
return true;
}
@@ -7520,6 +7677,81 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
return true;
}
+/* Vectorizes PHIs. */
+
+bool
+vectorizable_phi (vec_info *,
+ stmt_vec_info stmt_info, gimple **vec_stmt,
+ slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+ if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
+ return false;
+
+ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+ return false;
+
+ tree vectype = SLP_TREE_VECTYPE (slp_node);
+
+ if (!vec_stmt) /* transformation not required. */
+ {
+ slp_tree child;
+ unsigned i;
+ FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
+ if (!child)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "PHI node with unvectorized backedge def\n");
+ return false;
+ }
+ else if (!vect_maybe_update_slp_op_vectype (child, vectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
+ record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+ vector_stmt, stmt_info, vectype, 0, vect_body);
+ STMT_VINFO_TYPE (stmt_info) = phi_info_type;
+ return true;
+ }
+
+ tree scalar_dest = gimple_phi_result (stmt_info->stmt);
+ basic_block bb = gimple_bb (stmt_info->stmt);
+ tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
+ auto_vec<gphi *> new_phis;
+ for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
+ {
+ slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
+
+ /* Skip not yet vectorized defs. */
+ if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
+ && SLP_TREE_VEC_STMTS (child).is_empty ())
+ continue;
+
+ auto_vec<tree> vec_oprnds;
+ vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
+ if (!new_phis.exists ())
+ {
+ new_phis.create (vec_oprnds.length ());
+ for (unsigned j = 0; j < vec_oprnds.length (); j++)
+ {
+ /* Create the vectorized LC PHI node. */
+ new_phis.quick_push (create_phi_node (vec_dest, bb));
+ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
+ }
+ }
+ edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
+ for (unsigned j = 0; j < vec_oprnds.length (); j++)
+ add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
+ }
+ /* We should have at least one already vectorized child. */
+ gcc_assert (new_phis.exists ());
+
+ return true;
+}
+
/* Function vect_min_worthwhile_factor.
@@ -7590,7 +7822,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
unsigned i;
tree expr;
- gimple_seq stmts;
gimple_stmt_iterator si;
gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
@@ -7630,10 +7861,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
return false;
}
- /* FORNOW: outer loop induction with SLP not supported. */
- if (STMT_SLP_TYPE (stmt_info))
- return false;
-
exit_phi = NULL;
latch_e = loop_latch_edge (loop->inner);
loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
@@ -7672,7 +7899,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
if (slp_node && !nunits.is_constant ())
{
- /* The current SLP code creates the initial value element-by-element. */
+ /* The current SLP code creates the step value element-by-element. */
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"SLP induction not supported for variable-length"
@@ -7682,9 +7909,50 @@ vectorizable_induction (loop_vec_info loop_vinfo,
if (!vec_stmt) /* transformation not required. */
{
+ unsigned inside_cost = 0, prologue_cost = 0;
+ if (slp_node)
+ {
+ /* We eventually need to set a vector type on invariant
+ arguments. */
+ unsigned j;
+ slp_tree child;
+ FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+ if (!vect_maybe_update_slp_op_vectype
+ (child, SLP_TREE_VECTYPE (slp_node)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for "
+ "invariants\n");
+ return false;
+ }
+ /* loop cost for vec_loop. */
+ inside_cost
+ = record_stmt_cost (cost_vec,
+ SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+ vector_stmt, stmt_info, 0, vect_body);
+ /* prologue cost for vec_init (if not nested) and step. */
+ prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
+ scalar_to_vec,
+ stmt_info, 0, vect_prologue);
+ }
+ else /* if (!slp_node) */
+ {
+ /* loop cost for vec_loop. */
+ inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+ stmt_info, 0, vect_body);
+ /* prologue cost for vec_init and vec_step. */
+ prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+ stmt_info, 0, vect_prologue);
+ }
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vect_model_induction_cost: inside_cost = %d, "
+ "prologue_cost = %d .\n", inside_cost,
+ prologue_cost);
+
STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
DUMP_VECT_SCOPE ("vectorizable_induction");
- vect_model_induction_cost (stmt_info, ncopies, cost_vec);
return true;
}
@@ -7703,164 +7971,195 @@ vectorizable_induction (loop_vec_info loop_vinfo,
tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
pe = loop_preheader_edge (iv_loop);
- init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
- loop_preheader_edge (iv_loop));
-
- stmts = NULL;
- if (!nested_in_vect_loop)
- {
- /* Convert the initial value to the IV update type. */
- tree new_type = TREE_TYPE (step_expr);
- init_expr = gimple_convert (&stmts, new_type, init_expr);
-
- /* If we are using the loop mask to "peel" for alignment then we need
- to adjust the start value here. */
- tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
- if (skip_niters != NULL_TREE)
- {
- if (FLOAT_TYPE_P (vectype))
- skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
- skip_niters);
- else
- skip_niters = gimple_convert (&stmts, new_type, skip_niters);
- tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
- skip_niters, step_expr);
- init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
- init_expr, skip_step);
- }
- }
-
- if (stmts)
- {
- new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
- gcc_assert (!new_bb);
- }
-
/* Find the first insertion point in the BB. */
basic_block bb = gimple_bb (phi);
si = gsi_after_labels (bb);
/* For SLP induction we have to generate several IVs as for example
- with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
- [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
- [VF*S, VF*S, VF*S, VF*S] for all. */
+ with group size 3 we need
+ [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
+ [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
if (slp_node)
{
/* Enforced above. */
unsigned int const_nunits = nunits.to_constant ();
- /* Generate [VF*S, VF*S, ... ]. */
- if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+ /* The initial values are vectorized, but any lanes > group_size
+ need adjustment. */
+ slp_tree init_node
+ = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
+
+ /* Gather steps. Since we do not vectorize inductions as
+ cycles we have to reconstruct the step from SCEV data. */
+ unsigned group_size = SLP_TREE_LANES (slp_node);
+ tree *steps = XALLOCAVEC (tree, group_size);
+ tree *inits = XALLOCAVEC (tree, group_size);
+ stmt_vec_info phi_info;
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
{
- expr = build_int_cst (integer_type_node, vf);
- expr = fold_convert (TREE_TYPE (step_expr), expr);
+ steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+ if (!init_node)
+ inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
+ pe->dest_idx);
}
- else
- expr = build_int_cst (TREE_TYPE (step_expr), vf);
- new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
- expr, step_expr);
- if (! CONSTANT_CLASS_P (new_name))
- new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
- TREE_TYPE (step_expr), NULL);
- new_vec = build_vector_from_val (step_vectype, new_name);
- vec_step = vect_init_vector (loop_vinfo, stmt_info,
- new_vec, step_vectype, NULL);
/* Now generate the IVs. */
- unsigned group_size = SLP_TREE_LANES (slp_node);
unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
- unsigned elts = const_nunits * nvects;
- /* Compute the number of distinct IVs we need. First reduce
- group_size if it is a multiple of const_nunits so we get
- one IV for a group_size of 4 but const_nunits 2. */
- unsigned group_sizep = group_size;
- if (group_sizep % const_nunits == 0)
- group_sizep = group_sizep / const_nunits;
- unsigned nivs = least_common_multiple (group_sizep,
- const_nunits) / const_nunits;
- gcc_assert (elts % group_size == 0);
- tree elt = init_expr;
+ gcc_assert ((const_nunits * nvects) % group_size == 0);
+ unsigned nivs;
+ if (nested_in_vect_loop)
+ nivs = nvects;
+ else
+ {
+ /* Compute the number of distinct IVs we need. First reduce
+ group_size if it is a multiple of const_nunits so we get
+ one IV for a group_size of 4 but const_nunits 2. */
+ unsigned group_sizep = group_size;
+ if (group_sizep % const_nunits == 0)
+ group_sizep = group_sizep / const_nunits;
+ nivs = least_common_multiple (group_sizep,
+ const_nunits) / const_nunits;
+ }
+ tree stept = TREE_TYPE (step_vectype);
+ tree lupdate_mul = NULL_TREE;
+ if (!nested_in_vect_loop)
+ {
+ /* The number of iterations covered in one vector iteration. */
+ unsigned lup_mul = (nvects * const_nunits) / group_size;
+ lupdate_mul
+ = build_vector_from_val (step_vectype,
+ SCALAR_FLOAT_TYPE_P (stept)
+ ? build_real_from_wide (stept, lup_mul,
+ UNSIGNED)
+ : build_int_cstu (stept, lup_mul));
+ }
+ tree peel_mul = NULL_TREE;
+ gimple_seq init_stmts = NULL;
+ if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+ {
+ if (SCALAR_FLOAT_TYPE_P (stept))
+ peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
+ LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+ else
+ peel_mul = gimple_convert (&init_stmts, stept,
+ LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+ peel_mul = gimple_build_vector_from_val (&init_stmts,
+ step_vectype, peel_mul);
+ }
unsigned ivn;
+ auto_vec<tree> vec_steps;
for (ivn = 0; ivn < nivs; ++ivn)
{
- tree_vector_builder elts (step_vectype, const_nunits, 1);
- stmts = NULL;
+ tree_vector_builder step_elts (step_vectype, const_nunits, 1);
+ tree_vector_builder init_elts (vectype, const_nunits, 1);
+ tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
{
- if (ivn*const_nunits + eltn >= group_size
- && (ivn * const_nunits + eltn) % group_size == 0)
- elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
- elt, step_expr);
- elts.quick_push (elt);
- }
- vec_init = gimple_build_vector (&stmts, &elts);
- vec_init = gimple_convert (&stmts, vectype, vec_init);
- if (stmts)
- {
- new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
- gcc_assert (!new_bb);
+ /* The scalar steps of the IVs. */
+ tree elt = steps[(ivn*const_nunits + eltn) % group_size];
+ elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
+ step_elts.quick_push (elt);
+ if (!init_node)
+ {
+ /* The scalar inits of the IVs if not vectorized. */
+ elt = inits[(ivn*const_nunits + eltn) % group_size];
+ if (!useless_type_conversion_p (TREE_TYPE (vectype),
+ TREE_TYPE (elt)))
+ elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
+ TREE_TYPE (vectype), elt);
+ init_elts.quick_push (elt);
+ }
+ /* The number of steps to add to the initial values. */
+ unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
+ mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
+ ? build_real_from_wide (stept,
+ mul_elt, UNSIGNED)
+ : build_int_cstu (stept, mul_elt));
}
+ vec_step = gimple_build_vector (&init_stmts, &step_elts);
+ vec_steps.safe_push (vec_step);
+ tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
+ if (peel_mul)
+ step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+ step_mul, peel_mul);
+ if (!init_node)
+ vec_init = gimple_build_vector (&init_stmts, &init_elts);
/* Create the induction-phi that defines the induction-operand. */
- vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+ vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
+ "vec_iv_");
induction_phi = create_phi_node (vec_dest, iv_loop->header);
induc_def = PHI_RESULT (induction_phi);
/* Create the iv update inside the loop */
+ tree up = vec_step;
+ if (lupdate_mul)
+ up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+ vec_step, lupdate_mul);
gimple_seq stmts = NULL;
vec_def = gimple_convert (&stmts, step_vectype, induc_def);
vec_def = gimple_build (&stmts,
- PLUS_EXPR, step_vectype, vec_def, vec_step);
+ PLUS_EXPR, step_vectype, vec_def, up);
vec_def = gimple_convert (&stmts, vectype, vec_def);
gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+ add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+ UNKNOWN_LOCATION);
+
+ if (init_node)
+ vec_init = vect_get_slp_vect_def (init_node, ivn);
+ if (!nested_in_vect_loop
+ && !integer_zerop (step_mul))
+ {
+ vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
+ up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+ vec_step, step_mul);
+ vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+ vec_def, up);
+ vec_init = gimple_convert (&init_stmts, vectype, vec_def);
+ }
/* Set the arguments of the phi node: */
add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
- add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
- UNKNOWN_LOCATION);
SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
}
- /* Fill up to the number of vectors we need for the whole group. */
- nivs = least_common_multiple (group_size,
- const_nunits) / const_nunits;
- for (; ivn < nivs; ++ivn)
- SLP_TREE_VEC_STMTS (slp_node)
- .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+ if (!nested_in_vect_loop)
+ {
+ /* Fill up to the number of vectors we need for the whole group. */
+ nivs = least_common_multiple (group_size,
+ const_nunits) / const_nunits;
+ for (; ivn < nivs; ++ivn)
+ {
+ SLP_TREE_VEC_STMTS (slp_node)
+ .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+ vec_steps.safe_push (vec_steps[0]);
+ }
+ }
- /* Re-use IVs when we can. */
+ /* Re-use IVs when we can. We are generating further vector
+ stmts by adding VF' * stride to the IVs generated above. */
if (ivn < nvects)
{
unsigned vfp
= least_common_multiple (group_size, const_nunits) / group_size;
- /* Generate [VF'*S, VF'*S, ... ]. */
- if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
- {
- expr = build_int_cst (integer_type_node, vfp);
- expr = fold_convert (TREE_TYPE (step_expr), expr);
- }
- else
- expr = build_int_cst (TREE_TYPE (step_expr), vfp);
- new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
- expr, step_expr);
- if (! CONSTANT_CLASS_P (new_name))
- new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
- TREE_TYPE (step_expr), NULL);
- new_vec = build_vector_from_val (step_vectype, new_name);
- vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
- step_vectype, NULL);
+ tree lupdate_mul
+ = build_vector_from_val (step_vectype,
+ SCALAR_FLOAT_TYPE_P (stept)
+ ? build_real_from_wide (stept,
+ vfp, UNSIGNED)
+ : build_int_cstu (stept, vfp));
for (; ivn < nvects; ++ivn)
{
gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
- tree def;
- if (gimple_code (iv) == GIMPLE_PHI)
- def = gimple_phi_result (iv);
- else
- def = gimple_assign_lhs (iv);
+ tree def = gimple_get_lhs (iv);
+ if (ivn < 2*nivs)
+ vec_steps[ivn - nivs]
+ = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+ vec_steps[ivn - nivs], lupdate_mul);
gimple_seq stmts = NULL;
def = gimple_convert (&stmts, step_vectype, def);
- def = gimple_build (&stmts,
- PLUS_EXPR, step_vectype, def, vec_step);
+ def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
+ def, vec_steps[ivn % nivs]);
def = gimple_convert (&stmts, vectype, def);
if (gimple_code (iv) == GIMPLE_PHI)
gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
@@ -7874,9 +8173,45 @@ vectorizable_induction (loop_vec_info loop_vinfo,
}
}
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
+ gcc_assert (!new_bb);
+
return true;
}
+ init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
+ loop_preheader_edge (iv_loop));
+
+ gimple_seq stmts = NULL;
+ if (!nested_in_vect_loop)
+ {
+ /* Convert the initial value to the IV update type. */
+ tree new_type = TREE_TYPE (step_expr);
+ init_expr = gimple_convert (&stmts, new_type, init_expr);
+
+ /* If we are using the loop mask to "peel" for alignment then we need
+ to adjust the start value here. */
+ tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+ if (skip_niters != NULL_TREE)
+ {
+ if (FLOAT_TYPE_P (vectype))
+ skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
+ skip_niters);
+ else
+ skip_niters = gimple_convert (&stmts, new_type, skip_niters);
+ tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
+ skip_niters, step_expr);
+ init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
+ init_expr, skip_step);
+ }
+ }
+
+ if (stmts)
+ {
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+ gcc_assert (!new_bb);
+ }
+
/* Create the vector that holds the initial_value of the induction. */
if (nested_in_vect_loop)
{
@@ -8376,8 +8711,19 @@ vectorizable_live_operation (vec_info *vinfo,
gimple_seq stmts = NULL;
new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
&stmts, true, NULL_TREE);
-
- gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+ if (TREE_CODE (new_tree) == SSA_NAME
+ && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
+ SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
+ if (is_a <gphi *> (vec_stmt))
+ {
+ gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
+ gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+ }
+ else
+ {
+ gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
+ gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
+ }
/* Replace use of lhs with newly computed result. If the use stmt is a
single arg PHI, just replace all uses of PHI result. It's necessary
@@ -8414,6 +8760,24 @@ vectorizable_live_operation (vec_info *vinfo,
"def\n");
continue;
}
+ /* ??? It can also happen that we end up pulling a def into
+ a loop where replacing out-of-loop uses would require
+ a new LC SSA PHI node. Retain the original scalar in
+ those cases as well. PR98064. */
+ if (TREE_CODE (new_tree) == SSA_NAME
+ && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+ && (gimple_bb (use_stmt)->loop_father
+ != gimple_bb (vec_stmt)->loop_father)
+ && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
+ gimple_bb (use_stmt)->loop_father))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Using original scalar computation for "
+ "live lane because there is an out-of-loop "
+ "definition for it\n");
+ continue;
+ }
FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
SET_USE (use_p, new_tree);
update_stmt (use_stmt);
@@ -8734,7 +9098,7 @@ maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
stmt_vec_info. */
-static void
+static bool
vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
{
@@ -8750,7 +9114,7 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
if (!STMT_VINFO_RELEVANT_P (stmt_info)
&& !STMT_VINFO_LIVE_P (stmt_info))
- return;
+ return false;
if (STMT_VINFO_VECTYPE (stmt_info))
{
@@ -8767,13 +9131,15 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
/* Pure SLP statements have already been vectorized. We still need
to apply loop vectorization to hybrid SLP statements. */
if (PURE_SLP_STMT (stmt_info))
- return;
+ return false;
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
*seen_store = stmt_info;
+
+ return true;
}
/* Helper function to pass to simplify_replace_tree to enable replacing tree's
@@ -9199,17 +9565,17 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
}
stmt_vec_info pat_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info);
- vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
- &seen_store);
- maybe_set_vectorized_backedge_value (loop_vinfo,
- pat_stmt_info);
+ if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
+ &si, &seen_store))
+ maybe_set_vectorized_backedge_value (loop_vinfo,
+ pat_stmt_info);
}
else
{
- vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
- &seen_store);
- maybe_set_vectorized_backedge_value (loop_vinfo,
- stmt_info);
+ if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
+ &seen_store))
+ maybe_set_vectorized_backedge_value (loop_vinfo,
+ stmt_info);
}
}
gsi_next (&si);