aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-loop.c
diff options
context:
space:
mode:
authorGiuliano Belinassi <giuliano.belinassi@usp.br>2020-08-22 17:43:43 -0300
committerGiuliano Belinassi <giuliano.belinassi@usp.br>2020-08-22 17:43:43 -0300
commita926878ddbd5a98b272c22171ce58663fc04c3e0 (patch)
tree86af256e5d9a9c06263c00adc90e5fe348008c43 /gcc/tree-vect-loop.c
parent542730f087133690b47e036dfd43eb0db8a650ce (diff)
parent07cbaed8ba7d1b6e4ab3a9f44175502a4e1ecdb1 (diff)
downloadgcc-devel/autopar_devel.zip
gcc-devel/autopar_devel.tar.gz
gcc-devel/autopar_devel.tar.bz2
Merge branch 'autopar_rebase2' into autopar_develdevel/autopar_devel
Quickly commit changes in the rebase branch.
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r--gcc/tree-vect-loop.c1306
1 files changed, 773 insertions, 533 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index c4c3cc9..dba230f 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -342,6 +342,8 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
gsi_next (&si))
{
+ if (is_gimple_debug (gsi_stmt (si)))
+ continue;
stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
opt_result res
= vect_determine_vf_for_stmt (loop_vinfo,
@@ -800,7 +802,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
vectorization_factor (0),
max_vectorization_factor (0),
mask_skip_niters (NULL_TREE),
- mask_compare_type (NULL_TREE),
+ rgroup_compare_type (NULL_TREE),
simd_if_cond (NULL_TREE),
unaligned_dr (NULL),
peeling_for_alignment (0),
@@ -812,8 +814,9 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
vec_outside_cost (0),
vec_inside_cost (0),
vectorizable (false),
- can_fully_mask_p (true),
- fully_masked_p (false),
+ can_use_partial_vectors_p (true),
+ using_partial_vectors_p (false),
+ epil_using_partial_vectors_p (false),
peeling_for_gaps (false),
peeling_for_niter (false),
no_data_dependencies (false),
@@ -847,6 +850,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
{
gimple *stmt = gsi_stmt (si);
gimple_set_uid (stmt, 0);
+ if (is_gimple_debug (stmt))
+ continue;
add_stmt (stmt);
/* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
third argument is the #pragma omp simd if (x) condition, when 0,
@@ -874,16 +879,16 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
epilogue_vinfos.create (6);
}
-/* Free all levels of MASKS. */
+/* Free all levels of rgroup CONTROLS. */
void
-release_vec_loop_masks (vec_loop_masks *masks)
+release_vec_loop_controls (vec<rgroup_controls> *controls)
{
- rgroup_masks *rgm;
+ rgroup_controls *rgc;
unsigned int i;
- FOR_EACH_VEC_ELT (*masks, i, rgm)
- rgm->masks.release ();
- masks->release ();
+ FOR_EACH_VEC_ELT (*controls, i, rgc)
+ rgc->controls.release ();
+ controls->release ();
}
/* Free all memory used by the _loop_vec_info, as well as all the
@@ -893,7 +898,8 @@ _loop_vec_info::~_loop_vec_info ()
{
free (bbs);
- release_vec_loop_masks (&masks);
+ release_vec_loop_controls (&masks);
+ release_vec_loop_controls (&lens);
delete ivexpr_map;
delete scan_map;
epilogue_vinfos.release ();
@@ -934,12 +940,12 @@ cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
static bool
can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
{
- rgroup_masks *rgm;
+ rgroup_controls *rgm;
unsigned int i;
FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
- if (rgm->mask_type != NULL_TREE
+ if (rgm->type != NULL_TREE
&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
- cmp_type, rgm->mask_type,
+ cmp_type, rgm->type,
OPTIMIZE_FOR_SPEED))
return false;
return true;
@@ -953,20 +959,45 @@ vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
{
unsigned int res = 1;
unsigned int i;
- rgroup_masks *rgm;
+ rgroup_controls *rgm;
FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
res = MAX (res, rgm->max_nscalars_per_iter);
return res;
}
+/* Calculate the minimum precision necessary to represent:
+
+ MAX_NITERS * FACTOR
+
+ as an unsigned integer, where MAX_NITERS is the maximum number of
+ loop header iterations for the original scalar form of LOOP_VINFO. */
+
+static unsigned
+vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
+{
+ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+ /* Get the maximum number of iterations that is representable
+ in the counter type. */
+ tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
+ widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
+
+ /* Get a more refined estimate for the number of iterations. */
+ widest_int max_back_edges;
+ if (max_loop_iterations (loop, &max_back_edges))
+ max_ni = wi::smin (max_ni, max_back_edges + 1);
+
+ /* Work out how many bits we need to represent the limit. */
+ return wi::min_precision (max_ni * factor, UNSIGNED);
+}
+
/* Each statement in LOOP_VINFO can be masked where necessary. Check
whether we can actually generate the masks required. Return true if so,
- storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
+ storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
static bool
vect_verify_full_masking (loop_vec_info loop_vinfo)
{
- class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
unsigned int min_ni_width;
unsigned int max_nscalars_per_iter
= vect_get_max_nscalars_per_iter (loop_vinfo);
@@ -977,27 +1008,15 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
return false;
- /* Get the maximum number of iterations that is representable
- in the counter type. */
- tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
- widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
-
- /* Get a more refined estimate for the number of iterations. */
- widest_int max_back_edges;
- if (max_loop_iterations (loop, &max_back_edges))
- max_ni = wi::smin (max_ni, max_back_edges + 1);
-
- /* Account for rgroup masks, in which each bit is replicated N times. */
- max_ni *= max_nscalars_per_iter;
-
/* Work out how many bits we need to represent the limit. */
- min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+ min_ni_width
+ = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
/* Find a scalar mode for which WHILE_ULT is supported. */
opt_scalar_int_mode cmp_mode_iter;
tree cmp_type = NULL_TREE;
tree iv_type = NULL_TREE;
- widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+ widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
unsigned int iv_precision = UINT_MAX;
if (iv_limit != -1)
@@ -1050,8 +1069,83 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
if (!cmp_type)
return false;
- LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
- LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
+ LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
+ LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+ return true;
+}
+
+/* Check whether we can use vector access with length based on precison
+ comparison. So far, to keep it simple, we only allow the case that the
+ precision of the target supported length is larger than the precision
+ required by loop niters. */
+
+static bool
+vect_verify_loop_lens (loop_vec_info loop_vinfo)
+{
+ if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+ return false;
+
+ unsigned int max_nitems_per_iter = 1;
+ unsigned int i;
+ rgroup_controls *rgl;
+ /* Find the maximum number of items per iteration for every rgroup. */
+ FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
+ {
+ unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
+ max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
+ }
+
+ /* Work out how many bits we need to represent the length limit. */
+ unsigned int min_ni_prec
+ = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
+
+ /* Now use the maximum of below precisions for one suitable IV type:
+ - the IV's natural precision
+ - the precision needed to hold: the maximum number of scalar
+ iterations multiplied by the scale factor (min_ni_prec above)
+ - the Pmode precision
+
+ If min_ni_prec is less than the precision of the current niters,
+ we perfer to still use the niters type. Prefer to use Pmode and
+ wider IV to avoid narrow conversions. */
+
+ unsigned int ni_prec
+ = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
+ min_ni_prec = MAX (min_ni_prec, ni_prec);
+ min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
+
+ tree iv_type = NULL_TREE;
+ opt_scalar_int_mode tmode_iter;
+ FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
+ {
+ scalar_mode tmode = tmode_iter.require ();
+ unsigned int tbits = GET_MODE_BITSIZE (tmode);
+
+ /* ??? Do we really want to construct one IV whose precision exceeds
+ BITS_PER_WORD? */
+ if (tbits > BITS_PER_WORD)
+ break;
+
+ /* Find the first available standard integral type. */
+ if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
+ {
+ iv_type = build_nonstandard_integer_type (tbits, true);
+ break;
+ }
+ }
+
+ if (!iv_type)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't vectorize with length-based partial vectors"
+ " because there is no suitable iv type.\n");
+ return false;
+ }
+
+ LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
+ LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+
return true;
}
@@ -1124,8 +1218,8 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
j, si)
(void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
- si->kind, si->stmt_info, si->misalign,
- vect_body);
+ si->kind, si->stmt_info, si->vectype,
+ si->misalign, vect_body);
unsigned dummy, body_cost = 0;
finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
destroy_cost_data (target_cost_data);
@@ -1393,6 +1487,8 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
gsi_next (&si))
{
+ if (is_gimple_debug (gsi_stmt (si)))
+ continue;
stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
stmt_info = vect_stmt_to_vectorize (stmt_info);
if ((STMT_VINFO_RELEVANT_P (stmt_info)
@@ -1554,7 +1650,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
&& ! PURE_SLP_STMT (stmt_info))
ok = vectorizable_induction (loop_vinfo,
- stmt_info, NULL, NULL, NULL,
+ stmt_info, NULL, NULL,
&cost_vec);
else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
|| (STMT_VINFO_DEF_TYPE (stmt_info)
@@ -1584,7 +1680,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
gsi_next (&si))
{
gimple *stmt = gsi_stmt (si);
- if (!gimple_clobber_p (stmt))
+ if (!gimple_clobber_p (stmt)
+ && !is_gimple_debug (stmt))
{
opt_result res
= vect_analyze_stmt (loop_vinfo,
@@ -1617,6 +1714,27 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
return opt_result::success ();
}
+/* Return true if we know that the iteration count is smaller than the
+ vectorization factor. Return false if it isn't, or if we can't be sure
+ either way. */
+
+static bool
+vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
+{
+ unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
+
+ HOST_WIDE_INT max_niter;
+ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
+ else
+ max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+ if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
+ return true;
+
+ return false;
+}
+
/* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
is worthwhile to vectorize. Return 1 if definitely yes, 0 if
definitely no, or -1 if it's worth retrying. */
@@ -1627,19 +1745,11 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
- /* Only fully-masked loops can have iteration counts less than the
- vectorization factor. */
- if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ /* Only loops that can handle partially-populated vectors can have iteration
+ counts less than the vectorization factor. */
+ if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
- HOST_WIDE_INT max_niter;
-
- if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
- max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
- else
- max_niter = max_stmt_executions_int (loop);
-
- if (max_niter != -1
- && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
+ if (vect_known_niters_smaller_than_vf (loop_vinfo))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1810,7 +1920,7 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
- vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+ vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
FOR_EACH_VEC_ELT (datarefs, i, dr)
{
gcc_assert (DR_REF (dr));
@@ -1865,7 +1975,7 @@ determine_peel_for_niter (loop_vec_info loop_vinfo)
th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
(loop_vinfo));
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
/* The main loop handles all iterations. */
LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
@@ -2049,9 +2159,13 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
/* Update the vectorization factor based on the SLP decision. */
vect_update_vf_for_slp (loop_vinfo);
+
+ /* Optimize the SLP graph with the vectorization factor fixed. */
+ vect_optimize_slp (loop_vinfo);
}
- bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
+ bool saved_can_use_partial_vectors_p
+ = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
/* We don't expect to have to roll back to anything other than an empty
set of rgroups. */
@@ -2099,8 +2213,6 @@ start_over:
/* This pass will decide on using loop versioning and/or loop peeling in
order to enhance the alignment of data references in the loop. */
ok = vect_enhance_data_refs_alignment (loop_vinfo);
- else
- ok = vect_verify_datarefs_alignment (loop_vinfo);
if (!ok)
return ok;
@@ -2133,19 +2245,56 @@ start_over:
return ok;
}
- /* Decide whether to use a fully-masked loop for this vectorization
- factor. */
- LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
- = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
- && vect_verify_full_masking (loop_vinfo));
+ /* For now, we don't expect to mix both masking and length approaches for one
+ loop, disable it if both are recorded. */
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+ && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't vectorize a loop with partial vectors"
+ " because we don't expect to mix different"
+ " approaches with partial vectors for the"
+ " same loop.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+
+ /* Decide whether to vectorize a loop with partial vectors for
+ this vectorization factor. */
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ if (param_vect_partial_vector_usage == 0)
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ else if (vect_verify_full_masking (loop_vinfo)
+ || vect_verify_loop_lens (loop_vinfo))
+ {
+ /* The epilogue and other known niters less than VF
+ cases can still use vector access with length fully. */
+ if (param_vect_partial_vector_usage == 1
+ && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && !vect_known_niters_smaller_than_vf (loop_vinfo))
+ {
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+ }
+ else
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+ }
+ else
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ else
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+
if (dump_enabled_p ())
{
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
dump_printf_loc (MSG_NOTE, vect_location,
- "using a fully-masked loop.\n");
+ "operating on partial vectors.\n");
else
dump_printf_loc (MSG_NOTE, vect_location,
- "not using a fully-masked loop.\n");
+ "operating only on full vectors.\n");
}
/* If epilog loop is required because of data accesses with gaps,
@@ -2153,7 +2302,7 @@ start_over:
enough iterations for vectorization. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
&& LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
@@ -2164,10 +2313,11 @@ start_over:
" support peeling for gaps.\n");
}
- /* If we're vectorizing an epilogue loop, we either need a fully-masked
- loop or a loop that has a lower VF than the main loop. */
+ /* If we're vectorizing an epilogue loop, the vectorized loop either needs
+ to be able to handle fewer than VF scalars, or needs to have a lower VF
+ than the main loop. */
if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
- && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
&& maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
return opt_result::failure_at (vect_location,
@@ -2238,7 +2388,7 @@ start_over:
}
/* Niters for at least one iteration of vectorized loop. */
- if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
/* One additional iteration because of peeling for gap. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
@@ -2342,6 +2492,8 @@ again:
for (gimple_stmt_iterator si = gsi_start_bb (bb);
!gsi_end_p (si); gsi_next (&si))
{
+ if (is_gimple_debug (gsi_stmt (si)))
+ continue;
stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
STMT_SLP_TYPE (stmt_info) = loop_vect;
if (STMT_VINFO_IN_PATTERN_P (stmt_info))
@@ -2365,13 +2517,15 @@ again:
LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
= init_cost (LOOP_VINFO_LOOP (loop_vinfo));
/* Reset accumulated rgroup information. */
- release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
+ release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
+ release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
/* Reset assorted flags. */
LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ = saved_can_use_partial_vectors_p;
goto start_over;
}
@@ -2651,7 +2805,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
lowest_th = ordered_min (lowest_th, th);
}
else
- delete loop_vinfo;
+ {
+ delete loop_vinfo;
+ loop_vinfo = opt_loop_vec_info::success (NULL);
+ }
/* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
enabled, SIMDUID is not set, it is the innermost loop and we have
@@ -2676,6 +2833,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
else
{
delete loop_vinfo;
+ loop_vinfo = opt_loop_vec_info::success (NULL);
if (fatal)
{
gcc_checking_assert (first_loop_vinfo == NULL);
@@ -2683,6 +2841,23 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
}
}
+ /* Handle the case that the original loop can use partial
+ vectorization, but want to only adopt it for the epilogue.
+ The retry should be in the same mode as original. */
+ if (vect_epilogues
+ && loop_vinfo
+ && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "***** Re-trying analysis with same vector mode"
+ " %s for epilogue with partial vectors.\n",
+ GET_MODE_NAME (loop_vinfo->vector_mode));
+ continue;
+ }
+
if (mode_i < vector_modes.length ()
&& VECTOR_MODE_P (autodetected_vector_mode)
&& (related_vector_mode (vector_modes[mode_i],
@@ -3299,42 +3474,58 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
return NULL;
}
-/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
-int
-vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
- int *peel_iters_epilogue,
- stmt_vector_for_cost *scalar_cost_vec,
- stmt_vector_for_cost *prologue_cost_vec,
- stmt_vector_for_cost *epilogue_cost_vec)
+/* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
+ PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
+ or -1 if not known. */
+
+static int
+vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
{
- int retval = 0;
int assumed_vf = vect_vf_for_cost (loop_vinfo);
-
- if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
{
- *peel_iters_epilogue = assumed_vf / 2;
if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
+ dump_printf_loc (MSG_NOTE, vect_location,
"cost model: epilogue peel iters set to vf/2 "
"because loop iterations are unknown .\n");
-
- /* If peeled iterations are known but number of scalar loop
- iterations are unknown, count a taken branch per peeled loop. */
- retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
- NULL, 0, vect_prologue);
- retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
- NULL, 0, vect_epilogue);
+ return assumed_vf / 2;
}
else
{
int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
- peel_iters_prologue = niters < peel_iters_prologue ?
- niters : peel_iters_prologue;
- *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
+ peel_iters_prologue = MIN (niters, peel_iters_prologue);
+ int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
/* If we need to peel for gaps, but no peeling is required, we have to
peel VF iterations. */
- if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
- *peel_iters_epilogue = assumed_vf;
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
+ peel_iters_epilogue = assumed_vf;
+ return peel_iters_epilogue;
+ }
+}
+
+/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
+int
+vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
+ int *peel_iters_epilogue,
+ stmt_vector_for_cost *scalar_cost_vec,
+ stmt_vector_for_cost *prologue_cost_vec,
+ stmt_vector_for_cost *epilogue_cost_vec)
+{
+ int retval = 0;
+
+ *peel_iters_epilogue
+ = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
+
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ {
+ /* If peeled iterations are known but number of scalar loop
+ iterations are unknown, count a taken branch per peeled loop. */
+ if (peel_iters_prologue > 0)
+ retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
+ NULL, NULL_TREE, 0, vect_prologue);
+ if (*peel_iters_epilogue > 0)
+ retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
+ NULL, NULL_TREE, 0, vect_epilogue);
}
stmt_info_for_cost *si;
@@ -3404,7 +3595,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
/* FIXME: Make cost depend on complexity of individual check. */
unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
(void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
- NULL, 0, vect_prologue);
+ NULL, NULL_TREE, 0, vect_prologue);
if (dump_enabled_p ())
dump_printf (MSG_NOTE,
"cost model: Adding cost of checks for loop "
@@ -3417,12 +3608,12 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
/* FIXME: Make cost depend on complexity of individual check. */
unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
(void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
- NULL, 0, vect_prologue);
+ NULL, NULL_TREE, 0, vect_prologue);
len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
if (len)
/* Count LEN - 1 ANDs and LEN comparisons. */
(void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
- scalar_stmt, NULL, 0, vect_prologue);
+ scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
if (len)
{
@@ -3433,7 +3624,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
nstmts += 1;
(void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
- scalar_stmt, NULL, 0, vect_prologue);
+ scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
}
if (dump_enabled_p ())
dump_printf (MSG_NOTE,
@@ -3446,7 +3637,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
{
/* FIXME: Make cost depend on complexity of individual check. */
(void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
- NULL, 0, vect_prologue);
+ NULL, NULL_TREE, 0, vect_prologue);
if (dump_enabled_p ())
dump_printf (MSG_NOTE,
"cost model: Adding cost of checks for loop "
@@ -3455,7 +3646,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
(void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
- NULL, 0, vect_prologue);
+ NULL, NULL_TREE, 0, vect_prologue);
/* Count statements in scalar loop. Using this as scalar cost for a single
iteration for now.
@@ -3477,30 +3668,116 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
TODO: Build an expression that represents peel_iters for prologue and
epilogue to be used in a run-time test. */
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ bool prologue_need_br_taken_cost = false;
+ bool prologue_need_br_not_taken_cost = false;
+
+ /* Calculate peel_iters_prologue. */
+ if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
+ peel_iters_prologue = 0;
+ else if (npeel < 0)
{
- peel_iters_prologue = 0;
- peel_iters_epilogue = 0;
+ peel_iters_prologue = assumed_vf / 2;
+ if (dump_enabled_p ())
+ dump_printf (MSG_NOTE, "cost model: "
+ "prologue peel iters set to vf/2.\n");
- if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
- {
- /* We need to peel exactly one iteration. */
- peel_iters_epilogue += 1;
- stmt_info_for_cost *si;
- int j;
- FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
- j, si)
- (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
- si->kind, si->stmt_info, si->misalign,
- vect_epilogue);
- }
+ /* If peeled iterations are unknown, count a taken branch and a not taken
+ branch per peeled loop. Even if scalar loop iterations are known,
+ vector iterations are not known since peeled prologue iterations are
+ not known. Hence guards remain the same. */
+ prologue_need_br_taken_cost = true;
+ prologue_need_br_not_taken_cost = true;
+ }
+ else
+ {
+ peel_iters_prologue = npeel;
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
+ /* If peeled iterations are known but number of scalar loop
+ iterations are unknown, count a taken branch per peeled loop. */
+ prologue_need_br_taken_cost = true;
+ }
+
+ bool epilogue_need_br_taken_cost = false;
+ bool epilogue_need_br_not_taken_cost = false;
+
+ /* Calculate peel_iters_epilogue. */
+ if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+ /* We need to peel exactly one iteration for gaps. */
+ peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
+ else if (npeel < 0)
+ {
+ /* If peeling for alignment is unknown, loop bound of main loop
+ becomes unknown. */
+ peel_iters_epilogue = assumed_vf / 2;
+ if (dump_enabled_p ())
+ dump_printf (MSG_NOTE, "cost model: "
+ "epilogue peel iters set to vf/2 because "
+ "peeling for alignment is unknown.\n");
+
+ /* See the same reason above in peel_iters_prologue calculation. */
+ epilogue_need_br_taken_cost = true;
+ epilogue_need_br_not_taken_cost = true;
+ }
+ else
+ {
+ peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
+ /* If peeled iterations are known but number of scalar loop
+ iterations are unknown, count a taken branch per peeled loop. */
+ epilogue_need_br_taken_cost = true;
+ }
+
+ stmt_info_for_cost *si;
+ int j;
+ /* Add costs associated with peel_iters_prologue. */
+ if (peel_iters_prologue)
+ FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+ {
+ (void) add_stmt_cost (loop_vinfo, target_cost_data,
+ si->count * peel_iters_prologue, si->kind,
+ si->stmt_info, si->vectype, si->misalign,
+ vect_prologue);
+ }
+
+ /* Add costs associated with peel_iters_epilogue. */
+ if (peel_iters_epilogue)
+ FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+ {
+ (void) add_stmt_cost (loop_vinfo, target_cost_data,
+ si->count * peel_iters_epilogue, si->kind,
+ si->stmt_info, si->vectype, si->misalign,
+ vect_epilogue);
+ }
+
+ /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
+
+ if (prologue_need_br_taken_cost)
+ (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+ NULL, NULL_TREE, 0, vect_prologue);
+
+ if (prologue_need_br_not_taken_cost)
+ (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+ cond_branch_not_taken, NULL, NULL_TREE, 0,
+ vect_prologue);
+
+ if (epilogue_need_br_taken_cost)
+ (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+ NULL, NULL_TREE, 0, vect_epilogue);
+
+ if (epilogue_need_br_not_taken_cost)
+ (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+ cond_branch_not_taken, NULL, NULL_TREE, 0,
+ vect_epilogue);
+ /* Take care of special costs for rgroup controls of partial vectors. */
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ {
/* Calculate how many masks we need to generate. */
unsigned int num_masks = 0;
- rgroup_masks *rgm;
+ rgroup_controls *rgm;
unsigned int num_vectors_m1;
FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
- if (rgm->mask_type)
+ if (rgm->type)
num_masks += num_vectors_m1 + 1;
gcc_assert (num_masks > 0);
@@ -3516,86 +3793,62 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
simpler and safer to use the worst-case cost; if this ends up
being the tie-breaker between vectorizing or not, then it's
probably better not to vectorize. */
- (void) add_stmt_cost (loop_vinfo,
- target_cost_data, num_masks, vector_stmt,
- NULL, 0, vect_prologue);
- (void) add_stmt_cost (loop_vinfo,
- target_cost_data, num_masks - 1, vector_stmt,
- NULL, 0, vect_body);
+ (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
+ vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
+ (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
+ vector_stmt, NULL, NULL_TREE, 0, vect_body);
}
- else if (npeel < 0)
+ else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
{
- peel_iters_prologue = assumed_vf / 2;
- if (dump_enabled_p ())
- dump_printf (MSG_NOTE, "cost model: "
- "prologue peel iters set to vf/2.\n");
+ /* Referring to the functions vect_set_loop_condition_partial_vectors
+ and vect_set_loop_controls_directly, we need to generate each
+ length in the prologue and in the loop body if required. Although
+ there are some possible optimizations, we consider the worst case
+ here. */
- /* If peeling for alignment is unknown, loop bound of main loop becomes
- unknown. */
- peel_iters_epilogue = assumed_vf / 2;
- if (dump_enabled_p ())
- dump_printf (MSG_NOTE, "cost model: "
- "epilogue peel iters set to vf/2 because "
- "peeling for alignment is unknown.\n");
+ bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
+ bool need_iterate_p
+ = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && !vect_known_niters_smaller_than_vf (loop_vinfo));
- /* If peeled iterations are unknown, count a taken branch and a not taken
- branch per peeled loop. Even if scalar loop iterations are known,
- vector iterations are not known since peeled prologue iterations are
- not known. Hence guards remain the same. */
- (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
- NULL, 0, vect_prologue);
- (void) add_stmt_cost (loop_vinfo,
- target_cost_data, 1, cond_branch_not_taken,
- NULL, 0, vect_prologue);
- (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
- NULL, 0, vect_epilogue);
- (void) add_stmt_cost (loop_vinfo,
- target_cost_data, 1, cond_branch_not_taken,
- NULL, 0, vect_epilogue);
- stmt_info_for_cost *si;
- int j;
- FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
- {
- (void) add_stmt_cost (loop_vinfo, target_cost_data,
- si->count * peel_iters_prologue,
- si->kind, si->stmt_info, si->misalign,
- vect_prologue);
- (void) add_stmt_cost (loop_vinfo, target_cost_data,
- si->count * peel_iters_epilogue,
- si->kind, si->stmt_info, si->misalign,
- vect_epilogue);
- }
- }
- else
- {
- stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
- stmt_info_for_cost *si;
- int j;
- void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
+ /* Calculate how many statements to be added. */
+ unsigned int prologue_stmts = 0;
+ unsigned int body_stmts = 0;
- prologue_cost_vec.create (2);
- epilogue_cost_vec.create (2);
- peel_iters_prologue = npeel;
+ rgroup_controls *rgc;
+ unsigned int num_vectors_m1;
+ FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+ if (rgc->type)
+ {
+ /* May need one SHIFT for nitems_total computation. */
+ unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+ if (nitems != 1 && !niters_known_p)
+ prologue_stmts += 1;
+
+ /* May need one MAX and one MINUS for wrap around. */
+ if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
+ prologue_stmts += 2;
- (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
- &peel_iters_epilogue,
- &LOOP_VINFO_SCALAR_ITERATION_COST
- (loop_vinfo),
- &prologue_cost_vec,
- &epilogue_cost_vec);
+ /* Need one MAX and one MINUS for each batch limit excepting for
+ the 1st one. */
+ prologue_stmts += num_vectors_m1 * 2;
- FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
- (void) add_stmt_cost (loop_vinfo,
- data, si->count, si->kind, si->stmt_info,
- si->misalign, vect_prologue);
+ unsigned int num_vectors = num_vectors_m1 + 1;
- FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
- (void) add_stmt_cost (loop_vinfo,
- data, si->count, si->kind, si->stmt_info,
- si->misalign, vect_epilogue);
+ /* Need to set up lengths in prologue, only one MIN required
+ for each since start index is zero. */
+ prologue_stmts += num_vectors;
- prologue_cost_vec.release ();
- epilogue_cost_vec.release ();
+ /* Each may need two MINs and one MINUS to update lengths in body
+ for next iteration. */
+ if (need_iterate_p)
+ body_stmts += 3 * num_vectors;
+ }
+
+ (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
+ scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
+ (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
+ scalar_stmt, NULL, NULL_TREE, 0, vect_body);
}
/* FORNOW: The scalar outside cost is incremented in one of the
@@ -3731,8 +3984,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
}
/* ??? The "if" arm is written to handle all cases; see below for what
- we would do for !LOOP_VINFO_FULLY_MASKED_P. */
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
+ if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
/* Rewriting the condition above in terms of the number of
vector iterations (vniters) rather than the number of
@@ -3759,7 +4012,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
min_vec_niters);
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
/* Now that we know the minimum number of vector iterations,
find the minimum niters for which the scalar cost is larger:
@@ -3810,10 +4063,14 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
" Calculated minimum iters for profitability: %d\n",
min_profitable_iters);
- if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
&& min_profitable_iters < (assumed_vf + peel_iters_prologue))
/* We want the vectorized loop to execute at least once. */
min_profitable_iters = assumed_vf + peel_iters_prologue;
+ else if (min_profitable_iters < peel_iters_prologue)
+ /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
+ vectorized loop executes at least once. */
+ min_profitable_iters = peel_iters_prologue;
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
@@ -3831,7 +4088,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
if (vec_outside_cost <= 0)
min_profitable_estimate = 0;
- else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
/* This is a repeat of the code above, but with + SOC rather
than - SOC. */
@@ -3843,7 +4100,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
if (outside_overhead > 0)
min_vec_niters = outside_overhead / saving_per_viter + 1;
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
int threshold = (vec_inside_cost * min_vec_niters
+ vec_outside_cost
@@ -4480,7 +4737,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
= as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
- stmt_vec_info prev_phi_info;
tree vectype;
machine_mode mode;
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
@@ -4488,7 +4744,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
tree scalar_dest;
tree scalar_type;
gimple *new_phi = NULL, *phi;
- stmt_vec_info phi_info;
gimple_stmt_iterator exit_gsi;
tree new_temp = NULL_TREE, new_name, new_scalar_dest;
gimple *epilog_stmt = NULL;
@@ -4511,7 +4766,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
tree induction_index = NULL_TREE;
if (slp_node)
- group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+ group_size = SLP_TREE_LANES (slp_node);
if (nested_in_vect_loop_p (loop, stmt_info))
{
@@ -4558,15 +4813,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
}
else
{
+ stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
vec_num = 1;
- ncopies = 0;
- phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
- do
- {
- ncopies++;
- phi_info = STMT_VINFO_RELATED_STMT (phi_info);
- }
- while (phi_info);
+ ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
}
/* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
@@ -4588,7 +4837,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
{
if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
{
- gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
+ gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
ccompares.safe_push
(std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
@@ -4639,7 +4888,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
/* Create a vector phi node. */
tree new_phi_tree = make_ssa_name (cr_index_vector_type);
new_phi = create_phi_node (new_phi_tree, loop->header);
- loop_vinfo->add_stmt (new_phi);
add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
loop_preheader_edge (loop), UNKNOWN_LOCATION);
@@ -4666,9 +4914,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
new_phi_tree, indx_before_incr);
}
gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
- stmt_vec_info index_vec_info
- = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
- STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
/* Update the phi with the vec cond. */
induction_index = new_phi_tree;
@@ -4709,29 +4954,26 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
if (double_reduc)
loop = outer_loop;
exit_bb = single_exit (loop)->dest;
- prev_phi_info = NULL;
new_phis.create (slp_node ? vec_num : ncopies);
for (unsigned i = 0; i < vec_num; i++)
{
if (slp_node)
- def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
+ def = vect_get_slp_vect_def (slp_node, i);
else
- def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
+ def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
for (j = 0; j < ncopies; j++)
{
tree new_def = copy_ssa_name (def);
phi = create_phi_node (new_def, exit_bb);
- stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
if (j == 0)
new_phis.quick_push (phi);
else
{
- def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
- STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
+ def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
+ new_phis.quick_push (phi);
}
SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
- prev_phi_info = phi_info;
}
}
@@ -4802,15 +5044,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
/* Likewise if we couldn't use a single defuse cycle. */
else if (ncopies > 1)
{
- gcc_assert (new_phis.length () == 1);
gimple_seq stmts = NULL;
tree first_vect = PHI_RESULT (new_phis[0]);
first_vect = gimple_convert (&stmts, vectype, first_vect);
- stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
for (int k = 1; k < ncopies; ++k)
{
- next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
- tree second_vect = PHI_RESULT (next_phi_info->stmt);
+ tree second_vect = PHI_RESULT (new_phis[k]);
second_vect = gimple_convert (&stmts, vectype, second_vect);
first_vect = gimple_build (&stmts, code, vectype,
first_vect, second_vect);
@@ -5454,10 +5693,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
if (nested_in_vect_loop)
{
- stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
- STMT_VINFO_RELATED_STMT (epilog_stmt_info)
- = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
-
if (!double_reduc)
scalar_results.quick_push (new_temp);
else
@@ -5674,7 +5909,7 @@ static bool
vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
stmt_vec_info stmt_info,
gimple_stmt_iterator *gsi,
- stmt_vec_info *vec_stmt, slp_tree slp_node,
+ gimple **vec_stmt, slp_tree slp_node,
gimple *reduc_def_stmt,
tree_code code, internal_fn reduc_fn,
tree ops[3], tree vectype_in,
@@ -5682,7 +5917,6 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
{
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
- stmt_vec_info new_stmt_info = NULL;
internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
int ncopies;
@@ -5716,10 +5950,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
}
else
{
- tree loop_vec_def0 = vect_get_vec_def_for_operand (loop_vinfo,
- op0, stmt_info);
- vec_oprnds0.create (1);
- vec_oprnds0.quick_push (loop_vec_def0);
+ vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+ op0, &vec_oprnds0);
scalar_dest_def_info = stmt_info;
}
@@ -5795,22 +6027,24 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
if (i == vec_num - 1)
{
gimple_set_lhs (new_stmt, scalar_dest);
- new_stmt_info = vect_finish_replace_stmt (loop_vinfo,
- scalar_dest_def_info,
- new_stmt);
+ vect_finish_replace_stmt (loop_vinfo,
+ scalar_dest_def_info,
+ new_stmt);
}
else
- new_stmt_info = vect_finish_stmt_generation (loop_vinfo,
- scalar_dest_def_info,
- new_stmt, gsi);
+ vect_finish_stmt_generation (loop_vinfo,
+ scalar_dest_def_info,
+ new_stmt, gsi);
if (slp_node)
- SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
+ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+ else
+ {
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ *vec_stmt = new_stmt;
+ }
}
- if (!slp_node)
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
-
return true;
}
@@ -6180,17 +6414,35 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
The last use is the reduction variable. In case of nested cycle this
assumption is not true: we use reduc_index to record the index of the
reduction variable. */
- reduc_def = PHI_RESULT (reduc_def_phi);
+ /* ??? To get at invariant/constant uses on the SLP node we have to
+ get to it here, slp_node is still the reduction PHI. */
+ slp_tree slp_for_stmt_info = NULL;
+ if (slp_node)
+ {
+ slp_for_stmt_info = slp_node_instance->root;
+ /* And then there's reduction chain with a conversion ... */
+ if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
+ slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
+ gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
+ }
+ slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
+ /* We need to skip an extra operand for COND_EXPRs with embedded
+ comparison. */
+ unsigned opno_adjust = 0;
+ if (code == COND_EXPR
+ && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
+ opno_adjust = 1;
for (i = 0; i < op_type; i++)
{
- tree op = gimple_op (stmt, i + 1);
/* The condition of COND_EXPR is checked in vectorizable_condition(). */
if (i == 0 && code == COND_EXPR)
continue;
stmt_vec_info def_stmt_info;
enum vect_def_type dt;
- if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
+ tree op;
+ if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
+ i + opno_adjust, &op, &slp_op[i], &dt, &tem,
&def_stmt_info))
{
if (dump_enabled_p ())
@@ -6571,7 +6823,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
which each SLP statement has its own initial value and in which
that value needs to be repeated for every instance of the
statement within the initial vector. */
- unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+ unsigned int group_size = SLP_TREE_LANES (slp_node);
if (!neutral_op
&& !can_duplicate_and_interleave_p (loop_vinfo, group_size,
TREE_TYPE (vectype_out)))
@@ -6724,6 +6976,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
return false;
}
+ if (slp_node
+ && !(!single_defuse_cycle
+ && code != DOT_PROD_EXPR
+ && code != WIDEN_SUM_EXPR
+ && code != SAD_EXPR
+ && reduction_type != FOLD_LEFT_REDUCTION))
+ for (i = 0; i < op_type; i++)
+ if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
+
if (slp_node)
vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
else
@@ -6754,7 +7021,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
}
- else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+ else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
internal_fn cond_fn = get_conditional_internal_fn (code);
@@ -6767,9 +7034,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't use a fully-masked loop because no"
- " conditional operation is available.\n");
- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ "can't operate on partial vectors because"
+ " no conditional operation is available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
else if (reduction_type == FOLD_LEFT_REDUCTION
&& reduc_fn == IFN_LAST
@@ -6779,9 +7046,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't use a fully-masked loop because no"
- " conditional operation is available.\n");
- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ "can't operate on partial vectors because"
+ " no conditional operation is available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
else
vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
@@ -6796,13 +7063,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
bool
vect_transform_reduction (loop_vec_info loop_vinfo,
stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
- stmt_vec_info *vec_stmt, slp_tree slp_node)
+ gimple **vec_stmt, slp_tree slp_node)
{
tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
int i;
int ncopies;
- int j;
int vec_num;
stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
@@ -6858,8 +7124,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
/* Transform. */
- stmt_vec_info new_stmt_info = NULL;
- stmt_vec_info prev_stmt_info;
tree new_temp = NULL_TREE;
auto_vec<tree> vec_oprnds0;
auto_vec<tree> vec_oprnds1;
@@ -6894,139 +7158,83 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
tree scalar_dest = gimple_assign_lhs (stmt);
tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
- prev_stmt_info = NULL;
- if (!slp_node)
+ vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
+ single_defuse_cycle && reduc_index == 0
+ ? NULL_TREE : ops[0], &vec_oprnds0,
+ single_defuse_cycle && reduc_index == 1
+ ? NULL_TREE : ops[1], &vec_oprnds1,
+ op_type == ternary_op
+ && !(single_defuse_cycle && reduc_index == 2)
+ ? ops[2] : NULL_TREE, &vec_oprnds2);
+ if (single_defuse_cycle)
{
- vec_oprnds0.create (1);
- vec_oprnds1.create (1);
- if (op_type == ternary_op)
- vec_oprnds2.create (1);
+ gcc_assert (!slp_node);
+ vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+ ops[reduc_index],
+ reduc_index == 0 ? &vec_oprnds0
+ : (reduc_index == 1 ? &vec_oprnds1
+ : &vec_oprnds2));
}
- for (j = 0; j < ncopies; j++)
+ FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
{
- /* Handle uses. */
- if (j == 0)
- {
- if (slp_node)
- {
- /* Get vec defs for all the operands except the reduction index,
- ensuring the ordering of the ops in the vector is kept. */
- auto_vec<vec<tree>, 3> vec_defs;
- vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
- vec_oprnds0.safe_splice (vec_defs[0]);
- vec_defs[0].release ();
- vec_oprnds1.safe_splice (vec_defs[1]);
- vec_defs[1].release ();
- if (op_type == ternary_op)
- {
- vec_oprnds2.safe_splice (vec_defs[2]);
- vec_defs[2].release ();
- }
- }
- else
+ gimple *new_stmt;
+ tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
+ if (masked_loop_p && !mask_by_cond_expr)
+ {
+ /* Make sure that the reduction accumulator is vop[0]. */
+ if (reduc_index == 1)
{
- vec_oprnds0.quick_push
- (vect_get_vec_def_for_operand (loop_vinfo, ops[0], stmt_info));
- vec_oprnds1.quick_push
- (vect_get_vec_def_for_operand (loop_vinfo, ops[1], stmt_info));
- if (op_type == ternary_op)
- vec_oprnds2.quick_push
- (vect_get_vec_def_for_operand (loop_vinfo, ops[2], stmt_info));
+ gcc_assert (commutative_tree_code (code));
+ std::swap (vop[0], vop[1]);
}
- }
+ tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+ vectype_in, i);
+ gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
+ vop[0], vop[1], vop[0]);
+ new_temp = make_ssa_name (vec_dest, call);
+ gimple_call_set_lhs (call, new_temp);
+ gimple_call_set_nothrow (call, true);
+ vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
+ new_stmt = call;
+ }
else
- {
- if (!slp_node)
- {
- gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
-
- if (single_defuse_cycle && reduc_index == 0)
- vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
- else
- vec_oprnds0[0]
- = vect_get_vec_def_for_stmt_copy (loop_vinfo,
- vec_oprnds0[0]);
- if (single_defuse_cycle && reduc_index == 1)
- vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
- else
- vec_oprnds1[0]
- = vect_get_vec_def_for_stmt_copy (loop_vinfo,
- vec_oprnds1[0]);
- if (op_type == ternary_op)
- {
- if (single_defuse_cycle && reduc_index == 2)
- vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
- else
- vec_oprnds2[0]
- = vect_get_vec_def_for_stmt_copy (loop_vinfo,
- vec_oprnds2[0]);
- }
- }
- }
+ {
+ if (op_type == ternary_op)
+ vop[2] = vec_oprnds2[i];
- FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
- {
- tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
- if (masked_loop_p && !mask_by_cond_expr)
+ if (masked_loop_p && mask_by_cond_expr)
{
- /* Make sure that the reduction accumulator is vop[0]. */
- if (reduc_index == 1)
- {
- gcc_assert (commutative_tree_code (code));
- std::swap (vop[0], vop[1]);
- }
tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
- vectype_in, i * ncopies + j);
- gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
- vop[0], vop[1],
- vop[0]);
- new_temp = make_ssa_name (vec_dest, call);
- gimple_call_set_lhs (call, new_temp);
- gimple_call_set_nothrow (call, true);
- new_stmt_info
- = vect_finish_stmt_generation (loop_vinfo,
- stmt_info, call, gsi);
- }
- else
- {
- if (op_type == ternary_op)
- vop[2] = vec_oprnds2[i];
-
- if (masked_loop_p && mask_by_cond_expr)
- {
- tree mask = vect_get_loop_mask (gsi, masks,
- vec_num * ncopies,
- vectype_in, i * ncopies + j);
- build_vect_cond_expr (code, vop, mask, gsi);
- }
-
- gassign *new_stmt = gimple_build_assign (vec_dest, code,
- vop[0], vop[1], vop[2]);
- new_temp = make_ssa_name (vec_dest, new_stmt);
- gimple_assign_set_lhs (new_stmt, new_temp);
- new_stmt_info
- = vect_finish_stmt_generation (loop_vinfo,
- stmt_info, new_stmt, gsi);
+ vectype_in, i);
+ build_vect_cond_expr (code, vop, mask, gsi);
}
- if (slp_node)
- SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
- }
-
- if (slp_node || single_defuse_cycle)
- continue;
+ new_stmt = gimple_build_assign (vec_dest, code,
+ vop[0], vop[1], vop[2]);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ gimple_assign_set_lhs (new_stmt, new_temp);
+ vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+ }
- if (j == 0)
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+ if (slp_node)
+ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+ else if (single_defuse_cycle
+ && i < ncopies - 1)
+ {
+ if (reduc_index == 0)
+ vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
+ else if (reduc_index == 1)
+ vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
+ else if (reduc_index == 2)
+ vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
+ }
else
- STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
-
- prev_stmt_info = new_stmt_info;
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
- if (single_defuse_cycle && !slp_node)
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+ if (!slp_node)
+ *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
return true;
}
@@ -7035,14 +7243,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
bool
vect_transform_cycle_phi (loop_vec_info loop_vinfo,
- stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+ stmt_vec_info stmt_info, gimple **vec_stmt,
slp_tree slp_node, slp_instance slp_node_instance)
{
tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
int i;
int ncopies;
- stmt_vec_info prev_phi_info;
int j;
bool nested_cycle = false;
int vec_num;
@@ -7072,9 +7279,8 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
if (slp_node)
{
/* The size vect_schedule_slp_instance computes is off for us. */
- vec_num = vect_get_num_vectors
- (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
- * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
+ vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+ * SLP_TREE_LANES (slp_node), vectype_in);
ncopies = 1;
}
else
@@ -7134,14 +7340,17 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
}
vec_initial_def = build_vector_from_val (vectype_out, induc_val);
+ vec_initial_defs.create (ncopies);
+ for (i = 0; i < ncopies; ++i)
+ vec_initial_defs.quick_push (vec_initial_def);
}
else if (nested_cycle)
{
/* Do not use an adjustment def as that case is not supported
correctly if ncopies is not one. */
- vec_initial_def = vect_get_vec_def_for_operand (loop_vinfo,
- initial_def,
- reduc_stmt_info);
+ vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
+ ncopies, initial_def,
+ &vec_initial_defs);
}
else
{
@@ -7154,13 +7363,13 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
= get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
initial_def, adjustment_defp);
STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
+ vec_initial_defs.create (ncopies);
+ for (i = 0; i < ncopies; ++i)
+ vec_initial_defs.quick_push (vec_initial_def);
}
- vec_initial_defs.create (1);
- vec_initial_defs.quick_push (vec_initial_def);
}
/* Generate the reduction PHIs upfront. */
- prev_phi_info = NULL;
for (i = 0; i < vec_num; i++)
{
tree vec_init_def = vec_initial_defs[i];
@@ -7169,26 +7378,22 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
/* Create the reduction-phi that defines the reduction
operand. */
gphi *new_phi = create_phi_node (vec_dest, loop->header);
- stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
/* Set the loop-entry arg of the reduction-phi. */
if (j != 0 && nested_cycle)
- vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
- vec_init_def);
+ vec_init_def = vec_initial_defs[j];
add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
UNKNOWN_LOCATION);
/* The loop-latch arg is set in epilogue processing. */
if (slp_node)
- SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
+ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
else
{
if (j == 0)
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
- else
- STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
- prev_phi_info = new_phi_info;
+ *vec_stmt = new_phi;
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
}
}
}
@@ -7200,7 +7405,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
bool
vectorizable_lc_phi (loop_vec_info loop_vinfo,
- stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+ stmt_vec_info stmt_info, gimple **vec_stmt,
slp_tree slp_node)
{
if (!loop_vinfo
@@ -7223,43 +7428,22 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
basic_block bb = gimple_bb (stmt_info->stmt);
edge e = single_pred_edge (bb);
tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
- vec<tree> vec_oprnds = vNULL;
- vect_get_vec_defs (loop_vinfo,
- gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
- stmt_info, &vec_oprnds, NULL, slp_node);
- if (slp_node)
- {
- unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
- gcc_assert (vec_oprnds.length () == vec_num);
- for (unsigned i = 0; i < vec_num; i++)
- {
- /* Create the vectorized LC PHI node. */
- gphi *new_phi = create_phi_node (vec_dest, bb);
- add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
- stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
- SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
- }
- }
- else
- {
- unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
- stmt_vec_info prev_phi_info = NULL;
- for (unsigned i = 0; i < ncopies; i++)
- {
- if (i != 0)
- vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
- /* Create the vectorized LC PHI node. */
- gphi *new_phi = create_phi_node (vec_dest, bb);
- add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
- stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
- if (i == 0)
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
- else
- STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
- prev_phi_info = new_phi_info;
- }
+ auto_vec<tree> vec_oprnds;
+ vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
+ !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
+ gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
+ for (unsigned i = 0; i < vec_oprnds.length (); i++)
+ {
+ /* Create the vectorized LC PHI node. */
+ gphi *new_phi = create_phi_node (vec_dest, bb);
+ add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
+ if (slp_node)
+ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
+ else
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
}
- vec_oprnds.release ();
+ if (!slp_node)
+ *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
return true;
}
@@ -7315,8 +7499,7 @@ vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
bool
vectorizable_induction (loop_vec_info loop_vinfo,
stmt_vec_info stmt_info,
- gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
- stmt_vec_info *vec_stmt, slp_tree slp_node,
+ gimple **vec_stmt, slp_tree slp_node,
stmt_vector_for_cost *cost_vec)
{
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -7336,11 +7519,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
unsigned i;
tree expr;
gimple_seq stmts;
- imm_use_iterator imm_iter;
- use_operand_p use_p;
- gimple *exit_phi;
- edge latch_e;
- tree loop_arg;
gimple_stmt_iterator si;
gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
@@ -7448,9 +7626,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
- latch_e = loop_latch_edge (iv_loop);
- loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
-
step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
gcc_assert (step_expr != NULL_TREE);
tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
@@ -7520,10 +7695,16 @@ vectorizable_induction (loop_vec_info loop_vinfo,
new_vec, step_vectype, NULL);
/* Now generate the IVs. */
- unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+ unsigned group_size = SLP_TREE_LANES (slp_node);
unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
unsigned elts = const_nunits * nvects;
- unsigned nivs = least_common_multiple (group_size,
+ /* Compute the number of distinct IVs we need. First reduce
+ group_size if it is a multiple of const_nunits so we get
+ one IV for a group_size of 4 but const_nunits 2. */
+ unsigned group_sizep = group_size;
+ if (group_sizep % const_nunits == 0)
+ group_sizep = group_sizep / const_nunits;
+ unsigned nivs = least_common_multiple (group_sizep,
const_nunits) / const_nunits;
gcc_assert (elts % group_size == 0);
tree elt = init_expr;
@@ -7551,8 +7732,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
/* Create the induction-phi that defines the induction-operand. */
vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
induction_phi = create_phi_node (vec_dest, iv_loop->header);
- stmt_vec_info induction_phi_info
- = loop_vinfo->add_stmt (induction_phi);
induc_def = PHI_RESULT (induction_phi);
/* Create the iv update inside the loop */
@@ -7561,7 +7740,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
vec_def = gimple_build (&stmts,
PLUS_EXPR, step_vectype, vec_def, vec_step);
vec_def = gimple_convert (&stmts, vectype, vec_def);
- loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
/* Set the arguments of the phi node: */
@@ -7569,8 +7747,14 @@ vectorizable_induction (loop_vec_info loop_vinfo,
add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
UNKNOWN_LOCATION);
- SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
+ SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
}
+ /* Fill up to the number of vectors we need for the whole group. */
+ nivs = least_common_multiple (group_size,
+ const_nunits) / const_nunits;
+ for (; ivn < nivs; ++ivn)
+ SLP_TREE_VEC_STMTS (slp_node)
+ .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
/* Re-use IVs when we can. */
if (ivn < nvects)
@@ -7595,7 +7779,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
step_vectype, NULL);
for (; ivn < nvects; ++ivn)
{
- gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
+ gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
tree def;
if (gimple_code (iv) == GIMPLE_PHI)
def = gimple_phi_result (iv);
@@ -7613,8 +7797,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
}
- SLP_TREE_VEC_STMTS (slp_node).quick_push
- (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
+ SLP_TREE_VEC_STMTS (slp_node)
+ .quick_push (SSA_NAME_DEF_STMT (def));
}
}
@@ -7627,8 +7811,10 @@ vectorizable_induction (loop_vec_info loop_vinfo,
/* iv_loop is nested in the loop to be vectorized. init_expr had already
been created during vectorization of previous stmts. We obtain it
from the STMT_VINFO_VEC_STMT of the defining stmt. */
- vec_init = vect_get_vec_def_for_operand (loop_vinfo,
- init_expr, stmt_info);
+ auto_vec<tree> vec_inits;
+ vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+ init_expr, &vec_inits);
+ vec_init = vec_inits[0];
/* If the initial value is not of proper type, convert it. */
if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
{
@@ -7643,7 +7829,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
new_stmt);
gcc_assert (!new_bb);
- loop_vinfo->add_stmt (new_stmt);
}
}
else
@@ -7749,7 +7934,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
/* Create the induction-phi that defines the induction-operand. */
vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
induction_phi = create_phi_node (vec_dest, iv_loop->header);
- stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
induc_def = PHI_RESULT (induction_phi);
/* Create the iv update inside the loop */
@@ -7759,14 +7943,14 @@ vectorizable_induction (loop_vec_info loop_vinfo,
vec_def = gimple_convert (&stmts, vectype, vec_def);
gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
new_stmt = SSA_NAME_DEF_STMT (vec_def);
- stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
/* Set the arguments of the phi node: */
add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
UNKNOWN_LOCATION);
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
+ *vec_stmt = induction_phi;
/* In case that vectorization factor (VF) is bigger than the number
of elements that we can fit in a vectype (nunits), we have to generate
@@ -7777,7 +7961,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
if (ncopies > 1)
{
gimple_seq seq = NULL;
- stmt_vec_info prev_stmt_vinfo;
/* FORNOW. This restriction should be relaxed. */
gcc_assert (!nested_in_vect_loop);
@@ -7805,7 +7988,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
new_vec, step_vectype, NULL);
vec_def = induc_def;
- prev_stmt_vinfo = induction_phi_info;
for (i = 1; i < ncopies; i++)
{
/* vec_i = vec_prev + vec_step */
@@ -7817,46 +7999,10 @@ vectorizable_induction (loop_vec_info loop_vinfo,
gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
new_stmt = SSA_NAME_DEF_STMT (vec_def);
- new_stmt_info = loop_vinfo->add_stmt (new_stmt);
- STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
- prev_stmt_vinfo = new_stmt_info;
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
}
- if (nested_in_vect_loop)
- {
- /* Find the loop-closed exit-phi of the induction, and record
- the final vector of induction results: */
- exit_phi = NULL;
- FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
- {
- gimple *use_stmt = USE_STMT (use_p);
- if (is_gimple_debug (use_stmt))
- continue;
-
- if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
- {
- exit_phi = use_stmt;
- break;
- }
- }
- if (exit_phi)
- {
- stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
- /* FORNOW. Currently not supporting the case that an inner-loop induction
- is not used in the outer-loop (i.e. only outside the outer-loop). */
- gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
- && !STMT_VINFO_LIVE_P (stmt_vinfo));
-
- STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "vector of inductions after inner-loop:%G",
- new_stmt);
- }
- }
-
-
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"transform induction: created def-use cycle: %G%G",
@@ -7908,6 +8054,10 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
all involved stmts together. */
else if (slp_index != 0)
return true;
+ else
+ /* For SLP reductions the meta-info is attached to
+ the representative. */
+ stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
}
stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
gcc_assert (reduc_info->is_reduc_info);
@@ -7945,7 +8095,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
{
gcc_assert (slp_index >= 0);
- int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+ int num_scalar = SLP_TREE_LANES (slp_node);
int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
/* Get the last occurrence of the scalar index from the concatenation of
@@ -7968,33 +8118,34 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
if (!vec_stmt_p)
{
/* No transformation required. */
- if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
OPTIMIZE_FOR_SPEED))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't use a fully-masked loop because "
- "the target doesn't support extract last "
- "reduction.\n");
- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ "can't operate on partial vectors "
+ "because the target doesn't support extract "
+ "last reduction.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
else if (slp_node)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't use a fully-masked loop because an "
- "SLP statement is live after the loop.\n");
- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ "can't operate on partial vectors "
+ "because an SLP statement is live after "
+ "the loop.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
else if (ncopies > 1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't use a fully-masked loop because"
- " ncopies is greater than 1.\n");
- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ "can't operate on partial vectors "
+ "because ncopies is greater than 1.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
else
{
@@ -8014,9 +8165,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
: gimple_get_lhs (stmt);
lhs_type = TREE_TYPE (lhs);
- bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
- ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
- : TYPE_SIZE (TREE_TYPE (vectype)));
+ bitsize = vector_element_bits_tree (vectype);
vec_bitsize = TYPE_SIZE (vectype);
/* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
@@ -8026,7 +8175,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
/* Get the correct slp vectorized stmt. */
- gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
+ gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
vec_lhs = gimple_phi_result (phi);
else
@@ -8038,14 +8187,8 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
}
else
{
- enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
- vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
- gcc_checking_assert (ncopies == 1
- || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
-
/* For multiple copies, get the last copy. */
- for (int i = 1; i < ncopies; ++i)
- vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
+ vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
/* Get the last lane in the vector. */
bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
@@ -8250,7 +8393,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
gcc_assert (nvectors != 0);
if (masks->length () < nvectors)
masks->safe_grow_cleared (nvectors);
- rgroup_masks *rgm = &(*masks)[nvectors - 1];
+ rgroup_controls *rgm = &(*masks)[nvectors - 1];
/* The number of scalars per iteration and the number of vectors are
both compile-time constants. */
unsigned int nscalars_per_iter
@@ -8266,7 +8409,8 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
if (rgm->max_nscalars_per_iter < nscalars_per_iter)
{
rgm->max_nscalars_per_iter = nscalars_per_iter;
- rgm->mask_type = truth_type_for (vectype);
+ rgm->type = truth_type_for (vectype);
+ rgm->factor = 1;
}
}
@@ -8281,24 +8425,24 @@ tree
vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
unsigned int nvectors, tree vectype, unsigned int index)
{
- rgroup_masks *rgm = &(*masks)[nvectors - 1];
- tree mask_type = rgm->mask_type;
+ rgroup_controls *rgm = &(*masks)[nvectors - 1];
+ tree mask_type = rgm->type;
/* Populate the rgroup's mask array, if this is the first time we've
used it. */
- if (rgm->masks.is_empty ())
+ if (rgm->controls.is_empty ())
{
- rgm->masks.safe_grow_cleared (nvectors);
+ rgm->controls.safe_grow_cleared (nvectors);
for (unsigned int i = 0; i < nvectors; ++i)
{
tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
/* Provide a dummy definition until the real one is available. */
SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
- rgm->masks[i] = mask;
+ rgm->controls[i] = mask;
}
}
- tree mask = rgm->masks[index];
+ tree mask = rgm->controls[index];
if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
TYPE_VECTOR_SUBPARTS (vectype)))
{
@@ -8319,6 +8463,69 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
return mask;
}
+/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
+ lengths for controlling an operation on VECTYPE. The operation splits
+ each element of VECTYPE into FACTOR separate subelements, measuring the
+ length as a number of these subelements. */
+
+void
+vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+ unsigned int nvectors, tree vectype, unsigned int factor)
+{
+ gcc_assert (nvectors != 0);
+ if (lens->length () < nvectors)
+ lens->safe_grow_cleared (nvectors);
+ rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+ /* The number of scalars per iteration, scalar occupied bytes and
+ the number of vectors are both compile-time constants. */
+ unsigned int nscalars_per_iter
+ = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+ if (rgl->max_nscalars_per_iter < nscalars_per_iter)
+ {
+ /* For now, we only support cases in which all loads and stores fall back
+ to VnQI or none do. */
+ gcc_assert (!rgl->max_nscalars_per_iter
+ || (rgl->factor == 1 && factor == 1)
+ || (rgl->max_nscalars_per_iter * rgl->factor
+ == nscalars_per_iter * factor));
+ rgl->max_nscalars_per_iter = nscalars_per_iter;
+ rgl->type = vectype;
+ rgl->factor = factor;
+ }
+}
+
+/* Given a complete set of length LENS, extract length number INDEX for an
+ rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
+
+tree
+vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+ unsigned int nvectors, unsigned int index)
+{
+ rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+ /* Populate the rgroup's len array, if this is the first time we've
+ used it. */
+ if (rgl->controls.is_empty ())
+ {
+ rgl->controls.safe_grow_cleared (nvectors);
+ for (unsigned int i = 0; i < nvectors; ++i)
+ {
+ tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ gcc_assert (len_type != NULL_TREE);
+ tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
+
+ /* Provide a dummy definition until the real one is available. */
+ SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
+ rgl->controls[i] = len;
+ }
+ }
+
+ return rgl->controls[index];
+}
+
/* Scale profiling counters by estimation for LOOP which is vectorized
by factor VF. */
@@ -8482,6 +8689,8 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
!gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
{
new_stmt = gsi_stmt (epilogue_gsi);
+ if (is_gimple_debug (new_stmt))
+ continue;
gcc_assert (gimple_uid (new_stmt) > 0);
stmt_vinfo
@@ -8546,7 +8755,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
}
struct data_reference *dr;
- vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
+ vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
FOR_EACH_VEC_ELT (datarefs, i, dr)
{
orig_stmt = DR_STMT (dr);
@@ -8678,7 +8887,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
if (niters_vector == NULL_TREE)
{
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
&& known_eq (lowest_vf, vf))
{
niters_vector
@@ -8686,9 +8895,15 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
step_vector = build_one_cst (TREE_TYPE (niters));
}
- else
+ else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
&step_vector, niters_no_overflow);
+ else
+ /* vect_do_peeling subtracted the number of peeled prologue
+ iterations from LOOP_VINFO_NITERS. */
+ vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
+ &niters_vector, &step_vector,
+ niters_no_overflow);
}
/* 1) Make sure the loop header has exactly two entries
@@ -8771,6 +8986,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
}
else
{
+ /* Ignore vector stmts created in the outer loop. */
stmt_info = loop_vinfo->lookup_stmt (stmt);
/* vector stmts created in the outer-loop during vectorization of
@@ -8846,7 +9062,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
/* True if the final iteration might not handle a full vector's
worth of scalar iterations. */
- bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+ bool final_iter_may_be_partial
+ = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
/* The minimum number of iterations performed by the epilogue. This
is 1 when peeling for gaps because we always need a final scalar
iteration. */
@@ -8857,7 +9074,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
int bias_for_lowest = 1 - min_epilogue_iters;
int bias_for_assumed = bias_for_lowest;
int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
- if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
{
/* When the amount of peeling is known at compile time, the first
iteration will have exactly alignment_npeels active elements.
@@ -9149,12 +9366,13 @@ optimize_mask_stores (class loop *loop)
}
/* Decide whether it is possible to use a zero-based induction variable
- when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
- return the value that the induction variable must be able to hold
- in order to ensure that the loop ends with an all-false mask.
+ when vectorizing LOOP_VINFO with partial vectors. If it is, return
+ the value that the induction variable must be able to hold in order
+ to ensure that the rgroups eventually have no active vector elements.
Return -1 otherwise. */
+
widest_int
-vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
{
tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -9189,3 +9407,25 @@ vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
return iv_limit;
}
+/* For the given rgroup_controls RGC, check whether an induction variable
+ would ever hit a value that produces a set of all-false masks or zero
+ lengths before wrapping around. Return true if it's possible to wrap
+ around before hitting the desirable value, otherwise return false. */
+
+bool
+vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
+{
+ widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
+
+ if (iv_limit == -1)
+ return true;
+
+ tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ unsigned int compare_precision = TYPE_PRECISION (compare_type);
+ unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+
+ if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
+ return true;
+
+ return false;
+}