aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-loop.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r--gcc/tree-vect-loop.c325
1 files changed, 305 insertions, 20 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index d679115..5deb780 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1121,12 +1121,15 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
versioning_threshold (0),
vectorization_factor (0),
max_vectorization_factor (0),
+ mask_compare_type (NULL_TREE),
unaligned_dr (NULL),
peeling_for_alignment (0),
ptr_mask (0),
slp_unrolling_factor (1),
single_scalar_iteration_cost (0),
vectorizable (false),
+ can_fully_mask_p (true),
+ fully_masked_p (false),
peeling_for_gaps (false),
peeling_for_niter (false),
operands_swapped (false),
@@ -1168,6 +1171,17 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
gcc_assert (nbbs == loop->num_nodes);
}
+/* Free all levels of MASKS. */
+
+void
+release_vec_loop_masks (vec_loop_masks *masks)
+{
+ rgroup_masks *rgm;
+ unsigned int i;
+ FOR_EACH_VEC_ELT (*masks, i, rgm)
+ rgm->masks.release ();
+ masks->release ();
+}
/* Free all memory used by the _loop_vec_info, as well as all the
stmt_vec_info structs of all the stmts in the loop. */
@@ -1233,9 +1247,98 @@ _loop_vec_info::~_loop_vec_info ()
free (bbs);
+ release_vec_loop_masks (&masks);
+
loop->aux = NULL;
}
+/* Return true if we can use CMP_TYPE as the comparison type to produce
+ all masks required to mask LOOP_VINFO. */
+
+static bool
+can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
+{
+ rgroup_masks *rgm;
+ unsigned int i;
+ FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
+ if (rgm->mask_type != NULL_TREE
+ && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
+ cmp_type, rgm->mask_type,
+ OPTIMIZE_FOR_SPEED))
+ return false;
+ return true;
+}
+
+/* Calculate the maximum number of scalars per iteration for every
+ rgroup in LOOP_VINFO. */
+
+static unsigned int
+vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
+{
+ unsigned int res = 1;
+ unsigned int i;
+ rgroup_masks *rgm;
+ FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
+ res = MAX (res, rgm->max_nscalars_per_iter);
+ return res;
+}
+
+/* Each statement in LOOP_VINFO can be masked where necessary. Check
+ whether we can actually generate the masks required. Return true if so,
+ storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
+
+static bool
+vect_verify_full_masking (loop_vec_info loop_vinfo)
+{
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ unsigned int min_ni_width;
+
+ /* Get the maximum number of iterations that is representable
+ in the counter type. */
+ tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
+ widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
+
+ /* Get a more refined estimate for the number of iterations. */
+ widest_int max_back_edges;
+ if (max_loop_iterations (loop, &max_back_edges))
+ max_ni = wi::smin (max_ni, max_back_edges + 1);
+
+ /* Account for rgroup masks, in which each bit is replicated N times. */
+ max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+
+ /* Work out how many bits we need to represent the limit. */
+ min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+
+ /* Find a scalar mode for which WHILE_ULT is supported. */
+ opt_scalar_int_mode cmp_mode_iter;
+ tree cmp_type = NULL_TREE;
+ FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
+ {
+ unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
+ if (cmp_bits >= min_ni_width
+ && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
+ {
+ tree this_type = build_nonstandard_integer_type (cmp_bits, true);
+ if (this_type
+ && can_produce_all_loop_masks_p (loop_vinfo, this_type))
+ {
+ /* Although we could stop as soon as we find a valid mode,
+ it's often better to continue until we hit Pmode, since the
+ operands to the WHILE are more likely to be reusable in
+ address calculations. */
+ cmp_type = this_type;
+ if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
+ break;
+ }
+ }
+ }
+
+ if (!cmp_type)
+ return false;
+
+ LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+ return true;
+}
/* Calculate the cost of one scalar iteration of the loop. */
static void
@@ -1980,6 +2083,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
vect_update_vf_for_slp (loop_vinfo);
}
+ bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
+
+ /* We don't expect to have to roll back to anything other than an empty
+ set of rgroups. */
+ gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
+
/* This is the point where we can re-start analysis with SLP forced off. */
start_over:
@@ -2068,11 +2177,47 @@ start_over:
return false;
}
+ if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ {
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because peeling for"
+ " gaps is required.\n");
+ }
+
+ if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+ {
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because peeling for"
+ " alignment is required.\n");
+ }
+
+ /* Decide whether to use a fully-masked loop for this vectorization
+ factor. */
+ LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+ && vect_verify_full_masking (loop_vinfo));
+ if (dump_enabled_p ())
+ {
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "using a fully-masked loop.\n");
+ else
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "not using a fully-masked loop.\n");
+ }
+
/* If epilog loop is required because of data accesses with gaps,
one additional iteration needs to be peeled. Check if there is
enough iterations for vectorization. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
- && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
{
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
@@ -2153,8 +2298,11 @@ start_over:
th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
unsigned HOST_WIDE_INT const_vf;
- if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ /* The main loop handles all iterations. */
+ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+ else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
{
if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
- LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
@@ -2212,7 +2360,8 @@ start_over:
niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
/* Niters for at least one iteration of vectorized loop. */
- niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
/* One additional iteration because of peeling for gap. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
niters_th += 1;
@@ -2315,11 +2464,14 @@ again:
destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
= init_cost (LOOP_VINFO_LOOP (loop_vinfo));
+ /* Reset accumulated rgroup information. */
+ release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
/* Reset assorted flags. */
LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
goto start_over;
}
@@ -3523,7 +3675,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
= LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
/* Add additional cost for the peeled instructions in prologue and epilogue
- loop.
+ loop. (For fully-masked loops there will be no peeling.)
FORNOW: If we don't know the value of peel_iters for prologue or epilogue
at compile-time - we assume it's vf/2 (the worst would be vf-1).
@@ -3531,7 +3683,12 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
TODO: Build an expression that represents peel_iters for prologue and
epilogue to be used in a run-time test. */
- if (npeel < 0)
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ {
+ peel_iters_prologue = 0;
+ peel_iters_epilogue = 0;
+ }
+ else if (npeel < 0)
{
peel_iters_prologue = assumed_vf / 2;
dump_printf (MSG_NOTE, "cost model: "
@@ -3762,8 +3919,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
" Calculated minimum iters for profitability: %d\n",
min_profitable_iters);
- /* We want the vectorized loop to execute at least once. */
- if (min_profitable_iters < (assumed_vf + peel_iters_prologue))
+ if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && min_profitable_iters < (assumed_vf + peel_iters_prologue))
+ /* We want the vectorized loop to execute at least once. */
min_profitable_iters = assumed_vf + peel_iters_prologue;
if (dump_enabled_p ())
@@ -6737,6 +6895,15 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
if (!vec_stmt) /* transformation not required. */
{
+ if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop due to "
+ "reduction operation.\n");
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ }
+
if (first_p)
vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
@@ -7557,8 +7724,19 @@ vectorizable_live_operation (gimple *stmt,
}
if (!vec_stmt)
- /* No transformation required. */
- return true;
+ {
+ if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because "
+ "a value is live outside the loop.\n");
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ }
+
+ /* No transformation required. */
+ return true;
+ }
/* If stmt has a related stmt, then use that for getting the lhs. */
if (is_pattern_stmt_p (stmt_info))
@@ -7573,6 +7751,8 @@ vectorizable_live_operation (gimple *stmt,
: TYPE_SIZE (TREE_TYPE (vectype)));
vec_bitsize = TYPE_SIZE (vectype);
+ gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
/* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
tree vec_lhs, bitstart;
if (slp_node)
@@ -7706,6 +7886,97 @@ loop_niters_no_overflow (loop_vec_info loop_vinfo)
return false;
}
+/* Return a mask type with half the number of elements as TYPE. */
+
+tree
+vect_halve_mask_nunits (tree type)
+{
+ poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
+ return build_truth_vector_type (nunits, current_vector_size);
+}
+
+/* Return a mask type with twice as many elements as TYPE. */
+
+tree
+vect_double_mask_nunits (tree type)
+{
+ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
+ return build_truth_vector_type (nunits, current_vector_size);
+}
+
+/* Record that a fully-masked version of LOOP_VINFO would need MASKS to
+ contain a sequence of NVECTORS masks that each control a vector of type
+ VECTYPE. */
+
+void
+vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
+ unsigned int nvectors, tree vectype)
+{
+ gcc_assert (nvectors != 0);
+ if (masks->length () < nvectors)
+ masks->safe_grow_cleared (nvectors);
+ rgroup_masks *rgm = &(*masks)[nvectors - 1];
+ /* The number of scalars per iteration and the number of vectors are
+ both compile-time constants. */
+ unsigned int nscalars_per_iter
+ = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+ if (rgm->max_nscalars_per_iter < nscalars_per_iter)
+ {
+ rgm->max_nscalars_per_iter = nscalars_per_iter;
+ rgm->mask_type = build_same_sized_truth_vector_type (vectype);
+ }
+}
+
+/* Given a complete set of masks MASKS, extract mask number INDEX
+ for an rgroup that operates on NVECTORS vectors of type VECTYPE,
+ where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
+
+ See the comment above vec_loop_masks for more details about the mask
+ arrangement. */
+
+tree
+vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
+ unsigned int nvectors, tree vectype, unsigned int index)
+{
+ rgroup_masks *rgm = &(*masks)[nvectors - 1];
+ tree mask_type = rgm->mask_type;
+
+ /* Populate the rgroup's mask array, if this is the first time we've
+ used it. */
+ if (rgm->masks.is_empty ())
+ {
+ rgm->masks.safe_grow_cleared (nvectors);
+ for (unsigned int i = 0; i < nvectors; ++i)
+ {
+ tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
+ /* Provide a dummy definition until the real one is available. */
+ SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
+ rgm->masks[i] = mask;
+ }
+ }
+
+ tree mask = rgm->masks[index];
+ if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
+ TYPE_VECTOR_SUBPARTS (vectype)))
+ {
+ /* A loop mask for data type X can be reused for data type Y
+ if X has N times more elements than Y and if Y's elements
+ are N times bigger than X's. In this case each sequence
+ of N elements in the loop mask will be all-zero or all-one.
+ We can then view-convert the mask so that each sequence of
+ N elements is replaced by a single element. */
+ gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
+ TYPE_VECTOR_SUBPARTS (vectype)));
+ gimple_seq seq = NULL;
+ mask_type = build_same_sized_truth_vector_type (vectype);
+ mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
+ if (seq)
+ gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ }
+ return mask;
+}
+
/* Scale profiling counters by estimation for LOOP which is vectorized
by factor VF. */
@@ -7840,9 +8111,12 @@ vect_transform_loop (loop_vec_info loop_vinfo)
epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
&step_vector, &niters_vector_mult_vf, th,
check_profitability, niters_no_overflow);
+
if (niters_vector == NULL_TREE)
{
- if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && known_eq (lowest_vf, vf))
+ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && known_eq (lowest_vf, vf))
{
niters_vector
= build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
@@ -8124,13 +8398,15 @@ vect_transform_loop (loop_vec_info loop_vinfo)
a zero NITERS becomes a nonzero NITERS_VECTOR. */
if (integer_onep (step_vector))
niters_no_overflow = true;
- slpeel_make_loop_iterate_ntimes (loop, niters_vector, step_vector,
- niters_vector_mult_vf,
- !niters_no_overflow);
+ vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
+ niters_vector_mult_vf, !niters_no_overflow);
unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
scale_profile_for_vect_loop (loop, assumed_vf);
+ /* True if the final iteration might not handle a full vector's
+ worth of scalar iterations. */
+ bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
/* The minimum number of iterations performed by the epilogue. This
is 1 when peeling for gaps because we always need a final scalar
iteration. */
@@ -8143,16 +8419,25 @@ vect_transform_loop (loop_vec_info loop_vinfo)
back to latch counts. */
if (loop->any_upper_bound)
loop->nb_iterations_upper_bound
- = wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
- lowest_vf) - 1;
+ = (final_iter_may_be_partial
+ ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias,
+ lowest_vf) - 1
+ : wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
+ lowest_vf) - 1);
if (loop->any_likely_upper_bound)
loop->nb_iterations_likely_upper_bound
- = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
- lowest_vf) - 1;
+ = (final_iter_may_be_partial
+ ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound + bias,
+ lowest_vf) - 1
+ : wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
+ lowest_vf) - 1);
if (loop->any_estimate)
loop->nb_iterations_estimate
- = wi::udiv_floor (loop->nb_iterations_estimate + bias,
- assumed_vf) - 1;
+ = (final_iter_may_be_partial
+ ? wi::udiv_ceil (loop->nb_iterations_estimate + bias,
+ assumed_vf) - 1
+ : wi::udiv_floor (loop->nb_iterations_estimate + bias,
+ assumed_vf) - 1);
if (dump_enabled_p ())
{