aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-loop.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r--gcc/tree-vect-loop.c217
1 files changed, 211 insertions, 6 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 0a9be35..e933441 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -816,6 +816,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
vectorizable (false),
can_use_partial_vectors_p (true),
using_partial_vectors_p (false),
+ epil_using_partial_vectors_p (false),
peeling_for_gaps (false),
peeling_for_niter (false),
no_data_dependencies (false),
@@ -898,6 +899,7 @@ _loop_vec_info::~_loop_vec_info ()
free (bbs);
release_vec_loop_controls (&masks);
+ release_vec_loop_controls (&lens);
delete ivexpr_map;
delete scan_map;
epilogue_vinfos.release ();
@@ -1072,6 +1074,81 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
return true;
}
+/* Check whether we can use vector access with length based on precison
+ comparison. So far, to keep it simple, we only allow the case that the
+ precision of the target supported length is larger than the precision
+ required by loop niters. */
+
+static bool
+vect_verify_loop_lens (loop_vec_info loop_vinfo)
+{
+ if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+ return false;
+
+ unsigned int max_nitems_per_iter = 1;
+ unsigned int i;
+ rgroup_controls *rgl;
+ /* Find the maximum number of items per iteration for every rgroup. */
+ FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
+ {
+ unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
+ max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
+ }
+
+ /* Work out how many bits we need to represent the length limit. */
+ unsigned int min_ni_prec
+ = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
+
+ /* Now use the maximum of below precisions for one suitable IV type:
+ - the IV's natural precision
+ - the precision needed to hold: the maximum number of scalar
+ iterations multiplied by the scale factor (min_ni_prec above)
+ - the Pmode precision
+
+ If min_ni_prec is less than the precision of the current niters,
+ we perfer to still use the niters type. Prefer to use Pmode and
+ wider IV to avoid narrow conversions. */
+
+ unsigned int ni_prec
+ = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
+ min_ni_prec = MAX (min_ni_prec, ni_prec);
+ min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
+
+ tree iv_type = NULL_TREE;
+ opt_scalar_int_mode tmode_iter;
+ FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
+ {
+ scalar_mode tmode = tmode_iter.require ();
+ unsigned int tbits = GET_MODE_BITSIZE (tmode);
+
+ /* ??? Do we really want to construct one IV whose precision exceeds
+ BITS_PER_WORD? */
+ if (tbits > BITS_PER_WORD)
+ break;
+
+ /* Find the first available standard integral type. */
+ if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
+ {
+ iv_type = build_nonstandard_integer_type (tbits, true);
+ break;
+ }
+ }
+
+ if (!iv_type)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't vectorize with length-based partial vectors"
+ " because there is no suitable iv type.\n");
+ return false;
+ }
+
+ LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
+ LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+
+ return true;
+}
+
/* Calculate the cost of one scalar iteration of the loop. */
static void
vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
@@ -2168,11 +2245,48 @@ start_over:
return ok;
}
- /* Decide whether to use a fully-masked loop for this vectorization
- factor. */
- LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
- = (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
- && vect_verify_full_masking (loop_vinfo));
+ /* For now, we don't expect to mix both masking and length approaches for one
+ loop, disable it if both are recorded. */
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+ && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't vectorize a loop with partial vectors"
+ " because we don't expect to mix different"
+ " approaches with partial vectors for the"
+ " same loop.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+
+ /* Decide whether to vectorize a loop with partial vectors for
+ this vectorization factor. */
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ if (param_vect_partial_vector_usage == 0)
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ else if (vect_verify_full_masking (loop_vinfo)
+ || vect_verify_loop_lens (loop_vinfo))
+ {
+ /* The epilogue and other known niters less than VF
+ cases can still use vector access with length fully. */
+ if (param_vect_partial_vector_usage == 1
+ && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && !vect_known_niters_smaller_than_vf (loop_vinfo))
+ {
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+ }
+ else
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+ }
+ else
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ else
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+
if (dump_enabled_p ())
{
if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
@@ -2404,6 +2518,7 @@ again:
= init_cost (LOOP_VINFO_LOOP (loop_vinfo));
/* Reset accumulated rgroup information. */
release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
+ release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
/* Reset assorted flags. */
LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
@@ -2690,7 +2805,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
lowest_th = ordered_min (lowest_th, th);
}
else
- delete loop_vinfo;
+ {
+ delete loop_vinfo;
+ loop_vinfo = opt_loop_vec_info::success (NULL);
+ }
/* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
enabled, SIMDUID is not set, it is the innermost loop and we have
@@ -2715,6 +2833,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
else
{
delete loop_vinfo;
+ loop_vinfo = opt_loop_vec_info::success (NULL);
if (fatal)
{
gcc_checking_assert (first_loop_vinfo == NULL);
@@ -2722,6 +2841,23 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
}
}
+ /* Handle the case that the original loop can use partial
+ vectorization, but want to only adopt it for the epilogue.
+ The retry should be in the same mode as original. */
+ if (vect_epilogues
+ && loop_vinfo
+ && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "***** Re-trying analysis with same vector mode"
+ " %s for epilogue with partial vectors.\n",
+ GET_MODE_NAME (loop_vinfo->vector_mode));
+ continue;
+ }
+
if (mode_i < vector_modes.length ()
&& VECTOR_MODE_P (autodetected_vector_mode)
&& (related_vector_mode (vector_modes[mode_i],
@@ -3562,6 +3698,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
target_cost_data, num_masks - 1, vector_stmt,
NULL, NULL_TREE, 0, vect_body);
}
+ else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ {
+ peel_iters_prologue = 0;
+ peel_iters_epilogue = 0;
+ }
else if (npeel < 0)
{
peel_iters_prologue = assumed_vf / 2;
@@ -8194,6 +8335,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
{
rgm->max_nscalars_per_iter = nscalars_per_iter;
rgm->type = truth_type_for (vectype);
+ rgm->factor = 1;
}
}
@@ -8246,6 +8388,69 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
return mask;
}
+/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
+ lengths for controlling an operation on VECTYPE. The operation splits
+ each element of VECTYPE into FACTOR separate subelements, measuring the
+ length as a number of these subelements. */
+
+void
+vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+ unsigned int nvectors, tree vectype, unsigned int factor)
+{
+ gcc_assert (nvectors != 0);
+ if (lens->length () < nvectors)
+ lens->safe_grow_cleared (nvectors);
+ rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+ /* The number of scalars per iteration, scalar occupied bytes and
+ the number of vectors are both compile-time constants. */
+ unsigned int nscalars_per_iter
+ = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+ if (rgl->max_nscalars_per_iter < nscalars_per_iter)
+ {
+ /* For now, we only support cases in which all loads and stores fall back
+ to VnQI or none do. */
+ gcc_assert (!rgl->max_nscalars_per_iter
+ || (rgl->factor == 1 && factor == 1)
+ || (rgl->max_nscalars_per_iter * rgl->factor
+ == nscalars_per_iter * factor));
+ rgl->max_nscalars_per_iter = nscalars_per_iter;
+ rgl->type = vectype;
+ rgl->factor = factor;
+ }
+}
+
+/* Given a complete set of length LENS, extract length number INDEX for an
+ rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
+
+tree
+vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+ unsigned int nvectors, unsigned int index)
+{
+ rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+ /* Populate the rgroup's len array, if this is the first time we've
+ used it. */
+ if (rgl->controls.is_empty ())
+ {
+ rgl->controls.safe_grow_cleared (nvectors);
+ for (unsigned int i = 0; i < nvectors; ++i)
+ {
+ tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ gcc_assert (len_type != NULL_TREE);
+ tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
+
+ /* Provide a dummy definition until the real one is available. */
+ SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
+ rgl->controls[i] = len;
+ }
+ }
+
+ return rgl->controls[index];
+}
+
/* Scale profiling counters by estimation for LOOP which is vectorized
by factor VF. */