aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorKyrylo Tkachov <kyrylo.tkachov@arm.com>2020-12-17 18:02:37 +0000
committerKyrylo Tkachov <kyrylo.tkachov@arm.com>2020-12-17 18:04:21 +0000
commit64432b680eab0bddbe9a4ad4798457cf6a14ad60 (patch)
treec88d7613be91dc9c472634980e0de2f9a8897c37 /gcc
parent2d7a40fa60fb8b9870cfd053a37fc67404353ee2 (diff)
downloadgcc-64432b680eab0bddbe9a4ad4798457cf6a14ad60.zip
gcc-64432b680eab0bddbe9a4ad4798457cf6a14ad60.tar.gz
gcc-64432b680eab0bddbe9a4ad4798457cf6a14ad60.tar.bz2
vect, aarch64: Extend SVE vs Advanced SIMD costing decisions in vect_better_loop_vinfo_p
While experimenting with some backend costs for Advanced SIMD and SVE I hit many cases where GCC would pick SVE for VLA auto-vectorisation even when the backend very clearly presented cheaper costs for Advanced SIMD. For a simple float addition loop the SVE costs were: vec.c:9:21: note: Cost model analysis: Vector inside of loop cost: 28 Vector prologue cost: 2 Vector epilogue cost: 0 Scalar iteration cost: 10 Scalar outside cost: 0 Vector outside cost: 2 prologue iterations: 0 epilogue iterations: 0 Minimum number of vector iterations: 1 Calculated minimum iters for profitability: 4 and for Advanced SIMD (Neon) they're: vec.c:9:21: note: Cost model analysis: Vector inside of loop cost: 11 Vector prologue cost: 0 Vector epilogue cost: 0 Scalar iteration cost: 10 Scalar outside cost: 0 Vector outside cost: 0 prologue iterations: 0 epilogue iterations: 0 Calculated minimum iters for profitability: 0 vec.c:9:21: note: Runtime profitability threshold = 4 yet the SVE one was always picked. With guidance from Richard this seems to be due to the vinfo comparisons in vect_better_loop_vinfo_p, in particular the part with the big comment explaining the estimated_rel_new * 2 <= estimated_rel_old heuristic. This patch extends the comparisons by introducing a three-way estimate kind for poly_int values that the backend can distinguish. This allows vect_better_loop_vinfo_p to ask for minimum, maximum and likely estimates and pick Advanced SIMD overs SVE when it is clearly cheaper. gcc/ * target.h (enum poly_value_estimate_kind): Define. (estimated_poly_value): Take an estimate kind argument. * target.def (estimated_poly_value): Update definition for the above. * doc/tm.texi: Regenerate. * targhooks.c (estimated_poly_value): Update prototype. * tree-vect-loop.c (vect_better_loop_vinfo_p): Use min, max and likely estimates of VF to pick between vinfos. * config/aarch64/aarch64.c (aarch64_cmp_autovec_modes): Use estimated_poly_value instead of aarch64_estimated_poly_value. (aarch64_estimated_poly_value): Take a kind argument and handle it.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/aarch64/aarch64.c35
-rw-r--r--gcc/doc/tm.texi7
-rw-r--r--gcc/target.def7
-rw-r--r--gcc/target.h12
-rw-r--r--gcc/targhooks.c2
-rw-r--r--gcc/tree-vect-loop.c85
6 files changed, 96 insertions, 52 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b796301..67b4c91 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -17367,8 +17367,6 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
return word_mode;
}
-static HOST_WIDE_INT aarch64_estimated_poly_value (poly_int64);
-
/* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
and return whether the SVE mode should be preferred over the
Advanced SIMD one in aarch64_autovectorize_vector_modes. */
@@ -17401,8 +17399,8 @@ aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
return maybe_gt (nunits_sve, nunits_asimd);
/* Otherwise estimate the runtime width of the modes involved. */
- HOST_WIDE_INT est_sve = aarch64_estimated_poly_value (nunits_sve);
- HOST_WIDE_INT est_asimd = aarch64_estimated_poly_value (nunits_asimd);
+ HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
+ HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
/* Preferring SVE means picking it first unless the Advanced SIMD mode
is clearly wider. */
@@ -23195,19 +23193,38 @@ aarch64_speculation_safe_value (machine_mode mode,
/* Implement TARGET_ESTIMATED_POLY_VALUE.
Look into the tuning structure for an estimate.
- VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
- Advanced SIMD 128 bits. */
+ KIND specifies the type of requested estimate: min, max or likely.
+ For cores with a known SVE width all three estimates are the same.
+ For generic SVE tuning we want to distinguish the maximum estimate from
+ the minimum and likely ones.
+ The likely estimate is the same as the minimum in that case to give a
+ conservative behavior of auto-vectorizing with SVE when it is a win
+ even for 128-bit SVE.
+ When SVE width information is available VAL.coeffs[1] is multiplied by
+ the number of VQ chunks over the initial Advanced SIMD 128 bits. */
static HOST_WIDE_INT
-aarch64_estimated_poly_value (poly_int64 val)
+aarch64_estimated_poly_value (poly_int64 val,
+ poly_value_estimate_kind kind
+ = POLY_VALUE_LIKELY)
{
enum aarch64_sve_vector_bits_enum width_source
= aarch64_tune_params.sve_width;
- /* If we still don't have an estimate, use the default. */
+ /* If there is no core-specific information then the minimum and likely
+ values are based on 128-bit vectors and the maximum is based on
+ the architectural maximum of 2048 bits. */
if (width_source == SVE_SCALABLE)
- return default_estimated_poly_value (val);
+ switch (kind)
+ {
+ case POLY_VALUE_MIN:
+ case POLY_VALUE_LIKELY:
+ return val.coeffs[0];
+ case POLY_VALUE_MAX:
+ return val.coeffs[0] + val.coeffs[1] * 15;
+ }
+ /* If the core provides width information, use that. */
HOST_WIDE_INT over_128 = width_source - 128;
return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
}
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index d9b855c..900d584 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -7005,9 +7005,12 @@ delay slot branches filled using the basic filler is often still desirable
as the delay slot can hide a pipeline bubble.
@end deftypefn
-@deftypefn {Target Hook} HOST_WIDE_INT TARGET_ESTIMATED_POLY_VALUE (poly_int64 @var{val})
+@deftypefn {Target Hook} HOST_WIDE_INT TARGET_ESTIMATED_POLY_VALUE (poly_int64 @var{val}, poly_value_estimate_kind @var{kind})
Return an estimate of the runtime value of @var{val}, for use in
-things like cost calculations or profiling frequencies. The default
+things like cost calculations or profiling frequencies. @var{kind} is used
+to ask for the minimum, maximum, and likely estimates of the value through
+the @code{POLY_VALUE_MIN}, @code{POLY_VALUE_MAX} and
+@code{POLY_VALUE_LIKELY} values. The default
implementation returns the lowest possible value of @var{val}.
@end deftypefn
diff --git a/gcc/target.def b/gcc/target.def
index acdc694..d5ed0df 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3851,9 +3851,12 @@ default_new_address_profitable_p)
DEFHOOK
(estimated_poly_value,
"Return an estimate of the runtime value of @var{val}, for use in\n\
-things like cost calculations or profiling frequencies. The default\n\
+things like cost calculations or profiling frequencies. @var{kind} is used\n\
+to ask for the minimum, maximum, and likely estimates of the value through\n\
+the @code{POLY_VALUE_MIN}, @code{POLY_VALUE_MAX} and\n\
+@code{POLY_VALUE_LIKELY} values. The default\n\
implementation returns the lowest possible value of @var{val}.",
- HOST_WIDE_INT, (poly_int64 val),
+ HOST_WIDE_INT, (poly_int64 val, poly_value_estimate_kind kind),
default_estimated_poly_value)
/* Permit speculative instructions in delay slots during delayed-branch
diff --git a/gcc/target.h b/gcc/target.h
index 9601880..68ef519 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -252,6 +252,13 @@ enum type_context_kind {
TCTX_CAPTURE_BY_COPY
};
+enum poly_value_estimate_kind
+{
+ POLY_VALUE_MIN,
+ POLY_VALUE_MAX,
+ POLY_VALUE_LIKELY
+};
+
extern bool verify_type_context (location_t, type_context_kind, const_tree,
bool = false);
@@ -272,12 +279,13 @@ extern struct gcc_target targetm;
provides a rough guess. */
static inline HOST_WIDE_INT
-estimated_poly_value (poly_int64 x)
+estimated_poly_value (poly_int64 x,
+ poly_value_estimate_kind kind = POLY_VALUE_LIKELY)
{
if (NUM_POLY_INT_COEFFS == 1)
return x.coeffs[0];
else
- return targetm.estimated_poly_value (x);
+ return targetm.estimated_poly_value (x, kind);
}
#ifdef GCC_TM_H
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 6e12e13..d616fb73 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1757,7 +1757,7 @@ default_slow_unaligned_access (machine_mode, unsigned int)
/* The default implementation of TARGET_ESTIMATED_POLY_VALUE. */
HOST_WIDE_INT
-default_estimated_poly_value (poly_int64 x)
+default_estimated_poly_value (poly_int64 x, poly_value_estimate_kind)
{
return x.coeffs[0];
}
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 52757ad..688538a 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2773,43 +2773,56 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
/* Check whether the (fractional) cost per scalar iteration is lower
or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
- poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
- * poly_widest_int (old_vf));
- poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
- * poly_widest_int (new_vf));
- if (maybe_lt (rel_old, rel_new))
- {
- /* When old_loop_vinfo uses a variable vectorization factor,
- we know that it has a lower cost for at least one runtime VF.
- However, we don't know how likely that VF is.
-
- One option would be to compare the costs for the estimated VFs.
- The problem is that that can put too much pressure on the cost
- model. E.g. if the estimated VF is also the lowest possible VF,
- and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
- for the estimated VF, we'd then choose new_loop_vinfo even
- though (a) new_loop_vinfo might not actually be better than
- old_loop_vinfo for that VF and (b) it would be significantly
- worse at larger VFs.
-
- Here we go for a hacky compromise: pick new_loop_vinfo if it is
- no more expensive than old_loop_vinfo even after doubling the
- estimated old_loop_vinfo VF. For all but trivial loops, this
- ensures that we only pick new_loop_vinfo if it is significantly
- better than old_loop_vinfo at the estimated VF. */
- if (rel_new.is_constant ())
- return false;
-
- HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
- HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
- widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
- * widest_int (old_estimated_vf));
- widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
- * widest_int (new_estimated_vf));
- return estimated_rel_new * 2 <= estimated_rel_old;
- }
- if (known_lt (rel_new, rel_old))
+ poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
+ poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
+
+ HOST_WIDE_INT est_rel_new_min
+ = estimated_poly_value (rel_new, POLY_VALUE_MIN);
+ HOST_WIDE_INT est_rel_new_max
+ = estimated_poly_value (rel_new, POLY_VALUE_MAX);
+
+ HOST_WIDE_INT est_rel_old_min
+ = estimated_poly_value (rel_old, POLY_VALUE_MIN);
+ HOST_WIDE_INT est_rel_old_max
+ = estimated_poly_value (rel_old, POLY_VALUE_MAX);
+
+ /* Check first if we can make out an unambigous total order from the minimum
+ and maximum estimates. */
+ if (est_rel_new_min < est_rel_old_min
+ && est_rel_new_max < est_rel_old_max)
return true;
+ else if (est_rel_old_min < est_rel_new_min
+ && est_rel_old_max < est_rel_new_max)
+ return false;
+ /* When old_loop_vinfo uses a variable vectorization factor,
+ we know that it has a lower cost for at least one runtime VF.
+ However, we don't know how likely that VF is.
+
+ One option would be to compare the costs for the estimated VFs.
+ The problem is that that can put too much pressure on the cost
+ model. E.g. if the estimated VF is also the lowest possible VF,
+ and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
+ for the estimated VF, we'd then choose new_loop_vinfo even
+ though (a) new_loop_vinfo might not actually be better than
+ old_loop_vinfo for that VF and (b) it would be significantly
+ worse at larger VFs.
+
+ Here we go for a hacky compromise: pick new_loop_vinfo if it is
+ no more expensive than old_loop_vinfo even after doubling the
+ estimated old_loop_vinfo VF. For all but trivial loops, this
+ ensures that we only pick new_loop_vinfo if it is significantly
+ better than old_loop_vinfo at the estimated VF. */
+
+ if (est_rel_old_min != est_rel_new_min
+ || est_rel_old_max != est_rel_new_max)
+ {
+ HOST_WIDE_INT est_rel_new_likely
+ = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
+ HOST_WIDE_INT est_rel_old_likely
+ = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
+
+ return est_rel_new_likely * 2 <= est_rel_old_likely;
+ }
/* If there's nothing to choose between the loop bodies, see whether
there's a difference in the prologue and epilogue costs. */