diff options
author | Richard Sandiford <richard.sandiford@arm.com> | 2020-10-29 13:38:01 +0000 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@arm.com> | 2020-10-29 13:38:01 +0000 |
commit | 6e23549157d671f4f2e61756a0d0924cc59718ab (patch) | |
tree | 5b8e002bbfc48786bb13610aeaabfcb5e53c0db4 /gcc | |
parent | 568de14d2e74cfdd600b8995ff6ac08c98ddef48 (diff) | |
download | gcc-6e23549157d671f4f2e61756a0d0924cc59718ab.zip gcc-6e23549157d671f4f2e61756a0d0924cc59718ab.tar.gz gcc-6e23549157d671f4f2e61756a0d0924cc59718ab.tar.bz2 |
vect: Fix load costs for SLP permutes
For the following test case (compiled with load/store lanes
disabled locally):
void
f (uint32_t *restrict x, uint8_t *restrict y, int n)
{
for (int i = 0; i < n; ++i)
{
x[i * 2] = x[i * 2] + y[i * 2];
x[i * 2 + 1] = x[i * 2 + 1] + y[i * 2];
}
}
we have a redundant no-op permute on the x[] load node:
node 0x4472350 (max_nunits=8, refcnt=2)
stmt 0 _5 = *_4;
stmt 1 _13 = *_12;
load permutation { 0 1 }
Then, when costing it, we pick a cost of 1, even though we need 4 copies
of the x[] load to match a single y[] load:
==> examining statement: _5 = *_4;
Vectorizing an unaligned access.
vect_model_load_cost: unaligned supported by hardware.
vect_model_load_cost: inside_cost = 1, prologue_cost = 0 .
The problem is that the code only considers the permutation for
the first scalar iteration, rather than for all VF iterations.
This patch tries to fix that by making vect_transform_slp_perm_load
calculate the value instead.
gcc/
* tree-vectorizer.h (vect_transform_slp_perm_load): Take an
optional extra parameter.
* tree-vect-slp.c (vect_transform_slp_perm_load): Calculate
the number of loads as well as the number of permutes, taking
the counting loop from...
* tree-vect-stmts.c (vect_model_load_cost): ...here. Use the
value computed by vect_transform_slp_perm_load for ncopies.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/tree-vect-slp.c | 39 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 32 | ||||
-rw-r--r-- | gcc/tree-vectorizer.h | 3 |
3 files changed, 43 insertions, 31 deletions
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 7a08908..5d69a98 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -4830,13 +4830,16 @@ vect_get_slp_defs (vec_info *, /* Generate vector permute statements from a list of loads in DR_CHAIN. If ANALYZE_ONLY is TRUE, only check that it is possible to create valid - permute statements for the SLP node NODE. */ + permute statements for the SLP node NODE. Store the number of vector + permute instructions in *N_PERMS and the number of vector load + instructions in *N_LOADS. */ bool vect_transform_slp_perm_load (vec_info *vinfo, slp_tree node, vec<tree> dr_chain, gimple_stmt_iterator *gsi, poly_uint64 vf, - bool analyze_only, unsigned *n_perms) + bool analyze_only, unsigned *n_perms, + unsigned int *n_loads) { stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; int vec_index = 0; @@ -4888,6 +4891,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, vec_perm_builder mask; unsigned int nelts_to_build; unsigned int nvectors_per_build; + unsigned int in_nlanes; bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) && multiple_p (nunits, group_size)); if (repeating_p) @@ -4898,6 +4902,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, mask.new_vector (nunits, group_size, 3); nelts_to_build = mask.encoded_nelts (); nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); + in_nlanes = DR_GROUP_SIZE (stmt_info) * 3; } else { @@ -4909,7 +4914,10 @@ vect_transform_slp_perm_load (vec_info *vinfo, mask.new_vector (const_nunits, const_nunits, 1); nelts_to_build = const_vf * group_size; nvectors_per_build = 1; + in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info); } + auto_sbitmap used_in_lanes (in_nlanes); + bitmap_clear (used_in_lanes); unsigned int count = mask.encoded_nelts (); mask.quick_grow (count); @@ -4921,6 +4929,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, unsigned int stmt_num = j % group_size; unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]); + bitmap_set_bit (used_in_lanes, i); if (repeating_p) { first_vec_index = 0; @@ -5034,6 +5043,32 @@ vect_transform_slp_perm_load (vec_info *vinfo, } } + if (n_loads) + { + if (repeating_p) + *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node); + else + { + /* Enforced above when !repeating_p. */ + unsigned int const_nunits = nunits.to_constant (); + *n_loads = 0; + bool load_seen = false; + for (unsigned i = 0; i < in_nlanes; ++i) + { + if (i % const_nunits == 0) + { + if (load_seen) + *n_loads += 1; + load_seen = false; + } + if (bitmap_bit_p (used_in_lanes, i)) + load_seen = true; + } + if (load_seen) + *n_loads += 1; + } + } + return true; } diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 7f0763f..1a0da0e 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1098,39 +1098,15 @@ vect_model_load_cost (vec_info *vinfo, the first group element not by the first scalar stmt DR. */ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); /* Record the cost for the permutation. */ - unsigned n_perms; - unsigned assumed_nunits - = vect_nunits_for_cost (STMT_VINFO_VECTYPE (first_stmt_info)); + unsigned n_perms, n_loads; vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, - vf, true, &n_perms); + vf, true, &n_perms, &n_loads); inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm, first_stmt_info, 0, vect_body); + /* And adjust the number of loads performed. This handles redundancies as well as loads that are later dead. */ - auto_sbitmap perm (DR_GROUP_SIZE (first_stmt_info)); - bitmap_clear (perm); - for (unsigned i = 0; - i < SLP_TREE_LOAD_PERMUTATION (slp_node).length (); ++i) - bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (slp_node)[i]); - ncopies = 0; - bool load_seen = false; - for (unsigned i = 0; i < DR_GROUP_SIZE (first_stmt_info); ++i) - { - if (i % assumed_nunits == 0) - { - if (load_seen) - ncopies++; - load_seen = false; - } - if (bitmap_bit_p (perm, i)) - load_seen = true; - } - if (load_seen) - ncopies++; - gcc_assert (ncopies - <= (DR_GROUP_SIZE (first_stmt_info) - - DR_GROUP_GAP (first_stmt_info) - + assumed_nunits - 1) / assumed_nunits); + ncopies = n_loads; } /* Grouped loads read all elements in the group at once, diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 13a02cd..fbf5291 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1952,7 +1952,8 @@ extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree); extern void vect_free_slp_instance (slp_instance); extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec<tree>, gimple_stmt_iterator *, poly_uint64, - bool, unsigned *); + bool, unsigned *, + unsigned * = nullptr); extern bool vect_slp_analyze_operations (vec_info *); extern void vect_schedule_slp (vec_info *, vec<slp_instance>); extern opt_result vect_analyze_slp (vec_info *, unsigned); |