aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2020-10-29 13:38:01 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2020-10-29 13:38:01 +0000
commit6e23549157d671f4f2e61756a0d0924cc59718ab (patch)
tree5b8e002bbfc48786bb13610aeaabfcb5e53c0db4 /gcc
parent568de14d2e74cfdd600b8995ff6ac08c98ddef48 (diff)
downloadgcc-6e23549157d671f4f2e61756a0d0924cc59718ab.zip
gcc-6e23549157d671f4f2e61756a0d0924cc59718ab.tar.gz
gcc-6e23549157d671f4f2e61756a0d0924cc59718ab.tar.bz2
vect: Fix load costs for SLP permutes
For the following test case (compiled with load/store lanes disabled locally): void f (uint32_t *restrict x, uint8_t *restrict y, int n) { for (int i = 0; i < n; ++i) { x[i * 2] = x[i * 2] + y[i * 2]; x[i * 2 + 1] = x[i * 2 + 1] + y[i * 2]; } } we have a redundant no-op permute on the x[] load node: node 0x4472350 (max_nunits=8, refcnt=2) stmt 0 _5 = *_4; stmt 1 _13 = *_12; load permutation { 0 1 } Then, when costing it, we pick a cost of 1, even though we need 4 copies of the x[] load to match a single y[] load: ==> examining statement: _5 = *_4; Vectorizing an unaligned access. vect_model_load_cost: unaligned supported by hardware. vect_model_load_cost: inside_cost = 1, prologue_cost = 0 . The problem is that the code only considers the permutation for the first scalar iteration, rather than for all VF iterations. This patch tries to fix that by making vect_transform_slp_perm_load calculate the value instead. gcc/ * tree-vectorizer.h (vect_transform_slp_perm_load): Take an optional extra parameter. * tree-vect-slp.c (vect_transform_slp_perm_load): Calculate the number of loads as well as the number of permutes, taking the counting loop from... * tree-vect-stmts.c (vect_model_load_cost): ...here. Use the value computed by vect_transform_slp_perm_load for ncopies.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/tree-vect-slp.c39
-rw-r--r--gcc/tree-vect-stmts.c32
-rw-r--r--gcc/tree-vectorizer.h3
3 files changed, 43 insertions, 31 deletions
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 7a08908..5d69a98 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -4830,13 +4830,16 @@ vect_get_slp_defs (vec_info *,
/* Generate vector permute statements from a list of loads in DR_CHAIN.
If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
- permute statements for the SLP node NODE. */
+ permute statements for the SLP node NODE. Store the number of vector
+ permute instructions in *N_PERMS and the number of vector load
+ instructions in *N_LOADS. */
bool
vect_transform_slp_perm_load (vec_info *vinfo,
slp_tree node, vec<tree> dr_chain,
gimple_stmt_iterator *gsi, poly_uint64 vf,
- bool analyze_only, unsigned *n_perms)
+ bool analyze_only, unsigned *n_perms,
+ unsigned int *n_loads)
{
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
int vec_index = 0;
@@ -4888,6 +4891,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
vec_perm_builder mask;
unsigned int nelts_to_build;
unsigned int nvectors_per_build;
+ unsigned int in_nlanes;
bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
&& multiple_p (nunits, group_size));
if (repeating_p)
@@ -4898,6 +4902,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
mask.new_vector (nunits, group_size, 3);
nelts_to_build = mask.encoded_nelts ();
nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
+ in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
}
else
{
@@ -4909,7 +4914,10 @@ vect_transform_slp_perm_load (vec_info *vinfo,
mask.new_vector (const_nunits, const_nunits, 1);
nelts_to_build = const_vf * group_size;
nvectors_per_build = 1;
+ in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
}
+ auto_sbitmap used_in_lanes (in_nlanes);
+ bitmap_clear (used_in_lanes);
unsigned int count = mask.encoded_nelts ();
mask.quick_grow (count);
@@ -4921,6 +4929,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
unsigned int stmt_num = j % group_size;
unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
+ SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
+ bitmap_set_bit (used_in_lanes, i);
if (repeating_p)
{
first_vec_index = 0;
@@ -5034,6 +5043,32 @@ vect_transform_slp_perm_load (vec_info *vinfo,
}
}
+ if (n_loads)
+ {
+ if (repeating_p)
+ *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
+ else
+ {
+ /* Enforced above when !repeating_p. */
+ unsigned int const_nunits = nunits.to_constant ();
+ *n_loads = 0;
+ bool load_seen = false;
+ for (unsigned i = 0; i < in_nlanes; ++i)
+ {
+ if (i % const_nunits == 0)
+ {
+ if (load_seen)
+ *n_loads += 1;
+ load_seen = false;
+ }
+ if (bitmap_bit_p (used_in_lanes, i))
+ load_seen = true;
+ }
+ if (load_seen)
+ *n_loads += 1;
+ }
+ }
+
return true;
}
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 7f0763f..1a0da0e 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1098,39 +1098,15 @@ vect_model_load_cost (vec_info *vinfo,
the first group element not by the first scalar stmt DR. */
stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
/* Record the cost for the permutation. */
- unsigned n_perms;
- unsigned assumed_nunits
- = vect_nunits_for_cost (STMT_VINFO_VECTYPE (first_stmt_info));
+ unsigned n_perms, n_loads;
vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
- vf, true, &n_perms);
+ vf, true, &n_perms, &n_loads);
inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
first_stmt_info, 0, vect_body);
+
/* And adjust the number of loads performed. This handles
redundancies as well as loads that are later dead. */
- auto_sbitmap perm (DR_GROUP_SIZE (first_stmt_info));
- bitmap_clear (perm);
- for (unsigned i = 0;
- i < SLP_TREE_LOAD_PERMUTATION (slp_node).length (); ++i)
- bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (slp_node)[i]);
- ncopies = 0;
- bool load_seen = false;
- for (unsigned i = 0; i < DR_GROUP_SIZE (first_stmt_info); ++i)
- {
- if (i % assumed_nunits == 0)
- {
- if (load_seen)
- ncopies++;
- load_seen = false;
- }
- if (bitmap_bit_p (perm, i))
- load_seen = true;
- }
- if (load_seen)
- ncopies++;
- gcc_assert (ncopies
- <= (DR_GROUP_SIZE (first_stmt_info)
- - DR_GROUP_GAP (first_stmt_info)
- + assumed_nunits - 1) / assumed_nunits);
+ ncopies = n_loads;
}
/* Grouped loads read all elements in the group at once,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 13a02cd..fbf5291 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1952,7 +1952,8 @@ extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree);
extern void vect_free_slp_instance (slp_instance);
extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec<tree>,
gimple_stmt_iterator *, poly_uint64,
- bool, unsigned *);
+ bool, unsigned *,
+ unsigned * = nullptr);
extern bool vect_slp_analyze_operations (vec_info *);
extern void vect_schedule_slp (vec_info *, vec<slp_instance>);
extern opt_result vect_analyze_slp (vec_info *, unsigned);