aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-slp.c
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2018-08-24 13:05:36 +0000
committerRichard Sandiford <rsandifo@gcc.gnu.org>2018-08-24 13:05:36 +0000
commitab7e60cec1a6f4185b0428f3a2b3e71df0bae533 (patch)
treec666f7a31ae111b41e904e26bb4df9009c6db203 /gcc/tree-vect-slp.c
parent1ade64c9d8cd37c8db0a07383189f1719c7164da (diff)
downloadgcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.zip
gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.tar.gz
gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.tar.bz2
Handle SLP permutations for variable-length vectors
The SLP code currently punts for all variable-length permutes. This patch makes it handle the easy case of N->N permutes in which the number of vector lanes is a multiple of N. Every permute then uses the same mask, and that mask repeats (with a stride) every N elements. The patch uses the same path for constant-length vectors, since it should be slightly cheaper in terms of compile time. 2018-08-24 Richard Sandiford <richard.sandiford@arm.com> gcc/ * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out the case in which the permute needs only a single element and repeats for every vector of the result. Extend that case to handle variable-length vectors. * tree-vect-stmts.c (vectorizable_load): Update accordingly. gcc/testsuite/ * gcc.target/aarch64/sve/slp_perm_1.c: New test. * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. From-SVN: r263832
Diffstat (limited to 'gcc/tree-vect-slp.c')
-rw-r--r--gcc/tree-vect-slp.c150
1 files changed, 88 insertions, 62 deletions
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 0a9ce24..0ab7bd8 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -3606,13 +3606,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
{
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
vec_info *vinfo = stmt_info->vinfo;
- tree mask_element_type = NULL_TREE, mask_type;
int vec_index = 0;
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+ unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
unsigned int mask_element;
machine_mode mode;
- unsigned HOST_WIDE_INT nunits, const_vf;
if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
return false;
@@ -3620,22 +3618,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
mode = TYPE_MODE (vectype);
-
- /* At the moment, all permutations are represented using per-element
- indices, so we can't cope with variable vector lengths or
- vectorization factors. */
- if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
- || !vf.is_constant (&const_vf))
- return false;
-
- /* The generic VEC_PERM_EXPR code always uses an integral type of the
- same size as the vector element being permuted. */
- mask_element_type = lang_hooks.types.type_for_mode
- (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
- mask_type = get_vectype_for_scalar_type (mask_element_type);
- vec_perm_builder mask (nunits, nunits, 1);
- mask.quick_grow (nunits);
- vec_perm_indices indices;
+ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
/* Initialize the vect stmts of NODE to properly insert the generated
stmts later. */
@@ -3669,14 +3652,53 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
bool noop_p = true;
*n_perms = 0;
- for (unsigned int j = 0; j < const_vf; j++)
+ vec_perm_builder mask;
+ unsigned int nelts_to_build;
+ unsigned int nvectors_per_build;
+ bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
+ && multiple_p (nunits, group_size));
+ if (repeating_p)
{
- for (int k = 0; k < group_size; k++)
+ /* A single vector contains a whole number of copies of the node, so:
+ (a) all permutes can use the same mask; and
+ (b) the permutes only need a single vector input. */
+ mask.new_vector (nunits, group_size, 3);
+ nelts_to_build = mask.encoded_nelts ();
+ nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
+ }
+ else
+ {
+ /* We need to construct a separate mask for each vector statement. */
+ unsigned HOST_WIDE_INT const_nunits, const_vf;
+ if (!nunits.is_constant (&const_nunits)
+ || !vf.is_constant (&const_vf))
+ return false;
+ mask.new_vector (const_nunits, const_nunits, 1);
+ nelts_to_build = const_vf * group_size;
+ nvectors_per_build = 1;
+ }
+
+ unsigned int count = mask.encoded_nelts ();
+ mask.quick_grow (count);
+ vec_perm_indices indices;
+
+ for (unsigned int j = 0; j < nelts_to_build; j++)
+ {
+ unsigned int iter_num = j / group_size;
+ unsigned int stmt_num = j % group_size;
+ unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
+ + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
+ if (repeating_p)
{
- unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
- + j * DR_GROUP_SIZE (stmt_info));
- vec_index = i / nunits;
- mask_element = i % nunits;
+ first_vec_index = 0;
+ mask_element = i;
+ }
+ else
+ {
+ /* Enforced before the loop when !repeating_p. */
+ unsigned int const_nunits = nunits.to_constant ();
+ vec_index = i / const_nunits;
+ mask_element = i % const_nunits;
if (vec_index == first_vec_index
|| first_vec_index == -1)
{
@@ -3686,7 +3708,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
|| second_vec_index == -1)
{
second_vec_index = vec_index;
- mask_element += nunits;
+ mask_element += const_nunits;
}
else
{
@@ -3702,50 +3724,54 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
return false;
}
- gcc_assert (mask_element < 2 * nunits);
- if (mask_element != index)
- noop_p = false;
- mask[index++] = mask_element;
+ gcc_assert (mask_element < 2 * const_nunits);
+ }
+
+ if (mask_element != index)
+ noop_p = false;
+ mask[index++] = mask_element;
- if (index == nunits && !noop_p)
+ if (index == count && !noop_p)
+ {
+ indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
+ if (!can_vec_perm_const_p (mode, indices))
{
- indices.new_vector (mask, 2, nunits);
- if (!can_vec_perm_const_p (mode, indices))
+ if (dump_enabled_p ())
{
- if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+ vect_location,
+ "unsupported vect permute { ");
+ for (i = 0; i < count; ++i)
{
- dump_printf_loc (MSG_MISSED_OPTIMIZATION,
- vect_location,
- "unsupported vect permute { ");
- for (i = 0; i < nunits; ++i)
- {
- dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
- dump_printf (MSG_MISSED_OPTIMIZATION, " ");
- }
- dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
+ dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
+ dump_printf (MSG_MISSED_OPTIMIZATION, " ");
}
- gcc_assert (analyze_only);
- return false;
+ dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
}
-
- ++*n_perms;
+ gcc_assert (analyze_only);
+ return false;
}
- if (index == nunits)
+ ++*n_perms;
+ }
+
+ if (index == count)
+ {
+ if (!analyze_only)
{
- if (!analyze_only)
- {
- tree mask_vec = NULL_TREE;
+ tree mask_vec = NULL_TREE;
- if (! noop_p)
- mask_vec = vec_perm_indices_to_tree (mask_type, indices);
+ if (! noop_p)
+ mask_vec = vect_gen_perm_mask_checked (vectype, indices);
- if (second_vec_index == -1)
- second_vec_index = first_vec_index;
+ if (second_vec_index == -1)
+ second_vec_index = first_vec_index;
+ for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
+ {
/* Generate the permute statement if necessary. */
- tree first_vec = dr_chain[first_vec_index];
- tree second_vec = dr_chain[second_vec_index];
+ tree first_vec = dr_chain[first_vec_index + ri];
+ tree second_vec = dr_chain[second_vec_index + ri];
stmt_vec_info perm_stmt_info;
if (! noop_p)
{
@@ -3771,12 +3797,12 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
= perm_stmt_info;
}
-
- index = 0;
- first_vec_index = -1;
- second_vec_index = -1;
- noop_p = true;
}
+
+ index = 0;
+ first_vec_index = -1;
+ second_vec_index = -1;
+ noop_p = true;
}
}