diff options
author | Richard Sandiford <richard.sandiford@arm.com> | 2018-08-24 13:05:36 +0000 |
---|---|---|
committer | Richard Sandiford <rsandifo@gcc.gnu.org> | 2018-08-24 13:05:36 +0000 |
commit | ab7e60cec1a6f4185b0428f3a2b3e71df0bae533 (patch) | |
tree | c666f7a31ae111b41e904e26bb4df9009c6db203 /gcc/tree-vect-slp.c | |
parent | 1ade64c9d8cd37c8db0a07383189f1719c7164da (diff) | |
download | gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.zip gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.tar.gz gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.tar.bz2 |
Handle SLP permutations for variable-length vectors
The SLP code currently punts for all variable-length permutes.
This patch makes it handle the easy case of N->N permutes in which
the number of vector lanes is a multiple of N. Every permute then
uses the same mask, and that mask repeats (with a stride) every
N elements.
The patch uses the same path for constant-length vectors,
since it should be slightly cheaper in terms of compile time.
2018-08-24 Richard Sandiford <richard.sandiford@arm.com>
gcc/
* tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
the case in which the permute needs only a single element and
repeats for every vector of the result. Extend that case to
handle variable-length vectors.
* tree-vect-stmts.c (vectorizable_load): Update accordingly.
gcc/testsuite/
* gcc.target/aarch64/sve/slp_perm_1.c: New test.
* gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_7.c: Likewise.
From-SVN: r263832
Diffstat (limited to 'gcc/tree-vect-slp.c')
-rw-r--r-- | gcc/tree-vect-slp.c | 150 |
1 files changed, 88 insertions, 62 deletions
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 0a9ce24..0ab7bd8 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -3606,13 +3606,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, { stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; vec_info *vinfo = stmt_info->vinfo; - tree mask_element_type = NULL_TREE, mask_type; int vec_index = 0; tree vectype = STMT_VINFO_VECTYPE (stmt_info); - int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); + unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); unsigned int mask_element; machine_mode mode; - unsigned HOST_WIDE_INT nunits, const_vf; if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) return false; @@ -3620,22 +3618,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); mode = TYPE_MODE (vectype); - - /* At the moment, all permutations are represented using per-element - indices, so we can't cope with variable vector lengths or - vectorization factors. */ - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) - || !vf.is_constant (&const_vf)) - return false; - - /* The generic VEC_PERM_EXPR code always uses an integral type of the - same size as the vector element being permuted. */ - mask_element_type = lang_hooks.types.type_for_mode - (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1); - mask_type = get_vectype_for_scalar_type (mask_element_type); - vec_perm_builder mask (nunits, nunits, 1); - mask.quick_grow (nunits); - vec_perm_indices indices; + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); /* Initialize the vect stmts of NODE to properly insert the generated stmts later. */ @@ -3669,14 +3652,53 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, bool noop_p = true; *n_perms = 0; - for (unsigned int j = 0; j < const_vf; j++) + vec_perm_builder mask; + unsigned int nelts_to_build; + unsigned int nvectors_per_build; + bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) + && multiple_p (nunits, group_size)); + if (repeating_p) { - for (int k = 0; k < group_size; k++) + /* A single vector contains a whole number of copies of the node, so: + (a) all permutes can use the same mask; and + (b) the permutes only need a single vector input. */ + mask.new_vector (nunits, group_size, 3); + nelts_to_build = mask.encoded_nelts (); + nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); + } + else + { + /* We need to construct a separate mask for each vector statement. */ + unsigned HOST_WIDE_INT const_nunits, const_vf; + if (!nunits.is_constant (&const_nunits) + || !vf.is_constant (&const_vf)) + return false; + mask.new_vector (const_nunits, const_nunits, 1); + nelts_to_build = const_vf * group_size; + nvectors_per_build = 1; + } + + unsigned int count = mask.encoded_nelts (); + mask.quick_grow (count); + vec_perm_indices indices; + + for (unsigned int j = 0; j < nelts_to_build; j++) + { + unsigned int iter_num = j / group_size; + unsigned int stmt_num = j % group_size; + unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]); + if (repeating_p) { - unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k] - + j * DR_GROUP_SIZE (stmt_info)); - vec_index = i / nunits; - mask_element = i % nunits; + first_vec_index = 0; + mask_element = i; + } + else + { + /* Enforced before the loop when !repeating_p. */ + unsigned int const_nunits = nunits.to_constant (); + vec_index = i / const_nunits; + mask_element = i % const_nunits; if (vec_index == first_vec_index || first_vec_index == -1) { @@ -3686,7 +3708,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, || second_vec_index == -1) { second_vec_index = vec_index; - mask_element += nunits; + mask_element += const_nunits; } else { @@ -3702,50 +3724,54 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, return false; } - gcc_assert (mask_element < 2 * nunits); - if (mask_element != index) - noop_p = false; - mask[index++] = mask_element; + gcc_assert (mask_element < 2 * const_nunits); + } + + if (mask_element != index) + noop_p = false; + mask[index++] = mask_element; - if (index == nunits && !noop_p) + if (index == count && !noop_p) + { + indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits); + if (!can_vec_perm_const_p (mode, indices)) { - indices.new_vector (mask, 2, nunits); - if (!can_vec_perm_const_p (mode, indices)) + if (dump_enabled_p ()) { - if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "unsupported vect permute { "); + for (i = 0; i < count; ++i) { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, - vect_location, - "unsupported vect permute { "); - for (i = 0; i < nunits; ++i) - { - dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); - dump_printf (MSG_MISSED_OPTIMIZATION, " "); - } - dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); + dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); + dump_printf (MSG_MISSED_OPTIMIZATION, " "); } - gcc_assert (analyze_only); - return false; + dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); } - - ++*n_perms; + gcc_assert (analyze_only); + return false; } - if (index == nunits) + ++*n_perms; + } + + if (index == count) + { + if (!analyze_only) { - if (!analyze_only) - { - tree mask_vec = NULL_TREE; + tree mask_vec = NULL_TREE; - if (! noop_p) - mask_vec = vec_perm_indices_to_tree (mask_type, indices); + if (! noop_p) + mask_vec = vect_gen_perm_mask_checked (vectype, indices); - if (second_vec_index == -1) - second_vec_index = first_vec_index; + if (second_vec_index == -1) + second_vec_index = first_vec_index; + for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) + { /* Generate the permute statement if necessary. */ - tree first_vec = dr_chain[first_vec_index]; - tree second_vec = dr_chain[second_vec_index]; + tree first_vec = dr_chain[first_vec_index + ri]; + tree second_vec = dr_chain[second_vec_index + ri]; stmt_vec_info perm_stmt_info; if (! noop_p) { @@ -3771,12 +3797,12 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt_info; } - - index = 0; - first_vec_index = -1; - second_vec_index = -1; - noop_p = true; } + + index = 0; + first_vec_index = -1; + second_vec_index = -1; + noop_p = true; } } |