diff options
author | Richard Biener <rguenther@suse.de> | 2024-09-02 15:00:05 +0200 |
---|---|---|
committer | Richard Biener <rguenth@gcc.gnu.org> | 2024-09-04 10:13:30 +0200 |
commit | 7164d982663738c255a1a71a5d4f38dc51c2a3cb (patch) | |
tree | cca3a006f97e07ed7d8fd60efa002c7edeca4426 /gcc | |
parent | 4292297a0f938ffc953422fa246ff00fe345fe3d (diff) | |
download | gcc-7164d982663738c255a1a71a5d4f38dc51c2a3cb.zip gcc-7164d982663738c255a1a71a5d4f38dc51c2a3cb.tar.gz gcc-7164d982663738c255a1a71a5d4f38dc51c2a3cb.tar.bz2 |
Also lower SLP grouped loads with just one consumer
This makes sure to produce interleaving schemes or load-lanes
for single-element interleaving and other permutes that otherwise
would use more than three vectors.
It exposes the latent issue that single-element interleaving with
large gaps can be inefficient - the mitigation in get_group_load_store_type
doesn't trigger when we clear the load permutation.
It also exposes the fact that not all permutes can be lowered in
the best way in a vector length agnostic way so I've added an
exception to keep power-of-two size contiguous aligned chunks
unlowered (unless we want load-lanes). The optimal handling
of load/store vectorization is going to continue to be a learning
process.
* tree-vect-slp.cc (vect_lower_load_permutations): Also
process single-use grouped loads.
Avoid lowering contiguous aligned power-of-two sized
chunks, those are better handled by the vector size
specific SLP code generation.
* tree-vect-stmts.cc (get_group_load_store_type): Drop
the unrelated requirement of a load permutation for the
single-element interleaving limit.
* gcc.dg/vect/slp-46.c: Remove XFAIL.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-46.c | 2 | ||||
-rw-r--r-- | gcc/tree-vect-slp.cc | 56 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.cc | 1 |
3 files changed, 39 insertions, 20 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c index b44a673..016580e 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-46.c +++ b/gcc/testsuite/gcc.dg/vect/slp-46.c @@ -98,4 +98,4 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail { vect_load_lanes && vect_variable_length } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 2b05032..d35e060 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -4315,6 +4315,37 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo, && ld_lanes_lanes == 0) continue; + /* Build the permute to get the original load permutation order. */ + bool contiguous = true; + lane_permutation_t final_perm; + final_perm.create (SLP_TREE_LANES (load)); + for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i) + { + final_perm.quick_push + (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i])); + if (i != 0 + && (SLP_TREE_LOAD_PERMUTATION (load)[i] + != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1)) + contiguous = false; + } + + /* When the load permutation accesses a contiguous unpermuted, + power-of-two aligned and sized chunk leave the load alone. + We can likely (re-)load it more efficiently rather than + extracting it from the larger load. + ??? Long-term some of the lowering should move to where + the vector types involved are fixed. */ + if (ld_lanes_lanes == 0 + && contiguous + && (SLP_TREE_LANES (load) > 1 || loads.size () == 1) + && pow2p_hwi (SLP_TREE_LANES (load)) + && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0 + && group_lanes % SLP_TREE_LANES (load) == 0) + { + final_perm.release (); + continue; + } + /* First build (and possibly re-use) a load node for the unpermuted group. Gaps in the middle and on the end are represented with NULL stmts. */ @@ -4338,13 +4369,6 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo, &max_nunits, matches, &limit, &tree_size, bst_map); - /* Build the permute to get the original load permutation order. */ - lane_permutation_t final_perm; - final_perm.create (SLP_TREE_LANES (load)); - for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i) - final_perm.quick_push - (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i])); - if (ld_lanes_lanes != 0) { /* ??? If this is not in sync with what get_load_store_type @@ -4503,20 +4527,16 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo, && STMT_VINFO_GROUPED_ACCESS (b0) && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0)) continue; - /* Just one SLP load of a possible group, leave those alone. */ - if (i == firsti + 1) - { - firsti = i; - continue; - } - /* Now we have multiple SLP loads of the same group from + /* Now we have one or multiple SLP loads of the same group from firsti to i - 1. */ - vect_lower_load_permutations (loop_vinfo, bst_map, - make_array_slice (&loads[firsti], - i - firsti)); + if (STMT_VINFO_GROUPED_ACCESS (a0)) + vect_lower_load_permutations (loop_vinfo, bst_map, + make_array_slice (&loads[firsti], + i - firsti)); firsti = i; } - if (firsti < loads.length () - 1) + if (firsti < loads.length () + && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0])) vect_lower_load_permutations (loop_vinfo, bst_map, make_array_slice (&loads[firsti], loads.length () - firsti)); diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 16f6889..25b120c 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2187,7 +2187,6 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, blow up memory, see PR65518). */ if (loop_vinfo && *memory_access_type == VMAT_CONTIGUOUS - && SLP_TREE_LOAD_PERMUTATION (slp_node).exists () && single_element_p && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype))) { |