aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2024-02-23 11:45:50 +0100
committerRichard Biener <rguenther@suse.de>2024-06-04 10:13:30 +0200
commitd93353e6423ecaaae9fa47d0935caafd9abfe4de (patch)
tree8e85a2b6432e24b2a5cb84e4566ffa31ffd90bed
parent0592000aeed84d47040946a125154b3c46d7c84f (diff)
downloadgcc-d93353e6423ecaaae9fa47d0935caafd9abfe4de.zip
gcc-d93353e6423ecaaae9fa47d0935caafd9abfe4de.tar.gz
gcc-d93353e6423ecaaae9fa47d0935caafd9abfe4de.tar.bz2
Do single-lane SLP discovery for reductions
The following performs single-lane SLP discovery for reductions. It requires a fixup for outer loop vectorization where a check for multiple types needs adjustments as otherwise bogus pointer IV increments happen when there are multiple copies of vector stmts in the inner loop. For the reduction epilog handling this extends the optimized path to cover the trivial single-lane SLP reduction case. The fix for PR65518 implemented in vect_grouped_load_supported for non-SLP needs a SLP counterpart that I put in get_group_load_store_type. I've decided to adjust three testcases for appearing single-lane SLP instances instead of not dumping "vectorizing stmts using SLP" for single-lane instances as that also requires testsuite adjustments. * tree-vect-slp.cc (vect_build_slp_tree_2): Only multi-lane discoveries are reduction chains and need special backedge treatment. (vect_analyze_slp): Fall back to single-lane SLP discovery for reductions. Make sure to try single-lane SLP reduction for all reductions as fallback. (vectorizable_load): Avoid outer loop SLP vectorization with multi-copy vector stmts in the inner loop. (vectorizable_store): Likewise. * tree-vect-loop.cc (vect_create_epilog_for_reduction): Allow direct opcode and shift reduction also for SLP reductions with a single lane. * tree-vect-stmts.cc (get_group_load_store_type): For SLP also check for the PR65518 single-element interleaving case as done in vect_grouped_load_supported. * gcc.dg/vect/slp-24.c: Expect another SLP instance for the reduction. * gcc.dg/vect/slp-24-big-array.c: Likewise. * gcc.dg/vect/slp-reduc-6.c: Remove scan for zero SLP instances.
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-24-big-array.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-24.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-6.c1
-rw-r--r--gcc/tree-vect-loop.cc4
-rw-r--r--gcc/tree-vect-slp.cc71
-rw-r--r--gcc/tree-vect-stmts.cc24
6 files changed, 80 insertions, 24 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c
index 5eaea96..63f7443 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c
@@ -92,4 +92,4 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_no_align && ilp32 } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-24.c b/gcc/testsuite/gcc.dg/vect/slp-24.c
index 59178f2..7814d7c 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-24.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-24.c
@@ -78,4 +78,4 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_no_align && ilp32 } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
index 1fd15aa..5566705 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
@@ -45,6 +45,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail { vect_no_int_add || { ! { vect_unpack || vect_strided2 } } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index a08357a..06292ed 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6504,7 +6504,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
/* 2.3 Create the reduction code, using one of the three schemes described
above. In SLP we simply need to extract all the elements from the
vector (without reducing them), so we use scalar shifts. */
- else if (reduc_fn != IFN_LAST && !slp_reduc)
+ else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
{
tree tmp;
tree vec_elem_type;
@@ -6674,7 +6674,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
reduc_inputs[0] = new_temp;
- if (reduce_with_shift && !slp_reduc)
+ if (reduce_with_shift && (!slp_reduc || group_size == 1))
{
int element_bitsize = tree_to_uhwi (bitsize);
/* Enforced by vectorizable_reduction, which disallows SLP reductions
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 11ec820..ba1190c 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1911,7 +1911,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
/* Reduction chain backedge defs are filled manually.
??? Need a better way to identify a SLP reduction chain PHI.
Or a better overall way to SLP match those. */
- if (all_same && def_type == vect_reduction_def)
+ if (stmts.length () > 1
+ && all_same && def_type == vect_reduction_def)
skip_args[loop_latch_edge (loop)->dest_idx] = true;
}
else if (def_type != vect_internal_def)
@@ -3909,9 +3910,10 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
}
/* Find SLP sequences starting from groups of reductions. */
- if (loop_vinfo->reductions.length () > 1)
+ if (loop_vinfo->reductions.length () > 0)
{
- /* Collect reduction statements. */
+ /* Collect reduction statements we can combine into
+ a SLP reduction. */
vec<stmt_vec_info> scalar_stmts;
scalar_stmts.create (loop_vinfo->reductions.length ());
for (auto next_info : loop_vinfo->reductions)
@@ -3924,23 +3926,58 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
reduction path. In that case we'd have to reverse
engineer that conversion stmt following the chain using
reduc_idx and from the PHI using reduc_def. */
- && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
- /* Do not discover SLP reductions for lane-reducing ops, that
- will fail later. */
- && (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
- || !lane_reducing_op_p (gimple_assign_rhs_code (g))))
- scalar_stmts.quick_push (next_info);
+ && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
+ {
+ /* Do not discover SLP reductions combining lane-reducing
+ ops, that will fail later. */
+ if (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
+ || !lane_reducing_op_p (gimple_assign_rhs_code (g)))
+ scalar_stmts.quick_push (next_info);
+ else
+ {
+ /* Do SLP discovery for single-lane reductions. */
+ vec<stmt_vec_info> stmts;
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ stmts.create (1);
+ stmts.quick_push (next_info);
+ vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, NULL);
+ }
+ }
}
- if (scalar_stmts.length () > 1)
+ /* Save for re-processing on failure. */
+ vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ if (scalar_stmts.length () <= 1
+ || !vect_build_slp_instance (loop_vinfo,
+ slp_inst_kind_reduc_group,
+ scalar_stmts, roots, remain,
+ max_tree_size, &limit, bst_map,
+ NULL))
{
- vec<stmt_vec_info> roots = vNULL;
- vec<tree> remain = vNULL;
- vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group,
- scalar_stmts, roots, remain,
- max_tree_size, &limit, bst_map, NULL);
+ if (scalar_stmts.length () <= 1)
+ scalar_stmts.release ();
+ /* Do SLP discovery for single-lane reductions. */
+ for (auto stmt_info : saved_stmts)
+ {
+ vec<stmt_vec_info> stmts;
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ stmts.create (1);
+ stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
+ vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, NULL);
+ }
+ saved_stmts.release ();
}
- else
- scalar_stmts.release ();
}
}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 935d80f..b26cc74 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2160,6 +2160,23 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
}
overrun_p = true;
}
+
+ /* If this is single-element interleaving with an element
+ distance that leaves unused vector loads around punt - we
+ at least create very sub-optimal code in that case (and
+ blow up memory, see PR65518). */
+ if (loop_vinfo
+ && *memory_access_type == VMAT_CONTIGUOUS
+ && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+ && single_element_p
+ && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "single-element interleaving not supported "
+ "for not adjacent vector loads\n");
+ return false;
+ }
}
}
else
@@ -8202,7 +8219,9 @@ vectorizable_store (vec_info *vinfo,
gcc_assert (ncopies >= 1);
/* FORNOW. This restriction should be relaxed. */
- if (loop && nested_in_vect_loop_p (loop, stmt_info) && ncopies > 1)
+ if (loop
+ && nested_in_vect_loop_p (loop, stmt_info)
+ && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9945,7 +9964,8 @@ vectorizable_load (vec_info *vinfo,
gcc_assert (ncopies >= 1);
/* FORNOW. This restriction should be relaxed. */
- if (nested_in_vect_loop && ncopies > 1)
+ if (nested_in_vect_loop
+ && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,