aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2025-03-12 09:40:10 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2025-03-12 09:40:10 +0000
commit855b61b61e63b17cc9770cbe1c5387e4f59c1ffe (patch)
treed8f0a93f2608c0149ebc1f98183b7647c628eb9a
parent5cef719596400a712fc72dc54dd4ec8cdb694dd5 (diff)
downloadgcc-855b61b61e63b17cc9770cbe1c5387e4f59c1ffe.zip
gcc-855b61b61e63b17cc9770cbe1c5387e4f59c1ffe.tar.gz
gcc-855b61b61e63b17cc9770cbe1c5387e4f59c1ffe.tar.bz2
vect: Fix ncopies when costing SLP reductions [PR116901]
pr110625_[24].c started failing after r15-1329-gd66b820f392aa9a7, which switched to single def-use cycles for single-lane SLP. The problem is that we only costed one vector accumulator operation for an N-vector cycle. The problem seems to have been latent, and meant that we also only costed one FADDA for reduc_strict_4.c and reduc_strict_5.c, even though they need 4 and 6 FADDAs respectively. I'm not sure why: if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) && ncopies > 1) was previously only necessary for non-SLP, but the patch preserves that for safety. gcc/ PR tree-optimization/116901 * tree-vect-loop.cc (vectorizable_reduction): Set ncopies to SLP_TREE_NUMBER_OF_VEC_STMTS for SLP. gcc/testsuite/ PR tree-optimization/116901 * gcc.target/aarch64/sve/reduc_strict_4.c: Turn off costing. * gcc.target/aarch64/sve/reduc_strict_5.c: Likewise.
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c2
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c2
-rw-r--r--gcc/tree-vect-loop.cc14
3 files changed, 9 insertions, 9 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c
index 9a12eda..8dad5ee 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
double mat[100][8];
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c
index 7c3068f..9e11781 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
double mat[100][12];
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5253362..9413dce 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8180,7 +8180,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
return false;
if (slp_node)
- ncopies = 1;
+ ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
else
ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
@@ -8288,7 +8288,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
|| reduction_type == CONST_COND_REDUCTION
|| reduction_type == EXTRACT_LAST_REDUCTION)
&& slp_node
- && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)
+ && ncopies > 1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -8297,6 +8297,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
}
if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
+ && !slp_node
&& ncopies > 1)
{
if (dump_enabled_p ())
@@ -8523,11 +8524,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
participating. When unrolling we want each unrolled iteration to have its
own reduction accumulator since one of the main goals of unrolling a
reduction is to reduce the aggregate loop-carried latency. */
- if ((ncopies > 1
- || (slp_node
- && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
- && SLP_TREE_LANES (slp_node) == 1
- && vect_get_num_copies (loop_vinfo, vectype_in) > 1))
+ if (ncopies > 1
+ && (!slp_node
+ || (!REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+ && SLP_TREE_LANES (slp_node) == 1))
&& (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
&& reduc_chain_length == 1
&& loop_vinfo->suggested_unroll_factor == 1)