diff options
author | Richard Sandiford <richard.sandiford@arm.com> | 2025-03-12 09:40:10 +0000 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@arm.com> | 2025-03-12 09:40:10 +0000 |
commit | 855b61b61e63b17cc9770cbe1c5387e4f59c1ffe (patch) | |
tree | d8f0a93f2608c0149ebc1f98183b7647c628eb9a | |
parent | 5cef719596400a712fc72dc54dd4ec8cdb694dd5 (diff) | |
download | gcc-855b61b61e63b17cc9770cbe1c5387e4f59c1ffe.zip gcc-855b61b61e63b17cc9770cbe1c5387e4f59c1ffe.tar.gz gcc-855b61b61e63b17cc9770cbe1c5387e4f59c1ffe.tar.bz2 |
vect: Fix ncopies when costing SLP reductions [PR116901]
pr110625_[24].c started failing after r15-1329-gd66b820f392aa9a7,
which switched to single def-use cycles for single-lane SLP.
The problem is that we only costed one vector accumulator
operation for an N-vector cycle.
The problem seems to have been latent, and meant that we also
only costed one FADDA for reduc_strict_4.c and reduc_strict_5.c,
even though they need 4 and 6 FADDAs respectively.
I'm not sure why:
if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
&& ncopies > 1)
was previously only necessary for non-SLP, but the patch preserves
that for safety.
gcc/
PR tree-optimization/116901
* tree-vect-loop.cc (vectorizable_reduction): Set ncopies to
SLP_TREE_NUMBER_OF_VEC_STMTS for SLP.
gcc/testsuite/
PR tree-optimization/116901
* gcc.target/aarch64/sve/reduc_strict_4.c: Turn off costing.
* gcc.target/aarch64/sve/reduc_strict_5.c: Likewise.
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c | 2 | ||||
-rw-r--r-- | gcc/tree-vect-loop.cc | 14 |
3 files changed, 9 insertions, 9 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c index 9a12eda..8dad5ee 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize" } */ +/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */ double mat[100][8]; diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c index 7c3068f..9e11781 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize" } */ +/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */ double mat[100][12]; diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 5253362..9413dce 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -8180,7 +8180,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, return false; if (slp_node) - ncopies = 1; + ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); else ncopies = vect_get_num_copies (loop_vinfo, vectype_in); @@ -8288,7 +8288,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, || reduction_type == CONST_COND_REDUCTION || reduction_type == EXTRACT_LAST_REDUCTION) && slp_node - && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1) + && ncopies > 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -8297,6 +8297,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) + && !slp_node && ncopies > 1) { if (dump_enabled_p ()) @@ -8523,11 +8524,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo, participating. When unrolling we want each unrolled iteration to have its own reduction accumulator since one of the main goals of unrolling a reduction is to reduce the aggregate loop-carried latency. */ - if ((ncopies > 1 - || (slp_node - && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) - && SLP_TREE_LANES (slp_node) == 1 - && vect_get_num_copies (loop_vinfo, vectype_in) > 1)) + if (ncopies > 1 + && (!slp_node + || (!REDUC_GROUP_FIRST_ELEMENT (stmt_info) + && SLP_TREE_LANES (slp_node) == 1)) && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) && reduc_chain_length == 1 && loop_vinfo->suggested_unroll_factor == 1) |