aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-loop-manip.c
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2021-07-13 10:17:43 +0100
committerRichard Sandiford <richard.sandiford@arm.com>2021-07-13 10:17:43 +0100
commit1583b8bff0be7e41aa721dde79f90ca0763bd4e2 (patch)
treed2f95d2ac56f2508877f03cd54c3be109b1fed4d /gcc/tree-vect-loop-manip.c
parent7670b6633e51afbbc4b3c8a5775accf7f2d887af (diff)
downloadgcc-1583b8bff0be7e41aa721dde79f90ca0763bd4e2.zip
gcc-1583b8bff0be7e41aa721dde79f90ca0763bd4e2.tar.gz
gcc-1583b8bff0be7e41aa721dde79f90ca0763bd4e2.tar.bz2
vect: Reuse reduction accumulators between loops
This patch adds support for reusing a main loop's reduction accumulator in an epilogue loop. This in turn lets the loops share a single piece of vector->scalar reduction code. The patch has the following restrictions: (1) The epilogue reduction can only operate on a single vector (e.g. ncopies must be 1 for non-SLP reductions, and the group size must be <= the element count for SLP reductions). (2) Both loops must use the same vector mode for their accumulators. This means that the patch is restricted to targets that support --param vect-partial-vector-usage=1. (3) The reduction must be a standard “tree code” reduction. However, these restrictions could be lifted in future. For example, if the main loop operates on 128-bit vectors and the epilogue loop operates on 64-bit vectors, we could in future reduce the 128-bit vector by one stage and use the 64-bit result as the starting point for the epilogue result. The patch tries to handle chained SLP reductions, unchained SLP reductions and non-SLP reductions. It also handles cases in which the epilogue loop is entered directly (rather than via the main loop) and cases in which the epilogue loop can be skipped. vect_get_main_loop_result is a bit more general than the current patch needs. gcc/ * tree-vectorizer.h (vect_reusable_accumulator): New structure. (_loop_vec_info::main_loop_edge): New field. (_loop_vec_info::skip_main_loop_edge): Likewise. (_loop_vec_info::skip_this_loop_edge): Likewise. (_loop_vec_info::reusable_accumulators): Likewise. (_stmt_vec_info::reduc_scalar_results): Likewise. (_stmt_vec_info::reused_accumulator): Likewise. (vect_get_main_loop_result): Declare. * tree-vectorizer.c (vec_info::new_stmt_vec_info): Initialize reduc_scalar_inputs. (vec_info::free_stmt_vec_info): Free reduc_scalar_inputs. * tree-vect-loop-manip.c (vect_get_main_loop_result): New function. (vect_do_peeling): Fill an epilogue loop's main_loop_edge, skip_main_loop_edge and skip_this_loop_edge fields. * tree-vect-loop.c (INCLUDE_ALGORITHM): Define. (vect_emit_reduction_init_stmts): New function. (get_initial_def_for_reduction): Use it. (get_initial_defs_for_reduction): Likewise. Change the vinfo parameter to a loop_vec_info. (vect_create_epilog_for_reduction): Store the scalar results in the reduc_info. If an epilogue loop is reusing an accumulator from the main loop, and if the epilogue loop can also be skipped, try to place the reduction code in the join block. Record accumulators that could potentially be reused by epilogue loops. (vect_transform_cycle_phi): When vectorizing epilogue loops, try to reuse accumulators from the main loop. Record the initial value in reduc_info for non-SLP reductions too. gcc/testsuite/ * gcc.target/aarch64/sve/reduc_9.c: New test. * gcc.target/aarch64/sve/reduc_9_run.c: Likewise. * gcc.target/aarch64/sve/reduc_10.c: Likewise. * gcc.target/aarch64/sve/reduc_10_run.c: Likewise. * gcc.target/aarch64/sve/reduc_11.c: Likewise. * gcc.target/aarch64/sve/reduc_11_run.c: Likewise. * gcc.target/aarch64/sve/reduc_12.c: Likewise. * gcc.target/aarch64/sve/reduc_12_run.c: Likewise. * gcc.target/aarch64/sve/reduc_13.c: Likewise. * gcc.target/aarch64/sve/reduc_13_run.c: Likewise. * gcc.target/aarch64/sve/reduc_14.c: Likewise. * gcc.target/aarch64/sve/reduc_14_run.c: Likewise. * gcc.target/aarch64/sve/reduc_15.c: Likewise. * gcc.target/aarch64/sve/reduc_15_run.c: Likewise.
Diffstat (limited to 'gcc/tree-vect-loop-manip.c')
-rw-r--r--gcc/tree-vect-loop-manip.c26
1 files changed, 26 insertions, 0 deletions
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 2909e8a..c29ffb3 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2457,6 +2457,28 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
}
+/* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
+ Return a value that equals:
+
+ - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
+ - SKIP_VALUE when the main loop is skipped. */
+
+tree
+vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
+ tree skip_value)
+{
+ gcc_assert (loop_vinfo->main_loop_edge);
+
+ tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
+ basic_block bb = loop_vinfo->main_loop_edge->dest;
+ gphi *new_phi = create_phi_node (phi_result, bb);
+ add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
+ UNKNOWN_LOCATION);
+ add_phi_arg (new_phi, skip_value,
+ loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
+ return phi_result;
+}
+
/* Function vect_do_peeling.
Input:
@@ -2986,6 +3008,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
skip_vector ? anchor : guard_bb,
prob_epilog.invert (),
irred_flag);
+ if (vect_epilogues)
+ epilogue_vinfo->skip_this_loop_edge = guard_e;
slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
single_exit (epilog));
/* Only need to handle basic block before epilog loop if it's not
@@ -3057,6 +3081,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
UNKNOWN_LOCATION);
niters = PHI_RESULT (new_phi);
+ epilogue_vinfo->main_loop_edge = update_e;
+ epilogue_vinfo->skip_main_loop_edge = skip_e;
}
/* Set ADVANCE to the number of iterations performed by the previous