diff options
author | Richard Biener <rguenther@suse.de> | 2023-07-03 13:59:33 +0200 |
---|---|---|
committer | Richard Biener <rguenther@suse.de> | 2023-07-04 09:04:51 +0200 |
commit | 0682a32c026f1e246eb07bb8066abca4636f01d8 (patch) | |
tree | 7398ac782ac4989b785cb1b387c8f9feb6b77658 | |
parent | eed9eeaab30fd7b9e509ec3cf78f5f3c881b0abf (diff) | |
download | gcc-0682a32c026f1e246eb07bb8066abca4636f01d8.zip gcc-0682a32c026f1e246eb07bb8066abca4636f01d8.tar.gz gcc-0682a32c026f1e246eb07bb8066abca4636f01d8.tar.bz2 |
tree-optimization/110310 - move vector epilogue disabling to analysis phase
The following removes late deciding to elide vectorized epilogues to
the analysis phase and also avoids altering the epilogues niter.
The costing part from vect_determine_partial_vectors_and_peeling is
moved to vect_analyze_loop_costing where we use the main loop
analysis to constrain the epilogue scalar iterations.
I have not tried to integrate this with vect_known_niters_smaller_than_vf.
It seems the for_epilogue_p parameter in
vect_determine_partial_vectors_and_peeling is largely useless and
we could compute that in the function itself.
PR tree-optimization/110310
* tree-vect-loop.cc (vect_determine_partial_vectors_and_peeling):
Move costing part ...
(vect_analyze_loop_costing): ... here. Integrate better
estimate for epilogues from ...
(vect_analyze_loop_2): Call vect_determine_partial_vectors_and_peeling
with actual epilogue status.
* tree-vect-loop-manip.cc (vect_do_peeling): ... here and
avoid cancelling epilogue vectorization.
(vect_update_epilogue_niters): Remove. No longer update
epilogue LOOP_VINFO_NITERS.
* gcc.target/i386/pr110310.c: New testcase.
* gcc.dg/vect/slp-perm-12.c: Disable epilogue vectorization.
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-perm-12.c | 1 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr110310.c | 13 | ||||
-rw-r--r-- | gcc/tree-vect-loop-manip.cc | 104 | ||||
-rw-r--r-- | gcc/tree-vect-loop.cc | 98 |
4 files changed, 102 insertions, 114 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-12.c b/gcc/testsuite/gcc.dg/vect/slp-perm-12.c index 113223a..635fca5 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-12.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-12.c @@ -1,5 +1,6 @@ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_pack_trunc } */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ /* { dg-additional-options "-msse4" { target { i?86-*-* x86_64-*-* } } } */ #include "tree-vect.h" diff --git a/gcc/testsuite/gcc.target/i386/pr110310.c b/gcc/testsuite/gcc.target/i386/pr110310.c new file mode 100644 index 0000000..dce388a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr110310.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=znver4 -fdump-tree-vect-optimized" } */ + +void foo (int * __restrict a, int *b) +{ + for (int i = 0; i < 20; ++i) + a[i] = b[i] + 42; +} + +/* We should vectorize the main loop with AVX512 and the epilog with SSE. */ + +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte vectors" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 16 byte vectors" "vect" } } */ diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 20f570e..6c452e0 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -2882,34 +2882,6 @@ slpeel_update_phi_nodes_for_lcssa (class loop *epilog) rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); } -/* EPILOGUE_VINFO is an epilogue loop that we now know would need to - iterate exactly CONST_NITERS times. Make a final decision about - whether the epilogue loop should be used, returning true if so. */ - -static bool -vect_update_epilogue_niters (loop_vec_info epilogue_vinfo, - unsigned HOST_WIDE_INT const_niters) -{ - /* Avoid wrap-around when computing const_niters - 1. Also reject - using an epilogue loop for a single scalar iteration, even if - we could in principle implement that using partial vectors. */ - unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo); - if (const_niters <= gap_niters + 1) - return false; - - /* Install the number of iterations. */ - tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo)); - tree niters_tree = build_int_cst (niters_type, const_niters); - tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1); - - LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree; - LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree; - - /* Decide what to do if the number of epilogue iterations is not - a multiple of the epilogue loop's vectorization factor. */ - return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true); -} - /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped. Return a value that equals: @@ -3039,7 +3011,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, int estimated_vf; int prolog_peeling = 0; bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0; - bool vect_epilogues_updated_niters = false; /* We currently do not support prolog peeling if the target alignment is not known at compile time. 'vect_gen_prolog_loop_niters' depends on the target alignment being constant. */ @@ -3167,36 +3138,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo); edge update_e = NULL, skip_e = NULL; unsigned int lowest_vf = constant_lower_bound (vf); - /* If we know the number of scalar iterations for the main loop we should - check whether after the main loop there are enough iterations left over - for the epilogue. */ - if (vect_epilogues - && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && prolog_peeling >= 0 - && known_eq (vf, lowest_vf)) - { - unsigned HOST_WIDE_INT eiters - = (LOOP_VINFO_INT_NITERS (loop_vinfo) - - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); - - eiters -= prolog_peeling; - eiters - = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); - - while (!vect_update_epilogue_niters (epilogue_vinfo, eiters)) - { - delete epilogue_vinfo; - epilogue_vinfo = NULL; - if (loop_vinfo->epilogue_vinfos.length () == 0) - { - vect_epilogues = false; - break; - } - epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; - loop_vinfo->epilogue_vinfos.ordered_remove (0); - } - vect_epilogues_updated_niters = true; - } /* Prolog loop may be skipped. */ bool skip_prolog = (prolog_peeling != 0); /* Skip this loop to epilog when there are not enough iterations to enter this @@ -3473,9 +3414,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, skip_e edge. */ if (skip_vector) { - gcc_assert (update_e != NULL - && skip_e != NULL - && !vect_epilogues_updated_niters); + gcc_assert (update_e != NULL && skip_e != NULL); gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)), update_e->dest); tree new_ssa = make_ssa_name (TREE_TYPE (niters)); @@ -3506,28 +3445,25 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, loop and its prologue. */ *advance = niters; - if (!vect_epilogues_updated_niters) - { - /* Subtract the number of iterations performed by the vectorized loop - from the number of total iterations. */ - tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), - before_loop_niters, - niters); - - LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters; - LOOP_VINFO_NITERSM1 (epilogue_vinfo) - = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters), - epilogue_niters, - build_one_cst (TREE_TYPE (epilogue_niters))); - - /* Decide what to do if the number of epilogue iterations is not - a multiple of the epilogue loop's vectorization factor. - We should have rejected the loop during the analysis phase - if this fails. */ - if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo, - true)) - gcc_unreachable (); - } + /* Subtract the number of iterations performed by the vectorized loop + from the number of total iterations. */ + tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), + before_loop_niters, + niters); + + LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters; + LOOP_VINFO_NITERSM1 (epilogue_vinfo) + = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters), + epilogue_niters, + build_one_cst (TREE_TYPE (epilogue_niters))); + + /* Decide what to do if the number of epilogue iterations is not + a multiple of the epilogue loop's vectorization factor. + We should have rejected the loop during the analysis phase + if this fails. */ + bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo, + true); + gcc_assert (res); } adjust_vec.release (); diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 0a03f56..f39a1ec 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2144,14 +2144,76 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo, /* Only loops that can handle partially-populated vectors can have iteration counts less than the vectorization factor. */ - if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) + if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) + && vect_known_niters_smaller_than_vf (loop_vinfo)) { - if (vect_known_niters_smaller_than_vf (loop_vinfo)) + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: iteration count smaller than " + "vectorization factor.\n"); + return 0; + } + + /* If we know the number of iterations we can do better, for the + epilogue we can also decide whether the main loop leaves us + with enough iterations, prefering a smaller vector epilog then + also possibly used for the case we skip the vector loop. */ + if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) + && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + { + widest_int scalar_niters + = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1; + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + { + loop_vec_info orig_loop_vinfo + = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); + unsigned lowest_vf + = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)); + int prolog_peeling = 0; + if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) + prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo); + if (prolog_peeling >= 0 + && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo), + lowest_vf)) + { + unsigned gap + = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0; + scalar_niters = ((scalar_niters - gap - prolog_peeling) + % lowest_vf + gap); + if (scalar_niters == 0) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: loop never entered\n"); + return 0; + } + } + } + + /* Check that the loop processes at least one full vector. */ + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + if (known_lt (scalar_niters, vf)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: iteration count smaller than " - "vectorization factor.\n"); + "loop does not have enough iterations " + "to support vectorization.\n"); + return 0; + } + + /* If we need to peel an extra epilogue iteration to handle data + accesses with gaps, check that there are enough scalar iterations + available. + + The check above is redundant with this one when peeling for gaps, + but the distinction is useful for diagnostics. */ + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + && known_le (scalar_niters, vf)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "loop does not have enough iterations " + "to support peeling for gaps.\n"); return 0; } } @@ -2502,31 +2564,6 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo))); } - if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) - { - /* Check that the loop processes at least one full vector. */ - poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo); - if (known_lt (wi::to_widest (scalar_niters), vf)) - return opt_result::failure_at (vect_location, - "loop does not have enough iterations" - " to support vectorization.\n"); - - /* If we need to peel an extra epilogue iteration to handle data - accesses with gaps, check that there are enough scalar iterations - available. - - The check above is redundant with this one when peeling for gaps, - but the distinction is useful for diagnostics. */ - tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); - if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) - && known_lt (wi::to_widest (scalar_nitersm1), vf)) - return opt_result::failure_at (vect_location, - "loop does not have enough iterations" - " to support peeling for gaps.\n"); - } - LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) && need_peeling_or_partial_vectors_p); @@ -3002,7 +3039,8 @@ start_over: assuming that the loop will be used as a main loop. We will redo this analysis later if we instead decide to use the loop as an epilogue loop. */ - ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false); + ok = vect_determine_partial_vectors_and_peeling + (loop_vinfo, LOOP_VINFO_EPILOGUE_P (loop_vinfo)); if (!ok) return ok; |