diff options
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-perm-12.c | 1 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr110310.c | 13 | ||||
-rw-r--r-- | gcc/tree-vect-loop-manip.cc | 104 | ||||
-rw-r--r-- | gcc/tree-vect-loop.cc | 98 |
4 files changed, 102 insertions, 114 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-12.c b/gcc/testsuite/gcc.dg/vect/slp-perm-12.c index 113223a..635fca5 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-12.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-12.c @@ -1,5 +1,6 @@ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_pack_trunc } */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ /* { dg-additional-options "-msse4" { target { i?86-*-* x86_64-*-* } } } */ #include "tree-vect.h" diff --git a/gcc/testsuite/gcc.target/i386/pr110310.c b/gcc/testsuite/gcc.target/i386/pr110310.c new file mode 100644 index 0000000..dce388a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr110310.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=znver4 -fdump-tree-vect-optimized" } */ + +void foo (int * __restrict a, int *b) +{ + for (int i = 0; i < 20; ++i) + a[i] = b[i] + 42; +} + +/* We should vectorize the main loop with AVX512 and the epilog with SSE. */ + +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte vectors" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 16 byte vectors" "vect" } } */ diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 20f570e..6c452e0 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -2882,34 +2882,6 @@ slpeel_update_phi_nodes_for_lcssa (class loop *epilog) rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); } -/* EPILOGUE_VINFO is an epilogue loop that we now know would need to - iterate exactly CONST_NITERS times. Make a final decision about - whether the epilogue loop should be used, returning true if so. */ - -static bool -vect_update_epilogue_niters (loop_vec_info epilogue_vinfo, - unsigned HOST_WIDE_INT const_niters) -{ - /* Avoid wrap-around when computing const_niters - 1. Also reject - using an epilogue loop for a single scalar iteration, even if - we could in principle implement that using partial vectors. */ - unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo); - if (const_niters <= gap_niters + 1) - return false; - - /* Install the number of iterations. */ - tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo)); - tree niters_tree = build_int_cst (niters_type, const_niters); - tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1); - - LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree; - LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree; - - /* Decide what to do if the number of epilogue iterations is not - a multiple of the epilogue loop's vectorization factor. */ - return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true); -} - /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped. Return a value that equals: @@ -3039,7 +3011,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, int estimated_vf; int prolog_peeling = 0; bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0; - bool vect_epilogues_updated_niters = false; /* We currently do not support prolog peeling if the target alignment is not known at compile time. 'vect_gen_prolog_loop_niters' depends on the target alignment being constant. */ @@ -3167,36 +3138,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo); edge update_e = NULL, skip_e = NULL; unsigned int lowest_vf = constant_lower_bound (vf); - /* If we know the number of scalar iterations for the main loop we should - check whether after the main loop there are enough iterations left over - for the epilogue. */ - if (vect_epilogues - && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && prolog_peeling >= 0 - && known_eq (vf, lowest_vf)) - { - unsigned HOST_WIDE_INT eiters - = (LOOP_VINFO_INT_NITERS (loop_vinfo) - - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); - - eiters -= prolog_peeling; - eiters - = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); - - while (!vect_update_epilogue_niters (epilogue_vinfo, eiters)) - { - delete epilogue_vinfo; - epilogue_vinfo = NULL; - if (loop_vinfo->epilogue_vinfos.length () == 0) - { - vect_epilogues = false; - break; - } - epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; - loop_vinfo->epilogue_vinfos.ordered_remove (0); - } - vect_epilogues_updated_niters = true; - } /* Prolog loop may be skipped. */ bool skip_prolog = (prolog_peeling != 0); /* Skip this loop to epilog when there are not enough iterations to enter this @@ -3473,9 +3414,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, skip_e edge. */ if (skip_vector) { - gcc_assert (update_e != NULL - && skip_e != NULL - && !vect_epilogues_updated_niters); + gcc_assert (update_e != NULL && skip_e != NULL); gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)), update_e->dest); tree new_ssa = make_ssa_name (TREE_TYPE (niters)); @@ -3506,28 +3445,25 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, loop and its prologue. */ *advance = niters; - if (!vect_epilogues_updated_niters) - { - /* Subtract the number of iterations performed by the vectorized loop - from the number of total iterations. */ - tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), - before_loop_niters, - niters); - - LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters; - LOOP_VINFO_NITERSM1 (epilogue_vinfo) - = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters), - epilogue_niters, - build_one_cst (TREE_TYPE (epilogue_niters))); - - /* Decide what to do if the number of epilogue iterations is not - a multiple of the epilogue loop's vectorization factor. - We should have rejected the loop during the analysis phase - if this fails. */ - if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo, - true)) - gcc_unreachable (); - } + /* Subtract the number of iterations performed by the vectorized loop + from the number of total iterations. */ + tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), + before_loop_niters, + niters); + + LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters; + LOOP_VINFO_NITERSM1 (epilogue_vinfo) + = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters), + epilogue_niters, + build_one_cst (TREE_TYPE (epilogue_niters))); + + /* Decide what to do if the number of epilogue iterations is not + a multiple of the epilogue loop's vectorization factor. + We should have rejected the loop during the analysis phase + if this fails. */ + bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo, + true); + gcc_assert (res); } adjust_vec.release (); diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 0a03f56..f39a1ec 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2144,14 +2144,76 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo, /* Only loops that can handle partially-populated vectors can have iteration counts less than the vectorization factor. */ - if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) + if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) + && vect_known_niters_smaller_than_vf (loop_vinfo)) { - if (vect_known_niters_smaller_than_vf (loop_vinfo)) + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: iteration count smaller than " + "vectorization factor.\n"); + return 0; + } + + /* If we know the number of iterations we can do better, for the + epilogue we can also decide whether the main loop leaves us + with enough iterations, prefering a smaller vector epilog then + also possibly used for the case we skip the vector loop. */ + if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) + && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + { + widest_int scalar_niters + = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1; + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + { + loop_vec_info orig_loop_vinfo + = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); + unsigned lowest_vf + = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)); + int prolog_peeling = 0; + if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) + prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo); + if (prolog_peeling >= 0 + && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo), + lowest_vf)) + { + unsigned gap + = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0; + scalar_niters = ((scalar_niters - gap - prolog_peeling) + % lowest_vf + gap); + if (scalar_niters == 0) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: loop never entered\n"); + return 0; + } + } + } + + /* Check that the loop processes at least one full vector. */ + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + if (known_lt (scalar_niters, vf)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: iteration count smaller than " - "vectorization factor.\n"); + "loop does not have enough iterations " + "to support vectorization.\n"); + return 0; + } + + /* If we need to peel an extra epilogue iteration to handle data + accesses with gaps, check that there are enough scalar iterations + available. + + The check above is redundant with this one when peeling for gaps, + but the distinction is useful for diagnostics. */ + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + && known_le (scalar_niters, vf)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "loop does not have enough iterations " + "to support peeling for gaps.\n"); return 0; } } @@ -2502,31 +2564,6 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo, LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo))); } - if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) - { - /* Check that the loop processes at least one full vector. */ - poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo); - if (known_lt (wi::to_widest (scalar_niters), vf)) - return opt_result::failure_at (vect_location, - "loop does not have enough iterations" - " to support vectorization.\n"); - - /* If we need to peel an extra epilogue iteration to handle data - accesses with gaps, check that there are enough scalar iterations - available. - - The check above is redundant with this one when peeling for gaps, - but the distinction is useful for diagnostics. */ - tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); - if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) - && known_lt (wi::to_widest (scalar_nitersm1), vf)) - return opt_result::failure_at (vect_location, - "loop does not have enough iterations" - " to support peeling for gaps.\n"); - } - LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) && need_peeling_or_partial_vectors_p); @@ -3002,7 +3039,8 @@ start_over: assuming that the loop will be used as a main loop. We will redo this analysis later if we instead decide to use the loop as an epilogue loop. */ - ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false); + ok = vect_determine_partial_vectors_and_peeling + (loop_vinfo, LOOP_VINFO_EPILOGUE_P (loop_vinfo)); if (!ok) return ok; |