From 62acc72a957b561462a436fcb2d6caac5b363190 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 21 Jul 2021 07:50:20 +0100 Subject: unroll: Avoid unnecessary tail loops for constant niters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit unroll and jam can decide to unroll the outer loop of a nest like: for (int j = 0; j < n; ++j) for (int i = 0; i < n; ++i) x[i] += __builtin_expf (y[j][i]); It then uses a tail loop to handle any left-over iterations. However, the code is structured so that this tail loop is always used. If n is a multiple of the unroll factor UF, the final UF iterations will use the tail loop rather than the unrolled loop. “Fixing” that for variable loop counts would mean introducing another runtime test: a branch around the tail loop if there are no more iterations. There's at least an argument that the overhead of doing that test might not pay for itself. But we use this structure even if the iteration count is provably a multiple of UF at compile time. E.g. with s/n/100/ and an unroll factor of 2, the first 98 iterations use the unrolled loop and the final 2 iterations use the original loop. This patch makes the unroller avoid a tail loop in that case. The end result seemed easier to follow if variables were declared at the point of initialisation, so that it's more obvious which ones are meaningful even when there's no tail loop. gcc/ * tree-ssa-loop-manip.c (determine_exit_conditions): Return a null exit condition if no tail loop is needed, and if the original exit condition should therefore be kept as-is. (tree_transform_and_unroll_loop): Handle that case here too. gcc/testsuite/ * gcc.dg/unroll-9.c: New test/ --- gcc/tree-ssa-loop-manip.c | 306 +++++++++++++++++++++++++--------------------- 1 file changed, 164 insertions(+), 142 deletions(-) (limited to 'gcc/tree-ssa-loop-manip.c') diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c index 28ae131..41f9872 100644 --- a/gcc/tree-ssa-loop-manip.c +++ b/gcc/tree-ssa-loop-manip.c @@ -997,8 +997,10 @@ can_unroll_loop_p (class loop *loop, unsigned factor, /* Determines the conditions that control execution of LOOP unrolled FACTOR times. DESC is number of iterations of LOOP. ENTER_COND is set to condition that must be true if the main loop can be entered. + If the loop does not always iterate an exact multiple of FACTOR times, EXIT_BASE, EXIT_STEP, EXIT_CMP and EXIT_BOUND are set to values describing - how the exit from the unrolled loop should be controlled. */ + how the exit from the unrolled loop should be controlled. Otherwise, + the trees are set to null and EXIT_CMP is set to ERROR_MARK. */ static void determine_exit_conditions (class loop *loop, class tree_niter_desc *desc, @@ -1079,6 +1081,16 @@ determine_exit_conditions (class loop *loop, class tree_niter_desc *desc, assum = fold_build2 (cmp, boolean_type_node, base, bound); cond = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, assum, cond); + if (integer_nonzerop (cond) + && integer_zerop (desc->may_be_zero)) + { + /* Convert the latch count to an iteration count. */ + tree niter = fold_build2 (PLUS_EXPR, type, desc->niter, + build_one_cst (type)); + if (multiple_of_p (type, niter, bigstep)) + return; + } + cond = force_gimple_operand (unshare_expr (cond), &stmts, false, NULL_TREE); if (stmts) gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); @@ -1234,137 +1246,138 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor, transform_callback transform, void *data) { - gcond *exit_if; - tree ctr_before, ctr_after; - tree enter_main_cond, exit_base, exit_step, exit_bound; - enum tree_code exit_cmp; - gphi *phi_old_loop, *phi_new_loop, *phi_rest; - gphi_iterator psi_old_loop, psi_new_loop; - tree init, next, new_init; - class loop *new_loop; - basic_block rest, exit_bb; - edge old_entry, new_entry, old_latch, precond_edge, new_exit; - edge new_nonexit, e; - gimple_stmt_iterator bsi; - use_operand_p op; - bool ok; - unsigned i; - profile_probability prob, prob_entry, scale_unrolled; - profile_count freq_e, freq_h; gcov_type new_est_niter = niter_for_unrolled_loop (loop, factor); unsigned irr = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP; - auto_vec to_remove; + enum tree_code exit_cmp; + tree enter_main_cond, exit_base, exit_step, exit_bound; determine_exit_conditions (loop, desc, factor, &enter_main_cond, &exit_base, &exit_step, &exit_cmp, &exit_bound); + bool single_loop_p = !exit_base; /* Let us assume that the unrolled loop is quite likely to be entered. */ + profile_probability prob_entry; if (integer_nonzerop (enter_main_cond)) prob_entry = profile_probability::always (); else prob_entry = profile_probability::guessed_always () .apply_scale (PROB_UNROLLED_LOOP_ENTERED, 100); - /* The values for scales should keep profile consistent, and somewhat close - to correct. - - TODO: The current value of SCALE_REST makes it appear that the loop that - is created by splitting the remaining iterations of the unrolled loop is - executed the same number of times as the original loop, and with the same - frequencies, which is obviously wrong. This does not appear to cause - problems, so we do not bother with fixing it for now. To make the profile - correct, we would need to change the probability of the exit edge of the - loop, and recompute the distribution of frequencies in its body because - of this change (scale the frequencies of blocks before and after the exit - by appropriate factors). */ - scale_unrolled = prob_entry; - - new_loop = loop_version (loop, enter_main_cond, NULL, prob_entry, - prob_entry.invert (), scale_unrolled, - profile_probability::guessed_always (), - true); - gcc_assert (new_loop != NULL); - update_ssa (TODO_update_ssa); - - /* Prepare the cfg and update the phi nodes. Move the loop exit to the - loop latch (and make its condition dummy, for the moment). */ - rest = loop_preheader_edge (new_loop)->src; - precond_edge = single_pred_edge (rest); - split_edge (loop_latch_edge (loop)); - exit_bb = single_pred (loop->latch); - - /* Since the exit edge will be removed, the frequency of all the blocks - in the loop that are dominated by it must be scaled by - 1 / (1 - exit->probability). */ - if (exit->probability.initialized_p ()) - scale_dominated_blocks_in_loop (loop, exit->src, - /* We are scaling up here so probability - does not fit. */ - loop->header->count, - loop->header->count - - loop->header->count.apply_probability - (exit->probability)); - - bsi = gsi_last_bb (exit_bb); - exit_if = gimple_build_cond (EQ_EXPR, integer_zero_node, - integer_zero_node, - NULL_TREE, NULL_TREE); - - gsi_insert_after (&bsi, exit_if, GSI_NEW_STMT); - new_exit = make_edge (exit_bb, rest, EDGE_FALSE_VALUE | irr); - rescan_loop_exit (new_exit, true, false); - - /* Set the probability of new exit to the same of the old one. Fix - the frequency of the latch block, by scaling it back by - 1 - exit->probability. */ - new_exit->probability = exit->probability; - new_nonexit = single_pred_edge (loop->latch); - new_nonexit->probability = exit->probability.invert (); - new_nonexit->flags = EDGE_TRUE_VALUE; - if (new_nonexit->probability.initialized_p ()) - scale_bbs_frequencies (&loop->latch, 1, new_nonexit->probability); - - old_entry = loop_preheader_edge (loop); - new_entry = loop_preheader_edge (new_loop); - old_latch = loop_latch_edge (loop); - for (psi_old_loop = gsi_start_phis (loop->header), - psi_new_loop = gsi_start_phis (new_loop->header); - !gsi_end_p (psi_old_loop); - gsi_next (&psi_old_loop), gsi_next (&psi_new_loop)) + gcond *exit_if = nullptr; + class loop *new_loop = nullptr; + basic_block rest; + edge new_exit; + if (!single_loop_p) { - phi_old_loop = psi_old_loop.phi (); - phi_new_loop = psi_new_loop.phi (); - - init = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_entry); - op = PHI_ARG_DEF_PTR_FROM_EDGE (phi_new_loop, new_entry); - gcc_assert (operand_equal_for_phi_arg_p (init, USE_FROM_PTR (op))); - next = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_latch); - - /* Prefer using original variable as a base for the new ssa name. - This is necessary for virtual ops, and useful in order to avoid - losing debug info for real ops. */ - if (TREE_CODE (next) == SSA_NAME - && useless_type_conversion_p (TREE_TYPE (next), - TREE_TYPE (init))) - new_init = copy_ssa_name (next); - else if (TREE_CODE (init) == SSA_NAME - && useless_type_conversion_p (TREE_TYPE (init), - TREE_TYPE (next))) - new_init = copy_ssa_name (init); - else if (useless_type_conversion_p (TREE_TYPE (next), TREE_TYPE (init))) - new_init = make_temp_ssa_name (TREE_TYPE (next), NULL, "unrinittmp"); - else - new_init = make_temp_ssa_name (TREE_TYPE (init), NULL, "unrinittmp"); + /* The values for scales should keep profile consistent, and somewhat + close to correct. + + TODO: The current value of SCALE_REST makes it appear that the loop + that is created by splitting the remaining iterations of the unrolled + loop is executed the same number of times as the original loop, and + with the same frequencies, which is obviously wrong. This does not + appear to cause problems, so we do not bother with fixing it for now. + To make the profile correct, we would need to change the probability + of the exit edge of the loop, and recompute the distribution of + frequencies in its body because of this change (scale the frequencies + of blocks before and after the exit by appropriate factors). */ + profile_probability scale_unrolled = prob_entry; + new_loop = loop_version (loop, enter_main_cond, NULL, prob_entry, + prob_entry.invert (), scale_unrolled, + profile_probability::guessed_always (), + true); + gcc_assert (new_loop != NULL); + update_ssa (TODO_update_ssa); + + /* Prepare the cfg and update the phi nodes. Move the loop exit to the + loop latch (and make its condition dummy, for the moment). */ + rest = loop_preheader_edge (new_loop)->src; + edge precond_edge = single_pred_edge (rest); + split_edge (loop_latch_edge (loop)); + basic_block exit_bb = single_pred (loop->latch); + + /* Since the exit edge will be removed, the frequency of all the blocks + in the loop that are dominated by it must be scaled by + 1 / (1 - exit->probability). */ + if (exit->probability.initialized_p ()) + scale_dominated_blocks_in_loop (loop, exit->src, + /* We are scaling up here so + probability does not fit. */ + loop->header->count, + loop->header->count + - loop->header->count.apply_probability + (exit->probability)); + + gimple_stmt_iterator bsi = gsi_last_bb (exit_bb); + exit_if = gimple_build_cond (EQ_EXPR, integer_zero_node, + integer_zero_node, + NULL_TREE, NULL_TREE); + + gsi_insert_after (&bsi, exit_if, GSI_NEW_STMT); + new_exit = make_edge (exit_bb, rest, EDGE_FALSE_VALUE | irr); + rescan_loop_exit (new_exit, true, false); + + /* Set the probability of new exit to the same of the old one. Fix + the frequency of the latch block, by scaling it back by + 1 - exit->probability. */ + new_exit->probability = exit->probability; + edge new_nonexit = single_pred_edge (loop->latch); + new_nonexit->probability = exit->probability.invert (); + new_nonexit->flags = EDGE_TRUE_VALUE; + if (new_nonexit->probability.initialized_p ()) + scale_bbs_frequencies (&loop->latch, 1, new_nonexit->probability); + + edge old_entry = loop_preheader_edge (loop); + edge new_entry = loop_preheader_edge (new_loop); + edge old_latch = loop_latch_edge (loop); + for (gphi_iterator psi_old_loop = gsi_start_phis (loop->header), + psi_new_loop = gsi_start_phis (new_loop->header); + !gsi_end_p (psi_old_loop); + gsi_next (&psi_old_loop), gsi_next (&psi_new_loop)) + { + gphi *phi_old_loop = psi_old_loop.phi (); + gphi *phi_new_loop = psi_new_loop.phi (); + + tree init = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_entry); + use_operand_p op + = PHI_ARG_DEF_PTR_FROM_EDGE (phi_new_loop, new_entry); + gcc_assert (operand_equal_for_phi_arg_p (init, USE_FROM_PTR (op))); + tree next = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_latch); + + /* Prefer using original variable as a base for the new ssa name. + This is necessary for virtual ops, and useful in order to avoid + losing debug info for real ops. */ + tree new_init; + if (TREE_CODE (next) == SSA_NAME + && useless_type_conversion_p (TREE_TYPE (next), + TREE_TYPE (init))) + new_init = copy_ssa_name (next); + else if (TREE_CODE (init) == SSA_NAME + && useless_type_conversion_p (TREE_TYPE (init), + TREE_TYPE (next))) + new_init = copy_ssa_name (init); + else if (useless_type_conversion_p (TREE_TYPE (next), + TREE_TYPE (init))) + new_init = make_temp_ssa_name (TREE_TYPE (next), NULL, + "unrinittmp"); + else + new_init = make_temp_ssa_name (TREE_TYPE (init), NULL, + "unrinittmp"); - phi_rest = create_phi_node (new_init, rest); + gphi *phi_rest = create_phi_node (new_init, rest); + add_phi_arg (phi_rest, init, precond_edge, UNKNOWN_LOCATION); + add_phi_arg (phi_rest, next, new_exit, UNKNOWN_LOCATION); + SET_USE (op, new_init); + } - add_phi_arg (phi_rest, init, precond_edge, UNKNOWN_LOCATION); - add_phi_arg (phi_rest, next, new_exit, UNKNOWN_LOCATION); - SET_USE (op, new_init); + remove_path (exit); + } + else + { + new_exit = exit; + rest = exit->dest; } - - remove_path (exit); /* Transform the loop. */ if (transform) @@ -1376,57 +1389,66 @@ tree_transform_and_unroll_loop (class loop *loop, unsigned factor, bitmap_ones (wont_exit); bitmap_clear_bit (wont_exit, factor - 1); - ok = gimple_duplicate_loop_to_header_edge + auto_vec to_remove; + bool ok = gimple_duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), factor - 1, wont_exit, new_exit, &to_remove, DLTHE_FLAG_UPDATE_FREQ); gcc_assert (ok); - FOR_EACH_VEC_ELT (to_remove, i, e) + for (edge e : to_remove) { ok = remove_path (e); gcc_assert (ok); } update_ssa (TODO_update_ssa); - /* Ensure that the frequencies in the loop match the new estimated - number of iterations, and change the probability of the new - exit edge. */ - - freq_h = loop->header->count; - freq_e = (loop_preheader_edge (loop))->count (); - if (freq_h.nonzero_p ()) + if (!single_loop_p) { - /* Avoid dropping loop body profile counter to 0 because of zero count - in loop's preheader. */ - if (freq_h.nonzero_p () && !(freq_e == profile_count::zero ())) - freq_e = freq_e.force_nonzero (); - scale_loop_frequencies (loop, freq_e.probability_in (freq_h)); + /* Ensure that the frequencies in the loop match the new estimated + number of iterations, and change the probability of the new + exit edge. */ + + profile_count freq_h = loop->header->count; + profile_count freq_e = (loop_preheader_edge (loop))->count (); + if (freq_h.nonzero_p ()) + { + /* Avoid dropping loop body profile counter to 0 because of zero + count in loop's preheader. */ + if (freq_h.nonzero_p () && !(freq_e == profile_count::zero ())) + freq_e = freq_e.force_nonzero (); + scale_loop_frequencies (loop, freq_e.probability_in (freq_h)); + } } - exit_bb = single_pred (loop->latch); + basic_block exit_bb = single_pred (loop->latch); new_exit = find_edge (exit_bb, rest); new_exit->probability = profile_probability::always () .apply_scale (1, new_est_niter + 1); - rest->count += new_exit->count (); + if (!single_loop_p) + rest->count += new_exit->count (); - new_nonexit = single_pred_edge (loop->latch); - prob = new_nonexit->probability; + edge new_nonexit = single_pred_edge (loop->latch); + profile_probability prob = new_nonexit->probability; new_nonexit->probability = new_exit->probability.invert (); prob = new_nonexit->probability / prob; if (prob.initialized_p ()) scale_bbs_frequencies (&loop->latch, 1, prob); - /* Finally create the new counter for number of iterations and add the new - exit instruction. */ - bsi = gsi_last_nondebug_bb (exit_bb); - exit_if = as_a (gsi_stmt (bsi)); - create_iv (exit_base, exit_step, NULL_TREE, loop, - &bsi, false, &ctr_before, &ctr_after); - gimple_cond_set_code (exit_if, exit_cmp); - gimple_cond_set_lhs (exit_if, ctr_after); - gimple_cond_set_rhs (exit_if, exit_bound); - update_stmt (exit_if); + if (!single_loop_p) + { + /* Finally create the new counter for number of iterations and add + the new exit instruction. */ + tree ctr_before, ctr_after; + gimple_stmt_iterator bsi = gsi_last_nondebug_bb (exit_bb); + exit_if = as_a (gsi_stmt (bsi)); + create_iv (exit_base, exit_step, NULL_TREE, loop, + &bsi, false, &ctr_before, &ctr_after); + gimple_cond_set_code (exit_if, exit_cmp); + gimple_cond_set_lhs (exit_if, ctr_after); + gimple_cond_set_rhs (exit_if, exit_bound); + update_stmt (exit_if); + } checking_verify_flow_info (); checking_verify_loop_structure (); -- cgit v1.1 From e41ba804ba5f5ca433e09238d561b1b4c8b10985 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Thu, 29 Jul 2021 22:26:25 -0500 Subject: Use range-based for loops for traversing loops This patch follows Martin's suggestion here[1], to support range based loop for iterating loops, analogously to the patch for vec[2]. For example, use below range-based for loop for (auto loop : loops_list (cfun, 0)) to replace the previous macro FOR_EACH_LOOP FOR_EACH_LOOP (loop, 0) [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-June/573424.html [2] https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572315.html gcc/ChangeLog: * cfgloop.h (as_const): New function. (class loop_iterator): Rename to ... (class loops_list): ... this. (loop_iterator::next): Rename to ... (loops_list::Iter::fill_curr_loop): ... this and adjust. (loop_iterator::loop_iterator): Rename to ... (loops_list::loops_list): ... this and adjust. (loops_list::Iter): New class. (loops_list::iterator): New type. (loops_list::const_iterator): New type. (loops_list::begin): New function. (loops_list::end): Likewise. (loops_list::begin const): Likewise. (loops_list::end const): Likewise. (FOR_EACH_LOOP): Remove. (FOR_EACH_LOOP_FN): Remove. * cfgloop.c (flow_loops_dump): Adjust FOR_EACH_LOOP* with range-based for loop with loops_list instance. (sort_sibling_loops): Likewise. (disambiguate_loops_with_multiple_latches): Likewise. (verify_loop_structure): Likewise. * cfgloopmanip.c (create_preheaders): Likewise. (force_single_succ_latches): Likewise. * config/aarch64/falkor-tag-collision-avoidance.c (execute_tag_collision_avoidance): Likewise. * config/mn10300/mn10300.c (mn10300_scan_for_setlb_lcc): Likewise. * config/s390/s390.c (s390_adjust_loops): Likewise. * doc/loop.texi: Likewise. * gimple-loop-interchange.cc (pass_linterchange::execute): Likewise. * gimple-loop-jam.c (tree_loop_unroll_and_jam): Likewise. * gimple-loop-versioning.cc (loop_versioning::analyze_blocks): Likewise. (loop_versioning::make_versioning_decisions): Likewise. * gimple-ssa-split-paths.c (split_paths): Likewise. * graphite-isl-ast-to-gimple.c (graphite_regenerate_ast_isl): Likewise. * graphite.c (canonicalize_loop_form): Likewise. (graphite_transform_loops): Likewise. * ipa-fnsummary.c (analyze_function_body): Likewise. * ipa-pure-const.c (analyze_function): Likewise. * loop-doloop.c (doloop_optimize_loops): Likewise. * loop-init.c (loop_optimizer_finalize): Likewise. (fix_loop_structure): Likewise. * loop-invariant.c (calculate_loop_reg_pressure): Likewise. (move_loop_invariants): Likewise. * loop-unroll.c (decide_unrolling): Likewise. (unroll_loops): Likewise. * modulo-sched.c (sms_schedule): Likewise. * predict.c (predict_loops): Likewise. (pass_profile::execute): Likewise. * profile.c (branch_prob): Likewise. * sel-sched-ir.c (sel_finish_pipelining): Likewise. (sel_find_rgns): Likewise. * tree-cfg.c (replace_loop_annotate): Likewise. (replace_uses_by): Likewise. (move_sese_region_to_fn): Likewise. * tree-if-conv.c (pass_if_conversion::execute): Likewise. * tree-loop-distribution.c (loop_distribution::execute): Likewise. * tree-parloops.c (parallelize_loops): Likewise. * tree-predcom.c (tree_predictive_commoning): Likewise. * tree-scalar-evolution.c (scev_initialize): Likewise. (scev_reset): Likewise. * tree-ssa-dce.c (find_obviously_necessary_stmts): Likewise. * tree-ssa-live.c (remove_unused_locals): Likewise. * tree-ssa-loop-ch.c (ch_base::copy_headers): Likewise. * tree-ssa-loop-im.c (analyze_memory_references): Likewise. (tree_ssa_lim_initialize): Likewise. * tree-ssa-loop-ivcanon.c (canonicalize_induction_variables): Likewise. * tree-ssa-loop-ivopts.c (tree_ssa_iv_optimize): Likewise. * tree-ssa-loop-manip.c (get_loops_exits): Likewise. * tree-ssa-loop-niter.c (estimate_numbers_of_iterations): Likewise. (free_numbers_of_iterations_estimates): Likewise. * tree-ssa-loop-prefetch.c (tree_ssa_prefetch_arrays): Likewise. * tree-ssa-loop-split.c (tree_ssa_split_loops): Likewise. * tree-ssa-loop-unswitch.c (tree_ssa_unswitch_loops): Likewise. * tree-ssa-loop.c (gate_oacc_kernels): Likewise. (pass_scev_cprop::execute): Likewise. * tree-ssa-propagate.c (clean_up_loop_closed_phi): Likewise. * tree-ssa-sccvn.c (do_rpo_vn): Likewise. * tree-ssa-threadupdate.c (jump_thread_path_registry::thread_through_all_blocks): Likewise. * tree-vectorizer.c (vectorize_loops): Likewise. * tree-vrp.c (vrp_asserts::find_assert_locations): Likewise. --- gcc/tree-ssa-loop-manip.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'gcc/tree-ssa-loop-manip.c') diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c index 41f9872..c7a2f67 100644 --- a/gcc/tree-ssa-loop-manip.c +++ b/gcc/tree-ssa-loop-manip.c @@ -362,11 +362,10 @@ add_exit_phis (bitmap names_to_rename, bitmap *use_blocks, bitmap *loop_exits) static void get_loops_exits (bitmap *loop_exits) { - class loop *loop; unsigned j; edge e; - FOR_EACH_LOOP (loop, 0) + for (auto loop : loops_list (cfun, 0)) { auto_vec exit_edges = get_loop_exit_edges (loop); loop_exits[loop->num] = BITMAP_ALLOC (&loop_renamer_obstack); -- cgit v1.1