diff options
Diffstat (limited to 'gcc/ipa-cp.cc')
-rw-r--r-- | gcc/ipa-cp.cc | 255 |
1 files changed, 110 insertions, 145 deletions
diff --git a/gcc/ipa-cp.cc b/gcc/ipa-cp.cc index 806c2bd..b4b9699 100644 --- a/gcc/ipa-cp.cc +++ b/gcc/ipa-cp.cc @@ -147,10 +147,6 @@ object_allocator<ipcp_value_source<tree> > ipcp_sources_pool object_allocator<ipcp_agg_lattice> ipcp_agg_lattice_pool ("IPA_CP aggregate lattices"); -/* Base count to use in heuristics when using profile feedback. */ - -static profile_count base_count; - /* Original overall size of the program. */ static long overall_size, orig_overall_size; @@ -505,14 +501,16 @@ struct caller_statistics profile_count count_sum; /* Sum of all frequencies for all calls. */ sreal freq_sum; - /* Number of calls and hot calls respectively. */ - int n_calls, n_hot_calls; + /* Number of calls and calls considered interesting respectively. */ + int n_calls, n_interesting_calls; /* If itself is set up, also count the number of non-self-recursive calls. */ int n_nonrec_calls; /* If non-NULL, this is the node itself and calls from it should have their counts included in rec_count_sum and not count_sum. */ cgraph_node *itself; + /* True if there is a caller that has no IPA profile. */ + bool called_without_ipa_profile; }; /* Initialize fields of STAT to zeroes and optionally set it up so that edges @@ -524,10 +522,39 @@ init_caller_stats (caller_statistics *stats, cgraph_node *itself = NULL) stats->rec_count_sum = profile_count::zero (); stats->count_sum = profile_count::zero (); stats->n_calls = 0; - stats->n_hot_calls = 0; + stats->n_interesting_calls = 0; stats->n_nonrec_calls = 0; stats->freq_sum = 0; stats->itself = itself; + stats->called_without_ipa_profile = false; +} + +/* We want to propagate across edges that may be executed, however + we do not want to check maybe_hot, since call itself may be cold + while calee contains some heavy loop which makes propagation still + relevant. + + In particular, even edge called once may lead to significant + improvement. */ + +static bool +cs_interesting_for_ipcp_p (cgraph_edge *e) +{ + /* If profile says the edge is executed, we want to optimize. */ + if (e->count.ipa ().nonzero_p ()) + return true; + /* If local (possibly guseed or adjusted 0 profile) claims edge is + not executed, do not propagate. */ + if (!e->count.nonzero_p ()) + return false; + /* If IPA profile says edge is executed zero times, but zero + is quality is ADJUSTED, still consider it for cloning in + case we have partial training. */ + if (e->count.ipa ().initialized_p () + && opt_for_fn (e->callee->decl,flag_profile_partial_training) + && e->count.nonzero_p ()) + return false; + return true; } /* Worker callback of cgraph_for_node_and_aliases accumulating statistics of @@ -553,13 +580,18 @@ gather_caller_stats (struct cgraph_node *node, void *data) else stats->count_sum += cs->count.ipa (); } + else + stats->called_without_ipa_profile = true; stats->freq_sum += cs->sreal_frequency (); stats->n_calls++; if (stats->itself && stats->itself != cs->caller) stats->n_nonrec_calls++; - if (cs->maybe_hot_p ()) - stats->n_hot_calls ++; + /* If profile known to be zero, we do not want to clone for performance. + However if call is cold, the called function may still contain + important hot loops. */ + if (cs_interesting_for_ipcp_p (cs)) + stats->n_interesting_calls++; } return false; @@ -602,26 +634,11 @@ ipcp_cloning_candidate_p (struct cgraph_node *node) node->dump_name ()); return true; } - - /* When profile is available and function is hot, propagate into it even if - calls seems cold; constant propagation can improve function's speed - significantly. */ - if (stats.count_sum > profile_count::zero () - && node->count.ipa ().initialized_p ()) - { - if (stats.count_sum > node->count.ipa ().apply_scale (90, 100)) - { - if (dump_file) - fprintf (dump_file, "Considering %s for cloning; " - "usually called directly.\n", - node->dump_name ()); - return true; - } - } - if (!stats.n_hot_calls) + if (!stats.n_interesting_calls) { if (dump_file) - fprintf (dump_file, "Not considering %s for cloning; no hot calls.\n", + fprintf (dump_file, "Not considering %s for cloning; " + "no calls considered interesting by profile.\n", node->dump_name ()); return false; } @@ -3369,24 +3386,29 @@ incorporate_penalties (cgraph_node *node, ipa_node_params *info, static bool good_cloning_opportunity_p (struct cgraph_node *node, sreal time_benefit, sreal freq_sum, profile_count count_sum, - int size_cost) + int size_cost, bool called_without_ipa_profile) { + gcc_assert (count_sum.ipa () == count_sum); if (time_benefit == 0 || !opt_for_fn (node->decl, flag_ipa_cp_clone) - || node->optimize_for_size_p ()) + || node->optimize_for_size_p () + /* If there is no call which was executed in profiling or where + profile is missing, we do not want to clone. */ + || (!called_without_ipa_profile && !count_sum.nonzero_p ())) return false; gcc_assert (size_cost > 0); ipa_node_params *info = ipa_node_params_sum->get (node); int eval_threshold = opt_for_fn (node->decl, param_ipa_cp_eval_threshold); + /* If we know the execution IPA execution counts, we can estimate overall + speedup of the program. */ if (count_sum.nonzero_p ()) { - gcc_assert (base_count.nonzero_p ()); - sreal factor = count_sum.probability_in (base_count).to_sreal (); - sreal evaluation = (time_benefit * factor) / size_cost; + profile_count saved_time = count_sum * time_benefit; + sreal evaluation = saved_time.to_sreal_scale (profile_count::one ()) + / size_cost; evaluation = incorporate_penalties (node, info, evaluation); - evaluation *= 1000; if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -3394,33 +3416,46 @@ good_cloning_opportunity_p (struct cgraph_node *node, sreal time_benefit, "size: %i, count_sum: ", time_benefit.to_double (), size_cost); count_sum.dump (dump_file); + fprintf (dump_file, ", overall time saved: "); + saved_time.dump (dump_file); fprintf (dump_file, "%s%s) -> evaluation: %.2f, threshold: %i\n", info->node_within_scc ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "", info->node_calling_single_call ? ", single_call" : "", evaluation.to_double (), eval_threshold); } - - return evaluation.to_int () >= eval_threshold; + gcc_checking_assert (saved_time == saved_time.ipa ()); + if (!maybe_hot_count_p (NULL, saved_time)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, " not cloning: time saved is not hot\n"); + } + /* Evaulation approximately corresponds to time saved per instruction + introduced. This is likely almost always going to be true, since we + already checked that time saved is large enough to be considered + hot. */ + else if (evaluation.to_int () >= eval_threshold) + return true; + /* If all call sites have profile known; we know we do not want t clone. + If there are calls with unknown profile; try local heuristics. */ + if (!called_without_ipa_profile) + return false; } - else - { - sreal evaluation = (time_benefit * freq_sum) / size_cost; - evaluation = incorporate_penalties (node, info, evaluation); - evaluation *= 1000; + sreal evaluation = (time_benefit * freq_sum) / size_cost; + evaluation = incorporate_penalties (node, info, evaluation); + evaluation *= 1000; - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, " good_cloning_opportunity_p (time: %g, " - "size: %i, freq_sum: %g%s%s) -> evaluation: %.2f, " - "threshold: %i\n", - time_benefit.to_double (), size_cost, freq_sum.to_double (), - info->node_within_scc - ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "", - info->node_calling_single_call ? ", single_call" : "", - evaluation.to_double (), eval_threshold); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, " good_cloning_opportunity_p (time: %g, " + "size: %i, freq_sum: %g%s%s) -> evaluation: %.2f, " + "threshold: %i\n", + time_benefit.to_double (), size_cost, freq_sum.to_double (), + info->node_within_scc + ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "", + info->node_calling_single_call ? ", single_call" : "", + evaluation.to_double (), eval_threshold); - return evaluation.to_int () >= eval_threshold; - } + return evaluation.to_int () >= eval_threshold; } /* Grow vectors in AVALS and fill them with information about values of @@ -3613,7 +3648,8 @@ estimate_local_effects (struct cgraph_node *node) "known contexts, code not going to grow.\n"); } else if (good_cloning_opportunity_p (node, time, stats.freq_sum, - stats.count_sum, size)) + stats.count_sum, size, + stats.called_without_ipa_profile)) { if (size + overall_size <= get_max_overall_size (node)) { @@ -3979,7 +4015,7 @@ value_topo_info<valtype>::propagate_effects () processed_srcvals.empty (); for (src = val->sources; src; src = src->next) if (src->val - && src->cs->maybe_hot_p ()) + && cs_interesting_for_ipcp_p (src->cs)) { if (!processed_srcvals.add (src->val)) { @@ -4024,21 +4060,6 @@ value_topo_info<valtype>::propagate_effects () } } -/* Callback for qsort to sort counts of all edges. */ - -static int -compare_edge_profile_counts (const void *a, const void *b) -{ - const profile_count *cnt1 = (const profile_count *) a; - const profile_count *cnt2 = (const profile_count *) b; - - if (*cnt1 < *cnt2) - return 1; - if (*cnt1 > *cnt2) - return -1; - return 0; -} - /* Propagate constants, polymorphic contexts and their effects from the summaries interprocedurally. */ @@ -4051,10 +4072,6 @@ ipcp_propagate_stage (class ipa_topo_info *topo) if (dump_file) fprintf (dump_file, "\n Propagating constants:\n\n"); - base_count = profile_count::uninitialized (); - - bool compute_count_base = false; - unsigned base_count_pos_percent = 0; FOR_EACH_DEFINED_FUNCTION (node) { if (node->has_gimple_body_p () @@ -4071,57 +4088,8 @@ ipcp_propagate_stage (class ipa_topo_info *topo) ipa_size_summary *s = ipa_size_summaries->get (node); if (node->definition && !node->alias && s != NULL) overall_size += s->self_size; - if (node->count.ipa ().initialized_p ()) - { - compute_count_base = true; - unsigned pos_percent = opt_for_fn (node->decl, - param_ipa_cp_profile_count_base); - base_count_pos_percent = MAX (base_count_pos_percent, pos_percent); - } } - if (compute_count_base) - { - auto_vec<profile_count> all_edge_counts; - all_edge_counts.reserve_exact (symtab->edges_count); - FOR_EACH_DEFINED_FUNCTION (node) - for (cgraph_edge *cs = node->callees; cs; cs = cs->next_callee) - { - profile_count count = cs->count.ipa (); - if (!count.nonzero_p ()) - continue; - - enum availability avail; - cgraph_node *tgt - = cs->callee->function_or_virtual_thunk_symbol (&avail); - ipa_node_params *info = ipa_node_params_sum->get (tgt); - if (info && info->versionable) - all_edge_counts.quick_push (count); - } - - if (!all_edge_counts.is_empty ()) - { - gcc_assert (base_count_pos_percent <= 100); - all_edge_counts.qsort (compare_edge_profile_counts); - - unsigned base_count_pos - = ((all_edge_counts.length () * (base_count_pos_percent)) / 100); - base_count = all_edge_counts[base_count_pos]; - - if (dump_file) - { - fprintf (dump_file, "\nSelected base_count from %u edges at " - "position %u, arriving at: ", all_edge_counts.length (), - base_count_pos); - base_count.dump (dump_file); - fprintf (dump_file, "\n"); - } - } - else if (dump_file) - fprintf (dump_file, "\nNo candidates with non-zero call count found, " - "continuing as if without profile feedback.\n"); - } - orig_overall_size = overall_size; if (dump_file) @@ -4383,15 +4351,17 @@ static bool get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest, sreal *freq_sum, int *caller_count, profile_count *rec_count_sum, - profile_count *nonrec_count_sum) + profile_count *nonrec_count_sum, + bool *called_without_ipa_profile) { ipcp_value_source<valtype> *src; sreal freq = 0; int count = 0; profile_count rec_cnt = profile_count::zero (); profile_count nonrec_cnt = profile_count::zero (); - bool hot = false; + bool interesting = false; bool non_self_recursive = false; + *called_without_ipa_profile = false; for (src = val->sources; src; src = src->next) { @@ -4402,15 +4372,19 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest, { count++; freq += cs->sreal_frequency (); - hot |= cs->maybe_hot_p (); + interesting |= cs_interesting_for_ipcp_p (cs); if (cs->caller != dest) { non_self_recursive = true; if (cs->count.ipa ().initialized_p ()) rec_cnt += cs->count.ipa (); + else + *called_without_ipa_profile = true; } else if (cs->count.ipa ().initialized_p ()) nonrec_cnt += cs->count.ipa (); + else + *called_without_ipa_profile = true; } cs = get_next_cgraph_edge_clone (cs); } @@ -4426,19 +4400,7 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest, *rec_count_sum = rec_cnt; *nonrec_count_sum = nonrec_cnt; - if (!hot && ipa_node_params_sum->get (dest)->node_within_scc) - { - struct cgraph_edge *cs; - - /* Cold non-SCC source edge could trigger hot recursive execution of - function. Consider the case as hot and rely on following cost model - computation to further select right one. */ - for (cs = dest->callers; cs; cs = cs->next_caller) - if (cs->caller == dest && cs->maybe_hot_p ()) - return true; - } - - return hot; + return interesting; } /* Given a NODE, and a set of its CALLERS, try to adjust order of the callers @@ -4677,7 +4639,7 @@ update_counts_for_self_gen_clones (cgraph_node *orig_node, const vec<cgraph_node *> &self_gen_clones) { profile_count redist_sum = orig_node->count.ipa (); - if (!(redist_sum > profile_count::zero ())) + if (!redist_sum.nonzero_p ()) return; if (dump_file) @@ -4748,7 +4710,7 @@ update_counts_for_self_gen_clones (cgraph_node *orig_node, it. */ for (cgraph_node *n : self_gen_clones) { - if (!(n->count.ipa () > profile_count::zero ())) + if (!n->count.ipa ().nonzero_p ()) continue; desc_incoming_count_struct desc; @@ -4794,7 +4756,7 @@ update_profiling_info (struct cgraph_node *orig_node, profile_count new_sum; profile_count remainder, orig_node_count = orig_node->count.ipa (); - if (!(orig_node_count > profile_count::zero ())) + if (!orig_node_count.nonzero_p ()) return; if (dump_file) @@ -4958,7 +4920,7 @@ update_specialized_profile (struct cgraph_node *new_node, orig_node_count.dump (dump_file); fprintf (dump_file, "\n"); } - if (!(orig_node_count > profile_count::zero ())) + if (!orig_node_count.nonzero_p ()) return; new_node_count = new_node->count; @@ -5928,6 +5890,7 @@ decide_about_value (struct cgraph_node *node, int index, HOST_WIDE_INT offset, sreal freq_sum; profile_count count_sum, rec_count_sum; vec<cgraph_edge *> callers; + bool called_without_ipa_profile; if (val->spec_node) { @@ -5943,7 +5906,8 @@ decide_about_value (struct cgraph_node *node, int index, HOST_WIDE_INT offset, return false; } else if (!get_info_about_necessary_edges (val, node, &freq_sum, &caller_count, - &rec_count_sum, &count_sum)) + &rec_count_sum, &count_sum, + &called_without_ipa_profile)) return false; if (!dbg_cnt (ipa_cp_values)) @@ -5980,9 +5944,11 @@ decide_about_value (struct cgraph_node *node, int index, HOST_WIDE_INT offset, if (!good_cloning_opportunity_p (node, val->local_time_benefit, freq_sum, count_sum, - val->local_size_cost) + val->local_size_cost, + called_without_ipa_profile) && !good_cloning_opportunity_p (node, val->prop_time_benefit, - freq_sum, count_sum, val->prop_size_cost)) + freq_sum, count_sum, val->prop_size_cost, + called_without_ipa_profile)) return false; if (dump_file) @@ -6564,7 +6530,6 @@ make_pass_ipa_cp (gcc::context *ctxt) void ipa_cp_cc_finalize (void) { - base_count = profile_count::uninitialized (); overall_size = 0; orig_overall_size = 0; ipcp_free_transformation_sum (); |