diff options
Diffstat (limited to 'gcc/ipa-locality-cloning.cc')
-rw-r--r-- | gcc/ipa-locality-cloning.cc | 1137 |
1 files changed, 1137 insertions, 0 deletions
diff --git a/gcc/ipa-locality-cloning.cc b/gcc/ipa-locality-cloning.cc new file mode 100644 index 0000000..2684046 --- /dev/null +++ b/gcc/ipa-locality-cloning.cc @@ -0,0 +1,1137 @@ +/* Code locality based function cloning. + Copyright The GNU Toolchain Authors + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +/* This file implements cloning required to improve partitioning of the + callgraph for locality considerations. + + Partitioning for improving code locality. + This pass aims to place frequently executed callchains closer together in + memory to improve performance through improved locality. If any frequent + callchains cannot be placed together because they are already placed + elsewhere, local function clones are created and all callers near to the + clones are redirected to use this copy. + + Locality code placement is done in 2 parts. + 1. IPA pass to be executed after ipa-inline and before ipa-pure-const. + Execute stage prepares the plan to place all nodes into partitions. + 2. WPA Partition stage actually implements the plan. + + Brief overview of the IPA pass: + 1. Create and sort callchains. If PGO is available, use real profile + counts. Otherwise, use a set of heuristics to sort the callchains. + 2. Create a partition plan for the callchains, processing them in the sorted + order. + 1. If a function is unpartitioned, place it in the current partition. + 2. If a function is already placed in a partition away from current + partition as part of another callchain: + Create a local clone in current partition, if cloning criteria is + satisfied. + 3. Redirect any new caller to a local clone if one exists. + Partition size is param controlled to fine tune per program behavior. */ + +#include "config.h" +#define INCLUDE_ALGORITHM +#include "system.h" +#include "coretypes.h" +#include "target.h" +#include "function.h" +#include "tree.h" +#include "alloc-pool.h" +#include "tree-pass.h" +#include "cgraph.h" +#include "symbol-summary.h" +#include "tree-vrp.h" +#include "symtab-thunks.h" +#include "sreal.h" +#include "ipa-cp.h" +#include "ipa-prop.h" +#include "ipa-fnsummary.h" +#include "ipa-modref-tree.h" +#include "ipa-modref.h" +#include "symtab-clones.h" +#include "ipa-locality-cloning.h" + +/* Locality partitions, assigns nodes to partitions. These are used later in + WPA partitioning. */ +vec<locality_partition> locality_partitions; + +/* Map from original node to its latest clone. Gets overwritten whenever a new + clone is created from the same node. */ +hash_map<cgraph_node *, cgraph_node *> node_to_clone; +/* Map from clone to its original node. */ +hash_map<cgraph_node *, cgraph_node *> clone_to_node; + +/* Data structure to hold static heuristics and orders for cgraph_nodes. */ +struct locality_order +{ + cgraph_node *node; + sreal order; + locality_order (cgraph_node *node, sreal order) : node (node), order (order) + {} +}; + +/* Return true if NODE is already in some partition. */ +static inline bool +node_partitioned_p (cgraph_node *node) +{ + return node->aux; +} + +/* Add symbol NODE to partition PART. */ +static void +add_node_to_partition (locality_partition part, cgraph_node *node) +{ + struct cgraph_edge *e; + if (node_partitioned_p (node)) + return; + + part->nodes.safe_push (node); + node->aux = (void *) (uintptr_t) (part->part_id); + + if (!node->alias && node->get_partitioning_class () == SYMBOL_PARTITION) + part->insns += ipa_size_summaries->get (node)->size; + + /* Add all inline clones and callees that are duplicated. */ + for (e = node->callees; e; e = e->next_callee) + if (!e->inline_failed) + add_node_to_partition (part, e->callee); + /* omp declare_variant_alt or transparent_alias with definition or linker + discardable (non-local comdat but not forced and not + used by non-LTO). */ + else if (e->callee->get_partitioning_class () == SYMBOL_DUPLICATE) + add_node_to_partition (part, e->callee); + + /* Add all thunks associated with the function. */ + for (e = node->callers; e; e = e->next_caller) + if (e->caller->thunk && !e->caller->inlined_to) + add_node_to_partition (part, e->caller); + + /* Add all aliases associated with the symbol. */ + struct ipa_ref *ref; + FOR_EACH_ALIAS (node, ref) + if (!ref->referring->transparent_alias) + { + cgraph_node *referring = dyn_cast<cgraph_node *> (ref->referring); + /* Only add function aliases. + Varpool refs are added later in LTO partitioning pass. */ + if (referring) + add_node_to_partition (part, referring); + } + else + { + struct ipa_ref *ref2; + /* We do not need to add transparent aliases if they are not used. + However we must add aliases of transparent aliases if they exist. */ + FOR_EACH_ALIAS (ref->referring, ref2) + { + /* Nested transparent aliases are not permitted. */ + gcc_checking_assert (!ref2->referring->transparent_alias); + cgraph_node *referring = dyn_cast<cgraph_node *> (ref2->referring); + if (referring) + add_node_to_partition (part, referring); + } + } +} + +/* Return TRUE if NODE is in PARTITION. */ +static bool +node_in_partition_p (locality_partition partition, cgraph_node *node) +{ + return ((uintptr_t) (partition->part_id) == (uintptr_t) (node->aux)); +} + +/* Helper function for qsort; to break ties. */ +static int +compare_node_uids (cgraph_node *n1, cgraph_node *n2) +{ + int res = n1->get_uid () - n2->get_uid (); + gcc_assert (res != 0); + return res > 0 ? 1 : -1; +} + +/* Helper function for qsort; sort nodes by order. */ +static int +static_profile_cmp (const void *pa, const void *pb) +{ + const locality_order *a = *static_cast<const locality_order *const *> (pa); + const locality_order *b = *static_cast<const locality_order *const *> (pb); + /* Ascending order. */ + if (b->order < a->order) + return 1; + if (b->order > a->order) + return -1; + return compare_node_uids (a->node, b->node); +} + +/* Helper function for qsort; sort nodes by profile count. */ +static int +compare_edge_profile_counts (const void *pa, const void *pb) +{ + const locality_order *a = *static_cast<const locality_order *const *> (pa); + const locality_order *b = *static_cast<const locality_order *const *> (pb); + + profile_count cnt1 = a->node->count.ipa (); + profile_count cnt2 = b->node->count.ipa (); + if (!cnt1.compatible_p (cnt2)) + return static_profile_cmp (pa, pb); + + if (cnt1 < cnt2) + return 1; + if (cnt1 > cnt2) + return -1; + return static_profile_cmp (pa, pb); +} + +/* Create and return a new partition and increment NPARTITIONS. */ + +static locality_partition +create_partition (int &npartitions) +{ + locality_partition part = XCNEW (struct locality_partition_def); + npartitions++; + part->part_id = npartitions; + part->nodes.create (1); + part->insns = 0; + locality_partitions.safe_push (part); + return part; +} + +/* Structure for holding profile count information of callers of a node. */ +struct profile_stats +{ + /* Sum of non-recursive call counts. */ + profile_count nonrec_count; + + /* Sum of recursive call counts. */ + profile_count rec_count; + + /* If non-NULL, this node is the target of alias or thunk and calls from this + should be count in rec_count. */ + cgraph_node *target; +}; + +/* Initialize fields of STATS. */ +static inline void +init_profile_stats (profile_stats *stats, cgraph_node *target = NULL) +{ + stats->nonrec_count = profile_count::zero (); + stats->rec_count = profile_count::zero (); + stats->target = target; +} + +/* Helper function of to accumulate call counts. */ +static bool +accumulate_profile_counts_after_cloning (cgraph_node *node, void *data) +{ + struct profile_stats *stats = (struct profile_stats *) data; + for (cgraph_edge *e = node->callers; e; e = e->next_caller) + { + if (!e->count.initialized_p ()) + continue; + + if (e->caller == stats->target) + stats->rec_count += e->count.ipa (); + else + stats->nonrec_count += e->count.ipa (); + } + return false; +} + +/* NEW_NODE is a previously created clone of ORIG_NODE already present in + current partition. EDGES contains newly redirected edges to NEW_NODE. + Adjust profile information for both nodes and the edge. */ + +static void +adjust_profile_info_for_non_self_rec_edges (auto_vec<cgraph_edge *> &edges, + cgraph_node *new_node, + cgraph_node *orig_node) +{ + profile_count orig_node_count = orig_node->count.ipa (); + profile_count edge_count = profile_count::zero (); + profile_count final_new_count = profile_count::zero (); + profile_count final_orig_count = profile_count::zero (); + + for (unsigned i = 0; i < edges.length (); ++i) + if (edges[i]->count.initialized_p ()) + edge_count += edges[i]->count.ipa (); + + final_orig_count = orig_node_count - edge_count; + + /* NEW_NODE->count was adjusted for other callers when the clone was + first created. Just add the new edge count. */ + final_new_count = new_node->count + edge_count; + + final_new_count = orig_node_count.combine_with_ipa_count (final_new_count); + orig_node->count = final_orig_count; + new_node->count = final_new_count; + + if (dump_file) + { + fprintf (dump_file, "Adjusting profile information for %s\n", + new_node->dump_asm_name ()); + fprintf (dump_file, "\tOriginal node %s\n", orig_node->dump_asm_name ()); + fprintf (dump_file, "\tOriginal count: "); + orig_node_count.dump (dump_file); + fprintf (dump_file, "\n\tAdjusted original count to: "); + final_orig_count.dump (dump_file); + fprintf (dump_file, "\n\tAdjusted clone count to: "); + final_new_count.dump (dump_file); + fprintf (dump_file, "\n"); + } + + /* Scale all callee edges according to adjusted counts. */ + profile_count orig_node_count_copy = orig_node_count; + profile_count::adjust_for_ipa_scaling (&final_new_count, + &orig_node_count_copy); + for (cgraph_edge *cs = new_node->callees; cs; cs = cs->next_callee) + cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy); + for (cgraph_edge *cs = new_node->indirect_calls; cs; cs = cs->next_callee) + cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy); + + profile_count::adjust_for_ipa_scaling (&final_orig_count, &orig_node_count); + for (cgraph_edge *cs = orig_node->callees; cs; cs = cs->next_callee) + cs->count = cs->count.apply_scale (final_orig_count, orig_node_count); + for (cgraph_edge *cs = orig_node->indirect_calls; cs; cs = cs->next_callee) + cs->count = cs->count.apply_scale (final_orig_count, orig_node_count); +} + +/* Adjust profile counts of NEW_NODE and ORIG_NODE, where NEW_NODE is a clone + of OLD_NODE. + Assumes that all eligible edges from current partition so far are redirected + to NEW_NODE and recursive edges are adjusted. */ + +static void +adjust_profile_info (cgraph_node *new_node, cgraph_node *orig_node) +{ + /* If all calls to NEW_NODE are non-recursive, subtract corresponding count + from ORIG_NODE and assign to NEW_NODE, any unexpected remainder stays with + ORIG_NODE. + Recursive calls if present, likely contribute to majority of count; + scale according to redirected callers' count. */ + + profile_count orig_node_count = orig_node->count.ipa (); + profile_stats new_stats, orig_stats; + + init_profile_stats (&new_stats); + init_profile_stats (&orig_stats); + + new_node->call_for_symbol_thunks_and_aliases + (accumulate_profile_counts_after_cloning, &new_stats, false); + orig_node->call_for_symbol_thunks_and_aliases + (accumulate_profile_counts_after_cloning, &orig_stats, false); + + profile_count orig_nonrec_count = orig_stats.nonrec_count; + profile_count orig_rec_count = orig_stats.rec_count; + profile_count new_nonrec_count = new_stats.nonrec_count; + profile_count new_rec_count = new_stats.rec_count; + + profile_count final_new_count = new_nonrec_count; + profile_count final_orig_count = profile_count::zero (); + + /* All calls to NEW_NODE are non-recursive or recursive calls have + zero count. */ + if (!new_rec_count.nonzero_p ()) + final_orig_count = orig_node_count - new_nonrec_count; + else + { + /* If ORIG_NODE is externally visible, indirect calls or calls from + another part of the code may contribute to the count. + update_profiling_info () from ipa-cp.cc pretends to have an extra + caller to represent the extra counts. */ + if (!orig_node->local) + { + profile_count pretend_count = (orig_node_count - new_nonrec_count - + orig_nonrec_count - orig_rec_count); + orig_nonrec_count += pretend_count; + } + + /* Remaining rec_count is assigned in proportion to clone's non-recursive + count. */ + profile_count rec_count = orig_node_count - new_nonrec_count + - orig_nonrec_count; + profile_count new_rec_scaled + = rec_count.apply_scale (new_nonrec_count, + new_nonrec_count + orig_nonrec_count); + final_new_count += new_rec_scaled; + final_orig_count = orig_node_count - final_new_count; + } + + final_new_count = orig_node_count.combine_with_ipa_count (final_new_count); + new_node->count = final_new_count; + orig_node->count = final_orig_count; + + if (dump_file) + { + fprintf (dump_file, "Adjusting profile information for %s\n", + new_node->dump_asm_name ()); + fprintf (dump_file, "\tOriginal node %s\n", orig_node->dump_asm_name ()); + fprintf (dump_file, "\tOriginal count: "); + orig_node_count.dump (dump_file); + fprintf (dump_file, "\n\tAdjusted original count to: "); + final_orig_count.dump (dump_file); + fprintf (dump_file, "\n\tAdjusted clone count to: "); + final_new_count.dump (dump_file); + fprintf (dump_file, "\n"); + } + + /* Scale all callee edges according to adjusted counts. */ + profile_count orig_node_count_copy = orig_node_count; + profile_count::adjust_for_ipa_scaling (&final_new_count, + &orig_node_count_copy); + for (cgraph_edge *cs = new_node->callees; cs; cs = cs->next_callee) + cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy); + for (cgraph_edge *cs = new_node->indirect_calls; cs; cs = cs->next_callee) + cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy); + + profile_count::adjust_for_ipa_scaling (&final_orig_count, &orig_node_count); + for (cgraph_edge *cs = orig_node->callees; cs; cs = cs->next_callee) + cs->count = cs->count.apply_scale (final_orig_count, orig_node_count); + for (cgraph_edge *cs = orig_node->indirect_calls; cs; cs = cs->next_callee) + cs->count = cs->count.apply_scale (final_orig_count, orig_node_count); +} + +/* Return true if EDGE can be safely redirected to another callee. */ +static inline bool +edge_redirectable_p (cgraph_edge *edge, lto_locality_cloning_model cm) +{ + if (cm == LTO_LOCALITY_NON_INTERPOSABLE_CLONING) + { + /* Interposability may change on edge basis. */ + enum availability avail; + avail = edge->callee->get_availability (edge->caller); + if (avail <= AVAIL_INTERPOSABLE) + return false; + } + return true; +} + +/* Create a locality clone of CNODE and redirect all callers present in + PARTITION. + Create a clone dpending on whether CNODE itself is a clone or not. */ + +static cgraph_node * +create_locality_clone (cgraph_node *cnode, + locality_partition partition, int &cl_num, + lto_locality_cloning_model cm) +{ + cgraph_node *cl_node = NULL; + vec<cgraph_edge *> redirect_callers = vNULL; + /* All callers of cnode in current partition are redirected. */ + struct cgraph_edge *edge; + for (edge = cnode->callers; edge; edge = edge->next_caller) + { + struct cgraph_node *caller = edge->caller; + if (node_in_partition_p (partition, caller) && caller->definition + && caller != cnode && edge_redirectable_p (edge, cm)) + redirect_callers.safe_push (edge); + } + + const char *suffix = "locality_clone"; + + tree old_decl = cnode->decl; + tree new_decl = copy_node (old_decl); + + /* Generate a new name for the new version. */ + const char *name = IDENTIFIER_POINTER (DECL_NAME (old_decl)); + DECL_NAME (new_decl) = clone_function_name (name, suffix, cl_num); + SET_DECL_ASSEMBLER_NAME (new_decl, + clone_function_name (old_decl, suffix, cl_num)); + cl_num++; + if (dump_file) + fprintf (dump_file, "\tNew name %s\n", + IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (new_decl))); + + cl_node = cnode->create_clone (new_decl, cnode->count /*profile_count*/, + false /*update_original*/, redirect_callers, + false /*call_duplication_hook*/, + NULL /*new_inlined_to*/, + NULL /*param_adjustments*/, suffix); + + set_new_clone_decl_and_node_flags (cl_node); + + if (cnode->ipa_transforms_to_apply.exists ()) + cl_node->ipa_transforms_to_apply + = cnode->ipa_transforms_to_apply.copy (); + + if (dump_file) + { + fprintf (dump_file, "Cloned Node: %s %s\n", cnode->dump_asm_name (), + cl_node->dump_asm_name ()); + + for (edge = cl_node->callers; edge; edge = edge->next_caller) + fprintf (dump_file, "Redirected callers: %s\n", + edge->caller->dump_asm_name ()); + + for (edge = cl_node->callees; edge; edge = edge->next_callee) + fprintf (dump_file, "Callees of clone: %s %d\n", + edge->callee->dump_asm_name (), edge->frequency ()); + } + return cl_node; +} + +/* Redirect recursive edges of CLONE to correctly point to CLONE. As part of + cloning process, all callee edges of a node are just duplicated but not + redirected. Therefore, these edges still call to original of CLONE. + + For non-inlined CLONEs, NEW_CALLEE == CLONE and ORIG_CALLEE is CLONE's + original node. + + For inlined node, self recursion to CLONE's original same as non-inlined, + additionally, calls to CLONE->inlined_to are also recursive: + NEW_CALLEE == CLONE->inlined_into and + ORIG_CALLEE == original node of CLONE->inlined_into. */ + +static void +adjust_recursive_callees (cgraph_node *clone, cgraph_node *new_callee, + cgraph_node *orig_callee) +{ + cgraph_node *alias = NULL; + for (cgraph_edge *e = clone->callees; e; e = e->next_callee) + { + if (!e->inline_failed) + continue; + + /* Only self-cycle or local alias are handled. */ + cgraph_node *callee = e->callee; + if (callee == orig_callee) + { + cgraph_node **cl = node_to_clone.get (orig_callee); + gcc_assert (cl && *cl == new_callee); + e->redirect_callee_duplicating_thunks (new_callee); + if (dump_file) + fprintf (dump_file, "recursive call from %s to %s orig %s\n", + e->caller->dump_asm_name (), e->callee->dump_asm_name (), + callee->dump_asm_name ()); + } + else if (callee->alias + && e->callee->ultimate_alias_target () == orig_callee) + { + if (!alias) + { + alias = dyn_cast<cgraph_node *> ( + new_callee->noninterposable_alias ()); + } + e->redirect_callee_duplicating_thunks (alias); + if (dump_file) + fprintf (dump_file, "recursive call from %s to %s orig %s\n", + e->caller->dump_asm_name (), e->callee->dump_asm_name (), + callee->dump_asm_name ()); + } + } + new_callee->expand_all_artificial_thunks (); + if (alias) + alias->expand_all_artificial_thunks (); +} + +/* Create clones for CALLER's inlined callees, ORIG_INLINED_TO is the original + node from clone_as_needed () such that new_inlined_to is a clone of it. */ + +static void +inline_clones (cgraph_node *caller, cgraph_node *orig_inlined_to) +{ + struct cgraph_edge *edge; + for (edge = caller->callees; edge; edge = edge->next_callee) + { + struct cgraph_node *callee = edge->callee; + if (edge->inline_failed) + continue; + + if (callee->inlined_to != orig_inlined_to) + continue; + + struct cgraph_node *new_inlined_to, *cl; + if (caller->inlined_to) + new_inlined_to = caller->inlined_to; + else + new_inlined_to = caller; + + cl = callee->create_clone (callee->decl, + edge->count /*profile_count*/, + true /*update_original*/, + vNULL /*redirect_callers*/, + false /*call_duplication_hook*/, + new_inlined_to /*new_inlined_to*/, + NULL /*param_adjustments*/, + "locality_clone" /*suffix*/); + edge->redirect_callee (cl); + + node_to_clone.put (callee, cl); + clone_to_node.put (cl, callee); + + if (callee->thunk) + { + thunk_info *info = thunk_info::get (callee); + *thunk_info::get_create (cl) = *info; + } + + adjust_recursive_callees (cl, new_inlined_to, orig_inlined_to); + adjust_recursive_callees (cl, cl, callee); + if (dump_file) + { + fprintf (dump_file, "Inline cloned\n"); + cl->dump (dump_file); + } + + /* Recursively inline till end of this callchain. */ + inline_clones (cl, orig_inlined_to); + } +} + +/* Clone EDGE->CALLEE if it or a clone of it is not already in PARTITION. + Redirect all callers of EDGE->CALLEE that are in PARTITION, not just the + EDGE. If a clone is already present in PARTITION, redirect all edges from + EDGE->CALLER to EDGE->CALLEE. This is because we only visit one edge per + caller to callee and redirect for all others from there. + + If cloning, also recursively clone inlined functions till the end of the + callchain because inlined clones have 1-1 exclusive copy and edge from + caller to inlined node. + + There are 2 flows possible: + 1. Only redirect + 1.1. cnode is already in current partition - cnode mustn't be a + locality_clone -> nothing to do + 1.2. A clone of cnode is in current partition - find out if it's the + correct clone for edge - must be a locality_clone but the exact same + kind as callee i.e. orig or cp/sra clone, if yes, redirect, else go to #2 + 1.3. Cnode/a clone of cnode is in current partition but caller is inlined + 2. Clone and redirect + 2.1. cnode is original node + 2.2. cnode itself is a clone + Clone inlines + Flavors of edges: + 1. Normal -> orig nodes, locality clones or cp/sra clones + 2. Recursive -> direct recursion + 3. Alias -> recursion via aliasing or as a result of IPA code duplication + 4. Inline -> shouldn't be included in callchain. */ + +static cgraph_node * +clone_node_as_needed (cgraph_edge *edge, locality_partition partition, + int &cl_num, lto_locality_cloning_model cm) +{ + /* suitable_for_locality_cloning_p () currently prohibits cloning aliases due + to potential versioning and materialization issues. Could be enabled in + the future. suitable_for_locality_cloning_p () also checks for + interposability for CNODE but not for edge redirection. */ + struct cgraph_node *cnode = edge->callee; + struct cgraph_node *caller = edge->caller; + + /* If clone of cnode is already in the partition + Get latest clone of cnode. If current partition has cloned cnode, that + clone should be returned. Otherwise, clone from previous partition is + returned + Original node and its clone shouldn't co-exist in current partition + + This is required if callee is partitioned via another edge before caller + was, and we are now visiting caller->callee edge + + 1) a -> b ==> a -> bc1; b was cloned say via d -> bc1, a is orig + 2) ac1 -> b ==> ac1 -> bc1; b was cloned and a was just cloned + 3) a -> bc1 and bc2 present, mustn't happen, b was cloned and a was + redirected without being partitioned first. + Why will we do this again - multiple edges and something's wrong in + partition_callchain () + 4) ac1 -> bc1 ==> ac1 -> bc2; a was cloned and we already got (1) in some + other partition + 5) ac1 -> bc1 but no clone present in this PARTITION. Create from b, not + from bc1? + 6) a -> b; a -> bc0; create new clone, no clone present + 7) ac0 -> b; ac0 -> bc0 same as (6) + 8) a -> bc0 and no clone present, mustn't happen, same as (3) + + Redirect when bc1 is present and: + a -> b or ac -> b or ac -> bc0 */ + + cgraph_node *orig_cnode = cnode; + cgraph_node **o_cnode = clone_to_node.get (cnode); + if (o_cnode) + orig_cnode = *o_cnode; + + cgraph_node **cnode_cl = node_to_clone.get (orig_cnode); + + if (cnode_cl && node_in_partition_p (partition, *cnode_cl)) + { + if (node_in_partition_p (partition, caller)) + { + bool clone_p = false; + auto_vec<cgraph_edge *> redirected_edges; + for (cgraph_edge *ec = caller->callees; ec; ec = ec->next_callee) + if (ec->callee == cnode && edge_redirectable_p (ec, cm)) + { + ec->redirect_callee_duplicating_thunks (*cnode_cl); + clone_p = true; + redirected_edges.safe_push (ec); + if (dump_file) + { + fprintf (dump_file, "clone present %s %s redirecting %s\n", + cnode->dump_asm_name (), + (*cnode_cl)->dump_asm_name (), + caller->dump_asm_name ()); + } + } + if (clone_p) + { + (*cnode_cl)->expand_all_artificial_thunks (); + adjust_profile_info_for_non_self_rec_edges (redirected_edges, + *cnode_cl, cnode); + return NULL; + } + } + } + + /* Create a new clone for a -> b, ac -> b. + For ac -> bc, should be done on bc or b? + bc could be from b_cp/b_sra or b. */ + + if (orig_cnode != cnode) + { + if (dump_file) + fprintf (dump_file, "Clone of clone %s %s\n", cnode->dump_asm_name (), + orig_cnode->dump_asm_name ()); + return NULL; + } + + struct cgraph_node *cloned_node + = create_locality_clone (cnode, partition, cl_num, cm); + + gcc_assert (cloned_node); + if (!cloned_node) + return NULL; + + node_to_clone.put (cnode, cloned_node); + clone_to_node.put (cloned_node, cnode); + + adjust_recursive_callees (cloned_node, cloned_node, cnode); + symtab->call_cgraph_duplication_hooks (cnode, cloned_node); + + adjust_profile_info (cloned_node, cnode); + /* Inline clones are created iff their inlined_to == CNODE. */ + inline_clones (cloned_node, cnode); + + return cloned_node; +} + +/* Accumulate frequency of all edges from EDGE->caller to EDGE->callee. */ + +static sreal +accumulate_incoming_edge_frequency (cgraph_edge *edge) +{ + sreal count = 0; + struct cgraph_edge *e; + for (e = edge->callee->callers; e; e = e->next_caller) + { + /* Make a local decision about all edges for EDGE->caller but not the + other nodes already in the partition. Their edges will be visited + later or may have been visited before and not fit the + cut-off criteria. */ + if (e->caller == edge->caller) + count += e->sreal_frequency (); + } + return count; +} + +/* Determine if EDGE->CALLEE is suitable for cloning. It is assummed that the + callee is not an inlined node. */ + +static bool +suitable_for_locality_cloning_p (cgraph_edge *edge, + lto_locality_cloning_model cm) +{ + cgraph_node *node = edge->callee; + if (!node->versionable) + return false; + + /* Out-of-line locality clones of ipcp or sra clones will be created in this + pass after IPA inline is run. A locality clone has the same function + body and the same updated signature as the ipcp/sra clone. + This fails or asserts based on how the clone is created: + 1. If param_adjustments and tree_map are not recorded for locality clone: + clone materialization (tree_function_versioning ()) fails when + updating signature and remapping calls because clone_of (ipcp/sra + clone) and locality clone differ in param information. + 2. If param_adjustments and tree_map are provided: asserts are triggered + in fnsummary duplication because IPA inline resets some summaries. + + One inelegant solution is to provide param_adjustments and tree_map, and + then set clone_of to ipcp/sra clone's clone_of. However, this sometimes + results in segmentation fault when the compiled program is run. + Disabling clone of clones altogether for now with an aim to resolve this + is future. */ + if (node->clone_of) + return false; + + if (node->alias) + return false; + + if (edge->recursive_p ()) + return false; + + if (!node->definition) + return false; + + /* Don't clone NODE if IPA count of NODE or EDGE is zero. */ + if (!node->count.ipa ().nonzero_p () || !edge->count.ipa ().nonzero_p ()) + return false; + + if (cm == LTO_LOCALITY_NON_INTERPOSABLE_CLONING) + { + /* Interposability may change on edge basis. */ + enum availability avail; + edge->callee->ultimate_alias_target (&avail, edge->caller); + if (avail <= AVAIL_INTERPOSABLE) + return false; + } + + return true; +} + +/* Map from caller to all callees already visited for partitioning. */ +hash_map<cgraph_node *, auto_vec<cgraph_node *> > caller_to_callees; + +/* Partition EDGE->CALLEE into PARTITION or clone if already partitioned and + satisfies cloning criteria such as CLONING_MODEL, REAL_FREQ and SIZE + cut-offs and CLONE_FURTHER_P set by previous caller. */ + +/* callgraph can have multiple caller to callee edges for multiple callsites + For the first such edge, we make decisions about cutoffs and cloning because + we redirect ALL callsites to cloned callee, not just one of them. */ + +static void +partition_callchain (cgraph_edge *edge, locality_partition partition, + bool clone_further_p, + lto_locality_cloning_model cloning_model, + double freq_cutoff, int size, int &cl_num) +{ + /* Aliases are added in the same partition as their targets. + Aliases are not cloned and their callees are not processed separately. */ + cgraph_node *node = edge->callee->ultimate_alias_target (); + cgraph_node *caller = edge->caller; + cgraph_node *caller_node = node, *cl_node = NULL; + + /* Already visited the caller to callee edges. */ + auto_vec<cgraph_node *> &callees = caller_to_callees.get_or_insert (caller); + if (std::find (callees.begin (), callees.end (), node) != callees.end ()) + return; + + callees.safe_push (node); + + if (node->get_partitioning_class () == SYMBOL_PARTITION) + { + if (!node_partitioned_p (node)) + { + add_node_to_partition (partition, node); + if (dump_file) + fprintf (dump_file, "Partitioned node: %s\n", + node->dump_asm_name ()); + } + else if (cloning_model >= LTO_LOCALITY_NON_INTERPOSABLE_CLONING + && !node_in_partition_p (partition, node)) + { + /* Non-inlined node, or alias, already partitioned + If cut-off, don't clone callees but partition unpartitioned + callees. + size is node + inlined nodes. */ + if (clone_further_p) + { + if (!node->alias) + if (ipa_size_summaries->get (node)->size >= size) + clone_further_p = false; + + if (freq_cutoff != 0.0) + { + sreal acc_freq = accumulate_incoming_edge_frequency (edge); + if (acc_freq.to_double () < freq_cutoff) + clone_further_p = false; + } + } + + if (!suitable_for_locality_cloning_p (edge, cloning_model)) + clone_further_p = false; + + if (clone_further_p) + { + /* Try to clone NODE and its inline chain. */ + if (dump_file) + fprintf (dump_file, "Cloning node: %s\n", + node->dump_asm_name ()); + cl_node = clone_node_as_needed (edge, partition, cl_num, + cloning_model); + if (cl_node) + { + add_node_to_partition (partition, cl_node); + caller_node = cl_node; + } + else + caller_node = NULL; + } + } + } + else if (!node->inlined_to) + return; + + if (caller_node) + for (cgraph_edge *e = caller_node->callees; e; e = e->next_callee) + partition_callchain (e, partition, clone_further_p, cloning_model, + freq_cutoff, size, cl_num); +} + +/* Determine whether NODE is an entrypoint to a callchain. */ + +static bool +is_entry_node_p (cgraph_node *node) +{ + /* node->inlined_to is returned as SYMBOL_DUPLICATE. */ + if (node->get_partitioning_class () != SYMBOL_PARTITION) + return false; + + if (!node->callers) + return true; + + for (cgraph_edge *e = node->callers; e; e = e->next_caller) + { + if (! e->recursive_p ()) + return false; + } + if (node->alias + && !is_entry_node_p (node->ultimate_alias_target ())) + return false; + return true; +} + +/* Determine order of all external nodes if PGO profile is available. + Store the order in ORDER. */ + +static bool +locality_determine_ipa_order (auto_vec<locality_order *> *order) +{ + struct cgraph_node *node; + auto_vec<locality_order *> non_comparable_nodes; + FOR_EACH_DEFINED_FUNCTION (node) + if (node->get_partitioning_class () == SYMBOL_PARTITION) + { + if (node->no_reorder) + { + if (dump_file) + fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ()); + return false; + } + else if (is_entry_node_p (node)) + { + profile_count pcnt = node->count.ipa (); + if (!pcnt.initialized_p () || !pcnt.ipa_p ()) + { + sreal cnt = 0; + locality_order *lo = new locality_order (node, cnt); + non_comparable_nodes.safe_push (lo); + continue; + } + sreal count = 0; + struct cgraph_edge *edge; + for (edge = node->callees; edge; edge = edge->next_callee) + { + /* For PGO, frequency is not used in + compare_edge_profile_counts (), it's used only as part of + static profile order. */ + sreal freq = edge->sreal_frequency (); + count += freq; + } + locality_order *cl = new locality_order (node, count); + order->safe_push (cl); + } + } + order->qsort (compare_edge_profile_counts); + for (auto el : non_comparable_nodes) + order->safe_push (el); + return true; +} + +/* Determine order of all external nodes if only static profile is available. + Store the order in ORDER. */ + +static bool +locality_determine_static_order (auto_vec<locality_order *> *order) +{ + struct cgraph_node *node; + FOR_EACH_DEFINED_FUNCTION (node) + if (node->get_partitioning_class () == SYMBOL_PARTITION) + { + if (node->no_reorder) + { + if (dump_file) + fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ()); + return false; + } + else if (is_entry_node_p (node)) + { + sreal count = 0; + struct cgraph_edge *edge; + for (edge = node->callees; edge; edge = edge->next_callee) + { + sreal freq = edge->sreal_frequency (); + count += freq; + } + locality_order *cl = new locality_order (node, count); + order->safe_push (cl); + } + } + order->qsort (static_profile_cmp); + return true; +} + +/* Partitioning for code locality. + 1. Create and sort callchains. If PGO is available, use real profile + counts. Otherwise, use a set of heuristics to sort the callchains. + 2. Partition the external nodes and their callchains in the determined order + 2.1. If !partition, partition, else try and clone if it satisfies cloning + criteria. + 3. Partition all other unpartitioned nodes. */ + +static void +locality_partition_and_clone (int max_locality_partition_size, + lto_locality_cloning_model cloning_model, + int freq_denominator, int size) +{ + locality_partition partition; + int npartitions = 0; + + auto_vec<locality_order *> order; + auto_vec<varpool_node *> varpool_order; + struct cgraph_node *node; + bool order_p; + + int cl_num = 0; + + double real_freq = 0.0; + if (freq_denominator > 0) + real_freq = 1.0 / (double) freq_denominator; + + cgraph_node *n = symtab->first_defined_function (); + if (n && n->count.ipa_p ()) + order_p = locality_determine_ipa_order (&order); + else + order_p = locality_determine_static_order (&order); + if (!order_p) + { + if (dump_file) + { + fprintf (dump_file, "Locality partition: falling back to balanced" + "model\n"); + } + + return; + } + + int64_t partition_size + = max_locality_partition_size + ? max_locality_partition_size : param_max_partition_size; + partition = create_partition (npartitions); + + for (unsigned i = 0; i < order.length (); i++) + { + node = order[i]->node; + if (node_partitioned_p (node)) + continue; + + if (partition->insns > partition_size) + partition = create_partition (npartitions); + if (dump_file) + fprintf (dump_file, "Partition id: %d\n", partition->part_id); + + add_node_to_partition (partition, node); + if (dump_file) + fprintf (dump_file, "Ordered Node: %s\n", node->dump_asm_name ()); + + for (cgraph_edge *edge = node->callees; edge; edge = edge->next_callee) + { + /* Recursively partition the callchain of edge->callee. */ + partition_callchain (edge, partition, true, cloning_model, real_freq, + size, cl_num); + } + } + + for (unsigned i = 0; i < order.length (); i++) + delete order[i]; + order = vNULL; +} + +/* Entry point to locality-clone pass. */ +static int +lc_execute (void) +{ + symtab_node *node; + FOR_EACH_SYMBOL (node) + node->aux = NULL; + + locality_partition_and_clone (param_max_locality_partition_size, + flag_lto_locality_cloning, + param_lto_locality_frequency, + param_lto_locality_size); + + FOR_EACH_SYMBOL (node) + node->aux = NULL; + return 0; +} + +namespace { + +const pass_data pass_data_ipa_locality_clone = { + IPA_PASS, /* type */ + "locality-clone", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_IPA_LC, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + (TODO_dump_symtab | TODO_remove_functions), /* todo_flags_finish */ +}; + +class pass_ipa_locality_cloning : public ipa_opt_pass_d +{ +public: + pass_ipa_locality_cloning (gcc::context *ctxt) + : ipa_opt_pass_d (pass_data_ipa_locality_clone, ctxt, + NULL, /* generate_summary */ + NULL, /* write_summary */ + NULL, /* read_summary */ + NULL, /* write_optimization_summary */ + NULL, /* read_optimization_summary */ + NULL, /* stmt_fixup */ + 0, /* function_transform_todo_flags_start */ + NULL, /* function_transform */ + NULL) /* variable_transform */ + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return (flag_wpa && flag_ipa_reorder_for_locality); + } + + virtual unsigned int execute (function *) { return lc_execute (); } + +}; // class pass_ipa_locality_cloning + +} // namespace + +ipa_opt_pass_d * +make_pass_ipa_locality_cloning (gcc::context *ctxt) +{ + return new pass_ipa_locality_cloning (ctxt); +} |