aboutsummaryrefslogtreecommitdiff
path: root/gcc/ipa-locality-cloning.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/ipa-locality-cloning.cc')
-rw-r--r--gcc/ipa-locality-cloning.cc1137
1 files changed, 1137 insertions, 0 deletions
diff --git a/gcc/ipa-locality-cloning.cc b/gcc/ipa-locality-cloning.cc
new file mode 100644
index 0000000..2684046
--- /dev/null
+++ b/gcc/ipa-locality-cloning.cc
@@ -0,0 +1,1137 @@
+/* Code locality based function cloning.
+ Copyright The GNU Toolchain Authors
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+/* This file implements cloning required to improve partitioning of the
+ callgraph for locality considerations.
+
+ Partitioning for improving code locality.
+ This pass aims to place frequently executed callchains closer together in
+ memory to improve performance through improved locality. If any frequent
+ callchains cannot be placed together because they are already placed
+ elsewhere, local function clones are created and all callers near to the
+ clones are redirected to use this copy.
+
+ Locality code placement is done in 2 parts.
+ 1. IPA pass to be executed after ipa-inline and before ipa-pure-const.
+ Execute stage prepares the plan to place all nodes into partitions.
+ 2. WPA Partition stage actually implements the plan.
+
+ Brief overview of the IPA pass:
+ 1. Create and sort callchains. If PGO is available, use real profile
+ counts. Otherwise, use a set of heuristics to sort the callchains.
+ 2. Create a partition plan for the callchains, processing them in the sorted
+ order.
+ 1. If a function is unpartitioned, place it in the current partition.
+ 2. If a function is already placed in a partition away from current
+ partition as part of another callchain:
+ Create a local clone in current partition, if cloning criteria is
+ satisfied.
+ 3. Redirect any new caller to a local clone if one exists.
+ Partition size is param controlled to fine tune per program behavior. */
+
+#include "config.h"
+#define INCLUDE_ALGORITHM
+#include "system.h"
+#include "coretypes.h"
+#include "target.h"
+#include "function.h"
+#include "tree.h"
+#include "alloc-pool.h"
+#include "tree-pass.h"
+#include "cgraph.h"
+#include "symbol-summary.h"
+#include "tree-vrp.h"
+#include "symtab-thunks.h"
+#include "sreal.h"
+#include "ipa-cp.h"
+#include "ipa-prop.h"
+#include "ipa-fnsummary.h"
+#include "ipa-modref-tree.h"
+#include "ipa-modref.h"
+#include "symtab-clones.h"
+#include "ipa-locality-cloning.h"
+
+/* Locality partitions, assigns nodes to partitions. These are used later in
+ WPA partitioning. */
+vec<locality_partition> locality_partitions;
+
+/* Map from original node to its latest clone. Gets overwritten whenever a new
+ clone is created from the same node. */
+hash_map<cgraph_node *, cgraph_node *> node_to_clone;
+/* Map from clone to its original node. */
+hash_map<cgraph_node *, cgraph_node *> clone_to_node;
+
+/* Data structure to hold static heuristics and orders for cgraph_nodes. */
+struct locality_order
+{
+ cgraph_node *node;
+ sreal order;
+ locality_order (cgraph_node *node, sreal order) : node (node), order (order)
+ {}
+};
+
+/* Return true if NODE is already in some partition. */
+static inline bool
+node_partitioned_p (cgraph_node *node)
+{
+ return node->aux;
+}
+
+/* Add symbol NODE to partition PART. */
+static void
+add_node_to_partition (locality_partition part, cgraph_node *node)
+{
+ struct cgraph_edge *e;
+ if (node_partitioned_p (node))
+ return;
+
+ part->nodes.safe_push (node);
+ node->aux = (void *) (uintptr_t) (part->part_id);
+
+ if (!node->alias && node->get_partitioning_class () == SYMBOL_PARTITION)
+ part->insns += ipa_size_summaries->get (node)->size;
+
+ /* Add all inline clones and callees that are duplicated. */
+ for (e = node->callees; e; e = e->next_callee)
+ if (!e->inline_failed)
+ add_node_to_partition (part, e->callee);
+ /* omp declare_variant_alt or transparent_alias with definition or linker
+ discardable (non-local comdat but not forced and not
+ used by non-LTO). */
+ else if (e->callee->get_partitioning_class () == SYMBOL_DUPLICATE)
+ add_node_to_partition (part, e->callee);
+
+ /* Add all thunks associated with the function. */
+ for (e = node->callers; e; e = e->next_caller)
+ if (e->caller->thunk && !e->caller->inlined_to)
+ add_node_to_partition (part, e->caller);
+
+ /* Add all aliases associated with the symbol. */
+ struct ipa_ref *ref;
+ FOR_EACH_ALIAS (node, ref)
+ if (!ref->referring->transparent_alias)
+ {
+ cgraph_node *referring = dyn_cast<cgraph_node *> (ref->referring);
+ /* Only add function aliases.
+ Varpool refs are added later in LTO partitioning pass. */
+ if (referring)
+ add_node_to_partition (part, referring);
+ }
+ else
+ {
+ struct ipa_ref *ref2;
+ /* We do not need to add transparent aliases if they are not used.
+ However we must add aliases of transparent aliases if they exist. */
+ FOR_EACH_ALIAS (ref->referring, ref2)
+ {
+ /* Nested transparent aliases are not permitted. */
+ gcc_checking_assert (!ref2->referring->transparent_alias);
+ cgraph_node *referring = dyn_cast<cgraph_node *> (ref2->referring);
+ if (referring)
+ add_node_to_partition (part, referring);
+ }
+ }
+}
+
+/* Return TRUE if NODE is in PARTITION. */
+static bool
+node_in_partition_p (locality_partition partition, cgraph_node *node)
+{
+ return ((uintptr_t) (partition->part_id) == (uintptr_t) (node->aux));
+}
+
+/* Helper function for qsort; to break ties. */
+static int
+compare_node_uids (cgraph_node *n1, cgraph_node *n2)
+{
+ int res = n1->get_uid () - n2->get_uid ();
+ gcc_assert (res != 0);
+ return res > 0 ? 1 : -1;
+}
+
+/* Helper function for qsort; sort nodes by order. */
+static int
+static_profile_cmp (const void *pa, const void *pb)
+{
+ const locality_order *a = *static_cast<const locality_order *const *> (pa);
+ const locality_order *b = *static_cast<const locality_order *const *> (pb);
+ /* Ascending order. */
+ if (b->order < a->order)
+ return 1;
+ if (b->order > a->order)
+ return -1;
+ return compare_node_uids (a->node, b->node);
+}
+
+/* Helper function for qsort; sort nodes by profile count. */
+static int
+compare_edge_profile_counts (const void *pa, const void *pb)
+{
+ const locality_order *a = *static_cast<const locality_order *const *> (pa);
+ const locality_order *b = *static_cast<const locality_order *const *> (pb);
+
+ profile_count cnt1 = a->node->count.ipa ();
+ profile_count cnt2 = b->node->count.ipa ();
+ if (!cnt1.compatible_p (cnt2))
+ return static_profile_cmp (pa, pb);
+
+ if (cnt1 < cnt2)
+ return 1;
+ if (cnt1 > cnt2)
+ return -1;
+ return static_profile_cmp (pa, pb);
+}
+
+/* Create and return a new partition and increment NPARTITIONS. */
+
+static locality_partition
+create_partition (int &npartitions)
+{
+ locality_partition part = XCNEW (struct locality_partition_def);
+ npartitions++;
+ part->part_id = npartitions;
+ part->nodes.create (1);
+ part->insns = 0;
+ locality_partitions.safe_push (part);
+ return part;
+}
+
+/* Structure for holding profile count information of callers of a node. */
+struct profile_stats
+{
+ /* Sum of non-recursive call counts. */
+ profile_count nonrec_count;
+
+ /* Sum of recursive call counts. */
+ profile_count rec_count;
+
+ /* If non-NULL, this node is the target of alias or thunk and calls from this
+ should be count in rec_count. */
+ cgraph_node *target;
+};
+
+/* Initialize fields of STATS. */
+static inline void
+init_profile_stats (profile_stats *stats, cgraph_node *target = NULL)
+{
+ stats->nonrec_count = profile_count::zero ();
+ stats->rec_count = profile_count::zero ();
+ stats->target = target;
+}
+
+/* Helper function of to accumulate call counts. */
+static bool
+accumulate_profile_counts_after_cloning (cgraph_node *node, void *data)
+{
+ struct profile_stats *stats = (struct profile_stats *) data;
+ for (cgraph_edge *e = node->callers; e; e = e->next_caller)
+ {
+ if (!e->count.initialized_p ())
+ continue;
+
+ if (e->caller == stats->target)
+ stats->rec_count += e->count.ipa ();
+ else
+ stats->nonrec_count += e->count.ipa ();
+ }
+ return false;
+}
+
+/* NEW_NODE is a previously created clone of ORIG_NODE already present in
+ current partition. EDGES contains newly redirected edges to NEW_NODE.
+ Adjust profile information for both nodes and the edge. */
+
+static void
+adjust_profile_info_for_non_self_rec_edges (auto_vec<cgraph_edge *> &edges,
+ cgraph_node *new_node,
+ cgraph_node *orig_node)
+{
+ profile_count orig_node_count = orig_node->count.ipa ();
+ profile_count edge_count = profile_count::zero ();
+ profile_count final_new_count = profile_count::zero ();
+ profile_count final_orig_count = profile_count::zero ();
+
+ for (unsigned i = 0; i < edges.length (); ++i)
+ if (edges[i]->count.initialized_p ())
+ edge_count += edges[i]->count.ipa ();
+
+ final_orig_count = orig_node_count - edge_count;
+
+ /* NEW_NODE->count was adjusted for other callers when the clone was
+ first created. Just add the new edge count. */
+ final_new_count = new_node->count + edge_count;
+
+ final_new_count = orig_node_count.combine_with_ipa_count (final_new_count);
+ orig_node->count = final_orig_count;
+ new_node->count = final_new_count;
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Adjusting profile information for %s\n",
+ new_node->dump_asm_name ());
+ fprintf (dump_file, "\tOriginal node %s\n", orig_node->dump_asm_name ());
+ fprintf (dump_file, "\tOriginal count: ");
+ orig_node_count.dump (dump_file);
+ fprintf (dump_file, "\n\tAdjusted original count to: ");
+ final_orig_count.dump (dump_file);
+ fprintf (dump_file, "\n\tAdjusted clone count to: ");
+ final_new_count.dump (dump_file);
+ fprintf (dump_file, "\n");
+ }
+
+ /* Scale all callee edges according to adjusted counts. */
+ profile_count orig_node_count_copy = orig_node_count;
+ profile_count::adjust_for_ipa_scaling (&final_new_count,
+ &orig_node_count_copy);
+ for (cgraph_edge *cs = new_node->callees; cs; cs = cs->next_callee)
+ cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy);
+ for (cgraph_edge *cs = new_node->indirect_calls; cs; cs = cs->next_callee)
+ cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy);
+
+ profile_count::adjust_for_ipa_scaling (&final_orig_count, &orig_node_count);
+ for (cgraph_edge *cs = orig_node->callees; cs; cs = cs->next_callee)
+ cs->count = cs->count.apply_scale (final_orig_count, orig_node_count);
+ for (cgraph_edge *cs = orig_node->indirect_calls; cs; cs = cs->next_callee)
+ cs->count = cs->count.apply_scale (final_orig_count, orig_node_count);
+}
+
+/* Adjust profile counts of NEW_NODE and ORIG_NODE, where NEW_NODE is a clone
+ of OLD_NODE.
+ Assumes that all eligible edges from current partition so far are redirected
+ to NEW_NODE and recursive edges are adjusted. */
+
+static void
+adjust_profile_info (cgraph_node *new_node, cgraph_node *orig_node)
+{
+ /* If all calls to NEW_NODE are non-recursive, subtract corresponding count
+ from ORIG_NODE and assign to NEW_NODE, any unexpected remainder stays with
+ ORIG_NODE.
+ Recursive calls if present, likely contribute to majority of count;
+ scale according to redirected callers' count. */
+
+ profile_count orig_node_count = orig_node->count.ipa ();
+ profile_stats new_stats, orig_stats;
+
+ init_profile_stats (&new_stats);
+ init_profile_stats (&orig_stats);
+
+ new_node->call_for_symbol_thunks_and_aliases
+ (accumulate_profile_counts_after_cloning, &new_stats, false);
+ orig_node->call_for_symbol_thunks_and_aliases
+ (accumulate_profile_counts_after_cloning, &orig_stats, false);
+
+ profile_count orig_nonrec_count = orig_stats.nonrec_count;
+ profile_count orig_rec_count = orig_stats.rec_count;
+ profile_count new_nonrec_count = new_stats.nonrec_count;
+ profile_count new_rec_count = new_stats.rec_count;
+
+ profile_count final_new_count = new_nonrec_count;
+ profile_count final_orig_count = profile_count::zero ();
+
+ /* All calls to NEW_NODE are non-recursive or recursive calls have
+ zero count. */
+ if (!new_rec_count.nonzero_p ())
+ final_orig_count = orig_node_count - new_nonrec_count;
+ else
+ {
+ /* If ORIG_NODE is externally visible, indirect calls or calls from
+ another part of the code may contribute to the count.
+ update_profiling_info () from ipa-cp.cc pretends to have an extra
+ caller to represent the extra counts. */
+ if (!orig_node->local)
+ {
+ profile_count pretend_count = (orig_node_count - new_nonrec_count -
+ orig_nonrec_count - orig_rec_count);
+ orig_nonrec_count += pretend_count;
+ }
+
+ /* Remaining rec_count is assigned in proportion to clone's non-recursive
+ count. */
+ profile_count rec_count = orig_node_count - new_nonrec_count
+ - orig_nonrec_count;
+ profile_count new_rec_scaled
+ = rec_count.apply_scale (new_nonrec_count,
+ new_nonrec_count + orig_nonrec_count);
+ final_new_count += new_rec_scaled;
+ final_orig_count = orig_node_count - final_new_count;
+ }
+
+ final_new_count = orig_node_count.combine_with_ipa_count (final_new_count);
+ new_node->count = final_new_count;
+ orig_node->count = final_orig_count;
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Adjusting profile information for %s\n",
+ new_node->dump_asm_name ());
+ fprintf (dump_file, "\tOriginal node %s\n", orig_node->dump_asm_name ());
+ fprintf (dump_file, "\tOriginal count: ");
+ orig_node_count.dump (dump_file);
+ fprintf (dump_file, "\n\tAdjusted original count to: ");
+ final_orig_count.dump (dump_file);
+ fprintf (dump_file, "\n\tAdjusted clone count to: ");
+ final_new_count.dump (dump_file);
+ fprintf (dump_file, "\n");
+ }
+
+ /* Scale all callee edges according to adjusted counts. */
+ profile_count orig_node_count_copy = orig_node_count;
+ profile_count::adjust_for_ipa_scaling (&final_new_count,
+ &orig_node_count_copy);
+ for (cgraph_edge *cs = new_node->callees; cs; cs = cs->next_callee)
+ cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy);
+ for (cgraph_edge *cs = new_node->indirect_calls; cs; cs = cs->next_callee)
+ cs->count = cs->count.apply_scale (final_new_count, orig_node_count_copy);
+
+ profile_count::adjust_for_ipa_scaling (&final_orig_count, &orig_node_count);
+ for (cgraph_edge *cs = orig_node->callees; cs; cs = cs->next_callee)
+ cs->count = cs->count.apply_scale (final_orig_count, orig_node_count);
+ for (cgraph_edge *cs = orig_node->indirect_calls; cs; cs = cs->next_callee)
+ cs->count = cs->count.apply_scale (final_orig_count, orig_node_count);
+}
+
+/* Return true if EDGE can be safely redirected to another callee. */
+static inline bool
+edge_redirectable_p (cgraph_edge *edge, lto_locality_cloning_model cm)
+{
+ if (cm == LTO_LOCALITY_NON_INTERPOSABLE_CLONING)
+ {
+ /* Interposability may change on edge basis. */
+ enum availability avail;
+ avail = edge->callee->get_availability (edge->caller);
+ if (avail <= AVAIL_INTERPOSABLE)
+ return false;
+ }
+ return true;
+}
+
+/* Create a locality clone of CNODE and redirect all callers present in
+ PARTITION.
+ Create a clone dpending on whether CNODE itself is a clone or not. */
+
+static cgraph_node *
+create_locality_clone (cgraph_node *cnode,
+ locality_partition partition, int &cl_num,
+ lto_locality_cloning_model cm)
+{
+ cgraph_node *cl_node = NULL;
+ vec<cgraph_edge *> redirect_callers = vNULL;
+ /* All callers of cnode in current partition are redirected. */
+ struct cgraph_edge *edge;
+ for (edge = cnode->callers; edge; edge = edge->next_caller)
+ {
+ struct cgraph_node *caller = edge->caller;
+ if (node_in_partition_p (partition, caller) && caller->definition
+ && caller != cnode && edge_redirectable_p (edge, cm))
+ redirect_callers.safe_push (edge);
+ }
+
+ const char *suffix = "locality_clone";
+
+ tree old_decl = cnode->decl;
+ tree new_decl = copy_node (old_decl);
+
+ /* Generate a new name for the new version. */
+ const char *name = IDENTIFIER_POINTER (DECL_NAME (old_decl));
+ DECL_NAME (new_decl) = clone_function_name (name, suffix, cl_num);
+ SET_DECL_ASSEMBLER_NAME (new_decl,
+ clone_function_name (old_decl, suffix, cl_num));
+ cl_num++;
+ if (dump_file)
+ fprintf (dump_file, "\tNew name %s\n",
+ IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (new_decl)));
+
+ cl_node = cnode->create_clone (new_decl, cnode->count /*profile_count*/,
+ false /*update_original*/, redirect_callers,
+ false /*call_duplication_hook*/,
+ NULL /*new_inlined_to*/,
+ NULL /*param_adjustments*/, suffix);
+
+ set_new_clone_decl_and_node_flags (cl_node);
+
+ if (cnode->ipa_transforms_to_apply.exists ())
+ cl_node->ipa_transforms_to_apply
+ = cnode->ipa_transforms_to_apply.copy ();
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Cloned Node: %s %s\n", cnode->dump_asm_name (),
+ cl_node->dump_asm_name ());
+
+ for (edge = cl_node->callers; edge; edge = edge->next_caller)
+ fprintf (dump_file, "Redirected callers: %s\n",
+ edge->caller->dump_asm_name ());
+
+ for (edge = cl_node->callees; edge; edge = edge->next_callee)
+ fprintf (dump_file, "Callees of clone: %s %d\n",
+ edge->callee->dump_asm_name (), edge->frequency ());
+ }
+ return cl_node;
+}
+
+/* Redirect recursive edges of CLONE to correctly point to CLONE. As part of
+ cloning process, all callee edges of a node are just duplicated but not
+ redirected. Therefore, these edges still call to original of CLONE.
+
+ For non-inlined CLONEs, NEW_CALLEE == CLONE and ORIG_CALLEE is CLONE's
+ original node.
+
+ For inlined node, self recursion to CLONE's original same as non-inlined,
+ additionally, calls to CLONE->inlined_to are also recursive:
+ NEW_CALLEE == CLONE->inlined_into and
+ ORIG_CALLEE == original node of CLONE->inlined_into. */
+
+static void
+adjust_recursive_callees (cgraph_node *clone, cgraph_node *new_callee,
+ cgraph_node *orig_callee)
+{
+ cgraph_node *alias = NULL;
+ for (cgraph_edge *e = clone->callees; e; e = e->next_callee)
+ {
+ if (!e->inline_failed)
+ continue;
+
+ /* Only self-cycle or local alias are handled. */
+ cgraph_node *callee = e->callee;
+ if (callee == orig_callee)
+ {
+ cgraph_node **cl = node_to_clone.get (orig_callee);
+ gcc_assert (cl && *cl == new_callee);
+ e->redirect_callee_duplicating_thunks (new_callee);
+ if (dump_file)
+ fprintf (dump_file, "recursive call from %s to %s orig %s\n",
+ e->caller->dump_asm_name (), e->callee->dump_asm_name (),
+ callee->dump_asm_name ());
+ }
+ else if (callee->alias
+ && e->callee->ultimate_alias_target () == orig_callee)
+ {
+ if (!alias)
+ {
+ alias = dyn_cast<cgraph_node *> (
+ new_callee->noninterposable_alias ());
+ }
+ e->redirect_callee_duplicating_thunks (alias);
+ if (dump_file)
+ fprintf (dump_file, "recursive call from %s to %s orig %s\n",
+ e->caller->dump_asm_name (), e->callee->dump_asm_name (),
+ callee->dump_asm_name ());
+ }
+ }
+ new_callee->expand_all_artificial_thunks ();
+ if (alias)
+ alias->expand_all_artificial_thunks ();
+}
+
+/* Create clones for CALLER's inlined callees, ORIG_INLINED_TO is the original
+ node from clone_as_needed () such that new_inlined_to is a clone of it. */
+
+static void
+inline_clones (cgraph_node *caller, cgraph_node *orig_inlined_to)
+{
+ struct cgraph_edge *edge;
+ for (edge = caller->callees; edge; edge = edge->next_callee)
+ {
+ struct cgraph_node *callee = edge->callee;
+ if (edge->inline_failed)
+ continue;
+
+ if (callee->inlined_to != orig_inlined_to)
+ continue;
+
+ struct cgraph_node *new_inlined_to, *cl;
+ if (caller->inlined_to)
+ new_inlined_to = caller->inlined_to;
+ else
+ new_inlined_to = caller;
+
+ cl = callee->create_clone (callee->decl,
+ edge->count /*profile_count*/,
+ true /*update_original*/,
+ vNULL /*redirect_callers*/,
+ false /*call_duplication_hook*/,
+ new_inlined_to /*new_inlined_to*/,
+ NULL /*param_adjustments*/,
+ "locality_clone" /*suffix*/);
+ edge->redirect_callee (cl);
+
+ node_to_clone.put (callee, cl);
+ clone_to_node.put (cl, callee);
+
+ if (callee->thunk)
+ {
+ thunk_info *info = thunk_info::get (callee);
+ *thunk_info::get_create (cl) = *info;
+ }
+
+ adjust_recursive_callees (cl, new_inlined_to, orig_inlined_to);
+ adjust_recursive_callees (cl, cl, callee);
+ if (dump_file)
+ {
+ fprintf (dump_file, "Inline cloned\n");
+ cl->dump (dump_file);
+ }
+
+ /* Recursively inline till end of this callchain. */
+ inline_clones (cl, orig_inlined_to);
+ }
+}
+
+/* Clone EDGE->CALLEE if it or a clone of it is not already in PARTITION.
+ Redirect all callers of EDGE->CALLEE that are in PARTITION, not just the
+ EDGE. If a clone is already present in PARTITION, redirect all edges from
+ EDGE->CALLER to EDGE->CALLEE. This is because we only visit one edge per
+ caller to callee and redirect for all others from there.
+
+ If cloning, also recursively clone inlined functions till the end of the
+ callchain because inlined clones have 1-1 exclusive copy and edge from
+ caller to inlined node.
+
+ There are 2 flows possible:
+ 1. Only redirect
+ 1.1. cnode is already in current partition - cnode mustn't be a
+ locality_clone -> nothing to do
+ 1.2. A clone of cnode is in current partition - find out if it's the
+ correct clone for edge - must be a locality_clone but the exact same
+ kind as callee i.e. orig or cp/sra clone, if yes, redirect, else go to #2
+ 1.3. Cnode/a clone of cnode is in current partition but caller is inlined
+ 2. Clone and redirect
+ 2.1. cnode is original node
+ 2.2. cnode itself is a clone
+ Clone inlines
+ Flavors of edges:
+ 1. Normal -> orig nodes, locality clones or cp/sra clones
+ 2. Recursive -> direct recursion
+ 3. Alias -> recursion via aliasing or as a result of IPA code duplication
+ 4. Inline -> shouldn't be included in callchain. */
+
+static cgraph_node *
+clone_node_as_needed (cgraph_edge *edge, locality_partition partition,
+ int &cl_num, lto_locality_cloning_model cm)
+{
+ /* suitable_for_locality_cloning_p () currently prohibits cloning aliases due
+ to potential versioning and materialization issues. Could be enabled in
+ the future. suitable_for_locality_cloning_p () also checks for
+ interposability for CNODE but not for edge redirection. */
+ struct cgraph_node *cnode = edge->callee;
+ struct cgraph_node *caller = edge->caller;
+
+ /* If clone of cnode is already in the partition
+ Get latest clone of cnode. If current partition has cloned cnode, that
+ clone should be returned. Otherwise, clone from previous partition is
+ returned
+ Original node and its clone shouldn't co-exist in current partition
+
+ This is required if callee is partitioned via another edge before caller
+ was, and we are now visiting caller->callee edge
+
+ 1) a -> b ==> a -> bc1; b was cloned say via d -> bc1, a is orig
+ 2) ac1 -> b ==> ac1 -> bc1; b was cloned and a was just cloned
+ 3) a -> bc1 and bc2 present, mustn't happen, b was cloned and a was
+ redirected without being partitioned first.
+ Why will we do this again - multiple edges and something's wrong in
+ partition_callchain ()
+ 4) ac1 -> bc1 ==> ac1 -> bc2; a was cloned and we already got (1) in some
+ other partition
+ 5) ac1 -> bc1 but no clone present in this PARTITION. Create from b, not
+ from bc1?
+ 6) a -> b; a -> bc0; create new clone, no clone present
+ 7) ac0 -> b; ac0 -> bc0 same as (6)
+ 8) a -> bc0 and no clone present, mustn't happen, same as (3)
+
+ Redirect when bc1 is present and:
+ a -> b or ac -> b or ac -> bc0 */
+
+ cgraph_node *orig_cnode = cnode;
+ cgraph_node **o_cnode = clone_to_node.get (cnode);
+ if (o_cnode)
+ orig_cnode = *o_cnode;
+
+ cgraph_node **cnode_cl = node_to_clone.get (orig_cnode);
+
+ if (cnode_cl && node_in_partition_p (partition, *cnode_cl))
+ {
+ if (node_in_partition_p (partition, caller))
+ {
+ bool clone_p = false;
+ auto_vec<cgraph_edge *> redirected_edges;
+ for (cgraph_edge *ec = caller->callees; ec; ec = ec->next_callee)
+ if (ec->callee == cnode && edge_redirectable_p (ec, cm))
+ {
+ ec->redirect_callee_duplicating_thunks (*cnode_cl);
+ clone_p = true;
+ redirected_edges.safe_push (ec);
+ if (dump_file)
+ {
+ fprintf (dump_file, "clone present %s %s redirecting %s\n",
+ cnode->dump_asm_name (),
+ (*cnode_cl)->dump_asm_name (),
+ caller->dump_asm_name ());
+ }
+ }
+ if (clone_p)
+ {
+ (*cnode_cl)->expand_all_artificial_thunks ();
+ adjust_profile_info_for_non_self_rec_edges (redirected_edges,
+ *cnode_cl, cnode);
+ return NULL;
+ }
+ }
+ }
+
+ /* Create a new clone for a -> b, ac -> b.
+ For ac -> bc, should be done on bc or b?
+ bc could be from b_cp/b_sra or b. */
+
+ if (orig_cnode != cnode)
+ {
+ if (dump_file)
+ fprintf (dump_file, "Clone of clone %s %s\n", cnode->dump_asm_name (),
+ orig_cnode->dump_asm_name ());
+ return NULL;
+ }
+
+ struct cgraph_node *cloned_node
+ = create_locality_clone (cnode, partition, cl_num, cm);
+
+ gcc_assert (cloned_node);
+ if (!cloned_node)
+ return NULL;
+
+ node_to_clone.put (cnode, cloned_node);
+ clone_to_node.put (cloned_node, cnode);
+
+ adjust_recursive_callees (cloned_node, cloned_node, cnode);
+ symtab->call_cgraph_duplication_hooks (cnode, cloned_node);
+
+ adjust_profile_info (cloned_node, cnode);
+ /* Inline clones are created iff their inlined_to == CNODE. */
+ inline_clones (cloned_node, cnode);
+
+ return cloned_node;
+}
+
+/* Accumulate frequency of all edges from EDGE->caller to EDGE->callee. */
+
+static sreal
+accumulate_incoming_edge_frequency (cgraph_edge *edge)
+{
+ sreal count = 0;
+ struct cgraph_edge *e;
+ for (e = edge->callee->callers; e; e = e->next_caller)
+ {
+ /* Make a local decision about all edges for EDGE->caller but not the
+ other nodes already in the partition. Their edges will be visited
+ later or may have been visited before and not fit the
+ cut-off criteria. */
+ if (e->caller == edge->caller)
+ count += e->sreal_frequency ();
+ }
+ return count;
+}
+
+/* Determine if EDGE->CALLEE is suitable for cloning. It is assummed that the
+ callee is not an inlined node. */
+
+static bool
+suitable_for_locality_cloning_p (cgraph_edge *edge,
+ lto_locality_cloning_model cm)
+{
+ cgraph_node *node = edge->callee;
+ if (!node->versionable)
+ return false;
+
+ /* Out-of-line locality clones of ipcp or sra clones will be created in this
+ pass after IPA inline is run. A locality clone has the same function
+ body and the same updated signature as the ipcp/sra clone.
+ This fails or asserts based on how the clone is created:
+ 1. If param_adjustments and tree_map are not recorded for locality clone:
+ clone materialization (tree_function_versioning ()) fails when
+ updating signature and remapping calls because clone_of (ipcp/sra
+ clone) and locality clone differ in param information.
+ 2. If param_adjustments and tree_map are provided: asserts are triggered
+ in fnsummary duplication because IPA inline resets some summaries.
+
+ One inelegant solution is to provide param_adjustments and tree_map, and
+ then set clone_of to ipcp/sra clone's clone_of. However, this sometimes
+ results in segmentation fault when the compiled program is run.
+ Disabling clone of clones altogether for now with an aim to resolve this
+ is future. */
+ if (node->clone_of)
+ return false;
+
+ if (node->alias)
+ return false;
+
+ if (edge->recursive_p ())
+ return false;
+
+ if (!node->definition)
+ return false;
+
+ /* Don't clone NODE if IPA count of NODE or EDGE is zero. */
+ if (!node->count.ipa ().nonzero_p () || !edge->count.ipa ().nonzero_p ())
+ return false;
+
+ if (cm == LTO_LOCALITY_NON_INTERPOSABLE_CLONING)
+ {
+ /* Interposability may change on edge basis. */
+ enum availability avail;
+ edge->callee->ultimate_alias_target (&avail, edge->caller);
+ if (avail <= AVAIL_INTERPOSABLE)
+ return false;
+ }
+
+ return true;
+}
+
+/* Map from caller to all callees already visited for partitioning. */
+hash_map<cgraph_node *, auto_vec<cgraph_node *> > caller_to_callees;
+
+/* Partition EDGE->CALLEE into PARTITION or clone if already partitioned and
+ satisfies cloning criteria such as CLONING_MODEL, REAL_FREQ and SIZE
+ cut-offs and CLONE_FURTHER_P set by previous caller. */
+
+/* callgraph can have multiple caller to callee edges for multiple callsites
+ For the first such edge, we make decisions about cutoffs and cloning because
+ we redirect ALL callsites to cloned callee, not just one of them. */
+
+static void
+partition_callchain (cgraph_edge *edge, locality_partition partition,
+ bool clone_further_p,
+ lto_locality_cloning_model cloning_model,
+ double freq_cutoff, int size, int &cl_num)
+{
+ /* Aliases are added in the same partition as their targets.
+ Aliases are not cloned and their callees are not processed separately. */
+ cgraph_node *node = edge->callee->ultimate_alias_target ();
+ cgraph_node *caller = edge->caller;
+ cgraph_node *caller_node = node, *cl_node = NULL;
+
+ /* Already visited the caller to callee edges. */
+ auto_vec<cgraph_node *> &callees = caller_to_callees.get_or_insert (caller);
+ if (std::find (callees.begin (), callees.end (), node) != callees.end ())
+ return;
+
+ callees.safe_push (node);
+
+ if (node->get_partitioning_class () == SYMBOL_PARTITION)
+ {
+ if (!node_partitioned_p (node))
+ {
+ add_node_to_partition (partition, node);
+ if (dump_file)
+ fprintf (dump_file, "Partitioned node: %s\n",
+ node->dump_asm_name ());
+ }
+ else if (cloning_model >= LTO_LOCALITY_NON_INTERPOSABLE_CLONING
+ && !node_in_partition_p (partition, node))
+ {
+ /* Non-inlined node, or alias, already partitioned
+ If cut-off, don't clone callees but partition unpartitioned
+ callees.
+ size is node + inlined nodes. */
+ if (clone_further_p)
+ {
+ if (!node->alias)
+ if (ipa_size_summaries->get (node)->size >= size)
+ clone_further_p = false;
+
+ if (freq_cutoff != 0.0)
+ {
+ sreal acc_freq = accumulate_incoming_edge_frequency (edge);
+ if (acc_freq.to_double () < freq_cutoff)
+ clone_further_p = false;
+ }
+ }
+
+ if (!suitable_for_locality_cloning_p (edge, cloning_model))
+ clone_further_p = false;
+
+ if (clone_further_p)
+ {
+ /* Try to clone NODE and its inline chain. */
+ if (dump_file)
+ fprintf (dump_file, "Cloning node: %s\n",
+ node->dump_asm_name ());
+ cl_node = clone_node_as_needed (edge, partition, cl_num,
+ cloning_model);
+ if (cl_node)
+ {
+ add_node_to_partition (partition, cl_node);
+ caller_node = cl_node;
+ }
+ else
+ caller_node = NULL;
+ }
+ }
+ }
+ else if (!node->inlined_to)
+ return;
+
+ if (caller_node)
+ for (cgraph_edge *e = caller_node->callees; e; e = e->next_callee)
+ partition_callchain (e, partition, clone_further_p, cloning_model,
+ freq_cutoff, size, cl_num);
+}
+
+/* Determine whether NODE is an entrypoint to a callchain. */
+
+static bool
+is_entry_node_p (cgraph_node *node)
+{
+ /* node->inlined_to is returned as SYMBOL_DUPLICATE. */
+ if (node->get_partitioning_class () != SYMBOL_PARTITION)
+ return false;
+
+ if (!node->callers)
+ return true;
+
+ for (cgraph_edge *e = node->callers; e; e = e->next_caller)
+ {
+ if (! e->recursive_p ())
+ return false;
+ }
+ if (node->alias
+ && !is_entry_node_p (node->ultimate_alias_target ()))
+ return false;
+ return true;
+}
+
+/* Determine order of all external nodes if PGO profile is available.
+ Store the order in ORDER. */
+
+static bool
+locality_determine_ipa_order (auto_vec<locality_order *> *order)
+{
+ struct cgraph_node *node;
+ auto_vec<locality_order *> non_comparable_nodes;
+ FOR_EACH_DEFINED_FUNCTION (node)
+ if (node->get_partitioning_class () == SYMBOL_PARTITION)
+ {
+ if (node->no_reorder)
+ {
+ if (dump_file)
+ fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ());
+ return false;
+ }
+ else if (is_entry_node_p (node))
+ {
+ profile_count pcnt = node->count.ipa ();
+ if (!pcnt.initialized_p () || !pcnt.ipa_p ())
+ {
+ sreal cnt = 0;
+ locality_order *lo = new locality_order (node, cnt);
+ non_comparable_nodes.safe_push (lo);
+ continue;
+ }
+ sreal count = 0;
+ struct cgraph_edge *edge;
+ for (edge = node->callees; edge; edge = edge->next_callee)
+ {
+ /* For PGO, frequency is not used in
+ compare_edge_profile_counts (), it's used only as part of
+ static profile order. */
+ sreal freq = edge->sreal_frequency ();
+ count += freq;
+ }
+ locality_order *cl = new locality_order (node, count);
+ order->safe_push (cl);
+ }
+ }
+ order->qsort (compare_edge_profile_counts);
+ for (auto el : non_comparable_nodes)
+ order->safe_push (el);
+ return true;
+}
+
+/* Determine order of all external nodes if only static profile is available.
+ Store the order in ORDER. */
+
+static bool
+locality_determine_static_order (auto_vec<locality_order *> *order)
+{
+ struct cgraph_node *node;
+ FOR_EACH_DEFINED_FUNCTION (node)
+ if (node->get_partitioning_class () == SYMBOL_PARTITION)
+ {
+ if (node->no_reorder)
+ {
+ if (dump_file)
+ fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ());
+ return false;
+ }
+ else if (is_entry_node_p (node))
+ {
+ sreal count = 0;
+ struct cgraph_edge *edge;
+ for (edge = node->callees; edge; edge = edge->next_callee)
+ {
+ sreal freq = edge->sreal_frequency ();
+ count += freq;
+ }
+ locality_order *cl = new locality_order (node, count);
+ order->safe_push (cl);
+ }
+ }
+ order->qsort (static_profile_cmp);
+ return true;
+}
+
+/* Partitioning for code locality.
+ 1. Create and sort callchains. If PGO is available, use real profile
+ counts. Otherwise, use a set of heuristics to sort the callchains.
+ 2. Partition the external nodes and their callchains in the determined order
+ 2.1. If !partition, partition, else try and clone if it satisfies cloning
+ criteria.
+ 3. Partition all other unpartitioned nodes. */
+
+static void
+locality_partition_and_clone (int max_locality_partition_size,
+ lto_locality_cloning_model cloning_model,
+ int freq_denominator, int size)
+{
+ locality_partition partition;
+ int npartitions = 0;
+
+ auto_vec<locality_order *> order;
+ auto_vec<varpool_node *> varpool_order;
+ struct cgraph_node *node;
+ bool order_p;
+
+ int cl_num = 0;
+
+ double real_freq = 0.0;
+ if (freq_denominator > 0)
+ real_freq = 1.0 / (double) freq_denominator;
+
+ cgraph_node *n = symtab->first_defined_function ();
+ if (n && n->count.ipa_p ())
+ order_p = locality_determine_ipa_order (&order);
+ else
+ order_p = locality_determine_static_order (&order);
+ if (!order_p)
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file, "Locality partition: falling back to balanced"
+ "model\n");
+ }
+
+ return;
+ }
+
+ int64_t partition_size
+ = max_locality_partition_size
+ ? max_locality_partition_size : param_max_partition_size;
+ partition = create_partition (npartitions);
+
+ for (unsigned i = 0; i < order.length (); i++)
+ {
+ node = order[i]->node;
+ if (node_partitioned_p (node))
+ continue;
+
+ if (partition->insns > partition_size)
+ partition = create_partition (npartitions);
+ if (dump_file)
+ fprintf (dump_file, "Partition id: %d\n", partition->part_id);
+
+ add_node_to_partition (partition, node);
+ if (dump_file)
+ fprintf (dump_file, "Ordered Node: %s\n", node->dump_asm_name ());
+
+ for (cgraph_edge *edge = node->callees; edge; edge = edge->next_callee)
+ {
+ /* Recursively partition the callchain of edge->callee. */
+ partition_callchain (edge, partition, true, cloning_model, real_freq,
+ size, cl_num);
+ }
+ }
+
+ for (unsigned i = 0; i < order.length (); i++)
+ delete order[i];
+ order = vNULL;
+}
+
+/* Entry point to locality-clone pass. */
+static int
+lc_execute (void)
+{
+ symtab_node *node;
+ FOR_EACH_SYMBOL (node)
+ node->aux = NULL;
+
+ locality_partition_and_clone (param_max_locality_partition_size,
+ flag_lto_locality_cloning,
+ param_lto_locality_frequency,
+ param_lto_locality_size);
+
+ FOR_EACH_SYMBOL (node)
+ node->aux = NULL;
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_ipa_locality_clone = {
+ IPA_PASS, /* type */
+ "locality-clone", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_IPA_LC, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ (TODO_dump_symtab | TODO_remove_functions), /* todo_flags_finish */
+};
+
+class pass_ipa_locality_cloning : public ipa_opt_pass_d
+{
+public:
+ pass_ipa_locality_cloning (gcc::context *ctxt)
+ : ipa_opt_pass_d (pass_data_ipa_locality_clone, ctxt,
+ NULL, /* generate_summary */
+ NULL, /* write_summary */
+ NULL, /* read_summary */
+ NULL, /* write_optimization_summary */
+ NULL, /* read_optimization_summary */
+ NULL, /* stmt_fixup */
+ 0, /* function_transform_todo_flags_start */
+ NULL, /* function_transform */
+ NULL) /* variable_transform */
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ return (flag_wpa && flag_ipa_reorder_for_locality);
+ }
+
+ virtual unsigned int execute (function *) { return lc_execute (); }
+
+}; // class pass_ipa_locality_cloning
+
+} // namespace
+
+ipa_opt_pass_d *
+make_pass_ipa_locality_cloning (gcc::context *ctxt)
+{
+ return new pass_ipa_locality_cloning (ctxt);
+}