diff options
Diffstat (limited to 'gcc/tree-loop-distribution.cc')
-rw-r--r-- | gcc/tree-loop-distribution.cc | 3911 |
1 files changed, 3911 insertions, 0 deletions
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc new file mode 100644 index 0000000..6fe59cd --- /dev/null +++ b/gcc/tree-loop-distribution.cc @@ -0,0 +1,3911 @@ +/* Loop distribution. + Copyright (C) 2006-2022 Free Software Foundation, Inc. + Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> + and Sebastian Pop <sebastian.pop@amd.com>. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +GCC is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +/* This pass performs loop distribution: for example, the loop + + |DO I = 2, N + | A(I) = B(I) + C + | D(I) = A(I-1)*E + |ENDDO + + is transformed to + + |DOALL I = 2, N + | A(I) = B(I) + C + |ENDDO + | + |DOALL I = 2, N + | D(I) = A(I-1)*E + |ENDDO + + Loop distribution is the dual of loop fusion. It separates statements + of a loop (or loop nest) into multiple loops (or loop nests) with the + same loop header. The major goal is to separate statements which may + be vectorized from those that can't. This pass implements distribution + in the following steps: + + 1) Seed partitions with specific type statements. For now we support + two types seed statements: statement defining variable used outside + of loop; statement storing to memory. + 2) Build reduced dependence graph (RDG) for loop to be distributed. + The vertices (RDG:V) model all statements in the loop and the edges + (RDG:E) model flow and control dependencies between statements. + 3) Apart from RDG, compute data dependencies between memory references. + 4) Starting from seed statement, build up partition by adding depended + statements according to RDG's dependence information. Partition is + classified as parallel type if it can be executed paralleled; or as + sequential type if it can't. Parallel type partition is further + classified as different builtin kinds if it can be implemented as + builtin function calls. + 5) Build partition dependence graph (PG) based on data dependencies. + The vertices (PG:V) model all partitions and the edges (PG:E) model + all data dependencies between every partitions pair. In general, + data dependence is either compilation time known or unknown. In C + family languages, there exists quite amount compilation time unknown + dependencies because of possible alias relation of data references. + We categorize PG's edge to two types: "true" edge that represents + compilation time known data dependencies; "alias" edge for all other + data dependencies. + 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge + partitions in each strong connected component (SCC) correspondingly. + Build new PG for merged partitions. + 7) Traverse PG again and this time with both "true" and "alias" edges + included. We try to break SCCs by removing some edges. Because + SCCs by "true" edges are all fused in step 6), we can break SCCs + by removing some "alias" edges. It's NP-hard to choose optimal + edge set, fortunately simple approximation is good enough for us + given the small problem scale. + 8) Collect all data dependencies of the removed "alias" edges. Create + runtime alias checks for collected data dependencies. + 9) Version loop under the condition of runtime alias checks. Given + loop distribution generally introduces additional overhead, it is + only useful if vectorization is achieved in distributed loop. We + version loop with internal function call IFN_LOOP_DIST_ALIAS. If + no distributed loop can be vectorized, we simply remove distributed + loops and recover to the original one. + + TODO: + 1) We only distribute innermost two-level loop nest now. We should + extend it for arbitrary loop nests in the future. + 2) We only fuse partitions in SCC now. A better fusion algorithm is + desired to minimize loop overhead, maximize parallelism and maximize + data reuse. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "tree.h" +#include "gimple.h" +#include "cfghooks.h" +#include "tree-pass.h" +#include "ssa.h" +#include "gimple-pretty-print.h" +#include "fold-const.h" +#include "cfganal.h" +#include "gimple-iterator.h" +#include "gimplify-me.h" +#include "stor-layout.h" +#include "tree-cfg.h" +#include "tree-ssa-loop-manip.h" +#include "tree-ssa-loop-ivopts.h" +#include "tree-ssa-loop.h" +#include "tree-into-ssa.h" +#include "tree-ssa.h" +#include "cfgloop.h" +#include "tree-scalar-evolution.h" +#include "tree-vectorizer.h" +#include "tree-eh.h" +#include "gimple-fold.h" +#include "tree-affine.h" +#include "intl.h" +#include "rtl.h" +#include "memmodel.h" +#include "optabs.h" + + +#define MAX_DATAREFS_NUM \ + ((unsigned) param_loop_max_datarefs_for_datadeps) + +/* Threshold controlling number of distributed partitions. Given it may + be unnecessary if a memory stream cost model is invented in the future, + we define it as a temporary macro, rather than a parameter. */ +#define NUM_PARTITION_THRESHOLD (4) + +/* Hashtable helpers. */ + +struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation> +{ + static inline hashval_t hash (const data_dependence_relation *); + static inline bool equal (const data_dependence_relation *, + const data_dependence_relation *); +}; + +/* Hash function for data dependence. */ + +inline hashval_t +ddr_hasher::hash (const data_dependence_relation *ddr) +{ + inchash::hash h; + h.add_ptr (DDR_A (ddr)); + h.add_ptr (DDR_B (ddr)); + return h.end (); +} + +/* Hash table equality function for data dependence. */ + +inline bool +ddr_hasher::equal (const data_dependence_relation *ddr1, + const data_dependence_relation *ddr2) +{ + return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2)); +} + + + +#define DR_INDEX(dr) ((uintptr_t) (dr)->aux) + +/* A Reduced Dependence Graph (RDG) vertex representing a statement. */ +struct rdg_vertex +{ + /* The statement represented by this vertex. */ + gimple *stmt; + + /* Vector of data-references in this statement. */ + vec<data_reference_p> datarefs; + + /* True when the statement contains a write to memory. */ + bool has_mem_write; + + /* True when the statement contains a read from memory. */ + bool has_mem_reads; +}; + +#define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt +#define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs +#define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write +#define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads +#define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I])) +#define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I])) +#define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I])) +#define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I])) + +/* Data dependence type. */ + +enum rdg_dep_type +{ + /* Read After Write (RAW). */ + flow_dd = 'f', + + /* Control dependence (execute conditional on). */ + control_dd = 'c' +}; + +/* Dependence information attached to an edge of the RDG. */ + +struct rdg_edge +{ + /* Type of the dependence. */ + enum rdg_dep_type type; +}; + +#define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type + +/* Kind of distributed loop. */ +enum partition_kind { + PKIND_NORMAL, + /* Partial memset stands for a paritition can be distributed into a loop + of memset calls, rather than a single memset call. It's handled just + like a normal parition, i.e, distributed as separate loop, no memset + call is generated. + + Note: This is a hacking fix trying to distribute ZERO-ing stmt in a + loop nest as deep as possible. As a result, parloop achieves better + parallelization by parallelizing deeper loop nest. This hack should + be unnecessary and removed once distributed memset can be understood + and analyzed in data reference analysis. See PR82604 for more. */ + PKIND_PARTIAL_MEMSET, + PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE +}; + +/* Type of distributed loop. */ +enum partition_type { + /* The distributed loop can be executed parallelly. */ + PTYPE_PARALLEL = 0, + /* The distributed loop has to be executed sequentially. */ + PTYPE_SEQUENTIAL +}; + +/* Builtin info for loop distribution. */ +struct builtin_info +{ + /* data-references a kind != PKIND_NORMAL partition is about. */ + data_reference_p dst_dr; + data_reference_p src_dr; + /* Base address and size of memory objects operated by the builtin. Note + both dest and source memory objects must have the same size. */ + tree dst_base; + tree src_base; + tree size; + /* Base and offset part of dst_base after stripping constant offset. This + is only used in memset builtin distribution for now. */ + tree dst_base_base; + unsigned HOST_WIDE_INT dst_base_offset; +}; + +/* Partition for loop distribution. */ +struct partition +{ + /* Statements of the partition. */ + bitmap stmts; + /* True if the partition defines variable which is used outside of loop. */ + bool reduction_p; + location_t loc; + enum partition_kind kind; + enum partition_type type; + /* Data references in the partition. */ + bitmap datarefs; + /* Information of builtin parition. */ + struct builtin_info *builtin; +}; + +/* Partitions are fused because of different reasons. */ +enum fuse_type +{ + FUSE_NON_BUILTIN = 0, + FUSE_REDUCTION = 1, + FUSE_SHARE_REF = 2, + FUSE_SAME_SCC = 3, + FUSE_FINALIZE = 4 +}; + +/* Description on different fusing reason. */ +static const char *fuse_message[] = { + "they are non-builtins", + "they have reductions", + "they have shared memory refs", + "they are in the same dependence scc", + "there is no point to distribute loop"}; + + +/* Dump vertex I in RDG to FILE. */ + +static void +dump_rdg_vertex (FILE *file, struct graph *rdg, int i) +{ + struct vertex *v = &(rdg->vertices[i]); + struct graph_edge *e; + + fprintf (file, "(vertex %d: (%s%s) (in:", i, + RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "", + RDG_MEM_READS_STMT (rdg, i) ? "r" : ""); + + if (v->pred) + for (e = v->pred; e; e = e->pred_next) + fprintf (file, " %d", e->src); + + fprintf (file, ") (out:"); + + if (v->succ) + for (e = v->succ; e; e = e->succ_next) + fprintf (file, " %d", e->dest); + + fprintf (file, ")\n"); + print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); + fprintf (file, ")\n"); +} + +/* Call dump_rdg_vertex on stderr. */ + +DEBUG_FUNCTION void +debug_rdg_vertex (struct graph *rdg, int i) +{ + dump_rdg_vertex (stderr, rdg, i); +} + +/* Dump the reduced dependence graph RDG to FILE. */ + +static void +dump_rdg (FILE *file, struct graph *rdg) +{ + fprintf (file, "(rdg\n"); + for (int i = 0; i < rdg->n_vertices; i++) + dump_rdg_vertex (file, rdg, i); + fprintf (file, ")\n"); +} + +/* Call dump_rdg on stderr. */ + +DEBUG_FUNCTION void +debug_rdg (struct graph *rdg) +{ + dump_rdg (stderr, rdg); +} + +static void +dot_rdg_1 (FILE *file, struct graph *rdg) +{ + int i; + pretty_printer buffer; + pp_needs_newline (&buffer) = false; + buffer.buffer->stream = file; + + fprintf (file, "digraph RDG {\n"); + + for (i = 0; i < rdg->n_vertices; i++) + { + struct vertex *v = &(rdg->vertices[i]); + struct graph_edge *e; + + fprintf (file, "%d [label=\"[%d] ", i, i); + pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM); + pp_flush (&buffer); + fprintf (file, "\"]\n"); + + /* Highlight reads from memory. */ + if (RDG_MEM_READS_STMT (rdg, i)) + fprintf (file, "%d [style=filled, fillcolor=green]\n", i); + + /* Highlight stores to memory. */ + if (RDG_MEM_WRITE_STMT (rdg, i)) + fprintf (file, "%d [style=filled, fillcolor=red]\n", i); + + if (v->succ) + for (e = v->succ; e; e = e->succ_next) + switch (RDGE_TYPE (e)) + { + case flow_dd: + /* These are the most common dependences: don't print these. */ + fprintf (file, "%d -> %d \n", i, e->dest); + break; + + case control_dd: + fprintf (file, "%d -> %d [label=control] \n", i, e->dest); + break; + + default: + gcc_unreachable (); + } + } + + fprintf (file, "}\n\n"); +} + +/* Display the Reduced Dependence Graph using dotty. */ + +DEBUG_FUNCTION void +dot_rdg (struct graph *rdg) +{ + /* When debugging, you may want to enable the following code. */ +#ifdef HAVE_POPEN + FILE *file = popen ("dot -Tx11", "w"); + if (!file) + return; + dot_rdg_1 (file, rdg); + fflush (file); + close (fileno (file)); + pclose (file); +#else + dot_rdg_1 (stderr, rdg); +#endif +} + +/* Returns the index of STMT in RDG. */ + +static int +rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt) +{ + int index = gimple_uid (stmt); + gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt); + return index; +} + +/* Creates dependence edges in RDG for all the uses of DEF. IDEF is + the index of DEF in RDG. */ + +static void +create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef) +{ + use_operand_p imm_use_p; + imm_use_iterator iterator; + + FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def) + { + struct graph_edge *e; + int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p)); + + if (use < 0) + continue; + + e = add_edge (rdg, idef, use); + e->data = XNEW (struct rdg_edge); + RDGE_TYPE (e) = flow_dd; + } +} + +/* Creates an edge for the control dependences of BB to the vertex V. */ + +static void +create_edge_for_control_dependence (struct graph *rdg, basic_block bb, + int v, control_dependences *cd) +{ + bitmap_iterator bi; + unsigned edge_n; + EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index), + 0, edge_n, bi) + { + basic_block cond_bb = cd->get_edge_src (edge_n); + gimple *stmt = last_stmt (cond_bb); + if (stmt && is_ctrl_stmt (stmt)) + { + struct graph_edge *e; + int c = rdg_vertex_for_stmt (rdg, stmt); + if (c < 0) + continue; + + e = add_edge (rdg, c, v); + e->data = XNEW (struct rdg_edge); + RDGE_TYPE (e) = control_dd; + } + } +} + +/* Creates the edges of the reduced dependence graph RDG. */ + +static void +create_rdg_flow_edges (struct graph *rdg) +{ + int i; + def_operand_p def_p; + ssa_op_iter iter; + + for (i = 0; i < rdg->n_vertices; i++) + FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i), + iter, SSA_OP_DEF) + create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i); +} + +/* Creates the edges of the reduced dependence graph RDG. */ + +static void +create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop) +{ + int i; + + for (i = 0; i < rdg->n_vertices; i++) + { + gimple *stmt = RDG_STMT (rdg, i); + if (gimple_code (stmt) == GIMPLE_PHI) + { + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds) + if (flow_bb_inside_loop_p (loop, e->src)) + create_edge_for_control_dependence (rdg, e->src, i, cd); + } + else + create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd); + } +} + + +class loop_distribution +{ + private: + /* The loop (nest) to be distributed. */ + vec<loop_p> loop_nest; + + /* Vector of data references in the loop to be distributed. */ + vec<data_reference_p> datarefs_vec; + + /* If there is nonaddressable data reference in above vector. */ + bool has_nonaddressable_dataref_p; + + /* Store index of data reference in aux field. */ + + /* Hash table for data dependence relation in the loop to be distributed. */ + hash_table<ddr_hasher> *ddrs_table; + + /* Array mapping basic block's index to its topological order. */ + int *bb_top_order_index; + /* And size of the array. */ + int bb_top_order_index_size; + + /* Build the vertices of the reduced dependence graph RDG. Return false + if that failed. */ + bool create_rdg_vertices (struct graph *rdg, const vec<gimple *> &stmts, + loop_p loop); + + /* Initialize STMTS with all the statements of LOOP. We use topological + order to discover all statements. The order is important because + generate_loops_for_partition is using the same traversal for identifying + statements in loop copies. */ + void stmts_from_loop (class loop *loop, vec<gimple *> *stmts); + + + /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of + LOOP, and one edge per flow dependence or control dependence from control + dependence CD. During visiting each statement, data references are also + collected and recorded in global data DATAREFS_VEC. */ + struct graph * build_rdg (class loop *loop, control_dependences *cd); + +/* Merge PARTITION into the partition DEST. RDG is the reduced dependence + graph and we update type for result partition if it is non-NULL. */ + void partition_merge_into (struct graph *rdg, + partition *dest, partition *partition, + enum fuse_type ft); + + + /* Return data dependence relation for data references A and B. The two + data references must be in lexicographic order wrto reduced dependence + graph RDG. We firstly try to find ddr from global ddr hash table. If + it doesn't exist, compute the ddr and cache it. */ + data_dependence_relation * get_data_dependence (struct graph *rdg, + data_reference_p a, + data_reference_p b); + + + /* In reduced dependence graph RDG for loop distribution, return true if + dependence between references DR1 and DR2 leads to a dependence cycle + and such dependence cycle can't be resolved by runtime alias check. */ + bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1, + data_reference_p dr2); + + + /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update + PARTITION1's type after merging PARTITION2 into PARTITION1. */ + void update_type_for_merge (struct graph *rdg, + partition *partition1, partition *partition2); + + + /* Returns a partition with all the statements needed for computing + the vertex V of the RDG, also including the loop exit conditions. */ + partition *build_rdg_partition_for_vertex (struct graph *rdg, int v); + + /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify + if it forms builtin memcpy or memmove call. */ + void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, + data_reference_p dst_dr, data_reference_p src_dr); + + /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. + For the moment we detect memset, memcpy and memmove patterns. Bitmap + STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. + Returns true if there is a reduction in all partitions and we + possibly did not mark PARTITION as having one for this reason. */ + + bool + classify_partition (loop_p loop, + struct graph *rdg, partition *partition, + bitmap stmt_in_all_partitions); + + + /* Returns true when PARTITION1 and PARTITION2 access the same memory + object in RDG. */ + bool share_memory_accesses (struct graph *rdg, + partition *partition1, partition *partition2); + + /* For each seed statement in STARTING_STMTS, this function builds + partition for it by adding depended statements according to RDG. + All partitions are recorded in PARTITIONS. */ + void rdg_build_partitions (struct graph *rdg, + vec<gimple *> starting_stmts, + vec<partition *> *partitions); + + /* Compute partition dependence created by the data references in DRS1 + and DRS2, modify and return DIR according to that. IF ALIAS_DDR is + not NULL, we record dependence introduced by possible alias between + two data references in ALIAS_DDRS; otherwise, we simply ignore such + dependence as if it doesn't exist at all. */ + int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1, + bitmap drs2, vec<ddr_p> *alias_ddrs); + + + /* Build and return partition dependence graph for PARTITIONS. RDG is + reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P + is true, data dependence caused by possible alias between references + is ignored, as if it doesn't exist at all; otherwise all depdendences + are considered. */ + struct graph *build_partition_graph (struct graph *rdg, + vec<struct partition *> *partitions, + bool ignore_alias_p); + + /* Given reduced dependence graph RDG merge strong connected components + of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by + possible alias between references is ignored, as if it doesn't exist + at all; otherwise all depdendences are considered. */ + void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *> + *partitions, bool ignore_alias_p); + +/* This is the main function breaking strong conected components in + PARTITIONS giving reduced depdendence graph RDG. Store data dependence + relations for runtime alias check in ALIAS_DDRS. */ + void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *> + *partitions, vec<ddr_p> *alias_ddrs); + + + /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. + ALIAS_DDRS contains ddrs which need runtime alias check. */ + void finalize_partitions (class loop *loop, vec<struct partition *> + *partitions, vec<ddr_p> *alias_ddrs); + + /* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the + statements from STMTS into separate loops. Returns the number of + distributed loops. Set NB_CALLS to number of generated builtin calls. + Set *DESTROY_P to whether LOOP needs to be destroyed. */ + int distribute_loop (class loop *loop, const vec<gimple *> &stmts, + control_dependences *cd, int *nb_calls, bool *destroy_p, + bool only_patterns_p); + + /* Transform loops which mimic the effects of builtins rawmemchr or strlen and + replace them accordingly. */ + bool transform_reduction_loop (loop_p loop); + + /* Compute topological order for basic blocks. Topological order is + needed because data dependence is computed for data references in + lexicographical order. */ + void bb_top_order_init (void); + + void bb_top_order_destroy (void); + + public: + + /* Getter for bb_top_order. */ + + inline int get_bb_top_order_index_size (void) + { + return bb_top_order_index_size; + } + + inline int get_bb_top_order_index (int i) + { + return bb_top_order_index[i]; + } + + unsigned int execute (function *fun); +}; + + +/* If X has a smaller topological sort number than Y, returns -1; + if greater, returns 1. */ +static int +bb_top_order_cmp_r (const void *x, const void *y, void *loop) +{ + loop_distribution *_loop = + (loop_distribution *) loop; + + basic_block bb1 = *(const basic_block *) x; + basic_block bb2 = *(const basic_block *) y; + + int bb_top_order_index_size = _loop->get_bb_top_order_index_size (); + + gcc_assert (bb1->index < bb_top_order_index_size + && bb2->index < bb_top_order_index_size); + gcc_assert (bb1 == bb2 + || _loop->get_bb_top_order_index(bb1->index) + != _loop->get_bb_top_order_index(bb2->index)); + + return (_loop->get_bb_top_order_index(bb1->index) - + _loop->get_bb_top_order_index(bb2->index)); +} + +bool +loop_distribution::create_rdg_vertices (struct graph *rdg, + const vec<gimple *> &stmts, + loop_p loop) +{ + int i; + gimple *stmt; + + FOR_EACH_VEC_ELT (stmts, i, stmt) + { + struct vertex *v = &(rdg->vertices[i]); + + /* Record statement to vertex mapping. */ + gimple_set_uid (stmt, i); + + v->data = XNEW (struct rdg_vertex); + RDGV_STMT (v) = stmt; + RDGV_DATAREFS (v).create (0); + RDGV_HAS_MEM_WRITE (v) = false; + RDGV_HAS_MEM_READS (v) = false; + if (gimple_code (stmt) == GIMPLE_PHI) + continue; + + unsigned drp = datarefs_vec.length (); + if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec)) + return false; + for (unsigned j = drp; j < datarefs_vec.length (); ++j) + { + data_reference_p dr = datarefs_vec[j]; + if (DR_IS_READ (dr)) + RDGV_HAS_MEM_READS (v) = true; + else + RDGV_HAS_MEM_WRITE (v) = true; + RDGV_DATAREFS (v).safe_push (dr); + has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref); + } + } + return true; +} + +void +loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts) +{ + unsigned int i; + basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r); + + for (i = 0; i < loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + + for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); + gsi_next (&bsi)) + if (!virtual_operand_p (gimple_phi_result (bsi.phi ()))) + stmts->safe_push (bsi.phi ()); + + for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); + gsi_next (&bsi)) + { + gimple *stmt = gsi_stmt (bsi); + if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt)) + stmts->safe_push (stmt); + } + } + + free (bbs); +} + +/* Free the reduced dependence graph RDG. */ + +static void +free_rdg (struct graph *rdg) +{ + int i; + + for (i = 0; i < rdg->n_vertices; i++) + { + struct vertex *v = &(rdg->vertices[i]); + struct graph_edge *e; + + for (e = v->succ; e; e = e->succ_next) + free (e->data); + + if (v->data) + { + gimple_set_uid (RDGV_STMT (v), -1); + (RDGV_DATAREFS (v)).release (); + free (v->data); + } + } + + free_graph (rdg); +} + +struct graph * +loop_distribution::build_rdg (class loop *loop, control_dependences *cd) +{ + struct graph *rdg; + + /* Create the RDG vertices from the stmts of the loop nest. */ + auto_vec<gimple *, 10> stmts; + stmts_from_loop (loop, &stmts); + rdg = new_graph (stmts.length ()); + if (!create_rdg_vertices (rdg, stmts, loop)) + { + free_rdg (rdg); + return NULL; + } + stmts.release (); + + create_rdg_flow_edges (rdg); + if (cd) + create_rdg_cd_edges (rdg, cd, loop); + + return rdg; +} + + +/* Allocate and initialize a partition from BITMAP. */ + +static partition * +partition_alloc (void) +{ + partition *partition = XCNEW (struct partition); + partition->stmts = BITMAP_ALLOC (NULL); + partition->reduction_p = false; + partition->loc = UNKNOWN_LOCATION; + partition->kind = PKIND_NORMAL; + partition->type = PTYPE_PARALLEL; + partition->datarefs = BITMAP_ALLOC (NULL); + return partition; +} + +/* Free PARTITION. */ + +static void +partition_free (partition *partition) +{ + BITMAP_FREE (partition->stmts); + BITMAP_FREE (partition->datarefs); + if (partition->builtin) + free (partition->builtin); + + free (partition); +} + +/* Returns true if the partition can be generated as a builtin. */ + +static bool +partition_builtin_p (partition *partition) +{ + return partition->kind > PKIND_PARTIAL_MEMSET; +} + +/* Returns true if the partition contains a reduction. */ + +static bool +partition_reduction_p (partition *partition) +{ + return partition->reduction_p; +} + +void +loop_distribution::partition_merge_into (struct graph *rdg, + partition *dest, partition *partition, enum fuse_type ft) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]); + fprintf (dump_file, " Part 1: "); + dump_bitmap (dump_file, dest->stmts); + fprintf (dump_file, " Part 2: "); + dump_bitmap (dump_file, partition->stmts); + } + + dest->kind = PKIND_NORMAL; + if (dest->type == PTYPE_PARALLEL) + dest->type = partition->type; + + bitmap_ior_into (dest->stmts, partition->stmts); + if (partition_reduction_p (partition)) + dest->reduction_p = true; + + /* Further check if any data dependence prevents us from executing the + new partition parallelly. */ + if (dest->type == PTYPE_PARALLEL && rdg != NULL) + update_type_for_merge (rdg, dest, partition); + + bitmap_ior_into (dest->datarefs, partition->datarefs); +} + + +/* Returns true when DEF is an SSA_NAME defined in LOOP and used after + the LOOP. */ + +static bool +ssa_name_has_uses_outside_loop_p (tree def, loop_p loop) +{ + imm_use_iterator imm_iter; + use_operand_p use_p; + + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def) + { + if (is_gimple_debug (USE_STMT (use_p))) + continue; + + basic_block use_bb = gimple_bb (USE_STMT (use_p)); + if (!flow_bb_inside_loop_p (loop, use_bb)) + return true; + } + + return false; +} + +/* Returns true when STMT defines a scalar variable used after the + loop LOOP. */ + +static bool +stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt) +{ + def_operand_p def_p; + ssa_op_iter op_iter; + + if (gimple_code (stmt) == GIMPLE_PHI) + return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop); + + FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF) + if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop)) + return true; + + return false; +} + +/* Return a copy of LOOP placed before LOOP. */ + +static class loop * +copy_loop_before (class loop *loop) +{ + class loop *res; + edge preheader = loop_preheader_edge (loop); + + initialize_original_copy_tables (); + res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); + gcc_assert (res != NULL); + free_original_copy_tables (); + delete_update_ssa (); + + return res; +} + +/* Creates an empty basic block after LOOP. */ + +static void +create_bb_after_loop (class loop *loop) +{ + edge exit = single_exit (loop); + + if (!exit) + return; + + split_edge (exit); +} + +/* Generate code for PARTITION from the code in LOOP. The loop is + copied when COPY_P is true. All the statements not flagged in the + PARTITION bitmap are removed from the loop or from its copy. The + statements are indexed in sequence inside a basic block, and the + basic blocks of a loop are taken in dom order. */ + +static void +generate_loops_for_partition (class loop *loop, partition *partition, + bool copy_p) +{ + unsigned i; + basic_block *bbs; + + if (copy_p) + { + int orig_loop_num = loop->orig_loop_num; + loop = copy_loop_before (loop); + gcc_assert (loop != NULL); + loop->orig_loop_num = orig_loop_num; + create_preheader (loop, CP_SIMPLE_PREHEADERS); + create_bb_after_loop (loop); + } + else + { + /* Origin number is set to the new versioned loop's num. */ + gcc_assert (loop->orig_loop_num != loop->num); + } + + /* Remove stmts not in the PARTITION bitmap. */ + bbs = get_loop_body_in_dom_order (loop); + + if (MAY_HAVE_DEBUG_BIND_STMTS) + for (i = 0; i < loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + + for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); + gsi_next (&bsi)) + { + gphi *phi = bsi.phi (); + if (!virtual_operand_p (gimple_phi_result (phi)) + && !bitmap_bit_p (partition->stmts, gimple_uid (phi))) + reset_debug_uses (phi); + } + + for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) + { + gimple *stmt = gsi_stmt (bsi); + if (gimple_code (stmt) != GIMPLE_LABEL + && !is_gimple_debug (stmt) + && !bitmap_bit_p (partition->stmts, gimple_uid (stmt))) + reset_debug_uses (stmt); + } + } + + for (i = 0; i < loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + edge inner_exit = NULL; + + if (loop != bb->loop_father) + inner_exit = single_exit (bb->loop_father); + + for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);) + { + gphi *phi = bsi.phi (); + if (!virtual_operand_p (gimple_phi_result (phi)) + && !bitmap_bit_p (partition->stmts, gimple_uid (phi))) + remove_phi_node (&bsi, true); + else + gsi_next (&bsi); + } + + for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);) + { + gimple *stmt = gsi_stmt (bsi); + if (gimple_code (stmt) != GIMPLE_LABEL + && !is_gimple_debug (stmt) + && !bitmap_bit_p (partition->stmts, gimple_uid (stmt))) + { + /* In distribution of loop nest, if bb is inner loop's exit_bb, + we choose its exit edge/path in order to avoid generating + infinite loop. For all other cases, we choose an arbitrary + path through the empty CFG part that this unnecessary + control stmt controls. */ + if (gcond *cond_stmt = dyn_cast <gcond *> (stmt)) + { + if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE) + gimple_cond_make_true (cond_stmt); + else + gimple_cond_make_false (cond_stmt); + update_stmt (stmt); + } + else if (gimple_code (stmt) == GIMPLE_SWITCH) + { + gswitch *switch_stmt = as_a <gswitch *> (stmt); + gimple_switch_set_index + (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1))); + update_stmt (stmt); + } + else + { + unlink_stmt_vdef (stmt); + gsi_remove (&bsi, true); + release_defs (stmt); + continue; + } + } + gsi_next (&bsi); + } + } + + free (bbs); +} + +/* If VAL memory representation contains the same value in all bytes, + return that value, otherwise return -1. + E.g. for 0x24242424 return 0x24, for IEEE double + 747708026454360457216.0 return 0x44, etc. */ + +static int +const_with_all_bytes_same (tree val) +{ + unsigned char buf[64]; + int i, len; + + if (integer_zerop (val) + || (TREE_CODE (val) == CONSTRUCTOR + && !TREE_CLOBBER_P (val) + && CONSTRUCTOR_NELTS (val) == 0)) + return 0; + + if (real_zerop (val)) + { + /* Only return 0 for +0.0, not for -0.0, which doesn't have + an all bytes same memory representation. Don't transform + -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */ + switch (TREE_CODE (val)) + { + case REAL_CST: + if (!real_isneg (TREE_REAL_CST_PTR (val))) + return 0; + break; + case COMPLEX_CST: + if (!const_with_all_bytes_same (TREE_REALPART (val)) + && !const_with_all_bytes_same (TREE_IMAGPART (val))) + return 0; + break; + case VECTOR_CST: + { + unsigned int count = vector_cst_encoded_nelts (val); + unsigned int j; + for (j = 0; j < count; ++j) + if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j))) + break; + if (j == count) + return 0; + break; + } + default: + break; + } + } + + if (CHAR_BIT != 8 || BITS_PER_UNIT != 8) + return -1; + + len = native_encode_expr (val, buf, sizeof (buf)); + if (len == 0) + return -1; + for (i = 1; i < len; i++) + if (buf[i] != buf[0]) + return -1; + return buf[0]; +} + +/* Generate a call to memset for PARTITION in LOOP. */ + +static void +generate_memset_builtin (class loop *loop, partition *partition) +{ + gimple_stmt_iterator gsi; + tree mem, fn, nb_bytes; + tree val; + struct builtin_info *builtin = partition->builtin; + gimple *fn_call; + + /* The new statements will be placed before LOOP. */ + gsi = gsi_last_bb (loop_preheader_edge (loop)->src); + + nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); + nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + mem = rewrite_to_non_trapping_overflow (builtin->dst_base); + mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + + /* This exactly matches the pattern recognition in classify_partition. */ + val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr)); + /* Handle constants like 0x15151515 and similarly + floating point constants etc. where all bytes are the same. */ + int bytev = const_with_all_bytes_same (val); + if (bytev != -1) + val = build_int_cst (integer_type_node, bytev); + else if (TREE_CODE (val) == INTEGER_CST) + val = fold_convert (integer_type_node, val); + else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val))) + { + tree tem = make_ssa_name (integer_type_node); + gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val); + gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING); + val = tem; + } + + fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET)); + fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes); + gimple_set_location (fn_call, partition->loc); + gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); + fold_stmt (&gsi); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "generated memset"); + if (bytev == 0) + fprintf (dump_file, " zero\n"); + else + fprintf (dump_file, "\n"); + } +} + +/* Generate a call to memcpy for PARTITION in LOOP. */ + +static void +generate_memcpy_builtin (class loop *loop, partition *partition) +{ + gimple_stmt_iterator gsi; + gimple *fn_call; + tree dest, src, fn, nb_bytes; + enum built_in_function kind; + struct builtin_info *builtin = partition->builtin; + + /* The new statements will be placed before LOOP. */ + gsi = gsi_last_bb (loop_preheader_edge (loop)->src); + + nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); + nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + dest = rewrite_to_non_trapping_overflow (builtin->dst_base); + src = rewrite_to_non_trapping_overflow (builtin->src_base); + if (partition->kind == PKIND_MEMCPY + || ! ptr_derefs_may_alias_p (dest, src)) + kind = BUILT_IN_MEMCPY; + else + kind = BUILT_IN_MEMMOVE; + /* Try harder if we're copying a constant size. */ + if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes)) + { + aff_tree asrc, adest; + tree_to_aff_combination (src, ptr_type_node, &asrc); + tree_to_aff_combination (dest, ptr_type_node, &adest); + aff_combination_scale (&adest, -1); + aff_combination_add (&asrc, &adest); + if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes), + wi::to_poly_widest (nb_bytes))) + kind = BUILT_IN_MEMCPY; + } + + dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + fn = build_fold_addr_expr (builtin_decl_implicit (kind)); + fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes); + gimple_set_location (fn_call, partition->loc); + gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); + fold_stmt (&gsi); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + if (kind == BUILT_IN_MEMCPY) + fprintf (dump_file, "generated memcpy\n"); + else + fprintf (dump_file, "generated memmove\n"); + } +} + +/* Remove and destroy the loop LOOP. */ + +static void +destroy_loop (class loop *loop) +{ + unsigned nbbs = loop->num_nodes; + edge exit = single_exit (loop); + basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest; + basic_block *bbs; + unsigned i; + + bbs = get_loop_body_in_dom_order (loop); + + gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest); + bool safe_p = single_pred_p (exit->dest); + for (unsigned i = 0; i < nbbs; ++i) + { + /* We have made sure to not leave any dangling uses of SSA + names defined in the loop. With the exception of virtuals. + Make sure we replace all uses of virtual defs that will remain + outside of the loop with the bare symbol as delete_basic_block + will release them. */ + for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + if (virtual_operand_p (gimple_phi_result (phi))) + mark_virtual_phi_result_for_renaming (phi); + } + for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);) + { + gimple *stmt = gsi_stmt (gsi); + tree vdef = gimple_vdef (stmt); + if (vdef && TREE_CODE (vdef) == SSA_NAME) + mark_virtual_operand_for_renaming (vdef); + /* Also move and eventually reset debug stmts. We can leave + constant values in place in case the stmt dominates the exit. + ??? Non-constant values from the last iteration can be + replaced with final values if we can compute them. */ + if (gimple_debug_bind_p (stmt)) + { + tree val = gimple_debug_bind_get_value (stmt); + gsi_move_before (&gsi, &dst_gsi); + if (val + && (!safe_p + || !is_gimple_min_invariant (val) + || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i]))) + { + gimple_debug_bind_reset_value (stmt); + update_stmt (stmt); + } + } + else + gsi_next (&gsi); + } + } + + redirect_edge_pred (exit, src); + exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE); + exit->flags |= EDGE_FALLTHRU; + cancel_loop_tree (loop); + rescan_loop_exit (exit, false, true); + + i = nbbs; + do + { + --i; + delete_basic_block (bbs[i]); + } + while (i != 0); + + free (bbs); + + set_immediate_dominator (CDI_DOMINATORS, dest, + recompute_dominator (CDI_DOMINATORS, dest)); +} + +/* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */ + +static bool +generate_code_for_partition (class loop *loop, + partition *partition, bool copy_p) +{ + switch (partition->kind) + { + case PKIND_NORMAL: + case PKIND_PARTIAL_MEMSET: + /* Reductions all have to be in the last partition. */ + gcc_assert (!partition_reduction_p (partition) + || !copy_p); + generate_loops_for_partition (loop, partition, copy_p); + return false; + + case PKIND_MEMSET: + generate_memset_builtin (loop, partition); + break; + + case PKIND_MEMCPY: + case PKIND_MEMMOVE: + generate_memcpy_builtin (loop, partition); + break; + + default: + gcc_unreachable (); + } + + /* Common tail for partitions we turn into a call. If this was the last + partition for which we generate code, we have to destroy the loop. */ + if (!copy_p) + return true; + return false; +} + +data_dependence_relation * +loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a, + data_reference_p b) +{ + struct data_dependence_relation ent, **slot; + struct data_dependence_relation *ddr; + + gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b)); + gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a)) + <= rdg_vertex_for_stmt (rdg, DR_STMT (b))); + ent.a = a; + ent.b = b; + slot = ddrs_table->find_slot (&ent, INSERT); + if (*slot == NULL) + { + ddr = initialize_data_dependence_relation (a, b, loop_nest); + compute_affine_dependence (ddr, loop_nest[0]); + *slot = ddr; + } + + return *slot; +} + +bool +loop_distribution::data_dep_in_cycle_p (struct graph *rdg, + data_reference_p dr1, + data_reference_p dr2) +{ + struct data_dependence_relation *ddr; + + /* Re-shuffle data-refs to be in topological order. */ + if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) + > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) + std::swap (dr1, dr2); + + ddr = get_data_dependence (rdg, dr1, dr2); + + /* In case of no data dependence. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) + return false; + /* For unknown data dependence or known data dependence which can't be + expressed in classic distance vector, we check if it can be resolved + by runtime alias check. If yes, we still consider data dependence + as won't introduce data dependence cycle. */ + else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know + || DDR_NUM_DIST_VECTS (ddr) == 0) + return !runtime_alias_check_p (ddr, NULL, true); + else if (DDR_NUM_DIST_VECTS (ddr) > 1) + return true; + else if (DDR_REVERSED_P (ddr) + || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1)) + return false; + + return true; +} + +void +loop_distribution::update_type_for_merge (struct graph *rdg, + partition *partition1, + partition *partition2) +{ + unsigned i, j; + bitmap_iterator bi, bj; + data_reference_p dr1, dr2; + + EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) + { + unsigned start = (partition1 == partition2) ? i + 1 : 0; + + dr1 = datarefs_vec[i]; + EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj) + { + dr2 = datarefs_vec[j]; + if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) + continue; + + /* Partition can only be executed sequentially if there is any + data dependence cycle. */ + if (data_dep_in_cycle_p (rdg, dr1, dr2)) + { + partition1->type = PTYPE_SEQUENTIAL; + return; + } + } + } +} + +partition * +loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v) +{ + partition *partition = partition_alloc (); + auto_vec<int, 3> nodes; + unsigned i, j; + int x; + data_reference_p dr; + + graphds_dfs (rdg, &v, 1, &nodes, false, NULL); + + FOR_EACH_VEC_ELT (nodes, i, x) + { + bitmap_set_bit (partition->stmts, x); + + for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j) + { + unsigned idx = (unsigned) DR_INDEX (dr); + gcc_assert (idx < datarefs_vec.length ()); + + /* Partition can only be executed sequentially if there is any + unknown data reference. */ + if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) + || !DR_INIT (dr) || !DR_STEP (dr)) + partition->type = PTYPE_SEQUENTIAL; + + bitmap_set_bit (partition->datarefs, idx); + } + } + + if (partition->type == PTYPE_SEQUENTIAL) + return partition; + + /* Further check if any data dependence prevents us from executing the + partition parallelly. */ + update_type_for_merge (rdg, partition, partition); + + return partition; +} + +/* Given PARTITION of LOOP and RDG, record single load/store data references + for builtin partition in SRC_DR/DST_DR, return false if there is no such + data references. */ + +static bool +find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts, + data_reference_p *dst_dr, data_reference_p *src_dr) +{ + unsigned i; + data_reference_p single_ld = NULL, single_st = NULL; + bitmap_iterator bi; + + EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi) + { + gimple *stmt = RDG_STMT (rdg, i); + data_reference_p dr; + + if (gimple_code (stmt) == GIMPLE_PHI) + continue; + + /* Any scalar stmts are ok. */ + if (!gimple_vuse (stmt)) + continue; + + /* Otherwise just regular loads/stores. */ + if (!gimple_assign_single_p (stmt)) + return false; + + /* But exactly one store and/or load. */ + for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j) + { + tree type = TREE_TYPE (DR_REF (dr)); + + /* The memset, memcpy and memmove library calls are only + able to deal with generic address space. */ + if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type))) + return false; + + if (DR_IS_READ (dr)) + { + if (single_ld != NULL) + return false; + single_ld = dr; + } + else + { + if (single_st != NULL) + return false; + single_st = dr; + } + } + } + + if (!single_ld && !single_st) + return false; + + basic_block bb_ld = NULL; + basic_block bb_st = NULL; + + if (single_ld) + { + /* Bail out if this is a bitfield memory reference. */ + if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF + && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1))) + return false; + + /* Data reference must be executed exactly once per iteration of each + loop in the loop nest. We only need to check dominance information + against the outermost one in a perfect loop nest because a bb can't + dominate outermost loop's latch without dominating inner loop's. */ + bb_ld = gimple_bb (DR_STMT (single_ld)); + if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld)) + return false; + } + + if (single_st) + { + /* Bail out if this is a bitfield memory reference. */ + if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF + && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) + return false; + + /* Data reference must be executed exactly once per iteration. + Same as single_ld, we only need to check against the outermost + loop. */ + bb_st = gimple_bb (DR_STMT (single_st)); + if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st)) + return false; + } + + if (single_ld && single_st) + { + /* Load and store must be in the same loop nest. */ + if (bb_st->loop_father != bb_ld->loop_father) + return false; + + edge e = single_exit (bb_st->loop_father); + bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld); + bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st); + if (dom_ld != dom_st) + return false; + } + + *src_dr = single_ld; + *dst_dr = single_st; + return true; +} + +/* Given data reference DR in LOOP_NEST, this function checks the enclosing + loops from inner to outer to see if loop's step equals to access size at + each level of loop. Return 2 if we can prove this at all level loops; + record access base and size in BASE and SIZE; save loop's step at each + level of loop in STEPS if it is not null. For example: + + int arr[100][100][100]; + for (i = 0; i < 100; i++) ;steps[2] = 40000 + for (j = 100; j > 0; j--) ;steps[1] = -400 + for (k = 0; k < 100; k++) ;steps[0] = 4 + arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000 + + Return 1 if we can prove the equality at the innermost loop, but not all + level loops. In this case, no information is recorded. + + Return 0 if no equality can be proven at any level loops. */ + +static int +compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, + tree *size, vec<tree> *steps = NULL) +{ + location_t loc = gimple_location (DR_STMT (dr)); + basic_block bb = gimple_bb (DR_STMT (dr)); + class loop *loop = bb->loop_father; + tree ref = DR_REF (dr); + tree access_base = build_fold_addr_expr (ref); + tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); + int res = 0; + + do { + tree scev_fn = analyze_scalar_evolution (loop, access_base); + if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) + return res; + + access_base = CHREC_LEFT (scev_fn); + if (tree_contains_chrecs (access_base, NULL)) + return res; + + tree scev_step = CHREC_RIGHT (scev_fn); + /* Only support constant steps. */ + if (TREE_CODE (scev_step) != INTEGER_CST) + return res; + + enum ev_direction access_dir = scev_direction (scev_fn); + if (access_dir == EV_DIR_UNKNOWN) + return res; + + if (steps != NULL) + steps->safe_push (scev_step); + + scev_step = fold_convert_loc (loc, sizetype, scev_step); + /* Compute absolute value of scev step. */ + if (access_dir == EV_DIR_DECREASES) + scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step); + + /* At each level of loop, scev step must equal to access size. In other + words, DR must access consecutive memory between loop iterations. */ + if (!operand_equal_p (scev_step, access_size, 0)) + return res; + + /* Access stride can be computed for data reference at least for the + innermost loop. */ + res = 1; + + /* Compute DR's execution times in loop. */ + tree niters = number_of_latch_executions (loop); + niters = fold_convert_loc (loc, sizetype, niters); + if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb)) + niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node); + + /* Compute DR's overall access size in loop. */ + access_size = fold_build2_loc (loc, MULT_EXPR, sizetype, + niters, scev_step); + /* Adjust base address in case of negative step. */ + if (access_dir == EV_DIR_DECREASES) + { + tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype, + scev_step, access_size); + access_base = fold_build_pointer_plus_loc (loc, access_base, adj); + } + } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL); + + *base = access_base; + *size = access_size; + /* Access stride can be computed for data reference at each level loop. */ + return 2; +} + +/* Allocate and return builtin struct. Record information like DST_DR, + SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */ + +static struct builtin_info * +alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr, + tree dst_base, tree src_base, tree size) +{ + struct builtin_info *builtin = XNEW (struct builtin_info); + builtin->dst_dr = dst_dr; + builtin->src_dr = src_dr; + builtin->dst_base = dst_base; + builtin->src_base = src_base; + builtin->size = size; + return builtin; +} + +/* Given data reference DR in loop nest LOOP, classify if it forms builtin + memset call. */ + +static void +classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr) +{ + gimple *stmt = DR_STMT (dr); + tree base, size, rhs = gimple_assign_rhs1 (stmt); + + if (const_with_all_bytes_same (rhs) == -1 + && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs)) + || (TYPE_MODE (TREE_TYPE (rhs)) + != TYPE_MODE (unsigned_char_type_node)))) + return; + + if (TREE_CODE (rhs) == SSA_NAME + && !SSA_NAME_IS_DEFAULT_DEF (rhs) + && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) + return; + + int res = compute_access_range (loop, dr, &base, &size); + if (res == 0) + return; + if (res == 1) + { + partition->kind = PKIND_PARTIAL_MEMSET; + return; + } + + poly_uint64 base_offset; + unsigned HOST_WIDE_INT const_base_offset; + tree base_base = strip_offset (base, &base_offset); + if (!base_offset.is_constant (&const_base_offset)) + return; + + struct builtin_info *builtin; + builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); + builtin->dst_base_base = base_base; + builtin->dst_base_offset = const_base_offset; + partition->builtin = builtin; + partition->kind = PKIND_MEMSET; +} + +/* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify + if it forms builtin memcpy or memmove call. */ + +void +loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg, + partition *partition, + data_reference_p dst_dr, + data_reference_p src_dr) +{ + tree base, size, src_base, src_size; + auto_vec<tree> dst_steps, src_steps; + + /* Compute access range of both load and store. */ + int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps); + if (res != 2) + return; + res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps); + if (res != 2) + return; + + /* They much have the same access size. */ + if (!operand_equal_p (size, src_size, 0)) + return; + + /* Load and store in loop nest must access memory in the same way, i.e, + their must have the same steps in each loop of the nest. */ + if (dst_steps.length () != src_steps.length ()) + return; + for (unsigned i = 0; i < dst_steps.length (); ++i) + if (!operand_equal_p (dst_steps[i], src_steps[i], 0)) + return; + + /* Now check that if there is a dependence. */ + ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr); + + /* Classify as memmove if no dependence between load and store. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) + { + partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); + partition->kind = PKIND_MEMMOVE; + return; + } + + /* Can't do memmove in case of unknown dependence or dependence without + classical distance vector. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know + || DDR_NUM_DIST_VECTS (ddr) == 0) + return; + + unsigned i; + lambda_vector dist_v; + int num_lev = (DDR_LOOP_NEST (ddr)).length (); + FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) + { + unsigned dep_lev = dependence_level (dist_v, num_lev); + /* Can't do memmove if load depends on store. */ + if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr)) + return; + } + + partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); + partition->kind = PKIND_MEMMOVE; + return; +} + +bool +loop_distribution::classify_partition (loop_p loop, + struct graph *rdg, partition *partition, + bitmap stmt_in_all_partitions) +{ + bitmap_iterator bi; + unsigned i; + data_reference_p single_ld = NULL, single_st = NULL; + bool volatiles_p = false, has_reduction = false; + + EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) + { + gimple *stmt = RDG_STMT (rdg, i); + + if (gimple_has_volatile_ops (stmt)) + volatiles_p = true; + + /* If the stmt is not included by all partitions and there is uses + outside of the loop, then mark the partition as reduction. */ + if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) + { + /* Due to limitation in the transform phase we have to fuse all + reduction partitions. As a result, this could cancel valid + loop distribution especially for loop that induction variable + is used outside of loop. To workaround this issue, we skip + marking partition as reudction if the reduction stmt belongs + to all partitions. In such case, reduction will be computed + correctly no matter how partitions are fused/distributed. */ + if (!bitmap_bit_p (stmt_in_all_partitions, i)) + partition->reduction_p = true; + else + has_reduction = true; + } + } + + /* Simple workaround to prevent classifying the partition as builtin + if it contains any use outside of loop. For the case where all + partitions have the reduction this simple workaround is delayed + to only affect the last partition. */ + if (partition->reduction_p) + return has_reduction; + + /* Perform general partition disqualification for builtins. */ + if (volatiles_p + || !flag_tree_loop_distribute_patterns) + return has_reduction; + + /* Find single load/store data references for builtin partition. */ + if (!find_single_drs (loop, rdg, partition->stmts, &single_st, &single_ld) + || !single_st) + return has_reduction; + + if (single_ld && single_st) + { + gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); + /* Direct aggregate copy or via an SSA name temporary. */ + if (load != store + && gimple_assign_lhs (load) != gimple_assign_rhs1 (store)) + return has_reduction; + } + + partition->loc = gimple_location (DR_STMT (single_st)); + + /* Classify the builtin kind. */ + if (single_ld == NULL) + classify_builtin_st (loop, partition, single_st); + else + classify_builtin_ldst (loop, rdg, partition, single_st, single_ld); + return has_reduction; +} + +bool +loop_distribution::share_memory_accesses (struct graph *rdg, + partition *partition1, partition *partition2) +{ + unsigned i, j; + bitmap_iterator bi, bj; + data_reference_p dr1, dr2; + + /* First check whether in the intersection of the two partitions are + any loads or stores. Common loads are the situation that happens + most often. */ + EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi) + if (RDG_MEM_WRITE_STMT (rdg, i) + || RDG_MEM_READS_STMT (rdg, i)) + return true; + + /* Then check whether the two partitions access the same memory object. */ + EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) + { + dr1 = datarefs_vec[i]; + + if (!DR_BASE_ADDRESS (dr1) + || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1)) + continue; + + EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj) + { + dr2 = datarefs_vec[j]; + + if (!DR_BASE_ADDRESS (dr2) + || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2)) + continue; + + if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0) + && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0) + && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0) + && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)) + return true; + } + } + + return false; +} + +/* For each seed statement in STARTING_STMTS, this function builds + partition for it by adding depended statements according to RDG. + All partitions are recorded in PARTITIONS. */ + +void +loop_distribution::rdg_build_partitions (struct graph *rdg, + vec<gimple *> starting_stmts, + vec<partition *> *partitions) +{ + auto_bitmap processed; + int i; + gimple *stmt; + + FOR_EACH_VEC_ELT (starting_stmts, i, stmt) + { + int v = rdg_vertex_for_stmt (rdg, stmt); + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "ldist asked to generate code for vertex %d\n", v); + + /* If the vertex is already contained in another partition so + is the partition rooted at it. */ + if (bitmap_bit_p (processed, v)) + continue; + + partition *partition = build_rdg_partition_for_vertex (rdg, v); + bitmap_ior_into (processed, partition->stmts); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "ldist creates useful %s partition:\n", + partition->type == PTYPE_PARALLEL ? "parallel" : "sequent"); + bitmap_print (dump_file, partition->stmts, " ", "\n"); + } + + partitions->safe_push (partition); + } + + /* All vertices should have been assigned to at least one partition now, + other than vertices belonging to dead code. */ +} + +/* Dump to FILE the PARTITIONS. */ + +static void +dump_rdg_partitions (FILE *file, const vec<partition *> &partitions) +{ + int i; + partition *partition; + + FOR_EACH_VEC_ELT (partitions, i, partition) + debug_bitmap_file (file, partition->stmts); +} + +/* Debug PARTITIONS. */ +extern void debug_rdg_partitions (const vec<partition *> &); + +DEBUG_FUNCTION void +debug_rdg_partitions (const vec<partition *> &partitions) +{ + dump_rdg_partitions (stderr, partitions); +} + +/* Returns the number of read and write operations in the RDG. */ + +static int +number_of_rw_in_rdg (struct graph *rdg) +{ + int i, res = 0; + + for (i = 0; i < rdg->n_vertices; i++) + { + if (RDG_MEM_WRITE_STMT (rdg, i)) + ++res; + + if (RDG_MEM_READS_STMT (rdg, i)) + ++res; + } + + return res; +} + +/* Returns the number of read and write operations in a PARTITION of + the RDG. */ + +static int +number_of_rw_in_partition (struct graph *rdg, partition *partition) +{ + int res = 0; + unsigned i; + bitmap_iterator ii; + + EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii) + { + if (RDG_MEM_WRITE_STMT (rdg, i)) + ++res; + + if (RDG_MEM_READS_STMT (rdg, i)) + ++res; + } + + return res; +} + +/* Returns true when one of the PARTITIONS contains all the read or + write operations of RDG. */ + +static bool +partition_contains_all_rw (struct graph *rdg, + const vec<partition *> &partitions) +{ + int i; + partition *partition; + int nrw = number_of_rw_in_rdg (rdg); + + FOR_EACH_VEC_ELT (partitions, i, partition) + if (nrw == number_of_rw_in_partition (rdg, partition)) + return true; + + return false; +} + +int +loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir, + bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs) +{ + unsigned i, j; + bitmap_iterator bi, bj; + data_reference_p dr1, dr2, saved_dr1; + + /* dependence direction - 0 is no dependence, -1 is back, + 1 is forth, 2 is both (we can stop then, merging will occur). */ + EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi) + { + dr1 = datarefs_vec[i]; + + EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj) + { + int res, this_dir = 1; + ddr_p ddr; + + dr2 = datarefs_vec[j]; + + /* Skip all <read, read> data dependence. */ + if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) + continue; + + saved_dr1 = dr1; + /* Re-shuffle data-refs to be in topological order. */ + if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) + > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) + { + std::swap (dr1, dr2); + this_dir = -this_dir; + } + ddr = get_data_dependence (rdg, dr1, dr2); + if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) + { + this_dir = 0; + res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1), + DR_BASE_ADDRESS (dr2)); + /* Be conservative. If data references are not well analyzed, + or the two data references have the same base address and + offset, add dependence and consider it alias to each other. + In other words, the dependence cannot be resolved by + runtime alias check. */ + if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2) + || !DR_OFFSET (dr1) || !DR_OFFSET (dr2) + || !DR_INIT (dr1) || !DR_INIT (dr2) + || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1)) + || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2)) + || res == 0) + this_dir = 2; + /* Data dependence could be resolved by runtime alias check, + record it in ALIAS_DDRS. */ + else if (alias_ddrs != NULL) + alias_ddrs->safe_push (ddr); + /* Or simply ignore it. */ + } + else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE) + { + if (DDR_REVERSED_P (ddr)) + this_dir = -this_dir; + + /* Known dependences can still be unordered througout the + iteration space, see gcc.dg/tree-ssa/ldist-16.c and + gcc.dg/tree-ssa/pr94969.c. */ + if (DDR_NUM_DIST_VECTS (ddr) != 1) + this_dir = 2; + /* If the overlap is exact preserve stmt order. */ + else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), + DDR_NB_LOOPS (ddr))) + ; + /* Else as the distance vector is lexicographic positive swap + the dependence direction. */ + else + this_dir = -this_dir; + } + else + this_dir = 0; + if (this_dir == 2) + return 2; + else if (dir == 0) + dir = this_dir; + else if (this_dir != 0 && dir != this_dir) + return 2; + /* Shuffle "back" dr1. */ + dr1 = saved_dr1; + } + } + return dir; +} + +/* Compare postorder number of the partition graph vertices V1 and V2. */ + +static int +pgcmp (const void *v1_, const void *v2_) +{ + const vertex *v1 = (const vertex *)v1_; + const vertex *v2 = (const vertex *)v2_; + return v2->post - v1->post; +} + +/* Data attached to vertices of partition dependence graph. */ +struct pg_vdata +{ + /* ID of the corresponding partition. */ + int id; + /* The partition. */ + struct partition *partition; +}; + +/* Data attached to edges of partition dependence graph. */ +struct pg_edata +{ + /* If the dependence edge can be resolved by runtime alias check, + this vector contains data dependence relations for runtime alias + check. On the other hand, if the dependence edge is introduced + because of compilation time known data dependence, this vector + contains nothing. */ + vec<ddr_p> alias_ddrs; +}; + +/* Callback data for traversing edges in graph. */ +struct pg_edge_callback_data +{ + /* Bitmap contains strong connected components should be merged. */ + bitmap sccs_to_merge; + /* Array constains component information for all vertices. */ + int *vertices_component; + /* Array constains postorder information for all vertices. */ + int *vertices_post; + /* Vector to record all data dependence relations which are needed + to break strong connected components by runtime alias checks. */ + vec<ddr_p> *alias_ddrs; +}; + +/* Initialize vertice's data for partition dependence graph PG with + PARTITIONS. */ + +static void +init_partition_graph_vertices (struct graph *pg, + vec<struct partition *> *partitions) +{ + int i; + partition *partition; + struct pg_vdata *data; + + for (i = 0; partitions->iterate (i, &partition); ++i) + { + data = new pg_vdata; + pg->vertices[i].data = data; + data->id = i; + data->partition = partition; + } +} + +/* Add edge <I, J> to partition dependence graph PG. Attach vector of data + dependence relations to the EDGE if DDRS isn't NULL. */ + +static void +add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs) +{ + struct graph_edge *e = add_edge (pg, i, j); + + /* If the edge is attached with data dependence relations, it means this + dependence edge can be resolved by runtime alias checks. */ + if (ddrs != NULL) + { + struct pg_edata *data = new pg_edata; + + gcc_assert (ddrs->length () > 0); + e->data = data; + data->alias_ddrs = vNULL; + data->alias_ddrs.safe_splice (*ddrs); + } +} + +/* Callback function for graph travesal algorithm. It returns true + if edge E should skipped when traversing the graph. */ + +static bool +pg_skip_alias_edge (struct graph_edge *e) +{ + struct pg_edata *data = (struct pg_edata *)e->data; + return (data != NULL && data->alias_ddrs.length () > 0); +} + +/* Callback function freeing data attached to edge E of graph. */ + +static void +free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *) +{ + if (e->data != NULL) + { + struct pg_edata *data = (struct pg_edata *)e->data; + data->alias_ddrs.release (); + delete data; + } +} + +/* Free data attached to vertice of partition dependence graph PG. */ + +static void +free_partition_graph_vdata (struct graph *pg) +{ + int i; + struct pg_vdata *data; + + for (i = 0; i < pg->n_vertices; ++i) + { + data = (struct pg_vdata *)pg->vertices[i].data; + delete data; + } +} + +/* Build and return partition dependence graph for PARTITIONS. RDG is + reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P + is true, data dependence caused by possible alias between references + is ignored, as if it doesn't exist at all; otherwise all depdendences + are considered. */ + +struct graph * +loop_distribution::build_partition_graph (struct graph *rdg, + vec<struct partition *> *partitions, + bool ignore_alias_p) +{ + int i, j; + struct partition *partition1, *partition2; + graph *pg = new_graph (partitions->length ()); + auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p; + + alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs; + + init_partition_graph_vertices (pg, partitions); + + for (i = 0; partitions->iterate (i, &partition1); ++i) + { + for (j = i + 1; partitions->iterate (j, &partition2); ++j) + { + /* dependence direction - 0 is no dependence, -1 is back, + 1 is forth, 2 is both (we can stop then, merging will occur). */ + int dir = 0; + + /* If the first partition has reduction, add back edge; if the + second partition has reduction, add forth edge. This makes + sure that reduction partition will be sorted as the last one. */ + if (partition_reduction_p (partition1)) + dir = -1; + else if (partition_reduction_p (partition2)) + dir = 1; + + /* Cleanup the temporary vector. */ + alias_ddrs.truncate (0); + + dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs, + partition2->datarefs, alias_ddrs_p); + + /* Add edge to partition graph if there exists dependence. There + are two types of edges. One type edge is caused by compilation + time known dependence, this type cannot be resolved by runtime + alias check. The other type can be resolved by runtime alias + check. */ + if (dir == 1 || dir == 2 + || alias_ddrs.length () > 0) + { + /* Attach data dependence relations to edge that can be resolved + by runtime alias check. */ + bool alias_edge_p = (dir != 1 && dir != 2); + add_partition_graph_edge (pg, i, j, + (alias_edge_p) ? &alias_ddrs : NULL); + } + if (dir == -1 || dir == 2 + || alias_ddrs.length () > 0) + { + /* Attach data dependence relations to edge that can be resolved + by runtime alias check. */ + bool alias_edge_p = (dir != -1 && dir != 2); + add_partition_graph_edge (pg, j, i, + (alias_edge_p) ? &alias_ddrs : NULL); + } + } + } + return pg; +} + +/* Sort partitions in PG in descending post order and store them in + PARTITIONS. */ + +static void +sort_partitions_by_post_order (struct graph *pg, + vec<struct partition *> *partitions) +{ + int i; + struct pg_vdata *data; + + /* Now order the remaining nodes in descending postorder. */ + qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp); + partitions->truncate (0); + for (i = 0; i < pg->n_vertices; ++i) + { + data = (struct pg_vdata *)pg->vertices[i].data; + if (data->partition) + partitions->safe_push (data->partition); + } +} + +void +loop_distribution::merge_dep_scc_partitions (struct graph *rdg, + vec<struct partition *> *partitions, + bool ignore_alias_p) +{ + struct partition *partition1, *partition2; + struct pg_vdata *data; + graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p); + int i, j, num_sccs = graphds_scc (pg, NULL); + + /* Strong connected compoenent means dependence cycle, we cannot distribute + them. So fuse them together. */ + if ((unsigned) num_sccs < partitions->length ()) + { + for (i = 0; i < num_sccs; ++i) + { + for (j = 0; partitions->iterate (j, &partition1); ++j) + if (pg->vertices[j].component == i) + break; + for (j = j + 1; partitions->iterate (j, &partition2); ++j) + if (pg->vertices[j].component == i) + { + partition_merge_into (NULL, partition1, + partition2, FUSE_SAME_SCC); + partition1->type = PTYPE_SEQUENTIAL; + (*partitions)[j] = NULL; + partition_free (partition2); + data = (struct pg_vdata *)pg->vertices[j].data; + data->partition = NULL; + } + } + } + + sort_partitions_by_post_order (pg, partitions); + gcc_assert (partitions->length () == (unsigned)num_sccs); + free_partition_graph_vdata (pg); + for_each_edge (pg, free_partition_graph_edata_cb, NULL); + free_graph (pg); +} + +/* Callback function for traversing edge E in graph G. DATA is private + callback data. */ + +static void +pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data) +{ + int i, j, component; + struct pg_edge_callback_data *cbdata; + struct pg_edata *edata = (struct pg_edata *) e->data; + + /* If the edge doesn't have attached data dependence, it represents + compilation time known dependences. This type dependence cannot + be resolved by runtime alias check. */ + if (edata == NULL || edata->alias_ddrs.length () == 0) + return; + + cbdata = (struct pg_edge_callback_data *) data; + i = e->src; + j = e->dest; + component = cbdata->vertices_component[i]; + /* Vertices are topologically sorted according to compilation time + known dependences, so we can break strong connected components + by removing edges of the opposite direction, i.e, edges pointing + from vertice with smaller post number to vertice with bigger post + number. */ + if (g->vertices[i].post < g->vertices[j].post + /* We only need to remove edges connecting vertices in the same + strong connected component to break it. */ + && component == cbdata->vertices_component[j] + /* Check if we want to break the strong connected component or not. */ + && !bitmap_bit_p (cbdata->sccs_to_merge, component)) + cbdata->alias_ddrs->safe_splice (edata->alias_ddrs); +} + +/* This is the main function breaking strong conected components in + PARTITIONS giving reduced depdendence graph RDG. Store data dependence + relations for runtime alias check in ALIAS_DDRS. */ +void +loop_distribution::break_alias_scc_partitions (struct graph *rdg, + vec<struct partition *> *partitions, + vec<ddr_p> *alias_ddrs) +{ + int i, j, k, num_sccs, num_sccs_no_alias = 0; + /* Build partition dependence graph. */ + graph *pg = build_partition_graph (rdg, partitions, false); + + alias_ddrs->truncate (0); + /* Find strong connected components in the graph, with all dependence edges + considered. */ + num_sccs = graphds_scc (pg, NULL); + /* All SCCs now can be broken by runtime alias checks because SCCs caused by + compilation time known dependences are merged before this function. */ + if ((unsigned) num_sccs < partitions->length ()) + { + struct pg_edge_callback_data cbdata; + auto_bitmap sccs_to_merge; + auto_vec<enum partition_type> scc_types; + struct partition *partition, *first; + + /* If all partitions in a SCC have the same type, we can simply merge the + SCC. This loop finds out such SCCS and record them in bitmap. */ + bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs); + for (i = 0; i < num_sccs; ++i) + { + for (j = 0; partitions->iterate (j, &first); ++j) + if (pg->vertices[j].component == i) + break; + + bool same_type = true, all_builtins = partition_builtin_p (first); + for (++j; partitions->iterate (j, &partition); ++j) + { + if (pg->vertices[j].component != i) + continue; + + if (first->type != partition->type) + { + same_type = false; + break; + } + all_builtins &= partition_builtin_p (partition); + } + /* Merge SCC if all partitions in SCC have the same type, though the + result partition is sequential, because vectorizer can do better + runtime alias check. One expecption is all partitions in SCC are + builtins. */ + if (!same_type || all_builtins) + bitmap_clear_bit (sccs_to_merge, i); + } + + /* Initialize callback data for traversing. */ + cbdata.sccs_to_merge = sccs_to_merge; + cbdata.alias_ddrs = alias_ddrs; + cbdata.vertices_component = XNEWVEC (int, pg->n_vertices); + cbdata.vertices_post = XNEWVEC (int, pg->n_vertices); + /* Record the component information which will be corrupted by next + graph scc finding call. */ + for (i = 0; i < pg->n_vertices; ++i) + cbdata.vertices_component[i] = pg->vertices[i].component; + + /* Collect data dependences for runtime alias checks to break SCCs. */ + if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs) + { + /* Record the postorder information which will be corrupted by next + graph SCC finding call. */ + for (i = 0; i < pg->n_vertices; ++i) + cbdata.vertices_post[i] = pg->vertices[i].post; + + /* Run SCC finding algorithm again, with alias dependence edges + skipped. This is to topologically sort partitions according to + compilation time known dependence. Note the topological order + is stored in the form of pg's post order number. */ + num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge); + gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias); + /* With topological order, we can construct two subgraphs L and R. + L contains edge <x, y> where x < y in terms of post order, while + R contains edge <x, y> where x > y. Edges for compilation time + known dependence all fall in R, so we break SCCs by removing all + (alias) edges of in subgraph L. */ + for_each_edge (pg, pg_collect_alias_ddrs, &cbdata); + } + + /* For SCC that doesn't need to be broken, merge it. */ + for (i = 0; i < num_sccs; ++i) + { + if (!bitmap_bit_p (sccs_to_merge, i)) + continue; + + for (j = 0; partitions->iterate (j, &first); ++j) + if (cbdata.vertices_component[j] == i) + break; + for (k = j + 1; partitions->iterate (k, &partition); ++k) + { + struct pg_vdata *data; + + if (cbdata.vertices_component[k] != i) + continue; + + partition_merge_into (NULL, first, partition, FUSE_SAME_SCC); + (*partitions)[k] = NULL; + partition_free (partition); + data = (struct pg_vdata *)pg->vertices[k].data; + gcc_assert (data->id == k); + data->partition = NULL; + /* The result partition of merged SCC must be sequential. */ + first->type = PTYPE_SEQUENTIAL; + } + } + /* Restore the postorder information if it's corrupted in finding SCC + with alias dependence edges skipped. If reduction partition's SCC is + broken by runtime alias checks, we force a negative post order to it + making sure it will be scheduled in the last. */ + if (num_sccs_no_alias > 0) + { + j = -1; + for (i = 0; i < pg->n_vertices; ++i) + { + pg->vertices[i].post = cbdata.vertices_post[i]; + struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data; + if (data->partition && partition_reduction_p (data->partition)) + { + gcc_assert (j == -1); + j = i; + } + } + if (j >= 0) + pg->vertices[j].post = -1; + } + + free (cbdata.vertices_component); + free (cbdata.vertices_post); + } + + sort_partitions_by_post_order (pg, partitions); + free_partition_graph_vdata (pg); + for_each_edge (pg, free_partition_graph_edata_cb, NULL); + free_graph (pg); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Possible alias data dependence to break:\n"); + dump_data_dependence_relations (dump_file, *alias_ddrs); + } +} + +/* Compute and return an expression whose value is the segment length which + will be accessed by DR in NITERS iterations. */ + +static tree +data_ref_segment_size (struct data_reference *dr, tree niters) +{ + niters = size_binop (MINUS_EXPR, + fold_convert (sizetype, niters), + size_one_node); + return size_binop (MULT_EXPR, + fold_convert (sizetype, DR_STEP (dr)), + fold_convert (sizetype, niters)); +} + +/* Return true if LOOP's latch is dominated by statement for data reference + DR. */ + +static inline bool +latch_dominated_by_data_ref (class loop *loop, data_reference *dr) +{ + return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, + gimple_bb (DR_STMT (dr))); +} + +/* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's + data dependence relations ALIAS_DDRS. */ + +static void +compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs, + vec<dr_with_seg_len_pair_t> *comp_alias_pairs) +{ + unsigned int i; + unsigned HOST_WIDE_INT factor = 1; + tree niters_plus_one, niters = number_of_latch_executions (loop); + + gcc_assert (niters != NULL_TREE && niters != chrec_dont_know); + niters = fold_convert (sizetype, niters); + niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node); + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Creating alias check pairs:\n"); + + /* Iterate all data dependence relations and compute alias check pairs. */ + for (i = 0; i < alias_ddrs->length (); i++) + { + ddr_p ddr = (*alias_ddrs)[i]; + struct data_reference *dr_a = DDR_A (ddr); + struct data_reference *dr_b = DDR_B (ddr); + tree seg_length_a, seg_length_b; + + if (latch_dominated_by_data_ref (loop, dr_a)) + seg_length_a = data_ref_segment_size (dr_a, niters_plus_one); + else + seg_length_a = data_ref_segment_size (dr_a, niters); + + if (latch_dominated_by_data_ref (loop, dr_b)) + seg_length_b = data_ref_segment_size (dr_b, niters_plus_one); + else + seg_length_b = data_ref_segment_size (dr_b, niters); + + unsigned HOST_WIDE_INT access_size_a + = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a)))); + unsigned HOST_WIDE_INT access_size_b + = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b)))); + unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a))); + unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b))); + + dr_with_seg_len_pair_t dr_with_seg_len_pair + (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a), + dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b), + /* ??? Would WELL_ORDERED be safe? */ + dr_with_seg_len_pair_t::REORDERED); + + comp_alias_pairs->safe_push (dr_with_seg_len_pair); + } + + if (tree_fits_uhwi_p (niters)) + factor = tree_to_uhwi (niters); + + /* Prune alias check pairs. */ + prune_runtime_alias_test_list (comp_alias_pairs, factor); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Improved number of alias checks from %d to %d\n", + alias_ddrs->length (), comp_alias_pairs->length ()); +} + +/* Given data dependence relations in ALIAS_DDRS, generate runtime alias + checks and version LOOP under condition of these runtime alias checks. */ + +static void +version_loop_by_alias_check (vec<struct partition *> *partitions, + class loop *loop, vec<ddr_p> *alias_ddrs) +{ + profile_probability prob; + basic_block cond_bb; + class loop *nloop; + tree lhs, arg0, cond_expr = NULL_TREE; + gimple_seq cond_stmts = NULL; + gimple *call_stmt = NULL; + auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs; + + /* Generate code for runtime alias checks if necessary. */ + gcc_assert (alias_ddrs->length () > 0); + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Version loop <%d> with runtime alias check\n", loop->num); + + compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs); + create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr); + cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts, + is_gimple_val, NULL_TREE); + + /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */ + bool cancelable_p = flag_tree_loop_vectorize; + if (cancelable_p) + { + unsigned i = 0; + struct partition *partition; + for (; partitions->iterate (i, &partition); ++i) + if (!partition_builtin_p (partition)) + break; + + /* If all partitions are builtins, distributing it would be profitable and + we don't want to cancel the runtime alias checks. */ + if (i == partitions->length ()) + cancelable_p = false; + } + + /* Generate internal function call for loop distribution alias check if the + runtime alias check should be cancelable. */ + if (cancelable_p) + { + call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS, + 2, NULL_TREE, cond_expr); + lhs = make_ssa_name (boolean_type_node); + gimple_call_set_lhs (call_stmt, lhs); + } + else + lhs = cond_expr; + + prob = profile_probability::guessed_always ().apply_scale (9, 10); + initialize_original_copy_tables (); + nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (), + prob, prob.invert (), true); + free_original_copy_tables (); + /* Record the original loop number in newly generated loops. In case of + distribution, the original loop will be distributed and the new loop + is kept. */ + loop->orig_loop_num = nloop->num; + nloop->orig_loop_num = nloop->num; + nloop->dont_vectorize = true; + nloop->force_vectorize = false; + + if (call_stmt) + { + /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original + loop could be destroyed. */ + arg0 = build_int_cst (integer_type_node, loop->orig_loop_num); + gimple_call_set_arg (call_stmt, 0, arg0); + gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt); + } + + if (cond_stmts) + { + gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb); + gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT); + } + update_ssa (TODO_update_ssa); +} + +/* Return true if loop versioning is needed to distrubute PARTITIONS. + ALIAS_DDRS are data dependence relations for runtime alias check. */ + +static inline bool +version_for_distribution_p (vec<struct partition *> *partitions, + vec<ddr_p> *alias_ddrs) +{ + /* No need to version loop if we have only one partition. */ + if (partitions->length () == 1) + return false; + + /* Need to version loop if runtime alias check is necessary. */ + return (alias_ddrs->length () > 0); +} + +/* Compare base offset of builtin mem* partitions P1 and P2. */ + +static int +offset_cmp (const void *vp1, const void *vp2) +{ + struct partition *p1 = *(struct partition *const *) vp1; + struct partition *p2 = *(struct partition *const *) vp2; + unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset; + unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset; + return (o2 < o1) - (o1 < o2); +} + +/* Fuse adjacent memset builtin PARTITIONS if possible. This is a special + case optimization transforming below code: + + __builtin_memset (&obj, 0, 100); + _1 = &obj + 100; + __builtin_memset (_1, 0, 200); + _2 = &obj + 300; + __builtin_memset (_2, 0, 100); + + into: + + __builtin_memset (&obj, 0, 400); + + Note we don't have dependence information between different partitions + at this point, as a result, we can't handle nonadjacent memset builtin + partitions since dependence might be broken. */ + +static void +fuse_memset_builtins (vec<struct partition *> *partitions) +{ + unsigned i, j; + struct partition *part1, *part2; + tree rhs1, rhs2; + + for (i = 0; partitions->iterate (i, &part1);) + { + if (part1->kind != PKIND_MEMSET) + { + i++; + continue; + } + + /* Find sub-array of memset builtins of the same base. Index range + of the sub-array is [i, j) with "j > i". */ + for (j = i + 1; partitions->iterate (j, &part2); ++j) + { + if (part2->kind != PKIND_MEMSET + || !operand_equal_p (part1->builtin->dst_base_base, + part2->builtin->dst_base_base, 0)) + break; + + /* Memset calls setting different values can't be merged. */ + rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); + rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); + if (!operand_equal_p (rhs1, rhs2, 0)) + break; + } + + /* Stable sort is required in order to avoid breaking dependence. */ + gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i], + offset_cmp); + /* Continue with next partition. */ + i = j; + } + + /* Merge all consecutive memset builtin partitions. */ + for (i = 0; i < partitions->length () - 1;) + { + part1 = (*partitions)[i]; + if (part1->kind != PKIND_MEMSET) + { + i++; + continue; + } + + part2 = (*partitions)[i + 1]; + /* Only merge memset partitions of the same base and with constant + access sizes. */ + if (part2->kind != PKIND_MEMSET + || TREE_CODE (part1->builtin->size) != INTEGER_CST + || TREE_CODE (part2->builtin->size) != INTEGER_CST + || !operand_equal_p (part1->builtin->dst_base_base, + part2->builtin->dst_base_base, 0)) + { + i++; + continue; + } + rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); + rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); + int bytev1 = const_with_all_bytes_same (rhs1); + int bytev2 = const_with_all_bytes_same (rhs2); + /* Only merge memset partitions of the same value. */ + if (bytev1 != bytev2 || bytev1 == -1) + { + i++; + continue; + } + wide_int end1 = wi::add (part1->builtin->dst_base_offset, + wi::to_wide (part1->builtin->size)); + /* Only merge adjacent memset partitions. */ + if (wi::ne_p (end1, part2->builtin->dst_base_offset)) + { + i++; + continue; + } + /* Merge partitions[i] and partitions[i+1]. */ + part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype, + part1->builtin->size, + part2->builtin->size); + partition_free (part2); + partitions->ordered_remove (i + 1); + } +} + +void +loop_distribution::finalize_partitions (class loop *loop, + vec<struct partition *> *partitions, + vec<ddr_p> *alias_ddrs) +{ + unsigned i; + struct partition *partition, *a; + + if (partitions->length () == 1 + || alias_ddrs->length () > 0) + return; + + unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0; + bool same_type_p = true; + enum partition_type type = ((*partitions)[0])->type; + for (i = 0; partitions->iterate (i, &partition); ++i) + { + same_type_p &= (type == partition->type); + if (partition_builtin_p (partition)) + { + num_builtin++; + continue; + } + num_normal++; + if (partition->kind == PKIND_PARTIAL_MEMSET) + num_partial_memset++; + } + + /* Don't distribute current loop into too many loops given we don't have + memory stream cost model. Be even more conservative in case of loop + nest distribution. */ + if ((same_type_p && num_builtin == 0 + && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1)) + || (loop->inner != NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) + || (loop->inner == NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) + { + a = (*partitions)[0]; + for (i = 1; partitions->iterate (i, &partition); ++i) + { + partition_merge_into (NULL, a, partition, FUSE_FINALIZE); + partition_free (partition); + } + partitions->truncate (1); + } + + /* Fuse memset builtins if possible. */ + if (partitions->length () > 1) + fuse_memset_builtins (partitions); +} + +/* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the + statements from STMTS into separate loops. Returns the number of + distributed loops. Set NB_CALLS to number of generated builtin calls. + Set *DESTROY_P to whether LOOP needs to be destroyed. */ + +int +loop_distribution::distribute_loop (class loop *loop, + const vec<gimple *> &stmts, + control_dependences *cd, int *nb_calls, bool *destroy_p, + bool only_patterns_p) +{ + ddrs_table = new hash_table<ddr_hasher> (389); + struct graph *rdg; + partition *partition; + int i, nbp; + + *destroy_p = false; + *nb_calls = 0; + loop_nest.create (0); + if (!find_loop_nest (loop, &loop_nest)) + { + loop_nest.release (); + delete ddrs_table; + return 0; + } + + datarefs_vec.create (20); + has_nonaddressable_dataref_p = false; + rdg = build_rdg (loop, cd); + if (!rdg) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Loop %d not distributed: failed to build the RDG.\n", + loop->num); + + loop_nest.release (); + free_data_refs (datarefs_vec); + delete ddrs_table; + return 0; + } + + if (datarefs_vec.length () > MAX_DATAREFS_NUM) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Loop %d not distributed: too many memory references.\n", + loop->num); + + free_rdg (rdg); + loop_nest.release (); + free_data_refs (datarefs_vec); + delete ddrs_table; + return 0; + } + + data_reference_p dref; + for (i = 0; datarefs_vec.iterate (i, &dref); ++i) + dref->aux = (void *) (uintptr_t) i; + + if (dump_file && (dump_flags & TDF_DETAILS)) + dump_rdg (dump_file, rdg); + + auto_vec<struct partition *, 3> partitions; + rdg_build_partitions (rdg, stmts, &partitions); + + auto_vec<ddr_p> alias_ddrs; + + auto_bitmap stmt_in_all_partitions; + bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts); + for (i = 1; partitions.iterate (i, &partition); ++i) + bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts); + + bool any_builtin = false; + bool reduction_in_all = false; + FOR_EACH_VEC_ELT (partitions, i, partition) + { + reduction_in_all + |= classify_partition (loop, rdg, partition, stmt_in_all_partitions); + any_builtin |= partition_builtin_p (partition); + } + + /* If we are only distributing patterns but did not detect any, + simply bail out. */ + if (only_patterns_p + && !any_builtin) + { + nbp = 0; + goto ldist_done; + } + + /* If we are only distributing patterns fuse all partitions that + were not classified as builtins. This also avoids chopping + a loop into pieces, separated by builtin calls. That is, we + only want no or a single loop body remaining. */ + struct partition *into; + if (only_patterns_p) + { + for (i = 0; partitions.iterate (i, &into); ++i) + if (!partition_builtin_p (into)) + break; + for (++i; partitions.iterate (i, &partition); ++i) + if (!partition_builtin_p (partition)) + { + partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN); + partitions.unordered_remove (i); + partition_free (partition); + i--; + } + } + + /* Due to limitations in the transform phase we have to fuse all + reduction partitions into the last partition so the existing + loop will contain all loop-closed PHI nodes. */ + for (i = 0; partitions.iterate (i, &into); ++i) + if (partition_reduction_p (into)) + break; + for (i = i + 1; partitions.iterate (i, &partition); ++i) + if (partition_reduction_p (partition)) + { + partition_merge_into (rdg, into, partition, FUSE_REDUCTION); + partitions.unordered_remove (i); + partition_free (partition); + i--; + } + + /* Apply our simple cost model - fuse partitions with similar + memory accesses. */ + for (i = 0; partitions.iterate (i, &into); ++i) + { + bool changed = false; + if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET) + continue; + for (int j = i + 1; + partitions.iterate (j, &partition); ++j) + { + if (share_memory_accesses (rdg, into, partition)) + { + partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); + partitions.unordered_remove (j); + partition_free (partition); + j--; + changed = true; + } + } + /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar + accesses when 1 and 2 have similar accesses but not 0 and 1 + then in the next iteration we will fail to consider merging + 1 into 0,2. So try again if we did any merging into 0. */ + if (changed) + i--; + } + + /* Put a non-builtin partition last if we need to preserve a reduction. + ??? This is a workaround that makes sort_partitions_by_post_order do + the correct thing while in reality it should sort each component + separately and then put the component with a reduction or a non-builtin + last. */ + if (reduction_in_all + && partition_builtin_p (partitions.last())) + FOR_EACH_VEC_ELT (partitions, i, partition) + if (!partition_builtin_p (partition)) + { + partitions.unordered_remove (i); + partitions.quick_push (partition); + break; + } + + /* Build the partition dependency graph and fuse partitions in strong + connected component. */ + if (partitions.length () > 1) + { + /* Don't support loop nest distribution under runtime alias check + since it's not likely to enable many vectorization opportunities. + Also if loop has any data reference which may be not addressable + since alias check needs to take, compare address of the object. */ + if (loop->inner || has_nonaddressable_dataref_p) + merge_dep_scc_partitions (rdg, &partitions, false); + else + { + merge_dep_scc_partitions (rdg, &partitions, true); + if (partitions.length () > 1) + break_alias_scc_partitions (rdg, &partitions, &alias_ddrs); + } + } + + finalize_partitions (loop, &partitions, &alias_ddrs); + + /* If there is a reduction in all partitions make sure the last one + is not classified for builtin code generation. */ + if (reduction_in_all) + { + partition = partitions.last (); + if (only_patterns_p + && partition_builtin_p (partition) + && !partition_builtin_p (partitions[0])) + { + nbp = 0; + goto ldist_done; + } + partition->kind = PKIND_NORMAL; + } + + nbp = partitions.length (); + if (nbp == 0 + || (nbp == 1 && !partition_builtin_p (partitions[0])) + || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) + { + nbp = 0; + goto ldist_done; + } + + if (version_for_distribution_p (&partitions, &alias_ddrs)) + version_loop_by_alias_check (&partitions, loop, &alias_ddrs); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, + "distribute loop <%d> into partitions:\n", loop->num); + dump_rdg_partitions (dump_file, partitions); + } + + FOR_EACH_VEC_ELT (partitions, i, partition) + { + if (partition_builtin_p (partition)) + (*nb_calls)++; + *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1); + } + + ldist_done: + loop_nest.release (); + free_data_refs (datarefs_vec); + for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin (); + iter != ddrs_table->end (); ++iter) + { + free_dependence_relation (*iter); + *iter = NULL; + } + delete ddrs_table; + + FOR_EACH_VEC_ELT (partitions, i, partition) + partition_free (partition); + + free_rdg (rdg); + return nbp - *nb_calls; +} + + +void loop_distribution::bb_top_order_init (void) +{ + int rpo_num; + int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS); + edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + bitmap exit_bbs = BITMAP_ALLOC (NULL); + + bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun)); + bb_top_order_index_size = last_basic_block_for_fn (cfun); + + entry->flags &= ~EDGE_DFS_BACK; + bitmap_set_bit (exit_bbs, EXIT_BLOCK); + rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true, + rpo, NULL); + BITMAP_FREE (exit_bbs); + + for (int i = 0; i < rpo_num; i++) + bb_top_order_index[rpo[i]] = i; + + free (rpo); +} + +void loop_distribution::bb_top_order_destroy () +{ + free (bb_top_order_index); + bb_top_order_index = NULL; + bb_top_order_index_size = 0; +} + + +/* Given LOOP, this function records seed statements for distribution in + WORK_LIST. Return false if there is nothing for distribution. */ + +static bool +find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list) +{ + basic_block *bbs = get_loop_body_in_dom_order (loop); + + /* Initialize the worklist with stmts we seed the partitions with. */ + for (unsigned i = 0; i < loop->num_nodes; ++i) + { + /* In irreducible sub-regions we don't know how to redirect + conditions, so fail. See PR100492. */ + if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "loop %d contains an irreducible region.\n", + loop->num); + work_list->truncate (0); + break; + } + for (gphi_iterator gsi = gsi_start_phis (bbs[i]); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + if (virtual_operand_p (gimple_phi_result (phi))) + continue; + /* Distribute stmts which have defs that are used outside of + the loop. */ + if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) + continue; + work_list->safe_push (phi); + } + for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + /* Ignore clobbers, they do not have true side effects. */ + if (gimple_clobber_p (stmt)) + continue; + + /* If there is a stmt with side-effects bail out - we + cannot and should not distribute this loop. */ + if (gimple_has_side_effects (stmt)) + { + free (bbs); + return false; + } + + /* Distribute stmts which have defs that are used outside of + the loop. */ + if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) + ; + /* Otherwise only distribute stores for now. */ + else if (!gimple_vdef (stmt)) + continue; + + work_list->safe_push (stmt); + } + } + free (bbs); + return work_list->length () > 0; +} + +/* A helper function for generate_{rawmemchr,strlen}_builtin functions in order + to place new statements SEQ before LOOP and replace the old reduction + variable with the new one. */ + +static void +generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq, + tree reduction_var_old, tree reduction_var_new, + const char *info, machine_mode load_mode) +{ + /* Place new statements before LOOP. */ + gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src); + gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING); + + /* Replace old reduction variable with new one. */ + imm_use_iterator iter; + gimple *stmt; + use_operand_p use_p; + FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old) + { + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) + SET_USE (use_p, reduction_var_new); + + update_stmt (stmt); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, info, GET_MODE_NAME (load_mode)); +} + +/* Generate a call to rawmemchr and place it before LOOP. REDUCTION_VAR is + replaced with a fresh SSA name representing the result of the call. */ + +static void +generate_rawmemchr_builtin (loop_p loop, tree reduction_var, + data_reference_p store_dr, tree base, tree pattern, + location_t loc) +{ + gimple_seq seq = NULL; + + tree mem = force_gimple_operand (base, &seq, true, NULL_TREE); + gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern); + tree reduction_var_new = copy_ssa_name (reduction_var); + gimple_call_set_lhs (fn_call, reduction_var_new); + gimple_set_location (fn_call, loc); + gimple_seq_add_stmt (&seq, fn_call); + + if (store_dr) + { + gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new); + gimple_seq_add_stmt (&seq, g); + } + + generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new, + "generated rawmemchr%s\n", + TYPE_MODE (TREE_TYPE (TREE_TYPE (base)))); +} + +/* Helper function for generate_strlen_builtin(,_using_rawmemchr) */ + +static void +generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq, + tree reduction_var_old, tree reduction_var_new, + machine_mode mode, tree start_len) +{ + /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be + converted if types of old and new reduction variable are not compatible. */ + reduction_var_new = gimple_convert (&seq, TREE_TYPE (reduction_var_old), + reduction_var_new); + + /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start + length. */ + if (!integer_zerop (start_len)) + { + tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new)); + gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new, + start_len); + gimple_seq_add_stmt (&seq, g); + reduction_var_new = lhs; + } + + generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new, + "generated strlen%s\n", mode); +} + +/* Generate a call to strlen and place it before LOOP. REDUCTION_VAR is + replaced with a fresh SSA name representing the result of the call. */ + +static void +generate_strlen_builtin (loop_p loop, tree reduction_var, tree base, + tree start_len, location_t loc) +{ + gimple_seq seq = NULL; + + tree reduction_var_new = make_ssa_name (size_type_node); + + tree mem = force_gimple_operand (base, &seq, true, NULL_TREE); + tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN)); + gimple *fn_call = gimple_build_call (fn, 1, mem); + gimple_call_set_lhs (fn_call, reduction_var_new); + gimple_set_location (fn_call, loc); + gimple_seq_add_stmt (&seq, fn_call); + + generate_strlen_builtin_1 (loop, seq, reduction_var, reduction_var_new, + QImode, start_len); +} + +/* Generate code in order to mimic the behaviour of strlen but this time over + an array of elements with mode different than QI. REDUCTION_VAR is replaced + with a fresh SSA name representing the result, i.e., the length. */ + +static void +generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var, + tree base, tree load_type, + tree start_len, location_t loc) +{ + gimple_seq seq = NULL; + + tree start = force_gimple_operand (base, &seq, true, NULL_TREE); + tree zero = build_zero_cst (load_type); + gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero); + tree end = make_ssa_name (TREE_TYPE (base)); + gimple_call_set_lhs (fn_call, end); + gimple_set_location (fn_call, loc); + gimple_seq_add_stmt (&seq, fn_call); + + /* Determine the number of elements between START and END by + evaluating (END - START) / sizeof (*START). */ + tree diff = make_ssa_name (ptrdiff_type_node); + gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start); + gimple_seq_add_stmt (&seq, diff_stmt); + /* Let SIZE be the size of each character. */ + tree size = gimple_convert (&seq, ptrdiff_type_node, + TYPE_SIZE_UNIT (load_type)); + tree count = make_ssa_name (ptrdiff_type_node); + gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size); + gimple_seq_add_stmt (&seq, count_stmt); + + generate_strlen_builtin_1 (loop, seq, reduction_var, count, + TYPE_MODE (load_type), + start_len); +} + +/* Return true if we can count at least as many characters by taking pointer + difference as we can count via reduction_var without an overflow. Thus + compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type), + m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character. */ +static bool +reduction_var_overflows_first (tree reduction_var_type, tree load_type) +{ + widest_int n2 = wi::lshift (1, TYPE_PRECISION (reduction_var_type));; + widest_int m2 = wi::lshift (1, TYPE_PRECISION (ptrdiff_type_node) - 1); + widest_int s = wi::to_widest (TYPE_SIZE_UNIT (load_type)); + return wi::ltu_p (n2, wi::udiv_trunc (m2, s)); +} + +static gimple * +determine_reduction_stmt_1 (const loop_p loop, const basic_block *bbs) +{ + gimple *reduction_stmt = NULL; + + for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i) + { + basic_block bb = bbs[i]; + + for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); + gsi_next_nondebug (&bsi)) + { + gphi *phi = bsi.phi (); + if (virtual_operand_p (gimple_phi_result (phi))) + continue; + if (stmt_has_scalar_dependences_outside_loop (loop, phi)) + { + if (reduction_stmt) + return NULL; + reduction_stmt = phi; + } + } + + for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); + gsi_next_nondebug (&bsi), ++ninsns) + { + /* Bail out early for loops which are unlikely to match. */ + if (ninsns > 16) + return NULL; + gimple *stmt = gsi_stmt (bsi); + if (gimple_clobber_p (stmt)) + continue; + if (gimple_code (stmt) == GIMPLE_LABEL) + continue; + if (gimple_has_volatile_ops (stmt)) + return NULL; + if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) + { + if (reduction_stmt) + return NULL; + reduction_stmt = stmt; + } + } + } + + return reduction_stmt; +} + +/* If LOOP has a single non-volatile reduction statement, then return a pointer + to it. Otherwise return NULL. */ +static gimple * +determine_reduction_stmt (const loop_p loop) +{ + basic_block *bbs = get_loop_body (loop); + gimple *reduction_stmt = determine_reduction_stmt_1 (loop, bbs); + XDELETEVEC (bbs); + return reduction_stmt; +} + +/* Transform loops which mimic the effects of builtins rawmemchr or strlen and + replace them accordingly. For example, a loop of the form + + for (; *p != 42; ++p); + + is replaced by + + p = rawmemchr<MODE> (p, 42); + + under the assumption that rawmemchr is available for a particular MODE. + Another example is + + int i; + for (i = 42; s[i]; ++i); + + which is replaced by + + i = (int)strlen (&s[42]) + 42; + + for some character array S. In case array S is not of type character array + we end up with + + i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42; + + assuming that rawmemchr is available for a particular MODE. */ + +bool +loop_distribution::transform_reduction_loop (loop_p loop) +{ + gimple *reduction_stmt; + data_reference_p load_dr = NULL, store_dr = NULL; + + edge e = single_exit (loop); + gcond *cond = safe_dyn_cast <gcond *> (last_stmt (e->src)); + if (!cond) + return false; + /* Ensure loop condition is an (in)equality test and loop is exited either if + the inequality test fails or the equality test succeeds. */ + if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == NE_EXPR) + && !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == EQ_EXPR)) + return false; + /* A limitation of the current implementation is that we only support + constant patterns in (in)equality tests. */ + tree pattern = gimple_cond_rhs (cond); + if (TREE_CODE (pattern) != INTEGER_CST) + return false; + + reduction_stmt = determine_reduction_stmt (loop); + + /* A limitation of the current implementation is that we require a reduction + statement. Therefore, loops without a reduction statement as in the + following are not recognized: + int *p; + void foo (void) { for (; *p; ++p); } */ + if (reduction_stmt == NULL) + return false; + + /* Reduction variables are guaranteed to be SSA names. */ + tree reduction_var; + switch (gimple_code (reduction_stmt)) + { + case GIMPLE_ASSIGN: + case GIMPLE_PHI: + reduction_var = gimple_get_lhs (reduction_stmt); + break; + default: + /* Bail out e.g. for GIMPLE_CALL. */ + return false; + } + + struct graph *rdg = build_rdg (loop, NULL); + if (rdg == NULL) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Loop %d not transformed: failed to build the RDG.\n", + loop->num); + + return false; + } + auto_bitmap partition_stmts; + bitmap_set_range (partition_stmts, 0, rdg->n_vertices); + find_single_drs (loop, rdg, partition_stmts, &store_dr, &load_dr); + free_rdg (rdg); + + /* Bail out if there is no single load. */ + if (load_dr == NULL) + return false; + + /* Reaching this point we have a loop with a single reduction variable, + a single load, and an optional single store. */ + + tree load_ref = DR_REF (load_dr); + tree load_type = TREE_TYPE (load_ref); + tree load_access_base = build_fold_addr_expr (load_ref); + tree load_access_size = TYPE_SIZE_UNIT (load_type); + affine_iv load_iv, reduction_iv; + + if (!INTEGRAL_TYPE_P (load_type) + || !type_has_mode_precision_p (load_type)) + return false; + + /* We already ensured that the loop condition tests for (in)equality where the + rhs is a constant pattern. Now ensure that the lhs is the result of the + load. */ + if (gimple_cond_lhs (cond) != gimple_assign_lhs (DR_STMT (load_dr))) + return false; + + /* Bail out if no affine induction variable with constant step can be + determined. */ + if (!simple_iv (loop, loop, load_access_base, &load_iv, false)) + return false; + + /* Bail out if memory accesses are not consecutive or not growing. */ + if (!operand_equal_p (load_iv.step, load_access_size, 0)) + return false; + + if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false)) + return false; + + /* Handle rawmemchr like loops. */ + if (operand_equal_p (load_iv.base, reduction_iv.base) + && operand_equal_p (load_iv.step, reduction_iv.step)) + { + if (store_dr) + { + /* Ensure that we store to X and load from X+I where I>0. */ + if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR + || !integer_onep (TREE_OPERAND (load_iv.base, 1))) + return false; + tree ptr_base = TREE_OPERAND (load_iv.base, 0); + if (TREE_CODE (ptr_base) != SSA_NAME) + return false; + gimple *def = SSA_NAME_DEF_STMT (ptr_base); + if (!gimple_assign_single_p (def) + || gimple_assign_rhs1 (def) != DR_REF (store_dr)) + return false; + /* Ensure that the reduction value is stored. */ + if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var) + return false; + } + /* Bail out if target does not provide rawmemchr for a certain mode. */ + machine_mode mode = TYPE_MODE (load_type); + if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing) + return false; + location_t loc = gimple_location (DR_STMT (load_dr)); + generate_rawmemchr_builtin (loop, reduction_var, store_dr, load_iv.base, + pattern, loc); + return true; + } + + /* Handle strlen like loops. */ + if (store_dr == NULL + && integer_zerop (pattern) + && TREE_CODE (reduction_iv.base) == INTEGER_CST + && TREE_CODE (reduction_iv.step) == INTEGER_CST + && integer_onep (reduction_iv.step)) + { + location_t loc = gimple_location (DR_STMT (load_dr)); + tree reduction_var_type = TREE_TYPE (reduction_var); + /* While determining the length of a string an overflow might occur. + If an overflow only occurs in the loop implementation and not in the + strlen implementation, then either the overflow is undefined or the + truncated result of strlen equals the one of the loop. Otherwise if + an overflow may also occur in the strlen implementation, then + replacing a loop by a call to strlen is sound whenever we ensure that + if an overflow occurs in the strlen implementation, then also an + overflow occurs in the loop implementation which is undefined. It + seems reasonable to relax this and assume that the strlen + implementation cannot overflow in case sizetype is big enough in the + sense that an overflow can only happen for string objects which are + bigger than half of the address space; at least for 32-bit targets and + up. + + For strlen which makes use of rawmemchr the maximal length of a string + which can be determined without an overflow is PTRDIFF_MAX / S where + each character has size S. Since an overflow for ptrdiff type is + undefined we have to make sure that if an overflow occurs, then an + overflow occurs in the loop implementation, too, and this is + undefined, too. Similar as before we relax this and assume that no + string object is larger than half of the address space; at least for + 32-bit targets and up. */ + if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node) + && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node) + && ((TYPE_PRECISION (sizetype) >= TYPE_PRECISION (ptr_type_node) - 1 + && TYPE_PRECISION (ptr_type_node) >= 32) + || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type) + && TYPE_PRECISION (reduction_var_type) <= TYPE_PRECISION (sizetype))) + && builtin_decl_implicit (BUILT_IN_STRLEN)) + generate_strlen_builtin (loop, reduction_var, load_iv.base, + reduction_iv.base, loc); + else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type)) + != CODE_FOR_nothing + && ((TYPE_PRECISION (ptrdiff_type_node) == TYPE_PRECISION (ptr_type_node) + && TYPE_PRECISION (ptrdiff_type_node) >= 32) + || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type) + && reduction_var_overflows_first (reduction_var_type, load_type)))) + generate_strlen_builtin_using_rawmemchr (loop, reduction_var, + load_iv.base, + load_type, + reduction_iv.base, loc); + else + return false; + return true; + } + + return false; +} + +/* Given innermost LOOP, return the outermost enclosing loop that forms a + perfect loop nest. */ + +static class loop * +prepare_perfect_loop_nest (class loop *loop) +{ + class loop *outer = loop_outer (loop); + tree niters = number_of_latch_executions (loop); + + /* TODO: We only support the innermost 3-level loop nest distribution + because of compilation time issue for now. This should be relaxed + in the future. Note we only allow 3-level loop nest distribution + when parallelizing loops. */ + while ((loop->inner == NULL + || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1)) + && loop_outer (outer) + && outer->inner == loop && loop->next == NULL + && single_exit (outer) + && !chrec_contains_symbols_defined_in_loop (niters, outer->num) + && (niters = number_of_latch_executions (outer)) != NULL_TREE + && niters != chrec_dont_know) + { + loop = outer; + outer = loop_outer (loop); + } + + return loop; +} + + +unsigned int +loop_distribution::execute (function *fun) +{ + bool changed = false; + basic_block bb; + control_dependences *cd = NULL; + auto_vec<loop_p> loops_to_be_destroyed; + + if (number_of_loops (fun) <= 1) + return 0; + + bb_top_order_init (); + + FOR_ALL_BB_FN (bb, fun) + { + gimple_stmt_iterator gsi; + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + gimple_set_uid (gsi_stmt (gsi), -1); + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + gimple_set_uid (gsi_stmt (gsi), -1); + } + + /* We can at the moment only distribute non-nested loops, thus restrict + walking to innermost loops. */ + for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST)) + { + /* Don't distribute multiple exit edges loop, or cold loop when + not doing pattern detection. */ + if (!single_exit (loop) + || (!flag_tree_loop_distribute_patterns + && !optimize_loop_for_speed_p (loop))) + continue; + + /* If niters is unknown don't distribute loop but rather try to transform + it to a call to a builtin. */ + tree niters = number_of_latch_executions (loop); + if (niters == NULL_TREE || niters == chrec_dont_know) + { + datarefs_vec.create (20); + if (transform_reduction_loop (loop)) + { + changed = true; + loops_to_be_destroyed.safe_push (loop); + if (dump_enabled_p ()) + { + dump_user_location_t loc = find_loop_location (loop); + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, + loc, "Loop %d transformed into a builtin.\n", + loop->num); + } + } + free_data_refs (datarefs_vec); + continue; + } + + /* Get the perfect loop nest for distribution. */ + loop = prepare_perfect_loop_nest (loop); + for (; loop; loop = loop->inner) + { + auto_vec<gimple *> work_list; + if (!find_seed_stmts_for_distribution (loop, &work_list)) + break; + + const char *str = loop->inner ? " nest" : ""; + dump_user_location_t loc = find_loop_location (loop); + if (!cd) + { + calculate_dominance_info (CDI_DOMINATORS); + calculate_dominance_info (CDI_POST_DOMINATORS); + cd = new control_dependences (); + free_dominance_info (CDI_POST_DOMINATORS); + } + + bool destroy_p; + int nb_generated_loops, nb_generated_calls; + nb_generated_loops + = distribute_loop (loop, work_list, cd, &nb_generated_calls, + &destroy_p, (!optimize_loop_for_speed_p (loop) + || !flag_tree_loop_distribution)); + if (destroy_p) + loops_to_be_destroyed.safe_push (loop); + + if (nb_generated_loops + nb_generated_calls > 0) + { + changed = true; + if (dump_enabled_p ()) + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, + loc, "Loop%s %d distributed: split to %d loops " + "and %d library calls.\n", str, loop->num, + nb_generated_loops, nb_generated_calls); + + break; + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num); + } + } + + if (cd) + delete cd; + + if (bb_top_order_index != NULL) + bb_top_order_destroy (); + + if (changed) + { + /* Destroy loop bodies that could not be reused. Do this late as we + otherwise can end up refering to stale data in control dependences. */ + unsigned i; + class loop *loop; + FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop) + destroy_loop (loop); + + /* Cached scalar evolutions now may refer to wrong or non-existing + loops. */ + scev_reset_htab (); + mark_virtual_operands_for_renaming (fun); + rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); + } + + checking_verify_loop_structure (); + + return changed ? TODO_cleanup_cfg : 0; +} + + +/* Distribute all loops in the current function. */ + +namespace { + +const pass_data pass_data_loop_distribution = +{ + GIMPLE_PASS, /* type */ + "ldist", /* name */ + OPTGROUP_LOOP, /* optinfo_flags */ + TV_TREE_LOOP_DISTRIBUTION, /* tv_id */ + ( PROP_cfg | PROP_ssa ), /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_loop_distribution : public gimple_opt_pass +{ +public: + pass_loop_distribution (gcc::context *ctxt) + : gimple_opt_pass (pass_data_loop_distribution, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return flag_tree_loop_distribution + || flag_tree_loop_distribute_patterns; + } + + virtual unsigned int execute (function *); + +}; // class pass_loop_distribution + +unsigned int +pass_loop_distribution::execute (function *fun) +{ + return loop_distribution ().execute (fun); +} + +} // anon namespace + +gimple_opt_pass * +make_pass_loop_distribution (gcc::context *ctxt) +{ + return new pass_loop_distribution (ctxt); +} |