/* OpenACC worker partitioning via middle end neutering/broadcasting scheme Copyright (C) 2015-2021 Free Software Foundation, Inc. This file is part of GCC. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ #include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" #include "rtl.h" #include "tree.h" #include "gimple.h" #include "tree-pass.h" #include "ssa.h" #include "cgraph.h" #include "pretty-print.h" #include "fold-const.h" #include "gimplify.h" #include "gimple-iterator.h" #include "gimple-walk.h" #include "tree-inline.h" #include "langhooks.h" #include "omp-general.h" #include "omp-low.h" #include "gimple-pretty-print.h" #include "cfghooks.h" #include "insn-config.h" #include "recog.h" #include "internal-fn.h" #include "bitmap.h" #include "tree-nested.h" #include "stor-layout.h" #include "tree-ssa-threadupdate.h" #include "tree-into-ssa.h" #include "splay-tree.h" #include "target.h" #include "cfgloop.h" #include "tree-cfg.h" #include "omp-offload.h" #include "attribs.h" /* Loop structure of the function. The entire function is described as a NULL loop. */ /* Adapted from 'gcc/config/nvptx/nvptx.c:struct parallel'. */ struct parallel_g { /* Parent parallel. */ parallel_g *parent; /* Next sibling parallel. */ parallel_g *next; /* First child parallel. */ parallel_g *inner; /* Partitioning mask of the parallel. */ unsigned mask; /* Partitioning used within inner parallels. */ unsigned inner_mask; /* Location of parallel forked and join. The forked is the first block in the parallel and the join is the first block after of the partition. */ basic_block forked_block; basic_block join_block; gimple *forked_stmt; gimple *join_stmt; gimple *fork_stmt; gimple *joining_stmt; /* Basic blocks in this parallel, but not in child parallels. The FORKED and JOINING blocks are in the partition. The FORK and JOIN blocks are not. */ auto_vec blocks; tree record_type; tree sender_decl; tree receiver_decl; public: parallel_g (parallel_g *parent, unsigned mode); ~parallel_g (); }; /* Constructor links the new parallel into it's parent's chain of children. */ parallel_g::parallel_g (parallel_g *parent_, unsigned mask_) :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0) { forked_block = join_block = 0; forked_stmt = join_stmt = NULL; fork_stmt = joining_stmt = NULL; record_type = NULL_TREE; sender_decl = NULL_TREE; receiver_decl = NULL_TREE; if (parent) { next = parent->inner; parent->inner = this; } } parallel_g::~parallel_g () { delete inner; delete next; } static bool local_var_based_p (tree decl) { switch (TREE_CODE (decl)) { case VAR_DECL: return !is_global_var (decl); case COMPONENT_REF: case BIT_FIELD_REF: case ARRAY_REF: return local_var_based_p (TREE_OPERAND (decl, 0)); default: return false; } } /* Map of basic blocks to gimple stmts. */ typedef hash_map bb_stmt_map_t; /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since the routine likely contains partitioned loops (else will do its own neutering and variable propagation). Return TRUE if a function call CALL should be made in (worker) single mode instead, rather than redundant mode. */ static bool omp_sese_active_worker_call (gcall *call) { #define GOMP_DIM_SEQ GOMP_DIM_MAX tree fndecl = gimple_call_fndecl (call); if (!fndecl) return true; tree attrs = oacc_get_fn_attrib (fndecl); if (!attrs) return true; int level = oacc_fn_attrib_level (attrs); /* Neither regular functions nor "seq" routines should be run by all threads in worker-single mode. */ return level == -1 || level == GOMP_DIM_SEQ; #undef GOMP_DIM_SEQ } /* Split basic blocks such that each forked and join unspecs are at the start of their basic blocks. Thus afterwards each block will have a single partitioning mode. We also do the same for return insns, as they are executed by every thread. Return the partitioning mode of the function as a whole. Populate MAP with head and tail blocks. We also clear the BB visited flag, which is used when finding partitions. */ /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_split_blocks'. */ static void omp_sese_split_blocks (bb_stmt_map_t *map) { auto_vec worklist; basic_block block; /* Locate all the reorg instructions of interest. */ FOR_ALL_BB_FN (block, cfun) { /* Clear visited flag, for use by parallel locator */ block->flags &= ~BB_VISITED; for (gimple_stmt_iterator gsi = gsi_start_bb (block); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple *stmt = gsi_stmt (gsi); if (gimple_call_internal_p (stmt, IFN_UNIQUE)) { enum ifn_unique_kind k = ((enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); if (k == IFN_UNIQUE_OACC_JOIN) worklist.safe_push (stmt); else if (k == IFN_UNIQUE_OACC_FORK) { gcc_assert (gsi_one_before_end_p (gsi)); basic_block forked_block = single_succ (block); gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block); /* We push a NOP as a placeholder for the "forked" stmt. This is then recognized in omp_sese_find_par. */ gimple *nop = gimple_build_nop (); gsi_insert_before (&gsi2, nop, GSI_SAME_STMT); worklist.safe_push (nop); } } else if (gimple_code (stmt) == GIMPLE_RETURN || gimple_code (stmt) == GIMPLE_COND || gimple_code (stmt) == GIMPLE_SWITCH || (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt) && !omp_sese_active_worker_call (as_a (stmt)))) worklist.safe_push (stmt); else if (is_gimple_assign (stmt)) { tree lhs = gimple_assign_lhs (stmt); /* Force assignments to components/fields/elements of local aggregates into fully-partitioned (redundant) mode. This avoids having to broadcast the whole aggregate. The RHS of the assignment will be propagated using the normal mechanism. */ switch (TREE_CODE (lhs)) { case COMPONENT_REF: case BIT_FIELD_REF: case ARRAY_REF: { tree aggr = TREE_OPERAND (lhs, 0); if (local_var_based_p (aggr)) worklist.safe_push (stmt); } break; default: ; } } } } /* Split blocks on the worklist. */ unsigned ix; gimple *stmt; for (ix = 0; worklist.iterate (ix, &stmt); ix++) { basic_block block = gimple_bb (stmt); if (gimple_code (stmt) == GIMPLE_COND) { gcond *orig_cond = as_a (stmt); tree_code code = gimple_expr_code (orig_cond); tree pred = make_ssa_name (boolean_type_node); gimple *asgn = gimple_build_assign (pred, code, gimple_cond_lhs (orig_cond), gimple_cond_rhs (orig_cond)); gcond *new_cond = gimple_build_cond (NE_EXPR, pred, boolean_false_node, gimple_cond_true_label (orig_cond), gimple_cond_false_label (orig_cond)); gimple_stmt_iterator gsi = gsi_for_stmt (stmt); gsi_insert_before (&gsi, asgn, GSI_SAME_STMT); gsi_replace (&gsi, new_cond, true); edge e = split_block (block, asgn); block = e->dest; map->get_or_insert (block) = new_cond; } else if ((gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt)) || is_gimple_assign (stmt)) { gimple_stmt_iterator gsi = gsi_for_stmt (stmt); gsi_prev (&gsi); edge call = split_block (block, gsi_stmt (gsi)); gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest)); edge call_to_ret = split_block (call->dest, call_stmt); map->get_or_insert (call_to_ret->src) = call_stmt; } else { gimple_stmt_iterator gsi = gsi_for_stmt (stmt); gsi_prev (&gsi); if (gsi_end_p (gsi)) map->get_or_insert (block) = stmt; else { /* Split block before insn. The insn is in the new block. */ edge e = split_block (block, gsi_stmt (gsi)); block = e->dest; map->get_or_insert (block) = stmt; } } } } static const char * mask_name (unsigned mask) { switch (mask) { case 0: return "gang redundant"; case 1: return "gang partitioned"; case 2: return "worker partitioned"; case 3: return "gang+worker partitioned"; case 4: return "vector partitioned"; case 5: return "gang+vector partitioned"; case 6: return "worker+vector partitioned"; case 7: return "fully partitioned"; default: return ""; } } /* Dump this parallel and all its inner parallels. */ /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_dump_pars'. */ static void omp_sese_dump_pars (parallel_g *par, unsigned depth) { fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n", depth, par->mask, mask_name (par->mask), par->forked_block ? par->forked_block->index : -1, par->join_block ? par->join_block->index : -1); fprintf (dump_file, " blocks:"); basic_block block; for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++) fprintf (dump_file, " %d", block->index); fprintf (dump_file, "\n"); if (par->inner) omp_sese_dump_pars (par->inner, depth + 1); if (par->next) omp_sese_dump_pars (par->next, depth); } /* If BLOCK contains a fork/join marker, process it to create or terminate a loop structure. Add this block to the current loop, and then walk successor blocks. */ /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_find_par'. */ static parallel_g * omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block) { if (block->flags & BB_VISITED) return par; block->flags |= BB_VISITED; if (gimple **stmtp = map->get (block)) { gimple *stmt = *stmtp; if (gimple_code (stmt) == GIMPLE_COND || gimple_code (stmt) == GIMPLE_SWITCH || gimple_code (stmt) == GIMPLE_RETURN || (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt)) || is_gimple_assign (stmt)) { /* A single block that is forced to be at the maximum partition level. Make a singleton par for it. */ par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG) | GOMP_DIM_MASK (GOMP_DIM_WORKER) | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); par->forked_block = block; par->forked_stmt = stmt; par->blocks.safe_push (block); par = par->parent; goto walk_successors; } else if (gimple_nop_p (stmt)) { basic_block pred = single_pred (block); gcc_assert (pred); gimple_stmt_iterator gsi = gsi_last_bb (pred); gimple *final_stmt = gsi_stmt (gsi); if (gimple_call_internal_p (final_stmt, IFN_UNIQUE)) { gcall *call = as_a (final_stmt); enum ifn_unique_kind k = ((enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0))); if (k == IFN_UNIQUE_OACC_FORK) { HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; par = new parallel_g (par, mask); par->forked_block = block; par->forked_stmt = final_stmt; par->fork_stmt = stmt; } else gcc_unreachable (); } else gcc_unreachable (); } else if (gimple_call_internal_p (stmt, IFN_UNIQUE)) { gcall *call = as_a (stmt); enum ifn_unique_kind k = ((enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0))); if (k == IFN_UNIQUE_OACC_JOIN) { HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; gcc_assert (par->mask == mask); par->join_block = block; par->join_stmt = stmt; par = par->parent; } else gcc_unreachable (); } else gcc_unreachable (); } if (par) /* Add this block onto the current loop's list of blocks. */ par->blocks.safe_push (block); else /* This must be the entry block. Create a NULL parallel. */ par = new parallel_g (0, 0); walk_successors: /* Walk successor blocks. */ edge e; edge_iterator ei; FOR_EACH_EDGE (e, ei, block->succs) omp_sese_find_par (map, par, e->dest); return par; } /* DFS walk the CFG looking for fork & join markers. Construct loop structures as we go. MAP is a mapping of basic blocks to head & tail markers, discovered when splitting blocks. This speeds up the discovery. We rely on the BB visited flag having been cleared when splitting blocks. */ /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_discover_pars'. */ static parallel_g * omp_sese_discover_pars (bb_stmt_map_t *map) { basic_block block; /* Mark exit blocks as visited. */ block = EXIT_BLOCK_PTR_FOR_FN (cfun); block->flags |= BB_VISITED; /* And entry block as not. */ block = ENTRY_BLOCK_PTR_FOR_FN (cfun); block->flags &= ~BB_VISITED; parallel_g *par = omp_sese_find_par (map, 0, block); if (dump_file) { fprintf (dump_file, "\nLoops\n"); omp_sese_dump_pars (par, 0); fprintf (dump_file, "\n"); } return par; } static void populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single, bitmap vector_single, unsigned outer_mask, int depth) { unsigned mask = outer_mask | par->mask; basic_block block; for (unsigned i = 0; par->blocks.iterate (i, &block); i++) { if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) bitmap_set_bit (worker_single, block->index); if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0) bitmap_set_bit (vector_single, block->index); } if (par->inner) populate_single_mode_bitmaps (par->inner, worker_single, vector_single, mask, depth + 1); if (par->next) populate_single_mode_bitmaps (par->next, worker_single, vector_single, outer_mask, depth); } /* A map from SSA names or var decls to record fields. */ typedef hash_map field_map_t; /* For each propagation record type, this is a map from SSA names or var decls to propagate, to the field in the record type that should be used for transmission and reception. */ typedef hash_map record_field_map_t; static void install_var_field (tree var, tree record_type, field_map_t *fields) { tree name; char tmp[20]; if (TREE_CODE (var) == SSA_NAME) { name = SSA_NAME_IDENTIFIER (var); if (!name) { sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var)); name = get_identifier (tmp); } } else if (TREE_CODE (var) == VAR_DECL) { name = DECL_NAME (var); if (!name) { sprintf (tmp, "D_%u", (unsigned) DECL_UID (var)); name = get_identifier (tmp); } } else gcc_unreachable (); gcc_assert (!fields->get (var)); tree type = TREE_TYPE (var); if (POINTER_TYPE_P (type) && TYPE_RESTRICT (type)) type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT); tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type); if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var)) { SET_DECL_ALIGN (field, DECL_ALIGN (var)); DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var); TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var); } else SET_DECL_ALIGN (field, TYPE_ALIGN (type)); fields->put (var, field); insert_field_into_struct (record_type, field); } /* Sets of SSA_NAMES or VAR_DECLs to propagate. */ typedef hash_set propagation_set; static void find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask, bitmap worker_single, bitmap vector_single, vec *prop_set) { unsigned mask = outer_mask | par->mask; if (par->inner) find_ssa_names_to_propagate (par->inner, mask, worker_single, vector_single, prop_set); if (par->next) find_ssa_names_to_propagate (par->next, outer_mask, worker_single, vector_single, prop_set); if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) { basic_block block; int ix; for (ix = 0; par->blocks.iterate (ix, &block); ix++) { for (gphi_iterator psi = gsi_start_phis (block); !gsi_end_p (psi); gsi_next (&psi)) { gphi *phi = psi.phi (); use_operand_p use; ssa_op_iter iter; FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE) { tree var = USE_FROM_PTR (use); if (TREE_CODE (var) != SSA_NAME) continue; gimple *def_stmt = SSA_NAME_DEF_STMT (var); if (gimple_nop_p (def_stmt)) continue; basic_block def_bb = gimple_bb (def_stmt); if (bitmap_bit_p (worker_single, def_bb->index)) { if (!(*prop_set)[def_bb->index]) (*prop_set)[def_bb->index] = new propagation_set; propagation_set *ws_prop = (*prop_set)[def_bb->index]; ws_prop->add (var); } } } for (gimple_stmt_iterator gsi = gsi_start_bb (block); !gsi_end_p (gsi); gsi_next (&gsi)) { use_operand_p use; ssa_op_iter iter; gimple *stmt = gsi_stmt (gsi); FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE) { tree var = USE_FROM_PTR (use); gimple *def_stmt = SSA_NAME_DEF_STMT (var); if (gimple_nop_p (def_stmt)) continue; basic_block def_bb = gimple_bb (def_stmt); if (bitmap_bit_p (worker_single, def_bb->index)) { if (!(*prop_set)[def_bb->index]) (*prop_set)[def_bb->index] = new propagation_set; propagation_set *ws_prop = (*prop_set)[def_bb->index]; ws_prop->add (var); } } } } } } /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a statement. */ static tree find_partitioned_var_uses_1 (tree *node, int *, void *data) { walk_stmt_info *wi = (walk_stmt_info *) data; hash_set *partitioned_var_uses = (hash_set *) wi->info; if (!wi->is_lhs && VAR_P (*node)) partitioned_var_uses->add (*node); return NULL_TREE; } static void find_partitioned_var_uses (parallel_g *par, unsigned outer_mask, hash_set *partitioned_var_uses) { unsigned mask = outer_mask | par->mask; if (par->inner) find_partitioned_var_uses (par->inner, mask, partitioned_var_uses); if (par->next) find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses); if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) { basic_block block; int ix; for (ix = 0; par->blocks.iterate (ix, &block); ix++) for (gimple_stmt_iterator gsi = gsi_start_bb (block); !gsi_end_p (gsi); gsi_next (&gsi)) { walk_stmt_info wi; memset (&wi, 0, sizeof (wi)); wi.info = (void *) partitioned_var_uses; walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi); } } } /* Gang-private variables (typically placed in a GPU's shared memory) do not need to be processed by the worker-propagation mechanism. Populate the GANG_PRIVATE_VARS set with any such variables found in the current function. */ static void find_gang_private_vars (hash_set *gang_private_vars) { basic_block block; FOR_EACH_BB_FN (block, cfun) { for (gimple_stmt_iterator gsi = gsi_start_bb (block); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple *stmt = gsi_stmt (gsi); if (gimple_call_internal_p (stmt, IFN_UNIQUE)) { enum ifn_unique_kind k = ((enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); if (k == IFN_UNIQUE_OACC_PRIVATE) { HOST_WIDE_INT level = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); if (level != GOMP_DIM_GANG) continue; for (unsigned i = 3; i < gimple_call_num_args (stmt); i++) { tree arg = gimple_call_arg (stmt, i); gcc_assert (TREE_CODE (arg) == ADDR_EXPR); tree decl = TREE_OPERAND (arg, 0); gang_private_vars->add (decl); } } } } } } static void find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask, hash_set *partitioned_var_uses, hash_set *gang_private_vars, vec *prop_set) { unsigned mask = outer_mask | par->mask; if (par->inner) find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses, gang_private_vars, prop_set); if (par->next) find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses, gang_private_vars, prop_set); if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) { basic_block block; int ix; for (ix = 0; par->blocks.iterate (ix, &block); ix++) { for (gimple_stmt_iterator gsi = gsi_start_bb (block); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple *stmt = gsi_stmt (gsi); tree var; unsigned i; FOR_EACH_LOCAL_DECL (cfun, i, var) { if (!VAR_P (var) || is_global_var (var) || AGGREGATE_TYPE_P (TREE_TYPE (var)) || !partitioned_var_uses->contains (var) || gang_private_vars->contains (var)) continue; if (stmt_may_clobber_ref_p (stmt, var)) { if (dump_file) { fprintf (dump_file, "bb %u: local variable may be " "clobbered in %s mode: ", block->index, mask_name (mask)); print_generic_expr (dump_file, var, TDF_SLIM); fprintf (dump_file, "\n"); } if (!(*prop_set)[block->index]) (*prop_set)[block->index] = new propagation_set; propagation_set *ws_prop = (*prop_set)[block->index]; ws_prop->add (var); } } } } } } /* Transform basic blocks FROM, TO (which may be the same block) into: if (GOACC_single_start ()) BLOCK; GOACC_barrier (); \ | / +----+ | | (new) predicate block +----+-- \ | / \ | / |t \ +----+ +----+ +----+ | | | | | ===> | | | f (old) from block +----+ +----+ +----+ | | t/ \f | / +----+/ (split (split before | | skip block at end) condition) +----+ t/ \f */ static void worker_single_simple (basic_block from, basic_block to, hash_set *def_escapes_block) { gimple *call, *cond; tree lhs, decl; basic_block skip_block; gimple_stmt_iterator gsi = gsi_last_bb (to); if (EDGE_COUNT (to->succs) > 1) { gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND); gsi_prev (&gsi); } edge e = split_block (to, gsi_stmt (gsi)); skip_block = e->dest; gimple_stmt_iterator start = gsi_after_labels (from); decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START); lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); call = gimple_build_call (decl, 0); gimple_call_set_lhs (call, lhs); gsi_insert_before (&start, call, GSI_NEW_STMT); update_stmt (call); cond = gimple_build_cond (EQ_EXPR, lhs, fold_convert_loc (UNKNOWN_LOCATION, TREE_TYPE (lhs), boolean_true_node), NULL_TREE, NULL_TREE); gsi_insert_after (&start, cond, GSI_NEW_STMT); update_stmt (cond); edge et = split_block (from, cond); et->flags &= ~EDGE_FALLTHRU; et->flags |= EDGE_TRUE_VALUE; /* Make the active worker the more probable path so we prefer fallthrough (letting the idle workers jump around more). */ et->probability = profile_probability::likely (); edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE); ef->probability = et->probability.invert (); basic_block neutered = split_edge (ef); gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered); for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple *stmt = gsi_stmt (gsi); ssa_op_iter iter; tree var; FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF) { if (def_escapes_block->contains (var)) { gphi *join_phi = create_phi_node (NULL_TREE, skip_block); create_new_def_for (var, join_phi, gimple_phi_result_ptr (join_phi)); add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION); tree neutered_def = copy_ssa_name (var, NULL); /* We really want "don't care" or some value representing undefined here, but optimizers will probably get rid of the zero-assignments anyway. */ gassign *zero = gimple_build_assign (neutered_def, build_zero_cst (TREE_TYPE (neutered_def))); gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING); update_stmt (zero); add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered), UNKNOWN_LOCATION); update_stmt (join_phi); } } } gsi = gsi_start_bb (skip_block); decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); gimple *acc_bar = gimple_build_call (decl, 0); gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT); update_stmt (acc_bar); } /* Build COMPONENT_REF and set TREE_THIS_VOLATILE and TREE_READONLY on it as appropriate. */ /* Adapted from 'gcc/omp-low.c:omp_build_component_ref'. */ static tree oacc_build_component_ref (tree obj, tree field) { tree field_type = TREE_TYPE (field); tree obj_type = TREE_TYPE (obj); if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (obj_type))) field_type = build_qualified_type (field_type, KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (obj_type))); tree ret = build3 (COMPONENT_REF, field_type, obj, field, NULL); if (TREE_THIS_VOLATILE (field)) TREE_THIS_VOLATILE (ret) |= 1; if (TREE_READONLY (field)) TREE_READONLY (ret) |= 1; return ret; } static tree build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields) { tree x = build_simple_mem_ref (receiver_decl); tree field = *fields->get (var); TREE_THIS_NOTRAP (x) = 1; x = oacc_build_component_ref (x, field); return x; } static tree build_sender_ref (tree var, tree sender_decl, field_map_t *fields) { tree field = *fields->get (var); return oacc_build_component_ref (sender_decl, field); } static int sort_by_ssa_version_or_uid (const void *p1, const void *p2) { const tree t1 = *(const tree *)p1; const tree t2 = *(const tree *)p2; if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME) return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2); else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME) return -1; else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME) return 1; else return DECL_UID (t1) - DECL_UID (t2); } static int sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2) { const tree t1 = *(const tree *)p1; const tree t2 = *(const tree *)p2; unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1))); unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2))); if (s1 != s2) return s2 - s1; else return sort_by_ssa_version_or_uid (p1, p2); } static void worker_single_copy (basic_block from, basic_block to, hash_set *def_escapes_block, hash_set *worker_partitioned_uses, tree record_type, record_field_map_t *record_field_map) { /* If we only have virtual defs, we'll have no record type, but we still want to emit single_copy_start and (particularly) single_copy_end to act as a vdef source on the neutered edge representing memory writes on the non-neutered edge. */ if (!record_type) record_type = char_type_node; tree sender_decl = targetm.goacc.create_worker_broadcast_record (record_type, true, ".oacc_worker_o"); tree receiver_decl = targetm.goacc.create_worker_broadcast_record (record_type, false, ".oacc_worker_i"); gimple_stmt_iterator gsi = gsi_last_bb (to); if (EDGE_COUNT (to->succs) > 1) gsi_prev (&gsi); edge e = split_block (to, gsi_stmt (gsi)); basic_block barrier_block = e->dest; gimple_stmt_iterator start = gsi_after_labels (from); tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START); tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); gimple *call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl)); gimple_call_set_lhs (call, lhs); gsi_insert_before (&start, call, GSI_NEW_STMT); update_stmt (call); tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); gimple *conv = gimple_build_assign (conv_tmp, fold_convert (TREE_TYPE (receiver_decl), lhs)); update_stmt (conv); gsi_insert_after (&start, conv, GSI_NEW_STMT); gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp); gsi_insert_after (&start, asgn, GSI_NEW_STMT); update_stmt (asgn); tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0); tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); asgn = gimple_build_assign (recv_tmp, receiver_decl); gsi_insert_after (&start, asgn, GSI_NEW_STMT); update_stmt (asgn); gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE); update_stmt (cond); gsi_insert_after (&start, cond, GSI_NEW_STMT); edge et = split_block (from, cond); et->flags &= ~EDGE_FALLTHRU; et->flags |= EDGE_TRUE_VALUE; /* Make the active worker the more probable path so we prefer fallthrough (letting the idle workers jump around more). */ et->probability = profile_probability::likely (); basic_block body = et->dest; edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE); ef->probability = et->probability.invert (); decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); gimple *acc_bar = gimple_build_call (decl, 0); gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block); gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT); cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE); gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT); edge et2 = split_block (barrier_block, cond); et2->flags &= ~EDGE_FALLTHRU; et2->flags |= EDGE_TRUE_VALUE; et2->probability = profile_probability::unlikely (); basic_block exit_block = et2->dest; basic_block copyout_block = split_edge (et2); edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE); ef2->probability = et2->probability.invert (); gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block); edge copyout_to_exit = single_succ_edge (copyout_block); gimple_seq sender_seq = NULL; /* Make sure we iterate over definitions in a stable order. */ auto_vec escape_vec (def_escapes_block->elements ()); for (hash_set::iterator it = def_escapes_block->begin (); it != def_escapes_block->end (); ++it) escape_vec.quick_push (*it); escape_vec.qsort (sort_by_ssa_version_or_uid); for (unsigned i = 0; i < escape_vec.length (); i++) { tree var = escape_vec[i]; if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var)) continue; tree barrier_def = 0; if (TREE_CODE (var) == SSA_NAME) { gimple *def_stmt = SSA_NAME_DEF_STMT (var); if (gimple_nop_p (def_stmt)) continue; /* The barrier phi takes one result from the actual work of the block we're neutering, and the other result is constant zero of the same type. */ gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block); barrier_def = create_new_def_for (var, barrier_phi, gimple_phi_result_ptr (barrier_phi)); add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION); add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef, UNKNOWN_LOCATION); update_stmt (barrier_phi); } else gcc_assert (TREE_CODE (var) == VAR_DECL); /* If we had no record type, we will have no fields map. */ field_map_t **fields_p = record_field_map->get (record_type); field_map_t *fields = fields_p ? *fields_p : NULL; if (worker_partitioned_uses->contains (var) && fields && fields->get (var)) { tree neutered_def = make_ssa_name (TREE_TYPE (var)); /* Receive definition from shared memory block. */ tree receiver_ref = build_receiver_ref (var, receiver_decl, fields); gassign *recv = gimple_build_assign (neutered_def, receiver_ref); gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING); update_stmt (recv); if (TREE_CODE (var) == VAR_DECL) { /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy to the final location now. */ gassign *asgn = gimple_build_assign (var, neutered_def); gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING); update_stmt (asgn); } else { /* If it's an SSA name, create a new phi at the join node to represent either the output from the active worker (the barrier) or the inactive workers (the copyout block). */ gphi *join_phi = create_phi_node (NULL_TREE, exit_block); create_new_def_for (barrier_def, join_phi, gimple_phi_result_ptr (join_phi)); add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION); add_phi_arg (join_phi, neutered_def, copyout_to_exit, UNKNOWN_LOCATION); update_stmt (join_phi); } /* Send definition to shared memory block. */ tree sender_ref = build_sender_ref (var, sender_decl, fields); if (TREE_CODE (var) == SSA_NAME) { gassign *send = gimple_build_assign (sender_ref, var); gimple_seq_add_stmt (&sender_seq, send); update_stmt (send); } else if (TREE_CODE (var) == VAR_DECL) { tree tmp = make_ssa_name (TREE_TYPE (var)); gassign *send = gimple_build_assign (tmp, var); gimple_seq_add_stmt (&sender_seq, send); update_stmt (send); send = gimple_build_assign (sender_ref, tmp); gimple_seq_add_stmt (&sender_seq, send); update_stmt (send); } else gcc_unreachable (); } } /* It's possible for the ET->DEST block (the work done by the active thread) to finish with a control-flow insn, e.g. a UNIQUE function call. Split the block and add SENDER_SEQ in the latter part to avoid having control flow in the middle of a BB. */ decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END); call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl)); gimple_seq_add_stmt (&sender_seq, call); gsi = gsi_last_bb (body); gimple *last = gsi_stmt (gsi); basic_block sender_block = split_block (body, last)->dest; gsi = gsi_last_bb (sender_block); gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING); } static void neuter_worker_single (parallel_g *par, unsigned outer_mask, bitmap worker_single, bitmap vector_single, vec *prop_set, hash_set *partitioned_var_uses, record_field_map_t *record_field_map) { unsigned mask = outer_mask | par->mask; if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) { basic_block block; for (unsigned i = 0; par->blocks.iterate (i, &block); i++) { bool has_defs = false; hash_set def_escapes_block; hash_set worker_partitioned_uses; unsigned j; tree var; FOR_EACH_SSA_NAME (j, var, cfun) { if (SSA_NAME_IS_VIRTUAL_OPERAND (var)) { has_defs = true; continue; } gimple *def_stmt = SSA_NAME_DEF_STMT (var); if (gimple_nop_p (def_stmt)) continue; if (gimple_bb (def_stmt)->index != block->index) continue; gimple *use_stmt; imm_use_iterator use_iter; bool uses_outside_block = false; bool worker_partitioned_use = false; FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var) { int blocknum = gimple_bb (use_stmt)->index; /* Don't propagate SSA names that are only used in the current block, unless the usage is in a phi node: that means the name left the block, then came back in at the top. */ if (blocknum != block->index || gimple_code (use_stmt) == GIMPLE_PHI) uses_outside_block = true; if (!bitmap_bit_p (worker_single, blocknum)) worker_partitioned_use = true; } if (uses_outside_block) def_escapes_block.add (var); if (worker_partitioned_use) { worker_partitioned_uses.add (var); has_defs = true; } } propagation_set *ws_prop = (*prop_set)[block->index]; if (ws_prop) { for (propagation_set::iterator it = ws_prop->begin (); it != ws_prop->end (); ++it) { tree var = *it; if (TREE_CODE (var) == VAR_DECL) { def_escapes_block.add (var); if (partitioned_var_uses->contains (var)) { worker_partitioned_uses.add (var); has_defs = true; } } } delete ws_prop; (*prop_set)[block->index] = 0; } tree record_type = (tree) block->aux; if (has_defs) worker_single_copy (block, block, &def_escapes_block, &worker_partitioned_uses, record_type, record_field_map); else worker_single_simple (block, block, &def_escapes_block); } } if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) { basic_block block; for (unsigned i = 0; par->blocks.iterate (i, &block); i++) for (gimple_stmt_iterator gsi = gsi_start_bb (block); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple *stmt = gsi_stmt (gsi); if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt) && !omp_sese_active_worker_call (as_a (stmt))) { /* If we have an OpenACC routine call in worker-single mode, place barriers before and afterwards to prevent clobbering re-used shared memory regions (as are used for AMDGCN at present, for example). */ tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); gsi_insert_before (&gsi, gimple_build_call (decl, 0), GSI_SAME_STMT); gsi_insert_after (&gsi, gimple_build_call (decl, 0), GSI_NEW_STMT); } } } if (par->inner) neuter_worker_single (par->inner, mask, worker_single, vector_single, prop_set, partitioned_var_uses, record_field_map); if (par->next) neuter_worker_single (par->next, outer_mask, worker_single, vector_single, prop_set, partitioned_var_uses, record_field_map); } static int execute_omp_oacc_neuter_broadcast () { bb_stmt_map_t bb_stmt_map; auto_bitmap worker_single, vector_single; omp_sese_split_blocks (&bb_stmt_map); if (dump_file) { fprintf (dump_file, "\n\nAfter splitting:\n\n"); dump_function_to_file (current_function_decl, dump_file, dump_flags); } unsigned mask = 0; /* If this is a routine, calculate MASK as if the outer levels are already partitioned. */ tree attr = oacc_get_fn_attrib (current_function_decl); if (attr) { tree dims = TREE_VALUE (attr); unsigned ix; for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) { tree allowed = TREE_PURPOSE (dims); if (allowed && integer_zerop (allowed)) mask |= GOMP_DIM_MASK (ix); } } parallel_g *par = omp_sese_discover_pars (&bb_stmt_map); populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0); basic_block bb; FOR_ALL_BB_FN (bb, cfun) bb->aux = NULL; vec prop_set (vNULL); prop_set.safe_grow_cleared (last_basic_block_for_fn (cfun), true); find_ssa_names_to_propagate (par, mask, worker_single, vector_single, &prop_set); hash_set partitioned_var_uses; hash_set gang_private_vars; find_gang_private_vars (&gang_private_vars); find_partitioned_var_uses (par, mask, &partitioned_var_uses); find_local_vars_to_propagate (par, mask, &partitioned_var_uses, &gang_private_vars, &prop_set); record_field_map_t record_field_map; FOR_ALL_BB_FN (bb, cfun) { propagation_set *ws_prop = prop_set[bb->index]; if (ws_prop) { tree record_type = lang_hooks.types.make_type (RECORD_TYPE); tree name = create_tmp_var_name (".oacc_ws_data_s"); name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type); DECL_ARTIFICIAL (name) = 1; DECL_NAMELESS (name) = 1; TYPE_NAME (record_type) = name; TYPE_ARTIFICIAL (record_type) = 1; auto_vec field_vec (ws_prop->elements ()); for (hash_set::iterator it = ws_prop->begin (); it != ws_prop->end (); ++it) field_vec.quick_push (*it); field_vec.qsort (sort_by_size_then_ssa_version_or_uid); field_map_t *fields = new field_map_t; bool existed; existed = record_field_map.put (record_type, fields); gcc_checking_assert (!existed); /* Insert var fields in reverse order, so the last inserted element is the first in the structure. */ for (int i = field_vec.length () - 1; i >= 0; i--) install_var_field (field_vec[i], record_type, fields); layout_type (record_type); bb->aux = (tree) record_type; } } neuter_worker_single (par, mask, worker_single, vector_single, &prop_set, &partitioned_var_uses, &record_field_map); for (auto it : record_field_map) delete it.second; record_field_map.empty (); /* These are supposed to have been 'delete'd by 'neuter_worker_single'. */ for (auto it : prop_set) gcc_checking_assert (!it); prop_set.release (); delete par; /* This doesn't seem to make a difference. */ loops_state_clear (LOOP_CLOSED_SSA); /* Neutering worker-single neutered blocks will invalidate dominance info. It may be possible to incrementally update just the affected blocks, but obliterate everything for now. */ free_dominance_info (CDI_DOMINATORS); free_dominance_info (CDI_POST_DOMINATORS); if (dump_file) { fprintf (dump_file, "\n\nAfter neutering:\n\n"); dump_function_to_file (current_function_decl, dump_file, dump_flags); } return 0; } namespace { const pass_data pass_data_omp_oacc_neuter_broadcast = { GIMPLE_PASS, /* type */ "omp_oacc_neuter_broadcast", /* name */ OPTGROUP_OMP, /* optinfo_flags */ TV_NONE, /* tv_id */ PROP_cfg, /* properties_required */ 0, /* properties_provided */ 0, /* properties_destroyed */ 0, /* todo_flags_start */ TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */ }; class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass { public: pass_omp_oacc_neuter_broadcast (gcc::context *ctxt) : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt) {} /* opt_pass methods: */ virtual bool gate (function *) { return (flag_openacc && targetm.goacc.create_worker_broadcast_record); }; virtual unsigned int execute (function *) { return execute_omp_oacc_neuter_broadcast (); } }; // class pass_omp_oacc_neuter_broadcast } // anon namespace gimple_opt_pass * make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt) { return new pass_omp_oacc_neuter_broadcast (ctxt); }