diff options
Diffstat (limited to 'gcc/tree-vect-slp.cc')
-rw-r--r-- | gcc/tree-vect-slp.cc | 673 |
1 files changed, 486 insertions, 187 deletions
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 794a073..5236eac 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -120,11 +120,14 @@ _slp_tree::_slp_tree () SLP_TREE_LANE_PERMUTATION (this) = vNULL; SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def; SLP_TREE_CODE (this) = ERROR_MARK; + SLP_TREE_GS_SCALE (this) = 0; + SLP_TREE_GS_BASE (this) = NULL_TREE; this->ldst_lanes = false; this->avoid_stlf_fail = false; SLP_TREE_VECTYPE (this) = NULL_TREE; SLP_TREE_REPRESENTATIVE (this) = NULL; - SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_UNINITIALIZED; + this->cycle_info.id = -1; + this->cycle_info.reduc_idx = -1; SLP_TREE_REF_COUNT (this) = 1; this->failed = NULL; this->max_nunits = 1; @@ -680,6 +683,15 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap, { internal_fn ifn = gimple_call_internal_fn (stmt); commutative_op = first_commutative_argument (ifn); + if (internal_gather_scatter_fn_p (ifn)) + { + vect_describe_gather_scatter_call + (stmt_info, + first ? &(*oprnds_info)[0]->first_gs_info : &gs_info); + if (first) + (*oprnds_info)[0]->first_gs_p = true; + gs_op = 0; + } } } else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt)) @@ -1379,7 +1391,9 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, && (TREE_CODE_CLASS (tree_code (first_stmt_code)) == tcc_comparison) && (swap_tree_comparison (tree_code (first_stmt_code)) - == tree_code (rhs_code)))) + == tree_code (rhs_code)) + && (first_reduc_idx == -1 + || REDUC_GROUP_FIRST_ELEMENT (stmt_info)))) || (ldst_p && (STMT_VINFO_GROUPED_ACCESS (stmt_info) != STMT_VINFO_GROUPED_ACCESS (first_stmt_info))) @@ -1595,8 +1609,11 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, } } - if (rhs_code.is_tree_code () - && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison + if (i != 0 + && first_stmt_code != rhs_code + && first_stmt_code.is_tree_code () + && rhs_code.is_tree_code () + && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison && (swap_tree_comparison ((tree_code)first_stmt_code) == (tree_code)rhs_code)) swap[i] = 1; @@ -2720,6 +2737,10 @@ out: stmt_info = stmts[0]; + int reduc_idx = -1; + int gs_scale = 0; + tree gs_base = NULL_TREE; + /* Create SLP_TREE nodes for the definition node/s. */ FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) { @@ -2742,6 +2763,12 @@ out: continue; } + if (oprnd_info->first_gs_p) + { + gs_scale = oprnd_info->first_gs_info.scale; + gs_base = oprnd_info->first_gs_info.base; + } + if (is_a <bb_vec_info> (vinfo) && oprnd_info->first_dt == vect_internal_def && !oprnd_info->any_pattern) @@ -2802,6 +2829,33 @@ out: continue; } + /* See which SLP operand a reduction chain continues on. We want + to chain even PHIs but not backedges. */ + if (VECTORIZABLE_CYCLE_DEF (oprnd_info->first_dt) + || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1) + { + if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) + { + if (oprnd_info->first_dt == vect_double_reduction_def) + reduc_idx = i; + } + else if (is_a <gphi *> (stmt_info->stmt) + && gimple_phi_num_args + (as_a <gphi *> (stmt_info->stmt)) != 1) + ; + else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1 + && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) + ; + else if (reduc_idx == -1) + reduc_idx = i; + else + /* For .COND_* reduction operations the else value can be the + same as one of the operation operands. The other def + stmts have been moved, so we can't check easily. Check + it's a call at least. */ + gcc_assert (is_a <gcall *> (stmt_info->stmt)); + } + /* When we have a masked load with uniform mask discover this as a single-lane mask with a splat permute. This way we can recognize this as a masked load-lane by stripping the splat. */ @@ -3131,6 +3185,43 @@ fail: node = vect_create_new_slp_node (node, stmts, nops); SLP_TREE_VECTYPE (node) = vectype; SLP_TREE_CHILDREN (node).splice (children); + SLP_TREE_GS_SCALE (node) = gs_scale; + SLP_TREE_GS_BASE (node) = gs_base; + if (reduc_idx != -1) + { + gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1 + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def); + SLP_TREE_REDUC_IDX (node) = reduc_idx; + node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id; + } + /* When reaching the reduction PHI, create a vect_reduc_info. */ + else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) + && is_a <gphi *> (STMT_VINFO_STMT (stmt_info))) + { + loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo); + gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1); + node->cycle_info.id = loop_vinfo->reduc_infos.length (); + vect_reduc_info reduc_info = new vect_reduc_info_s (); + loop_vinfo->reduc_infos.safe_push (reduc_info); + stmt_vec_info reduc_phi = stmt_info; + /* ??? For double reductions vect_is_simple_reduction stores the + reduction type and code on the inner loop header PHI. */ + if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) + { + use_operand_p use_p; + gimple *use_stmt; + bool res = single_imm_use (gimple_phi_result (stmt_info->stmt), + &use_p, &use_stmt); + gcc_assert (res); + reduc_phi = loop_vinfo->lookup_stmt (use_stmt); + } + VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info); + VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi); + VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi); + VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST; + } return node; } @@ -3159,11 +3250,15 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, SLP_TREE_REF_COUNT (node)); if (SLP_TREE_VECTYPE (node)) dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node)); - dump_printf (metadata, "%s\n", + dump_printf (metadata, "%s", node->avoid_stlf_fail ? " (avoid-stlf-fail)" : ""); + if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1) + dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id, + node->cycle_info.reduc_idx); + dump_printf (metadata, "\n"); if (SLP_TREE_DEF_TYPE (node) == vect_internal_def) { - if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) + if (SLP_TREE_PERMUTE_P (node)) dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n"); else dump_printf_loc (metadata, user_loc, "op template: %G", @@ -3415,7 +3510,7 @@ vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node, if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) return; - if (SLP_TREE_CODE (node) != VEC_PERM_EXPR) + if (!SLP_TREE_PERMUTE_P (node)) { stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); if (STMT_VINFO_DATA_REF (stmt_info) @@ -3529,7 +3624,7 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) static inline bool vect_is_slp_load_node (slp_tree root) { - return (SLP_TREE_CODE (root) != VEC_PERM_EXPR + return (!SLP_TREE_PERMUTE_P (root) && SLP_TREE_DEF_TYPE (root) == vect_internal_def && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root)) && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)))); @@ -3554,7 +3649,7 @@ optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map, /* For now, we don't know anything about externals so do not do anything. */ if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def) return NULL; - else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR) + else if (SLP_TREE_PERMUTE_P (root)) { /* First convert this node into a load node and add it to the leaves list and flatten the permute from a lane to a load one. If it's @@ -3959,8 +4054,6 @@ vect_build_slp_instance (vec_info *vinfo, vec<tree> &remain, unsigned max_tree_size, unsigned *limit, scalar_stmts_to_slp_tree_map_t *bst_map, - /* ??? We need stmt_info for group splitting. */ - stmt_vec_info stmt_info_, bool force_single_lane) { /* If there's no budget left bail out early. */ @@ -3996,7 +4089,6 @@ vect_build_slp_instance (vec_info *vinfo, bool *matches = XALLOCAVEC (bool, group_size); poly_uint64 max_nunits = 1; unsigned tree_size = 0; - unsigned i; slp_tree node = NULL; if (group_size > 1 && force_single_lane) @@ -4056,68 +4148,345 @@ vect_build_slp_instance (vec_info *vinfo, "SLP size %u vs. limit %u.\n", tree_size, max_tree_size); - /* Fixup SLP reduction chains. */ - if (kind == slp_inst_kind_reduc_chain) + vinfo->slp_instances.safe_push (new_instance); + + /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with + the number of scalar stmts in the root in a few places. + Verify that assumption holds. */ + gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance)) + .length () == group_size); + + if (dump_enabled_p ()) { - /* If this is a reduction chain with a conversion in front - amend the SLP tree with a node for that. */ - gimple *scalar_def - = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt; - if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def) + dump_printf_loc (MSG_NOTE, vect_location, + "Final SLP tree for instance %p:\n", + (void *) new_instance); + vect_print_slp_graph (MSG_NOTE, vect_location, + SLP_INSTANCE_TREE (new_instance)); + } + + return true; + } + } + /* Failed to SLP. */ + + /* While we arrive here even with slp_inst_kind_store we should only + for group_size == 1. The code to split store groups is only in + vect_analyze_slp_instance now. */ + gcc_assert (kind != slp_inst_kind_store || group_size == 1); + + /* Free the allocated memory. */ + scalar_stmts.release (); + + /* Failed to SLP. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n"); + return false; +} + +/* Analyze an SLP instance starting from a the start of a reduction chain. + Call vect_build_slp_tree to build a tree of packed stmts if possible. + Return FALSE if SLP build fails. */ + +static bool +vect_analyze_slp_reduc_chain (vec_info *vinfo, + scalar_stmts_to_slp_tree_map_t *bst_map, + stmt_vec_info stmt_info, + unsigned max_tree_size, unsigned *limit) +{ + vec<stmt_vec_info> scalar_stmts; + + /* Collect the reduction stmts and store them in scalar_stmts. */ + scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info)); + stmt_vec_info next_info = stmt_info; + while (next_info) + { + scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); + next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); + } + /* Mark the first element of the reduction chain as reduction to properly + transform the node. In the reduction analysis phase only the last + element of the chain is marked as reduction. */ + STMT_VINFO_DEF_TYPE (stmt_info) + = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); + STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) + = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); + + /* Build the tree for the SLP instance. */ + vec<stmt_vec_info> root_stmt_infos = vNULL; + vec<tree> remain = vNULL; + + /* If there's no budget left bail out early. */ + if (*limit == 0) + return false; + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Starting SLP discovery for\n"); + for (unsigned i = 0; i < scalar_stmts.length (); ++i) + dump_printf_loc (MSG_NOTE, vect_location, + " %G", scalar_stmts[i]->stmt); + } + + /* Build the tree for the SLP instance. */ + unsigned int group_size = scalar_stmts.length (); + bool *matches = XALLOCAVEC (bool, group_size); + poly_uint64 max_nunits = 1; + unsigned tree_size = 0; + + slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, + &max_nunits, matches, limit, + &tree_size, bst_map); + if (node != NULL) + { + /* Calculate the unrolling factor based on the smallest type. */ + poly_uint64 unrolling_factor + = calculate_unrolling_factor (max_nunits, group_size); + + if (maybe_ne (unrolling_factor, 1U) + && is_a <bb_vec_info> (vinfo)) + { + unsigned HOST_WIDE_INT const_max_nunits; + if (!max_nunits.is_constant (&const_max_nunits) + || const_max_nunits > group_size) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: store group " + "size not a multiple of the vector size " + "in basic block SLP\n"); + vect_free_slp_tree (node); + return false; + } + /* Fatal mismatch. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "SLP discovery succeeded but node needs " + "splitting\n"); + memset (matches, true, group_size); + matches[group_size / const_max_nunits * const_max_nunits] = false; + vect_free_slp_tree (node); + } + else + { + /* Create a new SLP instance. */ + slp_instance new_instance = XNEW (class _slp_instance); + SLP_INSTANCE_TREE (new_instance) = node; + SLP_INSTANCE_LOADS (new_instance) = vNULL; + SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos; + SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain; + SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain; + new_instance->reduc_phis = NULL; + new_instance->cost_vec = vNULL; + new_instance->subgraph_entries = vNULL; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "SLP size %u vs. limit %u.\n", + tree_size, max_tree_size); + + /* Fixup SLP reduction chains. If this is a reduction chain with + a conversion in front amend the SLP tree with a node for that. */ + gimple *scalar_def + = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt; + if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def) + { + /* Get at the conversion stmt - we know it's the single use + of the last stmt of the reduction chain. */ + use_operand_p use_p; + bool r = single_imm_use (gimple_assign_lhs (scalar_def), + &use_p, &scalar_def); + gcc_assert (r); + stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def); + next_info = vect_stmt_to_vectorize (next_info); + scalar_stmts = vNULL; + scalar_stmts.create (group_size); + for (unsigned i = 0; i < group_size; ++i) + scalar_stmts.quick_push (next_info); + slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1); + SLP_TREE_VECTYPE (conv) + = get_vectype_for_scalar_type (vinfo, + TREE_TYPE + (gimple_assign_lhs (scalar_def)), + group_size); + SLP_TREE_REDUC_IDX (conv) = 0; + conv->cycle_info.id = node->cycle_info.id; + SLP_TREE_CHILDREN (conv).quick_push (node); + SLP_INSTANCE_TREE (new_instance) = conv; + /* We also have to fake this conversion stmt as SLP reduction + group so we don't have to mess with too much code + elsewhere. */ + REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info; + REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL; + } + /* Fill the backedge child of the PHI SLP node. The + general matching code cannot find it because the + scalar code does not reflect how we vectorize the + reduction. */ + use_operand_p use_p; + imm_use_iterator imm_iter; + class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo)); + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, + gimple_get_lhs (scalar_def)) + /* There are exactly two non-debug uses, the reduction + PHI and the loop-closed PHI node. */ + if (!is_gimple_debug (USE_STMT (use_p)) + && gimple_bb (USE_STMT (use_p)) == loop->header) { - /* Get at the conversion stmt - we know it's the single use - of the last stmt of the reduction chain. */ - use_operand_p use_p; - bool r = single_imm_use (gimple_assign_lhs (scalar_def), - &use_p, &scalar_def); - gcc_assert (r); - stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def); - next_info = vect_stmt_to_vectorize (next_info); - scalar_stmts = vNULL; - scalar_stmts.create (group_size); + auto_vec<stmt_vec_info, 64> phis (group_size); + stmt_vec_info phi_info + = vinfo->lookup_stmt (USE_STMT (use_p)); for (unsigned i = 0; i < group_size; ++i) - scalar_stmts.quick_push (next_info); - slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1); - SLP_TREE_VECTYPE (conv) - = get_vectype_for_scalar_type (vinfo, - TREE_TYPE - (gimple_assign_lhs - (scalar_def)), - group_size); - SLP_TREE_CHILDREN (conv).quick_push (node); - SLP_INSTANCE_TREE (new_instance) = conv; - /* We also have to fake this conversion stmt as SLP reduction - group so we don't have to mess with too much code - elsewhere. */ - REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info; - REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL; + phis.quick_push (phi_info); + slp_tree *phi_node = bst_map->get (phis); + unsigned dest_idx = loop_latch_edge (loop)->dest_idx; + SLP_TREE_CHILDREN (*phi_node)[dest_idx] + = SLP_INSTANCE_TREE (new_instance); + SLP_INSTANCE_TREE (new_instance)->refcnt++; } - /* Fill the backedge child of the PHI SLP node. The - general matching code cannot find it because the - scalar code does not reflect how we vectorize the - reduction. */ - use_operand_p use_p; - imm_use_iterator imm_iter; - class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo)); - FOR_EACH_IMM_USE_FAST (use_p, imm_iter, - gimple_get_lhs (scalar_def)) - /* There are exactly two non-debug uses, the reduction - PHI and the loop-closed PHI node. */ - if (!is_gimple_debug (USE_STMT (use_p)) - && gimple_bb (USE_STMT (use_p)) == loop->header) - { - auto_vec<stmt_vec_info, 64> phis (group_size); - stmt_vec_info phi_info - = vinfo->lookup_stmt (USE_STMT (use_p)); - for (unsigned i = 0; i < group_size; ++i) - phis.quick_push (phi_info); - slp_tree *phi_node = bst_map->get (phis); - unsigned dest_idx = loop_latch_edge (loop)->dest_idx; - SLP_TREE_CHILDREN (*phi_node)[dest_idx] - = SLP_INSTANCE_TREE (new_instance); - SLP_INSTANCE_TREE (new_instance)->refcnt++; - } + + vinfo->slp_instances.safe_push (new_instance); + + /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with + the number of scalar stmts in the root in a few places. + Verify that assumption holds. */ + gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance)) + .length () == group_size); + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Final SLP tree for instance %p:\n", + (void *) new_instance); + vect_print_slp_graph (MSG_NOTE, vect_location, + SLP_INSTANCE_TREE (new_instance)); + } + + return true; + } + } + /* Failed to SLP. */ + + /* Free the allocated memory. */ + scalar_stmts.release (); + + /* Failed to SLP. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n"); + return false; +} + +/* Analyze an SLP instance starting from a group of grouped stores. Call + vect_build_slp_tree to build a tree of packed stmts if possible. + Return FALSE if it's impossible to SLP any stmt in the group. */ + +static bool +vect_analyze_slp_instance (vec_info *vinfo, + scalar_stmts_to_slp_tree_map_t *bst_map, + stmt_vec_info stmt_info, + slp_instance_kind kind, + unsigned max_tree_size, unsigned *limit, + bool force_single_lane) +{ + vec<stmt_vec_info> scalar_stmts; + + if (is_a <bb_vec_info> (vinfo)) + vect_location = stmt_info->stmt; + + gcc_assert (kind == slp_inst_kind_store); + + /* Collect the stores and store them in scalar_stmts. */ + scalar_stmts.create (DR_GROUP_SIZE (stmt_info)); + stmt_vec_info next_info = stmt_info; + while (next_info) + { + scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); + next_info = DR_GROUP_NEXT_ELEMENT (next_info); + } + + vec<stmt_vec_info> root_stmt_infos = vNULL; + vec<tree> remain = vNULL; + + /* Build the tree for the SLP instance. */ + + /* If there's no budget left bail out early. */ + if (*limit == 0) + return false; + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Starting SLP discovery for\n"); + for (unsigned i = 0; i < scalar_stmts.length (); ++i) + dump_printf_loc (MSG_NOTE, vect_location, + " %G", scalar_stmts[i]->stmt); + } + + /* Build the tree for the SLP instance. */ + unsigned int group_size = scalar_stmts.length (); + bool *matches = XALLOCAVEC (bool, group_size); + poly_uint64 max_nunits = 1; + unsigned tree_size = 0; + unsigned i; + + slp_tree node = NULL; + if (group_size > 1 && force_single_lane) + { + matches[0] = true; + matches[1] = false; + } + else + node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, + &max_nunits, matches, limit, + &tree_size, bst_map); + if (node != NULL) + { + /* Calculate the unrolling factor based on the smallest type. */ + poly_uint64 unrolling_factor + = calculate_unrolling_factor (max_nunits, group_size); + + if (maybe_ne (unrolling_factor, 1U) + && is_a <bb_vec_info> (vinfo)) + { + unsigned HOST_WIDE_INT const_max_nunits; + if (!max_nunits.is_constant (&const_max_nunits) + || const_max_nunits > group_size) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: store group " + "size not a multiple of the vector size " + "in basic block SLP\n"); + vect_free_slp_tree (node); + return false; } + /* Fatal mismatch. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "SLP discovery succeeded but node needs " + "splitting\n"); + memset (matches, true, group_size); + matches[group_size / const_max_nunits * const_max_nunits] = false; + vect_free_slp_tree (node); + } + else + { + /* Create a new SLP instance. */ + slp_instance new_instance = XNEW (class _slp_instance); + SLP_INSTANCE_TREE (new_instance) = node; + SLP_INSTANCE_LOADS (new_instance) = vNULL; + SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos; + SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain; + SLP_INSTANCE_KIND (new_instance) = kind; + new_instance->reduc_phis = NULL; + new_instance->cost_vec = vNULL; + new_instance->subgraph_entries = vNULL; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "SLP size %u vs. limit %u.\n", + tree_size, max_tree_size); vinfo->slp_instances.safe_push (new_instance); @@ -4141,7 +4510,6 @@ vect_build_slp_instance (vec_info *vinfo, } /* Failed to SLP. */ - stmt_vec_info stmt_info = stmt_info_; /* Try to break the group up into pieces. */ if (*limit > 0 && kind == slp_inst_kind_store) { @@ -4402,70 +4770,6 @@ vect_build_slp_instance (vec_info *vinfo, return false; } - -/* Analyze an SLP instance starting from a group of grouped stores. Call - vect_build_slp_tree to build a tree of packed stmts if possible. - Return FALSE if it's impossible to SLP any stmt in the loop. */ - -static bool -vect_analyze_slp_instance (vec_info *vinfo, - scalar_stmts_to_slp_tree_map_t *bst_map, - stmt_vec_info stmt_info, - slp_instance_kind kind, - unsigned max_tree_size, unsigned *limit, - bool force_single_lane) -{ - vec<stmt_vec_info> scalar_stmts; - - if (is_a <bb_vec_info> (vinfo)) - vect_location = stmt_info->stmt; - - stmt_vec_info next_info = stmt_info; - if (kind == slp_inst_kind_store) - { - /* Collect the stores and store them in scalar_stmts. */ - scalar_stmts.create (DR_GROUP_SIZE (stmt_info)); - while (next_info) - { - scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); - next_info = DR_GROUP_NEXT_ELEMENT (next_info); - } - } - else if (kind == slp_inst_kind_reduc_chain) - { - /* Collect the reduction stmts and store them in scalar_stmts. */ - scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info)); - while (next_info) - { - scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); - next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); - } - /* Mark the first element of the reduction chain as reduction to properly - transform the node. In the reduction analysis phase only the last - element of the chain is marked as reduction. */ - STMT_VINFO_DEF_TYPE (stmt_info) - = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); - STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) - = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); - } - else - gcc_unreachable (); - - vec<stmt_vec_info> roots = vNULL; - vec<tree> remain = vNULL; - /* Build the tree for the SLP instance. */ - bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts, - roots, remain, - max_tree_size, limit, bst_map, - kind == slp_inst_kind_store - ? stmt_info : NULL, force_single_lane); - - /* ??? If this is slp_inst_kind_store and the above succeeded here's - where we should do store group splitting. */ - - return res; -} - /* qsort comparator ordering SLP load nodes. */ static int @@ -4909,8 +5213,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, stmts.quick_push (stmt_info); if (! vect_build_slp_instance (vinfo, slp_inst_kind_store, stmts, roots, remain, max_tree_size, - &limit, bst_map, NULL, - force_single_lane)) + &limit, bst_map, force_single_lane)) return opt_result::failure_at (vect_location, "SLP build failed.\n"); } @@ -4925,8 +5228,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, stmts.quick_push (stmt_info); if (! vect_build_slp_instance (vinfo, slp_inst_kind_store, stmts, roots, remain, max_tree_size, - &limit, bst_map, NULL, - force_single_lane)) + &limit, bst_map, force_single_lane)) return opt_result::failure_at (vect_location, "SLP build failed.\n"); } @@ -4945,8 +5247,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, bb_vinfo->roots[i].stmts, bb_vinfo->roots[i].roots, bb_vinfo->roots[i].remain, - max_tree_size, &limit, bst_map, NULL, - false)) + max_tree_size, &limit, bst_map, false)) { bb_vinfo->roots[i].stmts = vNULL; bb_vinfo->roots[i].roots = vNULL; @@ -4963,10 +5264,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, && ! STMT_VINFO_LIVE_P (first_element)) ; else if (force_single_lane - || ! vect_analyze_slp_instance (vinfo, bst_map, first_element, - slp_inst_kind_reduc_chain, - max_tree_size, &limit, - force_single_lane)) + || ! vect_analyze_slp_reduc_chain (vinfo, bst_map, + first_element, + max_tree_size, &limit)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -4983,6 +5283,11 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, vinfo = next; } STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def; + /* ??? When there's a conversion around the reduction + chain 'last' isn't the entry of the reduction. */ + if (STMT_VINFO_DEF_TYPE (last) != vect_reduction_def) + return opt_result::failure_at (vect_location, + "SLP build failed.\n"); /* It can be still vectorized as part of an SLP reduction. */ loop_vinfo->reductions.safe_push (last); } @@ -5024,7 +5329,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, slp_inst_kind_reduc_group, stmts, roots, remain, max_tree_size, &limit, - bst_map, NULL, + bst_map, force_single_lane)) return opt_result::failure_at (vect_location, "SLP build failed.\n"); @@ -5040,7 +5345,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, slp_inst_kind_reduc_group, scalar_stmts, roots, remain, max_tree_size, &limit, bst_map, - NULL, force_single_lane)) + force_single_lane)) { if (scalar_stmts.length () <= 1) scalar_stmts.release (); @@ -5056,8 +5361,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, slp_inst_kind_reduc_group, stmts, roots, remain, max_tree_size, &limit, - bst_map, NULL, - force_single_lane)) + bst_map, force_single_lane)) return opt_result::failure_at (vect_location, "SLP build failed.\n"); } @@ -5079,9 +5383,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true) && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live && STMT_VINFO_LIVE_P (stmt_info) - && (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def - || (STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def - && STMT_VINFO_REDUC_IDX (stmt_info) == -1))) + && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)) + && STMT_VINFO_REDUC_IDX (stmt_info) == -1) { vec<stmt_vec_info> stmts; vec<stmt_vec_info> roots = vNULL; @@ -5092,8 +5395,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, slp_inst_kind_reduc_group, stmts, roots, remain, max_tree_size, &limit, - bst_map, NULL, - force_single_lane)) + bst_map, force_single_lane)) return opt_result::failure_at (vect_location, "SLP build failed.\n"); } @@ -5136,7 +5438,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond, stmts, roots, remain, max_tree_size, &limit, - bst_map, NULL, force_single_lane)) + bst_map, force_single_lane)) { roots.release (); return opt_result::failure_at (vect_location, @@ -5162,8 +5464,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, if (! vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group, stmts, roots, remain, max_tree_size, &limit, - bst_map, NULL, - force_single_lane)) + bst_map, force_single_lane)) return opt_result::failure_at (vect_location, "SLP build failed.\n"); } @@ -5181,8 +5482,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, if (! vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group, stmts, roots, remain, max_tree_size, &limit, - bst_map, NULL, - force_single_lane)) + bst_map, force_single_lane)) return opt_result::failure_at (vect_location, "SLP build failed.\n"); } @@ -5924,7 +6224,7 @@ vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud) slp_tree use = m_vertices[ud->src].node; slp_tree def = m_vertices[ud->dest].node; if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def - || SLP_TREE_CODE (use) == VEC_PERM_EXPR) + || SLP_TREE_PERMUTE_P (use)) || SLP_TREE_DEF_TYPE (def) != vect_internal_def) return false; @@ -6257,7 +6557,7 @@ vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i, { const int fallback_cost = 1; - if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) + if (SLP_TREE_PERMUTE_P (node)) { auto_lane_permutation_t tmp_perm; tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node)); @@ -6392,7 +6692,7 @@ vect_optimize_slp_pass::start_choosing_layouts () imin = DR_GROUP_SIZE (dr_stmt) + 1; tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node)); } - else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR + else if (SLP_TREE_PERMUTE_P (node) && SLP_TREE_CHILDREN (node).length () == 1 && (child = SLP_TREE_CHILDREN (node)[0]) && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child)) @@ -6490,10 +6790,12 @@ vect_optimize_slp_pass::start_choosing_layouts () { stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance)); - stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info); + vect_reduc_info reduc_info + = info_for_reduction (as_a <loop_vec_info> (m_vinfo), + SLP_INSTANCE_TREE (instance)); if (needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)), - STMT_VINFO_REDUC_CODE (reduc_info))) + VECT_REDUC_INFO_CODE (reduc_info))) { unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex; m_partitions[m_vertices[node_i].partition].layout = 0; @@ -6710,7 +7012,7 @@ vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i, auto &to_costs = partition_layout_costs (to_partition_i, to_partition.layout); if (ud->src == int (to_node_i) - && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR) + && SLP_TREE_PERMUTE_P (to_vertex.node)) { auto &from_partition = m_partitions[m_vertices[ud->dest].partition]; auto old_layout = from_partition.layout; @@ -6765,7 +7067,7 @@ vect_optimize_slp_pass::forward_pass () { unsigned int node_i = m_partitioned_nodes[partition.node_begin]; single_node = m_vertices[node_i].node; - if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR) + if (SLP_TREE_PERMUTE_P (single_node)) in_cost = total_in_cost (node_i); } @@ -6862,8 +7164,7 @@ vect_optimize_slp_pass::forward_pass () if the VEC_PERM_EXPR can be changed to support output layout LAYOUT_I while keeping all the provisional choices of input layout. */ - if (single_node - && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR) + if (single_node && SLP_TREE_PERMUTE_P (single_node)) { int factor = internal_node_cost (single_node, -1, layout_i); if (factor >= 0) @@ -7022,7 +7323,7 @@ vect_optimize_slp_pass::get_result_with_layout (slp_tree node, in TMP_PERM on success. */ auto_lane_permutation_t tmp_perm; unsigned int num_inputs = 1; - if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) + if (SLP_TREE_PERMUTE_P (node)) { tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node)); if (from_layout_i != 0) @@ -7116,7 +7417,7 @@ vect_optimize_slp_pass::materialize () SLP_TREE_SCALAR_STMTS (node), true); /* Update load and lane permutations. */ - if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) + if (SLP_TREE_PERMUTE_P (node)) { /* First try to absorb the input vector layouts. If that fails, force the inputs to have layout LAYOUT_I too. We checked that @@ -7320,7 +7621,7 @@ vect_optimize_slp_pass::dump () " out weight: %f (degree %d)\n", vertex.out_weight.to_double (), vertex.out_degree); - if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR) + if (SLP_TREE_PERMUTE_P (vertex.node)) dump_printf_loc (MSG_NOTE, vect_location, " op: VEC_PERM_EXPR\n"); else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node)) @@ -7399,7 +7700,7 @@ vect_optimize_slp_pass::decide_masked_load_lanes () { slp_tree node = v.node; if (SLP_TREE_DEF_TYPE (node) != vect_internal_def - || SLP_TREE_CODE (node) == VEC_PERM_EXPR) + || SLP_TREE_PERMUTE_P (node)) continue; stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); if (! STMT_VINFO_GROUPED_ACCESS (stmt_info) @@ -7419,7 +7720,7 @@ vect_optimize_slp_pass::decide_masked_load_lanes () /* Uniform masks need to be suitably represented. */ slp_tree mask = SLP_TREE_CHILDREN (node)[0]; - if (SLP_TREE_CODE (mask) != VEC_PERM_EXPR + if (!SLP_TREE_PERMUTE_P (mask) || SLP_TREE_CHILDREN (mask).length () != 1) continue; bool match = true; @@ -7438,7 +7739,7 @@ vect_optimize_slp_pass::decide_masked_load_lanes () { slp_tree pred_node = m_vertices[pred->src].node; /* All consumers should be a permute with a single outgoing lane. */ - if (SLP_TREE_CODE (pred_node) != VEC_PERM_EXPR + if (!SLP_TREE_PERMUTE_P (pred_node) || SLP_TREE_LANES (pred_node) != 1) { match = false; @@ -7599,8 +7900,7 @@ vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf, /* For permute nodes that are fed from externs or constants we have to consider their number of lanes as well. Likewise for store-lanes. */ - if (SLP_TREE_CODE (node) == VEC_PERM_EXPR - || node->ldst_lanes) + if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes) for (slp_tree child : SLP_TREE_CHILDREN (node)) if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) { @@ -7755,7 +8055,7 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; /* Handle purely internal nodes. */ - if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) + if (SLP_TREE_PERMUTE_P (node)) { if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec)) return false; @@ -8466,7 +8766,7 @@ vect_slp_analyze_operations (vec_info *vinfo) && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)) /* Check we can vectorize the gcond. */ || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond - && !vectorizable_early_exit (vinfo, + && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo), SLP_INSTANCE_ROOT_STMTS (instance)[0], NULL, SLP_INSTANCE_TREE (instance), @@ -8830,7 +9130,7 @@ next_lane: { /* Do not directly pass LIFE to the recursive call, copy it to confine changes in the callee to the current child/subtree. */ - if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) + if (SLP_TREE_PERMUTE_P (node)) { subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true); for (unsigned j = 0; @@ -11114,8 +11414,7 @@ vect_schedule_slp_node (vec_info *vinfo, if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0) SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); - if (SLP_TREE_CODE (node) != VEC_PERM_EXPR - && STMT_VINFO_DATA_REF (stmt_info)) + if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info)) { /* Vectorized loads go before the first scalar load to make it ready early, vectorized stores go before the last scalar @@ -11127,7 +11426,7 @@ vect_schedule_slp_node (vec_info *vinfo, last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); si = gsi_for_stmt (last_stmt_info->stmt); } - else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR + else if (!SLP_TREE_PERMUTE_P (node) && (SLP_TREE_TYPE (node) == cycle_phi_info_type || SLP_TREE_TYPE (node) == induc_vec_info_type || SLP_TREE_TYPE (node) == phi_info_type)) @@ -11251,7 +11550,7 @@ vect_schedule_slp_node (vec_info *vinfo, si = gsi_after_labels (vinfo->bbs[0]); } else if (is_a <bb_vec_info> (vinfo) - && SLP_TREE_CODE (node) != VEC_PERM_EXPR + && !SLP_TREE_PERMUTE_P (node) && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt) && gimple_could_trap_p (stmt_info->stmt)) { @@ -11296,7 +11595,7 @@ vect_schedule_slp_node (vec_info *vinfo, } /* Handle purely internal nodes. */ - if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) + if (SLP_TREE_PERMUTE_P (node)) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -11499,8 +11798,8 @@ vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance i auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info)); gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt); gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ()); - bool res = vectorizable_early_exit (vinfo, root_stmt_info, &rgsi, - node, NULL); + bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo), + root_stmt_info, &rgsi, node, NULL); gcc_assert (res); return; } @@ -11575,7 +11874,7 @@ vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance, stack.pop (); info->on_stack = false; vect_schedule_slp_node (vinfo, node, instance); - if (SLP_TREE_CODE (node) != VEC_PERM_EXPR + if (!SLP_TREE_PERMUTE_P (node) && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt)) phis_to_fixup.quick_push (node); } @@ -11598,7 +11897,7 @@ vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance, slp_tree entry = stack[idx]; if (!entry) continue; - bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR + bool phi = (!SLP_TREE_PERMUTE_P (entry) && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt)); bool ready = !phi; FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child) |