aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-slp.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/tree-vect-slp.cc')
-rw-r--r--gcc/tree-vect-slp.cc1248
1 files changed, 770 insertions, 478 deletions
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index b5a9604..5236eac 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -118,17 +118,22 @@ _slp_tree::_slp_tree ()
SLP_TREE_CHILDREN (this) = vNULL;
SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
SLP_TREE_LANE_PERMUTATION (this) = vNULL;
- SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
SLP_TREE_CODE (this) = ERROR_MARK;
+ SLP_TREE_GS_SCALE (this) = 0;
+ SLP_TREE_GS_BASE (this) = NULL_TREE;
this->ldst_lanes = false;
+ this->avoid_stlf_fail = false;
SLP_TREE_VECTYPE (this) = NULL_TREE;
SLP_TREE_REPRESENTATIVE (this) = NULL;
- SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT;
+ this->cycle_info.id = -1;
+ this->cycle_info.reduc_idx = -1;
SLP_TREE_REF_COUNT (this) = 1;
this->failed = NULL;
this->max_nunits = 1;
this->lanes = 0;
+ SLP_TREE_TYPE (this) = undef_vec_info_type;
+ this->data = NULL;
}
/* Tear down a SLP node. */
@@ -147,9 +152,10 @@ _slp_tree::~_slp_tree ()
SLP_TREE_VEC_DEFS (this).release ();
SLP_TREE_LOAD_PERMUTATION (this).release ();
SLP_TREE_LANE_PERMUTATION (this).release ();
- SLP_TREE_SIMD_CLONE_INFO (this).release ();
if (this->failed)
free (failed);
+ if (this->data)
+ delete this->data;
}
/* Push the single SSA definition in DEF to the vector of vector defs. */
@@ -506,24 +512,21 @@ vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
&& (dtb == vect_external_def || dtb == vect_constant_def)));
}
-static const int cond_expr_maps[3][5] = {
- { 4, -1, -2, 1, 2 },
- { 4, -2, -1, 1, 2 },
- { 4, -1, -2, 2, 1 }
-};
+#define GATHER_SCATTER_OFFSET (-3)
+
static const int no_arg_map[] = { 0 };
static const int arg0_map[] = { 1, 0 };
-static const int arg1_map[] = { 1, 1 };
+static const int arg2_map[] = { 1, 2 };
static const int arg2_arg3_map[] = { 2, 2, 3 };
-static const int arg1_arg3_map[] = { 2, 1, 3 };
-static const int arg1_arg4_arg5_map[] = { 3, 1, 4, 5 };
-static const int arg1_arg3_arg4_map[] = { 3, 1, 3, 4 };
+static const int arg2_arg4_map[] = { 2, 2, 4 };
+static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
+static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
static const int arg3_arg2_map[] = { 2, 3, 2 };
static const int op1_op0_map[] = { 2, 1, 0 };
-static const int off_map[] = { 1, -3 };
-static const int off_op0_map[] = { 2, -3, 0 };
-static const int off_arg2_arg3_map[] = { 3, -3, 2, 3 };
-static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
+static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
+static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
+static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
+static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
static const int mask_call_maps[6][7] = {
{ 1, 1, },
{ 2, 1, 2, },
@@ -572,18 +575,18 @@ vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
case IFN_GATHER_LOAD:
- return arg1_map;
+ return arg2_map;
case IFN_MASK_GATHER_LOAD:
case IFN_MASK_LEN_GATHER_LOAD:
- return arg1_arg4_arg5_map;
+ return arg2_arg5_arg6_map;
case IFN_SCATTER_STORE:
- return arg1_arg3_map;
+ return arg2_arg4_map;
case IFN_MASK_SCATTER_STORE:
case IFN_MASK_LEN_SCATTER_STORE:
- return arg1_arg3_arg4_map;
+ return arg2_arg4_arg5_map;
case IFN_MASK_STORE:
return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
@@ -680,6 +683,15 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
{
internal_fn ifn = gimple_call_internal_fn (stmt);
commutative_op = first_commutative_argument (ifn);
+ if (internal_gather_scatter_fn_p (ifn))
+ {
+ vect_describe_gather_scatter_call
+ (stmt_info,
+ first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
+ if (first)
+ (*oprnds_info)[0]->first_gs_p = true;
+ gs_op = 0;
+ }
}
}
else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
@@ -695,7 +707,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
{
oprnd_info = (*oprnds_info)[i];
int opno = map ? map[i] : int (i);
- if (opno == -3)
+ if (opno == GATHER_SCATTER_OFFSET)
{
gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
if (!is_a <loop_vec_info> (vinfo)
@@ -985,13 +997,18 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
to be combined into the same SLP group. */
bool
-compatible_calls_p (gcall *call1, gcall *call2)
+compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
{
unsigned int nargs = gimple_call_num_args (call1);
if (nargs != gimple_call_num_args (call2))
return false;
- if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
+ auto cfn1 = gimple_call_combined_fn (call1);
+ auto cfn2 = gimple_call_combined_fn (call2);
+ if (cfn1 != cfn2
+ && (!allow_two_operators
+ || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
+ && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
return false;
if (gimple_call_internal_p (call1))
@@ -1113,6 +1130,16 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
matches[0] = false;
return false;
}
+ if (is_a <bb_vec_info> (vinfo)
+ && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Build SLP failed: not using single lane "
+ "vector type %T\n", vectype);
+ matches[0] = false;
+ return false;
+ }
/* Record nunits required but continue analysis, producing matches[]
as if nunits was not an issue. This allows splitting of groups
to happen. */
@@ -1125,7 +1152,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
soft_fail_nunits_vectype = nunits_vectype;
}
- gcc_assert (vectype);
+ gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
*node_vectype = vectype;
/* For every stmt in NODE find its def stmt/s. */
@@ -1172,10 +1199,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
gcall *call_stmt = dyn_cast <gcall *> (stmt);
tree lhs = gimple_get_lhs (stmt);
- if (lhs == NULL_TREE
- && (!call_stmt
- || !gimple_call_internal_p (stmt)
- || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
+ if (lhs == NULL_TREE && !call_stmt)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1353,17 +1377,23 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
|| rhs_code != IMAGPART_EXPR)
/* Handle mismatches in plus/minus by computing both
and merging the results. */
- && !((first_stmt_code == PLUS_EXPR
- || first_stmt_code == MINUS_EXPR)
- && (alt_stmt_code == PLUS_EXPR
- || alt_stmt_code == MINUS_EXPR)
+ && !((((first_stmt_code == PLUS_EXPR
+ || first_stmt_code == MINUS_EXPR)
+ && (alt_stmt_code == PLUS_EXPR
+ || alt_stmt_code == MINUS_EXPR))
+ || ((first_stmt_code == CFN_FMA
+ || first_stmt_code == CFN_FMS)
+ && (alt_stmt_code == CFN_FMA
+ || alt_stmt_code == CFN_FMS)))
&& rhs_code == alt_stmt_code)
&& !(first_stmt_code.is_tree_code ()
&& rhs_code.is_tree_code ()
&& (TREE_CODE_CLASS (tree_code (first_stmt_code))
== tcc_comparison)
&& (swap_tree_comparison (tree_code (first_stmt_code))
- == tree_code (rhs_code))))
+ == tree_code (rhs_code))
+ && (first_reduc_idx == -1
+ || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
|| (ldst_p
&& (STMT_VINFO_GROUPED_ACCESS (stmt_info)
!= STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
@@ -1405,7 +1435,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
{
if (!is_a <gcall *> (stmts[0]->stmt)
|| !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
- call_stmt))
+ call_stmt, true))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1579,8 +1609,11 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
}
}
- if (rhs_code.is_tree_code ()
- && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
+ if (i != 0
+ && first_stmt_code != rhs_code
+ && first_stmt_code.is_tree_code ()
+ && rhs_code.is_tree_code ()
+ && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
&& (swap_tree_comparison ((tree_code)first_stmt_code)
== (tree_code)rhs_code))
swap[i] = 1;
@@ -2616,13 +2649,14 @@ out:
if (oprnds_info[0]->def_stmts[0]
&& is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
+ basic_block bb = nullptr;
for (unsigned j = 0; j < group_size; ++j)
{
FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
{
stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
- if (!stmt_info || !stmt_info->stmt
+ if (!stmt_info
|| !is_a<gassign *> (stmt_info->stmt)
|| gimple_assign_rhs_code (stmt_info->stmt) != code
|| skip_args[i])
@@ -2630,6 +2664,14 @@ out:
success = false;
break;
}
+ /* Avoid mixing lanes with defs in different basic-blocks. */
+ if (!bb)
+ bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
+ else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
+ {
+ success = false;
+ break;
+ }
bool exists;
unsigned &stmt_idx
@@ -2695,6 +2737,10 @@ out:
stmt_info = stmts[0];
+ int reduc_idx = -1;
+ int gs_scale = 0;
+ tree gs_base = NULL_TREE;
+
/* Create SLP_TREE nodes for the definition node/s. */
FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
{
@@ -2717,6 +2763,12 @@ out:
continue;
}
+ if (oprnd_info->first_gs_p)
+ {
+ gs_scale = oprnd_info->first_gs_info.scale;
+ gs_base = oprnd_info->first_gs_info.base;
+ }
+
if (is_a <bb_vec_info> (vinfo)
&& oprnd_info->first_dt == vect_internal_def
&& !oprnd_info->any_pattern)
@@ -2777,6 +2829,33 @@ out:
continue;
}
+ /* See which SLP operand a reduction chain continues on. We want
+ to chain even PHIs but not backedges. */
+ if (VECTORIZABLE_CYCLE_DEF (oprnd_info->first_dt)
+ || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
+ {
+ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
+ {
+ if (oprnd_info->first_dt == vect_double_reduction_def)
+ reduc_idx = i;
+ }
+ else if (is_a <gphi *> (stmt_info->stmt)
+ && gimple_phi_num_args
+ (as_a <gphi *> (stmt_info->stmt)) != 1)
+ ;
+ else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
+ && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
+ ;
+ else if (reduc_idx == -1)
+ reduc_idx = i;
+ else
+ /* For .COND_* reduction operations the else value can be the
+ same as one of the operation operands. The other def
+ stmts have been moved, so we can't check easily. Check
+ it's a call at least. */
+ gcc_assert (is_a <gcall *> (stmt_info->stmt));
+ }
+
/* When we have a masked load with uniform mask discover this
as a single-lane mask with a splat permute. This way we can
recognize this as a masked load-lane by stripping the splat. */
@@ -2835,9 +2914,10 @@ out:
&& matches[0]
/* ??? For COND_EXPRs we can swap the comparison operands
as well as the arms under some constraints. */
- && nops == 2
+ && (nops == 2 || nops == 3)
&& oprnds_info[1]->first_dt == vect_internal_def
- && is_gimple_assign (stmt_info->stmt)
+ && (is_gimple_assign (stmt_info->stmt)
+ || is_gimple_call (stmt_info->stmt))
/* Swapping operands for reductions breaks assumptions later on. */
&& STMT_VINFO_REDUC_IDX (stmt_info) == -1)
{
@@ -2852,14 +2932,32 @@ out:
continue;
stmt_vec_info stmt_info = stmts[j];
/* Verify if we can swap operands of this stmt. */
- gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
- if (!stmt
- || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
+ if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
{
- if (!swap_not_matching)
- goto fail;
- swap_not_matching = false;
- break;
+ tree_code code = gimple_assign_rhs_code (stmt);
+ if (! commutative_tree_code (code)
+ && ! commutative_ternary_tree_code (code))
+ {
+ if (!swap_not_matching)
+ goto fail;
+ swap_not_matching = false;
+ break;
+ }
+ }
+ else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
+ {
+ internal_fn fn = (gimple_call_internal_p (call)
+ ? gimple_call_internal_fn (call)
+ : IFN_LAST);
+ if ((! commutative_binary_fn_p (fn)
+ && ! commutative_ternary_fn_p (fn))
+ || first_commutative_argument (fn) != 0)
+ {
+ if (!swap_not_matching)
+ goto fail;
+ swap_not_matching = false;
+ break;
+ }
}
}
}
@@ -3045,24 +3143,35 @@ fail:
SLP_TREE_CODE (node) = VEC_PERM_EXPR;
SLP_TREE_CHILDREN (node).quick_push (one);
SLP_TREE_CHILDREN (node).quick_push (two);
- gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
- enum tree_code code0 = gimple_assign_rhs_code (stmt);
+ enum tree_code code0 = ERROR_MARK;
enum tree_code ocode = ERROR_MARK;
+ if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
+ code0 = gimple_assign_rhs_code (stmt);
stmt_vec_info ostmt_info;
unsigned j = 0;
FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
{
- gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
- if (gimple_assign_rhs_code (ostmt) != code0)
+ int op = 0;
+ if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
{
- SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
- ocode = gimple_assign_rhs_code (ostmt);
- j = i;
+ if (gimple_assign_rhs_code (ostmt) != code0)
+ {
+ ocode = gimple_assign_rhs_code (ostmt);
+ op = 1;
+ j = i;
+ }
}
else
- SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
+ {
+ if (gimple_call_combined_fn (stmts[0]->stmt)
+ != gimple_call_combined_fn (ostmt_info->stmt))
+ {
+ op = 1;
+ j = i;
+ }
+ }
+ SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
}
-
SLP_TREE_CODE (one) = code0;
SLP_TREE_CODE (two) = ocode;
SLP_TREE_LANES (one) = stmts.length ();
@@ -3076,6 +3185,43 @@ fail:
node = vect_create_new_slp_node (node, stmts, nops);
SLP_TREE_VECTYPE (node) = vectype;
SLP_TREE_CHILDREN (node).splice (children);
+ SLP_TREE_GS_SCALE (node) = gs_scale;
+ SLP_TREE_GS_BASE (node) = gs_base;
+ if (reduc_idx != -1)
+ {
+ gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
+ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
+ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
+ SLP_TREE_REDUC_IDX (node) = reduc_idx;
+ node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
+ }
+ /* When reaching the reduction PHI, create a vect_reduc_info. */
+ else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
+ && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
+ {
+ loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
+ gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
+ node->cycle_info.id = loop_vinfo->reduc_infos.length ();
+ vect_reduc_info reduc_info = new vect_reduc_info_s ();
+ loop_vinfo->reduc_infos.safe_push (reduc_info);
+ stmt_vec_info reduc_phi = stmt_info;
+ /* ??? For double reductions vect_is_simple_reduction stores the
+ reduction type and code on the inner loop header PHI. */
+ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
+ {
+ use_operand_p use_p;
+ gimple *use_stmt;
+ bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
+ &use_p, &use_stmt);
+ gcc_assert (res);
+ reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
+ }
+ VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
+ VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
+ VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
+ VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
+ }
return node;
}
@@ -3104,10 +3250,15 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
SLP_TREE_REF_COUNT (node));
if (SLP_TREE_VECTYPE (node))
dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
+ dump_printf (metadata, "%s",
+ node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
+ if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
+ dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
+ node->cycle_info.reduc_idx);
dump_printf (metadata, "\n");
if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
{
- if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ if (SLP_TREE_PERMUTE_P (node))
dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
else
dump_printf_loc (metadata, user_loc, "op template: %G",
@@ -3359,7 +3510,7 @@ vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
return;
- if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
+ if (!SLP_TREE_PERMUTE_P (node))
{
stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
if (STMT_VINFO_DATA_REF (stmt_info)
@@ -3473,7 +3624,7 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
static inline bool
vect_is_slp_load_node (slp_tree root)
{
- return (SLP_TREE_CODE (root) != VEC_PERM_EXPR
+ return (!SLP_TREE_PERMUTE_P (root)
&& SLP_TREE_DEF_TYPE (root) == vect_internal_def
&& STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
&& DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
@@ -3498,7 +3649,7 @@ optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
/* For now, we don't know anything about externals so do not do anything. */
if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
return NULL;
- else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
+ else if (SLP_TREE_PERMUTE_P (root))
{
/* First convert this node into a load node and add it to the leaves
list and flatten the permute from a lane to a load one. If it's
@@ -3903,8 +4054,6 @@ vect_build_slp_instance (vec_info *vinfo,
vec<tree> &remain,
unsigned max_tree_size, unsigned *limit,
scalar_stmts_to_slp_tree_map_t *bst_map,
- /* ??? We need stmt_info for group splitting. */
- stmt_vec_info stmt_info_,
bool force_single_lane)
{
/* If there's no budget left bail out early. */
@@ -3940,7 +4089,6 @@ vect_build_slp_instance (vec_info *vinfo,
bool *matches = XALLOCAVEC (bool, group_size);
poly_uint64 max_nunits = 1;
unsigned tree_size = 0;
- unsigned i;
slp_tree node = NULL;
if (group_size > 1 && force_single_lane)
@@ -4000,63 +4148,345 @@ vect_build_slp_instance (vec_info *vinfo,
"SLP size %u vs. limit %u.\n",
tree_size, max_tree_size);
- /* Fixup SLP reduction chains. */
- if (kind == slp_inst_kind_reduc_chain)
+ vinfo->slp_instances.safe_push (new_instance);
+
+ /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+ the number of scalar stmts in the root in a few places.
+ Verify that assumption holds. */
+ gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+ .length () == group_size);
+
+ if (dump_enabled_p ())
{
- /* If this is a reduction chain with a conversion in front
- amend the SLP tree with a node for that. */
- gimple *scalar_def
- = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
- if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Final SLP tree for instance %p:\n",
+ (void *) new_instance);
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (new_instance));
+ }
+
+ return true;
+ }
+ }
+ /* Failed to SLP. */
+
+ /* While we arrive here even with slp_inst_kind_store we should only
+ for group_size == 1. The code to split store groups is only in
+ vect_analyze_slp_instance now. */
+ gcc_assert (kind != slp_inst_kind_store || group_size == 1);
+
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
+
+ /* Failed to SLP. */
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
+ return false;
+}
+
+/* Analyze an SLP instance starting from a the start of a reduction chain.
+ Call vect_build_slp_tree to build a tree of packed stmts if possible.
+ Return FALSE if SLP build fails. */
+
+static bool
+vect_analyze_slp_reduc_chain (vec_info *vinfo,
+ scalar_stmts_to_slp_tree_map_t *bst_map,
+ stmt_vec_info stmt_info,
+ unsigned max_tree_size, unsigned *limit)
+{
+ vec<stmt_vec_info> scalar_stmts;
+
+ /* Collect the reduction stmts and store them in scalar_stmts. */
+ scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
+ stmt_vec_info next_info = stmt_info;
+ while (next_info)
+ {
+ scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
+ next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
+ }
+ /* Mark the first element of the reduction chain as reduction to properly
+ transform the node. In the reduction analysis phase only the last
+ element of the chain is marked as reduction. */
+ STMT_VINFO_DEF_TYPE (stmt_info)
+ = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
+ STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
+ = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
+
+ /* Build the tree for the SLP instance. */
+ vec<stmt_vec_info> root_stmt_infos = vNULL;
+ vec<tree> remain = vNULL;
+
+ /* If there's no budget left bail out early. */
+ if (*limit == 0)
+ return false;
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Starting SLP discovery for\n");
+ for (unsigned i = 0; i < scalar_stmts.length (); ++i)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " %G", scalar_stmts[i]->stmt);
+ }
+
+ /* Build the tree for the SLP instance. */
+ unsigned int group_size = scalar_stmts.length ();
+ bool *matches = XALLOCAVEC (bool, group_size);
+ poly_uint64 max_nunits = 1;
+ unsigned tree_size = 0;
+
+ slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
+ &max_nunits, matches, limit,
+ &tree_size, bst_map);
+ if (node != NULL)
+ {
+ /* Calculate the unrolling factor based on the smallest type. */
+ poly_uint64 unrolling_factor
+ = calculate_unrolling_factor (max_nunits, group_size);
+
+ if (maybe_ne (unrolling_factor, 1U)
+ && is_a <bb_vec_info> (vinfo))
+ {
+ unsigned HOST_WIDE_INT const_max_nunits;
+ if (!max_nunits.is_constant (&const_max_nunits)
+ || const_max_nunits > group_size)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Build SLP failed: store group "
+ "size not a multiple of the vector size "
+ "in basic block SLP\n");
+ vect_free_slp_tree (node);
+ return false;
+ }
+ /* Fatal mismatch. */
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP discovery succeeded but node needs "
+ "splitting\n");
+ memset (matches, true, group_size);
+ matches[group_size / const_max_nunits * const_max_nunits] = false;
+ vect_free_slp_tree (node);
+ }
+ else
+ {
+ /* Create a new SLP instance. */
+ slp_instance new_instance = XNEW (class _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+ SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+ SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
+ new_instance->reduc_phis = NULL;
+ new_instance->cost_vec = vNULL;
+ new_instance->subgraph_entries = vNULL;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP size %u vs. limit %u.\n",
+ tree_size, max_tree_size);
+
+ /* Fixup SLP reduction chains. If this is a reduction chain with
+ a conversion in front amend the SLP tree with a node for that. */
+ gimple *scalar_def
+ = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
+ if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
+ {
+ /* Get at the conversion stmt - we know it's the single use
+ of the last stmt of the reduction chain. */
+ use_operand_p use_p;
+ bool r = single_imm_use (gimple_assign_lhs (scalar_def),
+ &use_p, &scalar_def);
+ gcc_assert (r);
+ stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
+ next_info = vect_stmt_to_vectorize (next_info);
+ scalar_stmts = vNULL;
+ scalar_stmts.create (group_size);
+ for (unsigned i = 0; i < group_size; ++i)
+ scalar_stmts.quick_push (next_info);
+ slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
+ SLP_TREE_VECTYPE (conv)
+ = get_vectype_for_scalar_type (vinfo,
+ TREE_TYPE
+ (gimple_assign_lhs (scalar_def)),
+ group_size);
+ SLP_TREE_REDUC_IDX (conv) = 0;
+ conv->cycle_info.id = node->cycle_info.id;
+ SLP_TREE_CHILDREN (conv).quick_push (node);
+ SLP_INSTANCE_TREE (new_instance) = conv;
+ /* We also have to fake this conversion stmt as SLP reduction
+ group so we don't have to mess with too much code
+ elsewhere. */
+ REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
+ REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
+ }
+ /* Fill the backedge child of the PHI SLP node. The
+ general matching code cannot find it because the
+ scalar code does not reflect how we vectorize the
+ reduction. */
+ use_operand_p use_p;
+ imm_use_iterator imm_iter;
+ class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
+ gimple_get_lhs (scalar_def))
+ /* There are exactly two non-debug uses, the reduction
+ PHI and the loop-closed PHI node. */
+ if (!is_gimple_debug (USE_STMT (use_p))
+ && gimple_bb (USE_STMT (use_p)) == loop->header)
{
- /* Get at the conversion stmt - we know it's the single use
- of the last stmt of the reduction chain. */
- use_operand_p use_p;
- bool r = single_imm_use (gimple_assign_lhs (scalar_def),
- &use_p, &scalar_def);
- gcc_assert (r);
- stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
- next_info = vect_stmt_to_vectorize (next_info);
- scalar_stmts = vNULL;
- scalar_stmts.create (group_size);
+ auto_vec<stmt_vec_info, 64> phis (group_size);
+ stmt_vec_info phi_info
+ = vinfo->lookup_stmt (USE_STMT (use_p));
for (unsigned i = 0; i < group_size; ++i)
- scalar_stmts.quick_push (next_info);
- slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
- SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
- SLP_TREE_CHILDREN (conv).quick_push (node);
- SLP_INSTANCE_TREE (new_instance) = conv;
- /* We also have to fake this conversion stmt as SLP reduction
- group so we don't have to mess with too much code
- elsewhere. */
- REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
- REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
+ phis.quick_push (phi_info);
+ slp_tree *phi_node = bst_map->get (phis);
+ unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
+ SLP_TREE_CHILDREN (*phi_node)[dest_idx]
+ = SLP_INSTANCE_TREE (new_instance);
+ SLP_INSTANCE_TREE (new_instance)->refcnt++;
}
- /* Fill the backedge child of the PHI SLP node. The
- general matching code cannot find it because the
- scalar code does not reflect how we vectorize the
- reduction. */
- use_operand_p use_p;
- imm_use_iterator imm_iter;
- class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
- FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
- gimple_get_lhs (scalar_def))
- /* There are exactly two non-debug uses, the reduction
- PHI and the loop-closed PHI node. */
- if (!is_gimple_debug (USE_STMT (use_p))
- && gimple_bb (USE_STMT (use_p)) == loop->header)
- {
- auto_vec<stmt_vec_info, 64> phis (group_size);
- stmt_vec_info phi_info
- = vinfo->lookup_stmt (USE_STMT (use_p));
- for (unsigned i = 0; i < group_size; ++i)
- phis.quick_push (phi_info);
- slp_tree *phi_node = bst_map->get (phis);
- unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
- SLP_TREE_CHILDREN (*phi_node)[dest_idx]
- = SLP_INSTANCE_TREE (new_instance);
- SLP_INSTANCE_TREE (new_instance)->refcnt++;
- }
+
+ vinfo->slp_instances.safe_push (new_instance);
+
+ /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+ the number of scalar stmts in the root in a few places.
+ Verify that assumption holds. */
+ gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+ .length () == group_size);
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Final SLP tree for instance %p:\n",
+ (void *) new_instance);
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (new_instance));
+ }
+
+ return true;
+ }
+ }
+ /* Failed to SLP. */
+
+ /* Free the allocated memory. */
+ scalar_stmts.release ();
+
+ /* Failed to SLP. */
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
+ return false;
+}
+
+/* Analyze an SLP instance starting from a group of grouped stores. Call
+ vect_build_slp_tree to build a tree of packed stmts if possible.
+ Return FALSE if it's impossible to SLP any stmt in the group. */
+
+static bool
+vect_analyze_slp_instance (vec_info *vinfo,
+ scalar_stmts_to_slp_tree_map_t *bst_map,
+ stmt_vec_info stmt_info,
+ slp_instance_kind kind,
+ unsigned max_tree_size, unsigned *limit,
+ bool force_single_lane)
+{
+ vec<stmt_vec_info> scalar_stmts;
+
+ if (is_a <bb_vec_info> (vinfo))
+ vect_location = stmt_info->stmt;
+
+ gcc_assert (kind == slp_inst_kind_store);
+
+ /* Collect the stores and store them in scalar_stmts. */
+ scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
+ stmt_vec_info next_info = stmt_info;
+ while (next_info)
+ {
+ scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
+ next_info = DR_GROUP_NEXT_ELEMENT (next_info);
+ }
+
+ vec<stmt_vec_info> root_stmt_infos = vNULL;
+ vec<tree> remain = vNULL;
+
+ /* Build the tree for the SLP instance. */
+
+ /* If there's no budget left bail out early. */
+ if (*limit == 0)
+ return false;
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Starting SLP discovery for\n");
+ for (unsigned i = 0; i < scalar_stmts.length (); ++i)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " %G", scalar_stmts[i]->stmt);
+ }
+
+ /* Build the tree for the SLP instance. */
+ unsigned int group_size = scalar_stmts.length ();
+ bool *matches = XALLOCAVEC (bool, group_size);
+ poly_uint64 max_nunits = 1;
+ unsigned tree_size = 0;
+ unsigned i;
+
+ slp_tree node = NULL;
+ if (group_size > 1 && force_single_lane)
+ {
+ matches[0] = true;
+ matches[1] = false;
+ }
+ else
+ node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
+ &max_nunits, matches, limit,
+ &tree_size, bst_map);
+ if (node != NULL)
+ {
+ /* Calculate the unrolling factor based on the smallest type. */
+ poly_uint64 unrolling_factor
+ = calculate_unrolling_factor (max_nunits, group_size);
+
+ if (maybe_ne (unrolling_factor, 1U)
+ && is_a <bb_vec_info> (vinfo))
+ {
+ unsigned HOST_WIDE_INT const_max_nunits;
+ if (!max_nunits.is_constant (&const_max_nunits)
+ || const_max_nunits > group_size)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Build SLP failed: store group "
+ "size not a multiple of the vector size "
+ "in basic block SLP\n");
+ vect_free_slp_tree (node);
+ return false;
}
+ /* Fatal mismatch. */
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP discovery succeeded but node needs "
+ "splitting\n");
+ memset (matches, true, group_size);
+ matches[group_size / const_max_nunits * const_max_nunits] = false;
+ vect_free_slp_tree (node);
+ }
+ else
+ {
+ /* Create a new SLP instance. */
+ slp_instance new_instance = XNEW (class _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+ SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+ SLP_INSTANCE_KIND (new_instance) = kind;
+ new_instance->reduc_phis = NULL;
+ new_instance->cost_vec = vNULL;
+ new_instance->subgraph_entries = vNULL;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP size %u vs. limit %u.\n",
+ tree_size, max_tree_size);
vinfo->slp_instances.safe_push (new_instance);
@@ -4080,7 +4510,6 @@ vect_build_slp_instance (vec_info *vinfo,
}
/* Failed to SLP. */
- stmt_vec_info stmt_info = stmt_info_;
/* Try to break the group up into pieces. */
if (*limit > 0 && kind == slp_inst_kind_store)
{
@@ -4341,70 +4770,6 @@ vect_build_slp_instance (vec_info *vinfo,
return false;
}
-
-/* Analyze an SLP instance starting from a group of grouped stores. Call
- vect_build_slp_tree to build a tree of packed stmts if possible.
- Return FALSE if it's impossible to SLP any stmt in the loop. */
-
-static bool
-vect_analyze_slp_instance (vec_info *vinfo,
- scalar_stmts_to_slp_tree_map_t *bst_map,
- stmt_vec_info stmt_info,
- slp_instance_kind kind,
- unsigned max_tree_size, unsigned *limit,
- bool force_single_lane)
-{
- vec<stmt_vec_info> scalar_stmts;
-
- if (is_a <bb_vec_info> (vinfo))
- vect_location = stmt_info->stmt;
-
- stmt_vec_info next_info = stmt_info;
- if (kind == slp_inst_kind_store)
- {
- /* Collect the stores and store them in scalar_stmts. */
- scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
- while (next_info)
- {
- scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
- next_info = DR_GROUP_NEXT_ELEMENT (next_info);
- }
- }
- else if (kind == slp_inst_kind_reduc_chain)
- {
- /* Collect the reduction stmts and store them in scalar_stmts. */
- scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
- while (next_info)
- {
- scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
- next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
- }
- /* Mark the first element of the reduction chain as reduction to properly
- transform the node. In the reduction analysis phase only the last
- element of the chain is marked as reduction. */
- STMT_VINFO_DEF_TYPE (stmt_info)
- = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
- STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
- = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
- }
- else
- gcc_unreachable ();
-
- vec<stmt_vec_info> roots = vNULL;
- vec<tree> remain = vNULL;
- /* Build the tree for the SLP instance. */
- bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
- roots, remain,
- max_tree_size, limit, bst_map,
- kind == slp_inst_kind_store
- ? stmt_info : NULL, force_single_lane);
-
- /* ??? If this is slp_inst_kind_store and the above succeeded here's
- where we should do store group splitting. */
-
- return res;
-}
-
/* qsort comparator ordering SLP load nodes. */
static int
@@ -4546,6 +4911,15 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
if (!SLP_TREE_CHILDREN (load).is_empty ())
continue;
+ /* For single-element interleaving spanning multiple vectors avoid
+ lowering, we want to use VMAT_ELEMENTWISE later. */
+ if (ld_lanes_lanes == 0
+ && SLP_TREE_LANES (load) == 1
+ && !DR_GROUP_NEXT_ELEMENT (first)
+ && maybe_gt (group_lanes,
+ TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
+ return;
+
/* We want to pattern-match special cases here and keep those
alone. Candidates are splats and load-lane. */
@@ -4815,9 +5189,11 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
/* Find SLP sequences starting from groups of grouped stores. */
FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
- vect_analyze_slp_instance (vinfo, bst_map, first_element,
- slp_inst_kind_store, max_tree_size, &limit,
- force_single_lane);
+ if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
+ slp_inst_kind_store, max_tree_size, &limit,
+ force_single_lane)
+ && loop_vinfo)
+ return opt_result::failure_at (vect_location, "SLP build failed.\n");
/* For loops also start SLP discovery from non-grouped stores. */
if (loop_vinfo)
@@ -4835,10 +5211,27 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
vec<tree> remain = vNULL;
stmts.create (1);
stmts.quick_push (stmt_info);
- vect_build_slp_instance (vinfo, slp_inst_kind_store,
- stmts, roots, remain, max_tree_size,
- &limit, bst_map, NULL, force_single_lane);
+ if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
+ stmts, roots, remain, max_tree_size,
+ &limit, bst_map, force_single_lane))
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
}
+
+ stmt_vec_info stmt_info;
+ FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
+ {
+ vec<stmt_vec_info> stmts;
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ stmts.create (1);
+ stmts.quick_push (stmt_info);
+ if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
+ stmts, roots, remain, max_tree_size,
+ &limit, bst_map, force_single_lane))
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
+ }
}
if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
@@ -4854,8 +5247,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
bb_vinfo->roots[i].stmts,
bb_vinfo->roots[i].roots,
bb_vinfo->roots[i].remain,
- max_tree_size, &limit, bst_map, NULL,
- false))
+ max_tree_size, &limit, bst_map, false))
{
bb_vinfo->roots[i].stmts = vNULL;
bb_vinfo->roots[i].roots = vNULL;
@@ -4872,11 +5264,13 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
&& ! STMT_VINFO_LIVE_P (first_element))
;
else if (force_single_lane
- || ! vect_analyze_slp_instance (vinfo, bst_map, first_element,
- slp_inst_kind_reduc_chain,
- max_tree_size, &limit,
- force_single_lane))
+ || ! vect_analyze_slp_reduc_chain (vinfo, bst_map,
+ first_element,
+ max_tree_size, &limit))
{
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "SLP discovery of reduction chain failed\n");
/* Dissolve reduction chain group. */
stmt_vec_info vinfo = first_element;
stmt_vec_info last = NULL;
@@ -4889,6 +5283,11 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
vinfo = next;
}
STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
+ /* ??? When there's a conversion around the reduction
+ chain 'last' isn't the entry of the reduction. */
+ if (STMT_VINFO_DEF_TYPE (last) != vect_reduction_def)
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
/* It can be still vectorized as part of an SLP reduction. */
loop_vinfo->reductions.safe_push (last);
}
@@ -4926,12 +5325,14 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
vec<tree> remain = vNULL;
stmts.create (1);
stmts.quick_push (next_info);
- vect_build_slp_instance (vinfo,
- slp_inst_kind_reduc_group,
- stmts, roots, remain,
- max_tree_size, &limit,
- bst_map, NULL,
- force_single_lane);
+ if (! vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map,
+ force_single_lane))
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
}
}
}
@@ -4944,7 +5345,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
slp_inst_kind_reduc_group,
scalar_stmts, roots, remain,
max_tree_size, &limit, bst_map,
- NULL, force_single_lane))
+ force_single_lane))
{
if (scalar_stmts.length () <= 1)
scalar_stmts.release ();
@@ -4956,11 +5357,13 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
vec<tree> remain = vNULL;
stmts.create (1);
stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
- vect_build_slp_instance (vinfo,
- slp_inst_kind_reduc_group,
- stmts, roots, remain,
- max_tree_size, &limit,
- bst_map, NULL, force_single_lane);
+ if (! vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, force_single_lane))
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
}
}
saved_stmts.release ();
@@ -4980,20 +5383,21 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
&& ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
&& STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
&& STMT_VINFO_LIVE_P (stmt_info)
- && (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
- || (STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
- && STMT_VINFO_REDUC_IDX (stmt_info) == -1)))
+ && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
+ && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
{
vec<stmt_vec_info> stmts;
vec<stmt_vec_info> roots = vNULL;
vec<tree> remain = vNULL;
stmts.create (1);
stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
- vect_build_slp_instance (vinfo,
- slp_inst_kind_reduc_group,
- stmts, roots, remain,
- max_tree_size, &limit,
- bst_map, NULL, force_single_lane);
+ if (! vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, force_single_lane))
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
}
}
@@ -5009,9 +5413,16 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
tree args0 = gimple_cond_lhs (stmt);
tree args1 = gimple_cond_rhs (stmt);
- /* These should be enforced by cond lowering. */
- gcc_assert (gimple_cond_code (stmt) == NE_EXPR);
- gcc_assert (zerop (args1));
+ /* These should be enforced by cond lowering, but if it failed
+ bail. */
+ if (gimple_cond_code (stmt) != NE_EXPR
+ || TREE_TYPE (args0) != boolean_type_node
+ || !integer_zerop (args1))
+ {
+ roots.release ();
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
+ }
/* An argument without a loop def will be codegened from vectorizing the
root gcond itself. As such we don't need to try to build an SLP tree
@@ -5027,8 +5438,12 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
stmts, roots, remain,
max_tree_size, &limit,
- bst_map, NULL, force_single_lane))
- roots.release ();
+ bst_map, force_single_lane))
+ {
+ roots.release ();
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
+ }
}
/* Find and create slp instances for inductions that have been forced
@@ -5040,14 +5455,19 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
vec<stmt_vec_info> roots = vNULL;
vec<tree> remain = vNULL;
gphi *phi = as_a<gphi *> (STMT_VINFO_STMT (stmt_info));
- stmts.create (1);
tree def = gimple_phi_arg_def_from_edge (phi, latch_e);
stmt_vec_info lc_info = loop_vinfo->lookup_def (def);
- stmts.quick_push (vect_stmt_to_vectorize (lc_info));
- vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
- stmts, roots, remain,
- max_tree_size, &limit,
- bst_map, NULL, force_single_lane);
+ if (lc_info)
+ {
+ stmts.create (1);
+ stmts.quick_push (vect_stmt_to_vectorize (lc_info));
+ if (! vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, force_single_lane))
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
+ }
/* When the latch def is from a different cycle this can only
be a induction. Build a simple instance for this.
??? We should be able to start discovery from the PHI
@@ -5057,14 +5477,14 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
tem.quick_push (stmt_info);
if (!bst_map->get (tem))
{
- gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
- == vect_induction_def);
stmts.create (1);
stmts.quick_push (stmt_info);
- vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
- stmts, roots, remain,
- max_tree_size, &limit,
- bst_map, NULL, force_single_lane);
+ if (! vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, force_single_lane))
+ return opt_result::failure_at (vect_location,
+ "SLP build failed.\n");
}
}
}
@@ -5163,7 +5583,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
if (STMT_VINFO_STRIDED_P (stmt_vinfo)
|| compare_step_with_zero (vinfo, stmt_vinfo) <= 0
|| vect_load_lanes_supported
- (STMT_VINFO_VECTYPE (stmt_vinfo),
+ (SLP_TREE_VECTYPE (load_node),
DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
/* ??? During SLP re-discovery with a single lane
a masked grouped load will appear permuted and
@@ -5184,7 +5604,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
|| SLP_TREE_LANES (load_node) == group_size
|| (vect_slp_prefer_store_lanes_p
(vinfo, stmt_vinfo,
- STMT_VINFO_VECTYPE (stmt_vinfo), masked,
+ SLP_TREE_VECTYPE (load_node), masked,
group_size, SLP_TREE_LANES (load_node))));
}
@@ -5804,7 +6224,7 @@ vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
slp_tree use = m_vertices[ud->src].node;
slp_tree def = m_vertices[ud->dest].node;
if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
- || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
+ || SLP_TREE_PERMUTE_P (use))
|| SLP_TREE_DEF_TYPE (def) != vect_internal_def)
return false;
@@ -6137,7 +6557,7 @@ vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
{
const int fallback_cost = 1;
- if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ if (SLP_TREE_PERMUTE_P (node))
{
auto_lane_permutation_t tmp_perm;
tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
@@ -6272,7 +6692,7 @@ vect_optimize_slp_pass::start_choosing_layouts ()
imin = DR_GROUP_SIZE (dr_stmt) + 1;
tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
}
- else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
+ else if (SLP_TREE_PERMUTE_P (node)
&& SLP_TREE_CHILDREN (node).length () == 1
&& (child = SLP_TREE_CHILDREN (node)[0])
&& (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
@@ -6370,10 +6790,12 @@ vect_optimize_slp_pass::start_choosing_layouts ()
{
stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
- stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
+ vect_reduc_info reduc_info
+ = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
+ SLP_INSTANCE_TREE (instance));
if (needs_fold_left_reduction_p (TREE_TYPE
(gimple_get_lhs (stmt_info->stmt)),
- STMT_VINFO_REDUC_CODE (reduc_info)))
+ VECT_REDUC_INFO_CODE (reduc_info)))
{
unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
m_partitions[m_vertices[node_i].partition].layout = 0;
@@ -6590,7 +7012,7 @@ vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
auto &to_costs = partition_layout_costs (to_partition_i,
to_partition.layout);
if (ud->src == int (to_node_i)
- && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
+ && SLP_TREE_PERMUTE_P (to_vertex.node))
{
auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
auto old_layout = from_partition.layout;
@@ -6645,7 +7067,7 @@ vect_optimize_slp_pass::forward_pass ()
{
unsigned int node_i = m_partitioned_nodes[partition.node_begin];
single_node = m_vertices[node_i].node;
- if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
+ if (SLP_TREE_PERMUTE_P (single_node))
in_cost = total_in_cost (node_i);
}
@@ -6742,8 +7164,7 @@ vect_optimize_slp_pass::forward_pass ()
if the VEC_PERM_EXPR can be changed to support output layout
LAYOUT_I while keeping all the provisional choices of input
layout. */
- if (single_node
- && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
+ if (single_node && SLP_TREE_PERMUTE_P (single_node))
{
int factor = internal_node_cost (single_node, -1, layout_i);
if (factor >= 0)
@@ -6902,7 +7323,7 @@ vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
in TMP_PERM on success. */
auto_lane_permutation_t tmp_perm;
unsigned int num_inputs = 1;
- if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ if (SLP_TREE_PERMUTE_P (node))
{
tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
if (from_layout_i != 0)
@@ -6996,7 +7417,7 @@ vect_optimize_slp_pass::materialize ()
SLP_TREE_SCALAR_STMTS (node), true);
/* Update load and lane permutations. */
- if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ if (SLP_TREE_PERMUTE_P (node))
{
/* First try to absorb the input vector layouts. If that fails,
force the inputs to have layout LAYOUT_I too. We checked that
@@ -7200,7 +7621,7 @@ vect_optimize_slp_pass::dump ()
" out weight: %f (degree %d)\n",
vertex.out_weight.to_double (),
vertex.out_degree);
- if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
+ if (SLP_TREE_PERMUTE_P (vertex.node))
dump_printf_loc (MSG_NOTE, vect_location,
" op: VEC_PERM_EXPR\n");
else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
@@ -7279,7 +7700,7 @@ vect_optimize_slp_pass::decide_masked_load_lanes ()
{
slp_tree node = v.node;
if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
- || SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ || SLP_TREE_PERMUTE_P (node))
continue;
stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
@@ -7299,7 +7720,7 @@ vect_optimize_slp_pass::decide_masked_load_lanes ()
/* Uniform masks need to be suitably represented. */
slp_tree mask = SLP_TREE_CHILDREN (node)[0];
- if (SLP_TREE_CODE (mask) != VEC_PERM_EXPR
+ if (!SLP_TREE_PERMUTE_P (mask)
|| SLP_TREE_CHILDREN (mask).length () != 1)
continue;
bool match = true;
@@ -7318,7 +7739,7 @@ vect_optimize_slp_pass::decide_masked_load_lanes ()
{
slp_tree pred_node = m_vertices[pred->src].node;
/* All consumers should be a permute with a single outgoing lane. */
- if (SLP_TREE_CODE (pred_node) != VEC_PERM_EXPR
+ if (!SLP_TREE_PERMUTE_P (pred_node)
|| SLP_TREE_LANES (pred_node) != 1)
{
match = false;
@@ -7479,8 +7900,7 @@ vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
/* For permute nodes that are fed from externs or constants we have to
consider their number of lanes as well. Likewise for store-lanes. */
- if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
- || node->ldst_lanes)
+ if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
for (slp_tree child : SLP_TREE_CHILDREN (node))
if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
{
@@ -7510,20 +7930,26 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
hash_set<slp_tree> visited;
FOR_EACH_VEC_ELT (slp_instances, i, instance)
{
- /* FORNOW: SLP if you can. */
+ slp_tree root = SLP_INSTANCE_TREE (instance);
+
/* All unroll factors have the form:
GET_MODE_SIZE (vinfo->vector_mode) * X
for some rational X, so they must have a common multiple. */
- vect_update_slp_vf_for_node (SLP_INSTANCE_TREE (instance),
- unrolling_factor, visited);
+ vect_update_slp_vf_for_node (root, unrolling_factor, visited);
/* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
loop-based vectorization. Such stmts will be marked as HYBRID. */
- vect_mark_slp_stmts (loop_vinfo, SLP_INSTANCE_TREE (instance));
- decided_to_slp++;
+ vect_mark_slp_stmts (loop_vinfo, root);
+
+ /* If all instances ended up with vector(1) T roots make sure to
+ not vectorize. RVV for example relies on loop vectorization
+ when some instances are essentially kept scalar. See PR121048. */
+ if (SLP_TREE_VECTYPE (root)
+ && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
+ decided_to_slp++;
}
LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
@@ -7540,186 +7966,6 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
return (decided_to_slp > 0);
}
-/* Private data for vect_detect_hybrid_slp. */
-struct vdhs_data
-{
- loop_vec_info loop_vinfo;
- vec<stmt_vec_info> *worklist;
-};
-
-/* Walker for walk_gimple_op. */
-
-static tree
-vect_detect_hybrid_slp (tree *tp, int *, void *data)
-{
- walk_stmt_info *wi = (walk_stmt_info *)data;
- vdhs_data *dat = (vdhs_data *)wi->info;
-
- if (wi->is_lhs)
- return NULL_TREE;
-
- stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
- if (!def_stmt_info)
- return NULL_TREE;
- def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
- if (PURE_SLP_STMT (def_stmt_info))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
- def_stmt_info->stmt);
- STMT_SLP_TYPE (def_stmt_info) = hybrid;
- dat->worklist->safe_push (def_stmt_info);
- }
-
- return NULL_TREE;
-}
-
-/* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
- if so, otherwise pushing it to WORKLIST. */
-
-static void
-maybe_push_to_hybrid_worklist (vec_info *vinfo,
- vec<stmt_vec_info> &worklist,
- stmt_vec_info stmt_info)
-{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "Processing hybrid candidate : %G", stmt_info->stmt);
- stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
- imm_use_iterator iter2;
- ssa_op_iter iter1;
- use_operand_p use_p;
- def_operand_p def_p;
- bool any_def = false;
- FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
- {
- any_def = true;
- FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
- {
- if (is_gimple_debug (USE_STMT (use_p)))
- continue;
- stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
- /* An out-of loop use means this is a loop_vect sink. */
- if (!use_info)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "Found loop_vect sink: %G", stmt_info->stmt);
- worklist.safe_push (stmt_info);
- return;
- }
- else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "Found loop_vect use: %G", use_info->stmt);
- worklist.safe_push (stmt_info);
- return;
- }
- }
- }
- /* No def means this is a loop_vect sink. Gimple conditionals also don't have a
- def but shouldn't be considered sinks. */
- if (!any_def && STMT_VINFO_DEF_TYPE (stmt_info) != vect_condition_def)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "Found loop_vect sink: %G", stmt_info->stmt);
- worklist.safe_push (stmt_info);
- return;
- }
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
- STMT_SLP_TYPE (stmt_info) = pure_slp;
-}
-
-/* Find stmts that must be both vectorized and SLPed. */
-
-void
-vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
-{
- DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
-
- /* All stmts participating in SLP are marked pure_slp, all other
- stmts are loop_vect.
- First collect all loop_vect stmts into a worklist.
- SLP patterns cause not all original scalar stmts to appear in
- SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
- Rectify this here and do a backward walk over the IL only considering
- stmts as loop_vect when they are used by a loop_vect stmt and otherwise
- mark them as pure_slp. */
- auto_vec<stmt_vec_info> worklist;
- for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
- {
- basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
- for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
- gsi_next (&gsi))
- {
- gphi *phi = gsi.phi ();
- stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
- if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
- maybe_push_to_hybrid_worklist (loop_vinfo,
- worklist, stmt_info);
- }
- for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
- gsi_prev (&gsi))
- {
- gimple *stmt = gsi_stmt (gsi);
- if (is_gimple_debug (stmt))
- continue;
- stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
- if (STMT_VINFO_IN_PATTERN_P (stmt_info))
- {
- for (gimple_stmt_iterator gsi2
- = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
- !gsi_end_p (gsi2); gsi_next (&gsi2))
- {
- stmt_vec_info patt_info
- = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
- if (!STMT_SLP_TYPE (patt_info)
- && STMT_VINFO_RELEVANT (patt_info))
- maybe_push_to_hybrid_worklist (loop_vinfo,
- worklist, patt_info);
- }
- stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
- }
- if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
- maybe_push_to_hybrid_worklist (loop_vinfo,
- worklist, stmt_info);
- }
- }
-
- /* Now we have a worklist of non-SLP stmts, follow use->def chains and
- mark any SLP vectorized stmt as hybrid.
- ??? We're visiting def stmts N times (once for each non-SLP and
- once for each hybrid-SLP use). */
- walk_stmt_info wi;
- vdhs_data dat;
- dat.worklist = &worklist;
- dat.loop_vinfo = loop_vinfo;
- memset (&wi, 0, sizeof (wi));
- wi.info = (void *)&dat;
- while (!worklist.is_empty ())
- {
- stmt_vec_info stmt_info = worklist.pop ();
- /* Since SSA operands are not set up for pattern stmts we need
- to use walk_gimple_op. */
- wi.is_lhs = 0;
- walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
- /* For gather/scatter make sure to walk the offset operand, that
- can be a scaling and conversion away. */
- gather_scatter_info gs_info;
- if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
- && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
- {
- int dummy;
- vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
- }
- }
-}
-
-
/* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
_bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
@@ -7797,18 +8043,19 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
slp_instance node_instance,
stmt_vector_for_cost *cost_vec)
{
- stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
-
/* Calculate the number of vector statements to be created for the scalar
stmts in this node. It is the number of scalar elements in one scalar
iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of
elements in a vector. For single-defuse-cycle, lane-reducing op, and
PHI statement that starts reduction comprised of only lane-reducing ops,
the number is more than effective vector statements actually required. */
- SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
+ if (SLP_TREE_VECTYPE (node))
+ SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
+ else
+ SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
/* Handle purely internal nodes. */
- if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ if (SLP_TREE_PERMUTE_P (node))
{
if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
return false;
@@ -7827,9 +8074,15 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
return true;
}
- bool dummy;
- return vect_analyze_stmt (vinfo, stmt_info, &dummy,
- node, node_instance, cost_vec);
+ return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
+}
+
+static int
+sort_ints (const void *a_, const void *b_)
+{
+ int a = *(const int *)a_;
+ int b = *(const int *)b_;
+ return a - b;
}
/* Verify if we can externalize a set of internal defs. */
@@ -7837,16 +8090,57 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
static bool
vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
{
+ /* Constant generation uses get_later_stmt which can only handle
+ defs from the same BB or a set of defs that can be ordered
+ with a dominance query. */
basic_block bb = NULL;
+ bool all_same = true;
+ auto_vec<int> bbs;
+ bbs.reserve_exact (stmts.length ());
for (stmt_vec_info stmt : stmts)
- if (!stmt)
- return false;
- /* Constant generation uses get_later_stmt which can only handle
- defs from the same BB. */
- else if (!bb)
- bb = gimple_bb (stmt->stmt);
- else if (gimple_bb (stmt->stmt) != bb)
+ {
+ if (!stmt)
+ return false;
+ else if (!bb)
+ bb = gimple_bb (stmt->stmt);
+ else if (gimple_bb (stmt->stmt) != bb)
+ all_same = false;
+ bbs.quick_push (gimple_bb (stmt->stmt)->index);
+ }
+ if (all_same)
+ return true;
+
+ /* Produce a vector of unique BB indexes for the defs. */
+ bbs.qsort (sort_ints);
+ unsigned i, j;
+ for (i = 1, j = 1; i < bbs.length (); ++i)
+ if (bbs[i] != bbs[j-1])
+ bbs[j++] = bbs[i];
+ gcc_assert (j >= 2);
+ bbs.truncate (j);
+
+ if (bbs.length () == 2)
+ return (dominated_by_p (CDI_DOMINATORS,
+ BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
+ BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
+ || dominated_by_p (CDI_DOMINATORS,
+ BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
+ BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
+
+ /* ??? For more than two BBs we can sort the vector and verify the
+ result is a total order. But we can't use vec::qsort with a
+ compare function using a dominance query since there's no way to
+ signal failure and any fallback for an unordered pair would
+ fail qsort_chk later.
+ For now simply hope that ordering after BB index provides the
+ best candidate total order. If required we can implement our
+ own mergesort or export an entry without checking. */
+ for (unsigned i = 1; i < bbs.length (); ++i)
+ if (!dominated_by_p (CDI_DOMINATORS,
+ BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
+ BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
return false;
+
return true;
}
@@ -7975,7 +8269,7 @@ vect_prologue_cost_for_slp (slp_tree node,
we are costing so avoid passing it down more than once. Pass
it to the first vec_construct or scalar_to_vec part since for those
the x86 backend tries to account for GPR to XMM register moves. */
- record_stmt_cost (cost_vec, 1, kind,
+ record_stmt_cost (cost_vec, 1, kind, nullptr,
(kind != vector_load && !passed) ? node : nullptr,
vectype, 0, vect_prologue);
if (kind != vector_load)
@@ -8085,8 +8379,7 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
/* Masked loads can have an undefined (default SSA definition)
else operand. We do not need to cost it. */
vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
- if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
- == load_vec_info_type)
+ if (SLP_TREE_TYPE (node) == load_vec_info_type
&& ((ops.length ()
&& TREE_CODE (ops[0]) == SSA_NAME
&& SSA_NAME_IS_DEFAULT_DEF (ops[0])
@@ -8097,8 +8390,7 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
/* For shifts with a scalar argument we don't need
to cost or code-generate anything.
??? Represent this more explicitely. */
- gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
- == shift_vec_info_type)
+ gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
&& j == 1);
continue;
}
@@ -8474,9 +8766,9 @@ vect_slp_analyze_operations (vec_info *vinfo)
&& !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
/* Check we can vectorize the gcond. */
|| (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
- && !vectorizable_early_exit (vinfo,
+ && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
SLP_INSTANCE_ROOT_STMTS (instance)[0],
- NULL, NULL,
+ NULL,
SLP_INSTANCE_TREE (instance),
&cost_vec)))
{
@@ -8838,7 +9130,7 @@ next_lane:
{
/* Do not directly pass LIFE to the recursive call, copy it to
confine changes in the callee to the current child/subtree. */
- if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ if (SLP_TREE_PERMUTE_P (node))
{
subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
for (unsigned j = 0;
@@ -9398,14 +9690,13 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
slp_instance instance;
int i;
- poly_uint64 min_vf = 2;
/* The first group of checks is independent of the vector size. */
fatal = true;
/* Analyze the data references. */
- if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
+ if (!vect_analyze_data_refs (bb_vinfo, NULL))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -11118,11 +11409,12 @@ vect_schedule_slp_node (vec_info *vinfo,
stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
- gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
- SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
+ gcc_assert (!SLP_TREE_VECTYPE (node)
+ || SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
+ if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0)
+ SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
- if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
- && STMT_VINFO_DATA_REF (stmt_info))
+ if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
{
/* Vectorized loads go before the first scalar load to make it
ready early, vectorized stores go before the last scalar
@@ -11134,10 +11426,10 @@ vect_schedule_slp_node (vec_info *vinfo,
last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
si = gsi_for_stmt (last_stmt_info->stmt);
}
- else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
- && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
- || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
- || STMT_VINFO_TYPE (stmt_info) == phi_info_type))
+ else if (!SLP_TREE_PERMUTE_P (node)
+ && (SLP_TREE_TYPE (node) == cycle_phi_info_type
+ || SLP_TREE_TYPE (node) == induc_vec_info_type
+ || SLP_TREE_TYPE (node) == phi_info_type))
{
/* For PHI node vectorization we do not use the insertion iterator. */
si = gsi_none ();
@@ -11157,8 +11449,7 @@ vect_schedule_slp_node (vec_info *vinfo,
last scalar def here. */
if (SLP_TREE_VEC_DEFS (child).is_empty ())
{
- gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
- == cycle_phi_info_type);
+ gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
gphi *phi = as_a <gphi *>
(vect_find_last_scalar_stmt_in_slp (child)->stmt);
if (!last_stmt)
@@ -11199,7 +11490,11 @@ vect_schedule_slp_node (vec_info *vinfo,
&& !SSA_NAME_IS_DEFAULT_DEF (def))
{
gimple *stmt = SSA_NAME_DEF_STMT (def);
- if (!last_stmt)
+ if (gimple_uid (stmt) == -1u)
+ /* If the stmt is not inside the region do not
+ use it as possible insertion point. */
+ ;
+ else if (!last_stmt)
last_stmt = stmt;
else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
last_stmt = stmt;
@@ -11255,7 +11550,7 @@ vect_schedule_slp_node (vec_info *vinfo,
si = gsi_after_labels (vinfo->bbs[0]);
}
else if (is_a <bb_vec_info> (vinfo)
- && SLP_TREE_CODE (node) != VEC_PERM_EXPR
+ && !SLP_TREE_PERMUTE_P (node)
&& gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
&& gimple_could_trap_p (stmt_info->stmt))
{
@@ -11300,12 +11595,12 @@ vect_schedule_slp_node (vec_info *vinfo,
}
/* Handle purely internal nodes. */
- if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ if (SLP_TREE_PERMUTE_P (node))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"------>vectorizing SLP permutation node\n");
- /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
+ /* ??? the transform kind was stored to STMT_VINFO_TYPE which might
be shared with different SLP nodes (but usually it's the same
operation apart from the case the stmt is only there for denoting
the actual scalar lane defs ...). So do not call vect_transform_stmt
@@ -11361,12 +11656,10 @@ vect_remove_slp_scalar_calls (vec_info *vinfo,
{
if (!stmt_info)
continue;
+ stmt_info = vect_orig_stmt (stmt_info);
gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
if (!stmt || gimple_bb (stmt) == NULL)
continue;
- if (is_pattern_stmt_p (stmt_info)
- || !PURE_SLP_STMT (stmt_info))
- continue;
lhs = gimple_call_lhs (stmt);
if (lhs)
new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
@@ -11504,10 +11797,9 @@ vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance i
auto root_stmt_info = instance->root_stmts[0];
auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
- gimple *vec_stmt = NULL;
gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
- bool res = vectorizable_early_exit (vinfo, root_stmt_info, &rgsi,
- &vec_stmt, node, NULL);
+ bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
+ root_stmt_info, &rgsi, node, NULL);
gcc_assert (res);
return;
}
@@ -11582,7 +11874,7 @@ vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
stack.pop ();
info->on_stack = false;
vect_schedule_slp_node (vinfo, node, instance);
- if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
+ if (!SLP_TREE_PERMUTE_P (node)
&& is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
phis_to_fixup.quick_push (node);
}
@@ -11605,7 +11897,7 @@ vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
slp_tree entry = stack[idx];
if (!entry)
continue;
- bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
+ bool phi = (!SLP_TREE_PERMUTE_P (entry)
&& is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
bool ready = !phi;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)