diff options
Diffstat (limited to 'gcc/tree-vect-slp.cc')
-rw-r--r-- | gcc/tree-vect-slp.cc | 172 |
1 files changed, 123 insertions, 49 deletions
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 7a828ca..ca14a2d 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -118,18 +118,19 @@ _slp_tree::_slp_tree () SLP_TREE_CHILDREN (this) = vNULL; SLP_TREE_LOAD_PERMUTATION (this) = vNULL; SLP_TREE_LANE_PERMUTATION (this) = vNULL; - SLP_TREE_SIMD_CLONE_INFO (this) = vNULL; SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def; SLP_TREE_CODE (this) = ERROR_MARK; this->ldst_lanes = false; this->avoid_stlf_fail = false; SLP_TREE_VECTYPE (this) = NULL_TREE; SLP_TREE_REPRESENTATIVE (this) = NULL; - SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT; + SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_UNINITIALIZED; SLP_TREE_REF_COUNT (this) = 1; this->failed = NULL; this->max_nunits = 1; this->lanes = 0; + SLP_TREE_TYPE (this) = undef_vec_info_type; + this->data = NULL; } /* Tear down a SLP node. */ @@ -148,9 +149,10 @@ _slp_tree::~_slp_tree () SLP_TREE_VEC_DEFS (this).release (); SLP_TREE_LOAD_PERMUTATION (this).release (); SLP_TREE_LANE_PERMUTATION (this).release (); - SLP_TREE_SIMD_CLONE_INFO (this).release (); if (this->failed) free (failed); + if (this->data) + delete this->data; } /* Push the single SSA definition in DEF to the vector of vector defs. */ @@ -507,19 +509,21 @@ vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb) && (dtb == vect_external_def || dtb == vect_constant_def))); } +#define GATHER_SCATTER_OFFSET (-3) + static const int no_arg_map[] = { 0 }; static const int arg0_map[] = { 1, 0 }; -static const int arg1_map[] = { 1, 1 }; +static const int arg2_map[] = { 1, 2 }; static const int arg2_arg3_map[] = { 2, 2, 3 }; -static const int arg1_arg3_map[] = { 2, 1, 3 }; -static const int arg1_arg4_arg5_map[] = { 3, 1, 4, 5 }; -static const int arg1_arg3_arg4_map[] = { 3, 1, 3, 4 }; +static const int arg2_arg4_map[] = { 2, 2, 4 }; +static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 }; +static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 }; static const int arg3_arg2_map[] = { 2, 3, 2 }; static const int op1_op0_map[] = { 2, 1, 0 }; -static const int off_map[] = { 1, -3 }; -static const int off_op0_map[] = { 2, -3, 0 }; -static const int off_arg2_arg3_map[] = { 3, -3, 2, 3 }; -static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 }; +static const int off_map[] = { 1, GATHER_SCATTER_OFFSET }; +static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 }; +static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 }; +static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 }; static const int mask_call_maps[6][7] = { { 1, 1, }, { 2, 1, 2, }, @@ -568,18 +572,18 @@ vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false, return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map; case IFN_GATHER_LOAD: - return arg1_map; + return arg2_map; case IFN_MASK_GATHER_LOAD: case IFN_MASK_LEN_GATHER_LOAD: - return arg1_arg4_arg5_map; + return arg2_arg5_arg6_map; case IFN_SCATTER_STORE: - return arg1_arg3_map; + return arg2_arg4_map; case IFN_MASK_SCATTER_STORE: case IFN_MASK_LEN_SCATTER_STORE: - return arg1_arg3_arg4_map; + return arg2_arg4_arg5_map; case IFN_MASK_STORE: return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map; @@ -691,7 +695,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap, { oprnd_info = (*oprnds_info)[i]; int opno = map ? map[i] : int (i); - if (opno == -3) + if (opno == GATHER_SCATTER_OFFSET) { gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info)); if (!is_a <loop_vec_info> (vinfo) @@ -1114,6 +1118,16 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, matches[0] = false; return false; } + if (is_a <bb_vec_info> (vinfo) + && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: not using single lane " + "vector type %T\n", vectype); + matches[0] = false; + return false; + } /* Record nunits required but continue analysis, producing matches[] as if nunits was not an issue. This allows splitting of groups to happen. */ @@ -4067,7 +4081,12 @@ vect_build_slp_instance (vec_info *vinfo, for (unsigned i = 0; i < group_size; ++i) scalar_stmts.quick_push (next_info); slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1); - SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info); + SLP_TREE_VECTYPE (conv) + = get_vectype_for_scalar_type (vinfo, + TREE_TYPE + (gimple_assign_lhs + (scalar_def)), + group_size); SLP_TREE_CHILDREN (conv).quick_push (node); SLP_INSTANCE_TREE (new_instance) = conv; /* We also have to fake this conversion stmt as SLP reduction @@ -4931,6 +4950,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, max_tree_size, &limit, force_single_lane)) { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "SLP discovery of reduction chain failed\n"); /* Dissolve reduction chain group. */ stmt_vec_info vinfo = first_element; stmt_vec_info last = NULL; @@ -5063,9 +5085,15 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, tree args0 = gimple_cond_lhs (stmt); tree args1 = gimple_cond_rhs (stmt); - /* These should be enforced by cond lowering. */ - gcc_assert (gimple_cond_code (stmt) == NE_EXPR); - gcc_assert (zerop (args1)); + /* These should be enforced by cond lowering, but if it failed + bail. */ + if (gimple_cond_code (stmt) != NE_EXPR + || TREE_TYPE (args0) != boolean_type_node + || !integer_zerop (args1)) + { + roots.release (); + continue; + } /* An argument without a loop def will be codegened from vectorizing the root gcond itself. As such we don't need to try to build an SLP tree @@ -5218,7 +5246,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, if (STMT_VINFO_STRIDED_P (stmt_vinfo) || compare_step_with_zero (vinfo, stmt_vinfo) <= 0 || vect_load_lanes_supported - (STMT_VINFO_VECTYPE (stmt_vinfo), + (SLP_TREE_VECTYPE (load_node), DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST /* ??? During SLP re-discovery with a single lane a masked grouped load will appear permuted and @@ -5239,7 +5267,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, || SLP_TREE_LANES (load_node) == group_size || (vect_slp_prefer_store_lanes_p (vinfo, stmt_vinfo, - STMT_VINFO_VECTYPE (stmt_vinfo), masked, + SLP_TREE_VECTYPE (load_node), masked, group_size, SLP_TREE_LANES (load_node)))); } @@ -7565,20 +7593,25 @@ vect_make_slp_decision (loop_vec_info loop_vinfo) hash_set<slp_tree> visited; FOR_EACH_VEC_ELT (slp_instances, i, instance) { - /* FORNOW: SLP if you can. */ + slp_tree root = SLP_INSTANCE_TREE (instance); + /* All unroll factors have the form: GET_MODE_SIZE (vinfo->vector_mode) * X for some rational X, so they must have a common multiple. */ - vect_update_slp_vf_for_node (SLP_INSTANCE_TREE (instance), - unrolling_factor, visited); + vect_update_slp_vf_for_node (root, unrolling_factor, visited); /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and loop-based vectorization. Such stmts will be marked as HYBRID. */ - vect_mark_slp_stmts (loop_vinfo, SLP_INSTANCE_TREE (instance)); - decided_to_slp++; + vect_mark_slp_stmts (loop_vinfo, root); + + /* If all instances ended up with vector(1) T roots make sure to + not vectorize. RVV for example relies on loop vectorization + when some instances are essentially kept scalar. See PR121048. */ + if (known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U)) + decided_to_slp++; } LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; @@ -7691,7 +7724,7 @@ maybe_push_to_hybrid_worklist (vec_info *vinfo, /* Find stmts that must be both vectorized and SLPed. */ -void +bool vect_detect_hybrid_slp (loop_vec_info loop_vinfo) { DUMP_VECT_SCOPE ("vect_detect_hybrid_slp"); @@ -7772,6 +7805,52 @@ vect_detect_hybrid_slp (loop_vec_info loop_vinfo) vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi); } } + + /* Determine if all the stmts in the loop can be SLPed. */ + for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; i++) + { + basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; + for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); + gsi_next (&si)) + { + stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ()); + if (!stmt_info) + continue; + if ((STMT_VINFO_RELEVANT_P (stmt_info) + || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) + && !PURE_SLP_STMT (stmt_info)) + { + /* STMT needs both SLP and loop-based vectorization. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Loop contains SLP and non-SLP stmts\n"); + return false; + } + } + for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); + gsi_next (&si)) + { + if (is_gimple_debug (gsi_stmt (si))) + continue; + stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); + stmt_info = vect_stmt_to_vectorize (stmt_info); + if ((STMT_VINFO_RELEVANT_P (stmt_info) + || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) + && !PURE_SLP_STMT (stmt_info)) + { + /* STMT needs both SLP and loop-based vectorization. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Loop contains SLP and non-SLP stmts\n"); + return false; + } + } + } + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Loop contains only SLP stmts\n"); + return true; } @@ -7852,8 +7931,6 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, slp_instance node_instance, stmt_vector_for_cost *cost_vec) { - stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); - /* Calculate the number of vector statements to be created for the scalar stmts in this node. It is the number of scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of @@ -7882,9 +7959,7 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, return true; } - bool dummy; - return vect_analyze_stmt (vinfo, stmt_info, &dummy, - node, node_instance, cost_vec); + return vect_analyze_stmt (vinfo, node, node_instance, cost_vec); } static int @@ -8189,8 +8264,7 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, /* Masked loads can have an undefined (default SSA definition) else operand. We do not need to cost it. */ vec<tree> ops = SLP_TREE_SCALAR_OPS (child); - if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) - == load_vec_info_type) + if (SLP_TREE_TYPE (node) == load_vec_info_type && ((ops.length () && TREE_CODE (ops[0]) == SSA_NAME && SSA_NAME_IS_DEFAULT_DEF (ops[0]) @@ -8201,8 +8275,7 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, /* For shifts with a scalar argument we don't need to cost or code-generate anything. ??? Represent this more explicitely. */ - gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) - == shift_vec_info_type) + gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type && j == 1); continue; } @@ -8580,7 +8653,7 @@ vect_slp_analyze_operations (vec_info *vinfo) || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond && !vectorizable_early_exit (vinfo, SLP_INSTANCE_ROOT_STMTS (instance)[0], - NULL, NULL, + NULL, SLP_INSTANCE_TREE (instance), &cost_vec))) { @@ -9502,14 +9575,13 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, slp_instance instance; int i; - poly_uint64 min_vf = 2; /* The first group of checks is independent of the vector size. */ fatal = true; /* Analyze the data references. */ - if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL)) + if (!vect_analyze_data_refs (bb_vinfo, NULL)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -11239,9 +11311,9 @@ vect_schedule_slp_node (vec_info *vinfo, si = gsi_for_stmt (last_stmt_info->stmt); } else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR - && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type - || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type - || STMT_VINFO_TYPE (stmt_info) == phi_info_type)) + && (SLP_TREE_TYPE (node) == cycle_phi_info_type + || SLP_TREE_TYPE (node) == induc_vec_info_type + || SLP_TREE_TYPE (node) == phi_info_type)) { /* For PHI node vectorization we do not use the insertion iterator. */ si = gsi_none (); @@ -11261,8 +11333,7 @@ vect_schedule_slp_node (vec_info *vinfo, last scalar def here. */ if (SLP_TREE_VEC_DEFS (child).is_empty ()) { - gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child)) - == cycle_phi_info_type); + gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type); gphi *phi = as_a <gphi *> (vect_find_last_scalar_stmt_in_slp (child)->stmt); if (!last_stmt) @@ -11303,7 +11374,11 @@ vect_schedule_slp_node (vec_info *vinfo, && !SSA_NAME_IS_DEFAULT_DEF (def)) { gimple *stmt = SSA_NAME_DEF_STMT (def); - if (!last_stmt) + if (gimple_uid (stmt) == -1u) + /* If the stmt is not inside the region do not + use it as possible insertion point. */ + ; + else if (!last_stmt) last_stmt = stmt; else if (vect_stmt_dominates_stmt_p (last_stmt, stmt)) last_stmt = stmt; @@ -11409,7 +11484,7 @@ vect_schedule_slp_node (vec_info *vinfo, if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "------>vectorizing SLP permutation node\n"); - /* ??? the transform kind is stored to STMT_VINFO_TYPE which might + /* ??? the transform kind was stored to STMT_VINFO_TYPE which might be shared with different SLP nodes (but usually it's the same operation apart from the case the stmt is only there for denoting the actual scalar lane defs ...). So do not call vect_transform_stmt @@ -11608,10 +11683,9 @@ vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance i auto root_stmt_info = instance->root_stmts[0]; auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info)); gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt); - gimple *vec_stmt = NULL; gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ()); bool res = vectorizable_early_exit (vinfo, root_stmt_info, &rgsi, - &vec_stmt, node, NULL); + node, NULL); gcc_assert (res); return; } |