diff options
Diffstat (limited to 'gcc/tree-vect-loop.cc')
-rw-r--r-- | gcc/tree-vect-loop.cc | 1665 |
1 files changed, 534 insertions, 1131 deletions
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index fe6f3cf..56f80db 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -58,6 +58,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-eh.h" #include "case-cfn-macros.h" #include "langhooks.h" +#include "opts.h" /* Loop Vectorization Pass. @@ -167,9 +168,8 @@ static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, may already be set for general statements (not just data refs). */ static opt_result -vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info, - bool vectype_maybe_set_p, - poly_uint64 *vf) +vect_determine_vectype_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info, + bool vectype_maybe_set_p) { gimple *stmt = stmt_info->stmt; @@ -191,6 +191,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info, if (stmt_vectype) { + if (known_le (TYPE_VECTOR_SUBPARTS (stmt_vectype), 1U)) + return opt_result::failure_at (STMT_VINFO_STMT (stmt_info), + "not vectorized: unsupported " + "data-type in %G", + STMT_VINFO_STMT (stmt_info)); + if (STMT_VINFO_VECTYPE (stmt_info)) /* The only case when a vectype had been already set is for stmts that contain a data ref, or for "pattern-stmts" (stmts generated @@ -202,9 +208,6 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info, STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype; } - if (nunits_vectype) - vect_update_max_nunits (vf, nunits_vectype); - return opt_result::success (); } @@ -214,13 +217,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info, or false if something prevented vectorization. */ static opt_result -vect_determine_vf_for_stmt (vec_info *vinfo, - stmt_vec_info stmt_info, poly_uint64 *vf) +vect_determine_vectype_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G", stmt_info->stmt); - opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf); + opt_result res = vect_determine_vectype_for_stmt_1 (vinfo, stmt_info, false); if (!res) return res; @@ -239,7 +241,7 @@ vect_determine_vf_for_stmt (vec_info *vinfo, dump_printf_loc (MSG_NOTE, vect_location, "==> examining pattern def stmt: %G", def_stmt_info->stmt); - res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf); + res = vect_determine_vectype_for_stmt_1 (vinfo, def_stmt_info, true); if (!res) return res; } @@ -248,7 +250,7 @@ vect_determine_vf_for_stmt (vec_info *vinfo, dump_printf_loc (MSG_NOTE, vect_location, "==> examining pattern statement: %G", stmt_info->stmt); - res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf); + res = vect_determine_vectype_for_stmt_1 (vinfo, stmt_info, true); if (!res) return res; } @@ -256,45 +258,23 @@ vect_determine_vf_for_stmt (vec_info *vinfo, return opt_result::success (); } -/* Function vect_determine_vectorization_factor - - Determine the vectorization factor (VF). VF is the number of data elements - that are operated upon in parallel in a single iteration of the vectorized - loop. For example, when vectorizing a loop that operates on 4byte elements, - on a target with vector size (VS) 16byte, the VF is set to 4, since 4 - elements can fit in a single vector register. - - We currently support vectorization of loops in which all types operated upon - are of the same size. Therefore this function currently sets VF according to - the size of the types operated upon, and fails if there are multiple sizes - in the loop. - - VF is also the factor by which the loop iterations are strip-mined, e.g.: - original loop: - for (i=0; i<N; i++){ - a[i] = b[i] + c[i]; - } +/* Function vect_set_stmts_vectype - vectorized loop: - for (i=0; i<N; i+=VF){ - a[i:VF] = b[i:VF] + c[i:VF]; - } -*/ + Set STMT_VINFO_VECTYPE of all stmts. */ static opt_result -vect_determine_vectorization_factor (loop_vec_info loop_vinfo) +vect_set_stmts_vectype (loop_vec_info loop_vinfo) { class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); unsigned nbbs = loop->num_nodes; - poly_uint64 vectorization_factor = 1; tree scalar_type = NULL_TREE; gphi *phi; tree vectype; stmt_vec_info stmt_info; unsigned i; - DUMP_VECT_SCOPE ("vect_determine_vectorization_factor"); + DUMP_VECT_SCOPE ("vect_set_stmts_vectype"); for (i = 0; i < nbbs; i++) { @@ -323,7 +303,8 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) scalar_type); vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); - if (!vectype) + if (!vectype + || known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U)) return opt_result::failure_at (phi, "not vectorized: unsupported " "data-type %T\n", @@ -333,15 +314,6 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype); - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, "nunits = "); - dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype)); - dump_printf (MSG_NOTE, "\n"); - } - - vect_update_max_nunits (&vectorization_factor, vectype); } } @@ -352,25 +324,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) continue; stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); opt_result res - = vect_determine_vf_for_stmt (loop_vinfo, - stmt_info, &vectorization_factor); + = vect_determine_vectype_for_stmt (loop_vinfo, stmt_info); if (!res) return res; } } - /* TODO: Analyze cost. Decide if worth while to vectorize. */ - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = "); - dump_dec (MSG_NOTE, vectorization_factor); - dump_printf (MSG_NOTE, "\n"); - } - - if (known_le (vectorization_factor, 1U)) - return opt_result::failure_at (vect_location, - "not vectorized: unsupported data-type\n"); - LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; return opt_result::success (); } @@ -1069,10 +1028,12 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) using_decrementing_iv_p (false), using_select_vl_p (false), epil_using_partial_vectors_p (false), + allow_mutual_alignment (false), partial_load_store_bias (0), peeling_for_gaps (false), peeling_for_niter (false), early_breaks (false), + user_unroll (false), no_data_dependencies (false), has_mask_store (false), scalar_loop_scaling (profile_probability::uninitialized ()), @@ -1999,234 +1960,6 @@ vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared, -/* Scan the loop stmts and dependent on whether there are any (non-)SLP - statements update the vectorization factor. */ - -static void -vect_update_vf_for_slp (loop_vec_info loop_vinfo) -{ - class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); - int nbbs = loop->num_nodes; - poly_uint64 vectorization_factor; - int i; - - DUMP_VECT_SCOPE ("vect_update_vf_for_slp"); - - vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - gcc_assert (known_ne (vectorization_factor, 0U)); - - /* If all the stmts in the loop can be SLPed, we perform only SLP, and - vectorization factor of the loop is the unrolling factor required by - the SLP instances. If that unrolling factor is 1, we say, that we - perform pure SLP on loop - cross iteration parallelism is not - exploited. */ - bool only_slp_in_loop = true; - for (i = 0; i < nbbs; i++) - { - basic_block bb = bbs[i]; - for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); - gsi_next (&si)) - { - stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ()); - if (!stmt_info) - continue; - if ((STMT_VINFO_RELEVANT_P (stmt_info) - || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) - && !PURE_SLP_STMT (stmt_info)) - /* STMT needs both SLP and loop-based vectorization. */ - only_slp_in_loop = false; - } - for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); - gsi_next (&si)) - { - if (is_gimple_debug (gsi_stmt (si))) - continue; - stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); - stmt_info = vect_stmt_to_vectorize (stmt_info); - if ((STMT_VINFO_RELEVANT_P (stmt_info) - || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) - && !PURE_SLP_STMT (stmt_info)) - /* STMT needs both SLP and loop-based vectorization. */ - only_slp_in_loop = false; - } - } - - if (only_slp_in_loop) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Loop contains only SLP stmts\n"); - vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); - } - else - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Loop contains SLP and non-SLP stmts\n"); - /* Both the vectorization factor and unroll factor have the form - GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X, - so they must have a common multiple. */ - vectorization_factor - = force_common_multiple (vectorization_factor, - LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); - } - - LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "Updating vectorization factor to "); - dump_dec (MSG_NOTE, vectorization_factor); - dump_printf (MSG_NOTE, ".\n"); - } -} - -/* Return true if STMT_INFO describes a double reduction phi and if - the other phi in the reduction is also relevant for vectorization. - This rejects cases such as: - - outer1: - x_1 = PHI <x_3(outer2), ...>; - ... - - inner: - x_2 = ...; - ... - - outer2: - x_3 = PHI <x_2(inner)>; - - if nothing in x_2 or elsewhere makes x_1 relevant. */ - -static bool -vect_active_double_reduction_p (stmt_vec_info stmt_info) -{ - if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) - return false; - - return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info)); -} - -/* Function vect_analyze_loop_operations. - - Scan the loop stmts and make sure they are all vectorizable. */ - -static opt_result -vect_analyze_loop_operations (loop_vec_info loop_vinfo) -{ - class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); - int nbbs = loop->num_nodes; - int i; - stmt_vec_info stmt_info; - - DUMP_VECT_SCOPE ("vect_analyze_loop_operations"); - - for (i = 0; i < nbbs; i++) - { - basic_block bb = bbs[i]; - - for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); - gsi_next (&si)) - { - gphi *phi = si.phi (); - - stmt_info = loop_vinfo->lookup_stmt (phi); - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", - (gimple *) phi); - if (virtual_operand_p (gimple_phi_result (phi))) - continue; - - /* ??? All of the below unconditional FAILs should be in - done earlier after analyzing cycles, possibly when - determining stmt relevancy? */ - - /* Inner-loop loop-closed exit phi in outer-loop vectorization - (i.e., a phi in the tail of the outer-loop). */ - if (! is_loop_header_bb_p (bb)) - { - /* FORNOW: we currently don't support the case that these phis - are not used in the outerloop (unless it is double reduction, - i.e., this phi is vect_reduction_def), cause this case - requires to actually do something here. */ - if (STMT_VINFO_LIVE_P (stmt_info) - && !vect_active_double_reduction_p (stmt_info)) - return opt_result::failure_at (phi, - "Unsupported loop-closed phi" - " in outer-loop.\n"); - - /* If PHI is used in the outer loop, we check that its operand - is defined in the inner loop. */ - if (STMT_VINFO_RELEVANT_P (stmt_info)) - { - tree phi_op; - - if (gimple_phi_num_args (phi) != 1) - return opt_result::failure_at (phi, "unsupported phi"); - - phi_op = PHI_ARG_DEF (phi, 0); - stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); - if (!op_def_info) - return opt_result::failure_at (phi, "unsupported phi\n"); - - if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer - && (STMT_VINFO_RELEVANT (op_def_info) - != vect_used_in_outer_by_reduction)) - return opt_result::failure_at (phi, "unsupported phi\n"); - - if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def - || (STMT_VINFO_DEF_TYPE (stmt_info) - == vect_double_reduction_def)) - && ! PURE_SLP_STMT (stmt_info)) - return opt_result::failure_at (phi, "unsupported phi\n"); - } - - continue; - } - - gcc_assert (stmt_info); - - if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope - || STMT_VINFO_LIVE_P (stmt_info)) - && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def - && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence) - /* A scalar-dependence cycle that we don't support. */ - return opt_result::failure_at (phi, - "not vectorized:" - " scalar dependence cycle.\n"); - - if (STMT_VINFO_RELEVANT_P (stmt_info) - && ! PURE_SLP_STMT (stmt_info)) - return opt_result::failure_at (phi, - "not vectorized: relevant phi not " - "supported: %G", - static_cast <gimple *> (phi)); - } - - for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); - gsi_next (&si)) - { - gimple *stmt = gsi_stmt (si); - if (!gimple_clobber_p (stmt) - && !is_gimple_debug (stmt)) - { - bool need_to_vectorize = false; - opt_result res - = vect_analyze_stmt (loop_vinfo, - loop_vinfo->lookup_stmt (stmt), - &need_to_vectorize, - NULL, NULL, NULL); - if (!res) - return res; - } - } - } /* bbs */ - - return opt_result::success (); -} - /* Return true if we know that the iteration count is smaller than the vectorization factor. Return false if it isn't, or if we can't be sure either way. */ @@ -2527,78 +2260,6 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, return opt_result::success (); } -/* Look for SLP-only access groups and turn each individual access into its own - group. */ -static void -vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) -{ - unsigned int i; - struct data_reference *dr; - - DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups"); - - vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); - FOR_EACH_VEC_ELT (datarefs, i, dr) - { - gcc_assert (DR_REF (dr)); - stmt_vec_info stmt_info - = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr))); - - /* Check if the load is a part of an interleaving chain. */ - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) - { - stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info); - dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element); - unsigned int group_size = DR_GROUP_SIZE (first_element); - - /* Check if SLP-only groups. */ - if (!STMT_SLP_TYPE (stmt_info) - && STMT_VINFO_SLP_VECT_ONLY (first_element)) - { - /* Dissolve the group. */ - STMT_VINFO_SLP_VECT_ONLY (first_element) = false; - - stmt_vec_info vinfo = first_element; - while (vinfo) - { - stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); - DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo; - DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; - DR_GROUP_SIZE (vinfo) = 1; - if (STMT_VINFO_STRIDED_P (first_element) - /* We cannot handle stores with gaps. */ - || DR_IS_WRITE (dr_info->dr)) - { - STMT_VINFO_STRIDED_P (vinfo) = true; - DR_GROUP_GAP (vinfo) = 0; - } - else - DR_GROUP_GAP (vinfo) = group_size - 1; - /* Duplicate and adjust alignment info, it needs to - be present on each group leader, see dr_misalignment. */ - if (vinfo != first_element) - { - dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo); - dr_info2->target_alignment = dr_info->target_alignment; - int misalignment = dr_info->misalignment; - if (misalignment != DR_MISALIGNMENT_UNKNOWN) - { - HOST_WIDE_INT diff - = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr)) - - TREE_INT_CST_LOW (DR_INIT (dr_info->dr))); - unsigned HOST_WIDE_INT align_c - = dr_info->target_alignment.to_constant (); - misalignment = (misalignment + diff) % align_c; - } - dr_info2->misalignment = misalignment; - } - vinfo = next; - } - } - } - } -} - /* Determine if operating on full vectors for LOOP_VINFO might leave some scalar iterations still to do. If so, decide how we should handle those scalar iterations. The possibilities are: @@ -2836,19 +2497,18 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, return opt_result::failure_at (vect_location, "bad data dependence.\n"); LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; - ok = vect_determine_vectorization_factor (loop_vinfo); + ok = vect_set_stmts_vectype (loop_vinfo); if (!ok) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't determine vectorization factor.\n"); + "cannot determine vector types.\n"); return ok; } /* Compute the scalar iteration cost. */ vect_compute_single_scalar_iteration_cost (loop_vinfo); - poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); bool saved_can_use_partial_vectors_p = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo); @@ -2864,21 +2524,29 @@ start_over: return ok; /* If there are any SLP instances mark them as pure_slp. */ - if (vect_make_slp_decision (loop_vinfo)) - { - /* Find stmts that need to be both vectorized and SLPed. */ - vect_detect_hybrid_slp (loop_vinfo); + if (!vect_make_slp_decision (loop_vinfo)) + return opt_result::failure_at (vect_location, "no stmts to vectorize.\n"); - /* Update the vectorization factor based on the SLP decision. */ - vect_update_vf_for_slp (loop_vinfo); + /* Find stmts that need to be both vectorized and SLPed. */ + if (!vect_detect_hybrid_slp (loop_vinfo)) + return opt_result::failure_at (vect_location, "needs non-SLP handling\n"); - /* Optimize the SLP graph with the vectorization factor fixed. */ - vect_optimize_slp (loop_vinfo); - - /* Gather the loads reachable from the SLP graph entries. */ - vect_gather_slp_loads (loop_vinfo); + /* Determine the vectorization factor from the SLP decision. */ + LOOP_VINFO_VECT_FACTOR (loop_vinfo) + = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = "); + dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + dump_printf (MSG_NOTE, "\n"); } + /* Optimize the SLP graph with the vectorization factor fixed. */ + vect_optimize_slp (loop_vinfo); + + /* Gather the loads reachable from the SLP graph entries. */ + vect_gather_slp_loads (loop_vinfo); + /* We don't expect to have to roll back to anything other than an empty set of rgroups. */ gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); @@ -2947,19 +2615,6 @@ start_over: goto again; } - /* Dissolve SLP-only groups. */ - vect_dissolve_slp_only_groups (loop_vinfo); - - /* Scan all the remaining operations in the loop that we did not catch - during SLP build and make sure we fail. */ - ok = vect_analyze_loop_operations (loop_vinfo); - if (!ok) - { - ok = opt_result::failure_at (vect_location, - "bad operation or unsupported loop bound\n"); - goto again; - } - /* For now, we don't expect to mix both masking and length approaches for one loop, disable it if both are recorded. */ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) @@ -3269,8 +2924,8 @@ again: dump_printf_loc (MSG_NOTE, vect_location, "re-trying with single-lane SLP\n"); - /* Restore vectorization factor as it were without SLP. */ - LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor; + /* Reset the vectorization factor. */ + LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0; /* Free the SLP instances. */ FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance) vect_free_slp_instance (instance); @@ -3398,8 +3053,10 @@ vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, } /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is - not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance - MODE_I to the next mode useful to analyze. + not NULL. When MASKED_P is not -1 override the default + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it. + Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next + mode useful to analyze. Return the loop_vinfo on success and wrapped null on failure. */ static opt_loop_vec_info @@ -3407,6 +3064,7 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, const vect_loop_form_info *loop_form_info, loop_vec_info orig_loop_vinfo, const vector_modes &vector_modes, unsigned &mode_i, + int masked_p, machine_mode &autodetected_vector_mode, bool &fatal) { @@ -3415,6 +3073,8 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, machine_mode vector_mode = vector_modes[mode_i]; loop_vinfo->vector_mode = vector_mode; + if (masked_p != -1) + loop_vinfo->can_use_partial_vectors_p = masked_p; unsigned int suggested_unroll_factor = 1; unsigned slp_done_for_suggested_uf = 0; @@ -3428,27 +3088,50 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, res ? "succeeded" : "failed", GET_MODE_NAME (loop_vinfo->vector_mode)); - if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && suggested_unroll_factor > 1) + auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll; + if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) + /* Check to see if the user wants to unroll or if the target wants to. */ + && (suggested_unroll_factor > 1 || user_unroll > 1)) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, + if (suggested_unroll_factor == 1) + { + int assumed_vf = vect_vf_for_cost (loop_vinfo); + suggested_unroll_factor = user_unroll / assumed_vf; + if (suggested_unroll_factor > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "setting unroll factor to %d based on user requested " + "unroll factor %d and suggested vectorization " + "factor: %d\n", + suggested_unroll_factor, user_unroll, assumed_vf); + } + } + + if (suggested_unroll_factor > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "***** Re-trying analysis for unrolling" " with unroll factor %d and slp %s.\n", suggested_unroll_factor, slp_done_for_suggested_uf ? "on" : "off"); - loop_vec_info unroll_vinfo - = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL); - unroll_vinfo->vector_mode = vector_mode; - unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; - opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL, - slp_done_for_suggested_uf); - if (new_res) - { - delete loop_vinfo; - loop_vinfo = unroll_vinfo; - } - else - delete unroll_vinfo; + loop_vec_info unroll_vinfo + = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL); + unroll_vinfo->vector_mode = vector_mode; + unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; + opt_result new_res + = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL, + slp_done_for_suggested_uf); + if (new_res) + { + delete loop_vinfo; + loop_vinfo = unroll_vinfo; + LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1; + } + else + delete unroll_vinfo; + } } /* Remember the autodetected vector mode. */ @@ -3469,13 +3152,8 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, mode_i += 1; } if (mode_i + 1 < vector_modes.length () - && VECTOR_MODE_P (autodetected_vector_mode) - && (related_vector_mode (vector_modes[mode_i + 1], - GET_MODE_INNER (autodetected_vector_mode)) - == autodetected_vector_mode) - && (related_vector_mode (autodetected_vector_mode, - GET_MODE_INNER (vector_modes[mode_i + 1])) - == vector_modes[mode_i + 1])) + && vect_chooses_same_modes_p (autodetected_vector_mode, + vector_modes[mode_i + 1])) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -3580,7 +3258,7 @@ vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call, cached_vf_per_mode[last_mode_i] = -1; opt_loop_vec_info loop_vinfo = vect_analyze_loop_1 (loop, shared, &loop_form_info, - NULL, vector_modes, mode_i, + NULL, vector_modes, mode_i, -1, autodetected_vector_mode, fatal); if (fatal) break; @@ -3665,24 +3343,38 @@ vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call, array may contain length-agnostic and length-specific modes. Their ordering is not guaranteed, so we could end up picking a mode for the main loop that is after the epilogue's optimal mode. */ + int masked_p = -1; if (!unlimited_cost_model (loop) - && first_loop_vinfo->vector_costs->suggested_epilogue_mode () != VOIDmode) + && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p) + != VOIDmode)) { vector_modes[0] - = first_loop_vinfo->vector_costs->suggested_epilogue_mode (); + = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p); cached_vf_per_mode[0] = 0; } else vector_modes[0] = autodetected_vector_mode; mode_i = 0; - bool supports_partial_vectors = - partial_vectors_supported_p () && param_vect_partial_vector_usage != 0; + bool supports_partial_vectors = (param_vect_partial_vector_usage != 0 + || masked_p == 1); + machine_mode mask_mode; + if (supports_partial_vectors + && !partial_vectors_supported_p () + && !(VECTOR_MODE_P (first_loop_vinfo->vector_mode) + && targetm.vectorize.get_mask_mode + (first_loop_vinfo->vector_mode).exists (&mask_mode) + && SCALAR_INT_MODE_P (mask_mode))) + supports_partial_vectors = false; poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo); loop_vec_info orig_loop_vinfo = first_loop_vinfo; do { + /* Let the user override what the target suggests. */ + if (OPTION_SET_P (param_vect_partial_vector_usage)) + masked_p = -1; + while (1) { /* If the target does not support partial vectors we can shorten the @@ -3697,6 +3389,22 @@ vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call, break; continue; } + /* We would need an exhaustive search to find all modes we + skipped but that would lead to the same result as the + analysis it was skipped for and where we'd could check + cached_vf_per_mode against. + Check for the autodetected mode, which is the common + situation on x86 which does not perform cost comparison. */ + if (!supports_partial_vectors + && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf) + && vect_chooses_same_modes_p (autodetected_vector_mode, + vector_modes[mode_i])) + { + mode_i++; + if (mode_i == vector_modes.length ()) + break; + continue; + } if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -3707,7 +3415,7 @@ vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call, opt_loop_vec_info loop_vinfo = vect_analyze_loop_1 (loop, shared, &loop_form_info, orig_loop_vinfo, - vector_modes, mode_i, + vector_modes, mode_i, masked_p, autodetected_vector_mode, fatal); if (fatal) break; @@ -3738,6 +3446,9 @@ vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call, break; } + /* Revert back to the default from the suggested prefered + epilogue vectorization mode. */ + masked_p = -1; if (mode_i == vector_modes.length ()) break; } @@ -3748,12 +3459,14 @@ vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call, /* When we selected a first vectorized epilogue, see if the target suggests to have another one. */ + masked_p = -1; if (!unlimited_cost_model (loop) - && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode () + && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo) + && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p) != VOIDmode)) { vector_modes[0] - = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (); + = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p); cached_vf_per_mode[0] = 0; mode_i = 0; } @@ -4101,6 +3814,10 @@ pop: if (op.ops[2] == op.ops[opi]) neg = ! neg; } + /* For an FMA the reduction code is the PLUS if the addition chain + is the reduction. */ + else if (op.code == IFN_FMA && opi == 2) + op.code = PLUS_EXPR; if (CONVERT_EXPR_CODE_P (op.code) && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))) ; @@ -4646,7 +4363,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, TODO: Consider assigning different costs to different scalar statements. */ - scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost (); + scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost () + * param_vect_scalar_cost_multiplier) / 100; /* Add additional cost for the peeled instructions in prologue and epilogue loop. (For fully-masked loops there will be no peeling.) @@ -5283,7 +5001,7 @@ vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info) static void vect_model_reduction_cost (loop_vec_info loop_vinfo, - stmt_vec_info stmt_info, internal_fn reduc_fn, + slp_tree node, internal_fn reduc_fn, vect_reduction_type reduction_type, int ncopies, stmt_vector_for_cost *cost_vec) { @@ -5299,9 +5017,10 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, if (reduction_type == COND_REDUCTION) ncopies *= 2; - vectype = STMT_VINFO_VECTYPE (stmt_info); + vectype = SLP_TREE_VECTYPE (node); mode = TYPE_MODE (vectype); - stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); + stmt_vec_info orig_stmt_info + = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node)); gimple_match_op op; if (!gimple_extract_op (orig_stmt_info->stmt, &op)) @@ -5319,16 +5038,16 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, if (reduc_fn != IFN_LAST) /* Count one reduction-like operation per vector. */ inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar, - stmt_info, 0, vect_body); + node, 0, vect_body); else { /* Use NELEMENTS extracts and NELEMENTS scalar ops. */ unsigned int nelements = ncopies * vect_nunits_for_cost (vectype); inside_cost = record_stmt_cost (cost_vec, nelements, - vec_to_scalar, stmt_info, 0, + vec_to_scalar, node, 0, vect_body); inside_cost += record_stmt_cost (cost_vec, nelements, - scalar_stmt, stmt_info, 0, + scalar_stmt, node, 0, vect_body); } } @@ -5345,7 +5064,7 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, /* We need the initial reduction value. */ prologue_stmts = 1; prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, - scalar_to_vec, stmt_info, 0, + scalar_to_vec, node, 0, vect_prologue); } @@ -5362,24 +5081,24 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, { /* An EQ stmt and an COND_EXPR stmt. */ epilogue_cost += record_stmt_cost (cost_vec, 2, - vector_stmt, stmt_info, 0, + vector_stmt, node, 0, vect_epilogue); /* Reduction of the max index and a reduction of the found values. */ epilogue_cost += record_stmt_cost (cost_vec, 2, - vec_to_scalar, stmt_info, 0, + vec_to_scalar, node, 0, vect_epilogue); /* A broadcast of the max value. */ epilogue_cost += record_stmt_cost (cost_vec, 1, - scalar_to_vec, stmt_info, 0, + scalar_to_vec, node, 0, vect_epilogue); } else { epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt, - stmt_info, 0, vect_epilogue); + node, 0, vect_epilogue); epilogue_cost += record_stmt_cost (cost_vec, 1, - vec_to_scalar, stmt_info, 0, + vec_to_scalar, node, 0, vect_epilogue); } } @@ -5389,12 +5108,12 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, /* Extraction of scalar elements. */ epilogue_cost += record_stmt_cost (cost_vec, 2 * estimated_nunits, - vec_to_scalar, stmt_info, 0, + vec_to_scalar, node, 0, vect_epilogue); /* Scalar max reductions via COND_EXPR / MAX_EXPR. */ epilogue_cost += record_stmt_cost (cost_vec, 2 * estimated_nunits - 3, - scalar_stmt, stmt_info, 0, + scalar_stmt, node, 0, vect_epilogue); } else if (reduction_type == EXTRACT_LAST_REDUCTION @@ -5420,10 +5139,10 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, Also requires scalar extract. */ epilogue_cost += record_stmt_cost (cost_vec, exact_log2 (nelements) * 2, - vector_stmt, stmt_info, 0, + vector_stmt, node, 0, vect_epilogue); epilogue_cost += record_stmt_cost (cost_vec, 1, - vec_to_scalar, stmt_info, 0, + vec_to_scalar, node, 0, vect_epilogue); } else @@ -5431,7 +5150,7 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, elements, we have N extracts and N-1 reduction ops. */ epilogue_cost += record_stmt_cost (cost_vec, nelements + nelements - 1, - vector_stmt, stmt_info, 0, + vector_stmt, node, 0, vect_epilogue); } } @@ -6016,7 +5735,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Create an induction variable. */ gimple_stmt_iterator incr_gsi; bool insert_after; - vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after); + vect_iv_increment_position (LOOP_VINFO_IV_EXIT (loop_vinfo), + &incr_gsi, &insert_after); create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); @@ -7658,23 +7378,20 @@ vectorizable_reduction (loop_vec_info loop_vinfo, if (lane_reducing_op_p (op.code)) { - enum vect_def_type dt; - tree vectype_op; - /* The last operand of lane-reducing operation is for reduction. */ gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1); - if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op)) - return false; - + slp_tree op_node = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; + tree vectype_op = SLP_TREE_VECTYPE (op_node); tree type_op = TREE_TYPE (op.ops[0]); - if (!vectype_op) { vectype_op = get_vectype_for_scalar_type (loop_vinfo, type_op); - if (!vectype_op) + if (!vectype_op + || !vect_maybe_update_slp_op_vectype (op_node, + vectype_op)) return false; } @@ -7755,7 +7472,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, inside the loop body. The last operand is the reduction variable, which is defined by the loop-header-phi. */ - tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); + tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info); STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; @@ -8043,6 +7760,19 @@ vectorizable_reduction (loop_vec_info loop_vinfo, "in-order reduction chain without SLP.\n"); return false; } + /* Code generation doesn't support function calls other + than .COND_*. */ + if (!op.code.is_tree_code () + && !(op.code.is_internal_fn () + && conditional_internal_fn_code (internal_fn (op.code)) + != ERROR_MARK)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "in-order reduction chain operation not " + "supported.\n"); + return false; + } STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type = FOLD_LEFT_REDUCTION; } @@ -8345,7 +8075,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, return false; } - vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, + vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn, reduction_type, ncopies, cost_vec); /* Cost the reduction op inside the loop if transformed via vect_transform_reduction for non-lane-reducing operation. Otherwise @@ -9698,7 +9428,7 @@ vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree vectype = SLP_TREE_VECTYPE (slp_node); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); enum vect_induction_op_type induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); @@ -9723,7 +9453,7 @@ vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate vector iv update for each iv and a permutation to generate wanted vector iv. */ - if (slp_node && SLP_TREE_LANES (slp_node) > 1) + if (SLP_TREE_LANES (slp_node) > 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -9934,13 +9664,7 @@ vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), UNKNOWN_LOCATION); - if (slp_node) - slp_node->push_vec_def (induction_phi); - else - { - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi); - *vec_stmt = induction_phi; - } + slp_node->push_vec_def (induction_phi); /* In case that vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate @@ -9970,10 +9694,7 @@ vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, induction_type); gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); new_stmt = SSA_NAME_DEF_STMT (vec_def); - if (slp_node) - slp_node->push_vec_def (new_stmt); - else - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + slp_node->push_vec_def (new_stmt); } } @@ -9999,15 +9720,13 @@ vectorizable_induction (loop_vec_info loop_vinfo, stmt_vector_for_cost *cost_vec) { class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - unsigned ncopies; bool nested_in_vect_loop = false; class loop *iv_loop; tree vec_def; edge pe = loop_preheader_edge (loop); basic_block new_bb; - tree new_vec, vec_init = NULL_TREE, vec_step, t; + tree vec_init = NULL_TREE, vec_step, t; tree new_name; - gimple *new_stmt; gphi *induction_phi; tree induc_def, vec_dest; poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); @@ -10034,15 +9753,9 @@ vectorizable_induction (loop_vec_info loop_vinfo, return vectorizable_nonlinear_induction (loop_vinfo, stmt_info, vec_stmt, slp_node, cost_vec); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree vectype = SLP_TREE_VECTYPE (slp_node); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); - if (slp_node) - ncopies = 1; - else - ncopies = vect_get_num_copies (loop_vinfo, vectype); - gcc_assert (ncopies >= 1); - /* FORNOW. These restrictions should be relaxed. */ if (nested_in_vect_loop_p (loop, stmt_info)) { @@ -10052,14 +9765,6 @@ vectorizable_induction (loop_vec_info loop_vinfo, edge latch_e; tree loop_arg; - if (ncopies > 1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "multiple types in nested loop.\n"); - return false; - } - exit_phi = NULL; latch_e = loop_latch_edge (loop->inner); loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); @@ -10096,7 +9801,7 @@ vectorizable_induction (loop_vec_info loop_vinfo, iv_loop = loop; gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); - if (slp_node && (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)) + if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1) { /* The current SLP code creates the step value element-by-element. */ if (dump_enabled_p ()) @@ -10152,41 +9857,28 @@ vectorizable_induction (loop_vec_info loop_vinfo, if (!vec_stmt) /* transformation not required. */ { unsigned inside_cost = 0, prologue_cost = 0; - if (slp_node) - { - /* We eventually need to set a vector type on invariant - arguments. */ - unsigned j; - slp_tree child; - FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) - if (!vect_maybe_update_slp_op_vectype - (child, SLP_TREE_VECTYPE (slp_node))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "incompatible vector types for " - "invariants\n"); - return false; - } - /* loop cost for vec_loop. */ - inside_cost - = record_stmt_cost (cost_vec, - SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), - vector_stmt, stmt_info, 0, vect_body); - /* prologue cost for vec_init (if not nested) and step. */ - prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop, - scalar_to_vec, - stmt_info, 0, vect_prologue); - } - else /* if (!slp_node) */ - { - /* loop cost for vec_loop. */ - inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, - stmt_info, 0, vect_body); - /* prologue cost for vec_init and vec_step. */ - prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, - stmt_info, 0, vect_prologue); - } + /* We eventually need to set a vector type on invariant + arguments. */ + unsigned j; + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) + if (!vect_maybe_update_slp_op_vectype + (child, SLP_TREE_VECTYPE (slp_node))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for " + "invariants\n"); + return false; + } + /* loop cost for vec_loop. */ + inside_cost = record_stmt_cost (cost_vec, + SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), + vector_stmt, stmt_info, 0, vect_body); + /* prologue cost for vec_init (if not nested) and step. */ + prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop, + scalar_to_vec, + stmt_info, 0, vect_prologue); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "vect_model_induction_cost: inside_cost = %d, " @@ -10217,670 +9909,374 @@ vectorizable_induction (loop_vec_info loop_vinfo, with group size 3 we need [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1] [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */ - if (slp_node) + gimple_stmt_iterator incr_si; + bool insert_after; + standard_iv_increment_position (iv_loop, &incr_si, &insert_after); + + /* The initial values are vectorized, but any lanes > group_size + need adjustment. */ + slp_tree init_node + = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx]; + + /* Gather steps. Since we do not vectorize inductions as + cycles we have to reconstruct the step from SCEV data. */ + unsigned group_size = SLP_TREE_LANES (slp_node); + tree *steps = XALLOCAVEC (tree, group_size); + tree *inits = XALLOCAVEC (tree, group_size); + stmt_vec_info phi_info; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info) + { + steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); + if (!init_node) + inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt), + pe->dest_idx); + } + + /* Now generate the IVs. */ + unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + gcc_assert (multiple_p (nunits * nvects, group_size)); + unsigned nivs; + unsigned HOST_WIDE_INT const_nunits; + if (nested_in_vect_loop) + nivs = nvects; + else if (nunits.is_constant (&const_nunits)) { - gimple_stmt_iterator incr_si; - bool insert_after; - standard_iv_increment_position (iv_loop, &incr_si, &insert_after); - - /* The initial values are vectorized, but any lanes > group_size - need adjustment. */ - slp_tree init_node - = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx]; - - /* Gather steps. Since we do not vectorize inductions as - cycles we have to reconstruct the step from SCEV data. */ - unsigned group_size = SLP_TREE_LANES (slp_node); - tree *steps = XALLOCAVEC (tree, group_size); - tree *inits = XALLOCAVEC (tree, group_size); - stmt_vec_info phi_info; - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info) - { - steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); - if (!init_node) - inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt), - pe->dest_idx); - } - - /* Now generate the IVs. */ - unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); - gcc_assert (multiple_p (nunits * nvects, group_size)); - unsigned nivs; - unsigned HOST_WIDE_INT const_nunits; - if (nested_in_vect_loop) - nivs = nvects; - else if (nunits.is_constant (&const_nunits)) - { - /* Compute the number of distinct IVs we need. First reduce - group_size if it is a multiple of const_nunits so we get - one IV for a group_size of 4 but const_nunits 2. */ - unsigned group_sizep = group_size; - if (group_sizep % const_nunits == 0) - group_sizep = group_sizep / const_nunits; - nivs = least_common_multiple (group_sizep, - const_nunits) / const_nunits; - } - else - { - gcc_assert (SLP_TREE_LANES (slp_node) == 1); - nivs = 1; - } - gimple_seq init_stmts = NULL; - tree lupdate_mul = NULL_TREE; - if (!nested_in_vect_loop) + /* Compute the number of distinct IVs we need. First reduce + group_size if it is a multiple of const_nunits so we get + one IV for a group_size of 4 but const_nunits 2. */ + unsigned group_sizep = group_size; + if (group_sizep % const_nunits == 0) + group_sizep = group_sizep / const_nunits; + nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits; + } + else + { + gcc_assert (SLP_TREE_LANES (slp_node) == 1); + nivs = 1; + } + gimple_seq init_stmts = NULL; + tree lupdate_mul = NULL_TREE; + if (!nested_in_vect_loop) + { + if (nunits.is_constant (&const_nunits)) { - if (nunits.is_constant (&const_nunits)) - { - /* The number of iterations covered in one vector iteration. */ - unsigned lup_mul = (nvects * const_nunits) / group_size; - lupdate_mul - = build_vector_from_val (step_vectype, - SCALAR_FLOAT_TYPE_P (stept) - ? build_real_from_wide (stept, lup_mul, - UNSIGNED) - : build_int_cstu (stept, lup_mul)); - } - else - { - if (SCALAR_FLOAT_TYPE_P (stept)) - { - tree tem = build_int_cst (integer_type_node, vf); - lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, - stept, tem); - } - else - lupdate_mul = build_int_cst (stept, vf); - lupdate_mul = gimple_build_vector_from_val (&init_stmts, - step_vectype, - lupdate_mul); - } + /* The number of iterations covered in one vector iteration. */ + unsigned lup_mul = (nvects * const_nunits) / group_size; + lupdate_mul + = build_vector_from_val (step_vectype, + SCALAR_FLOAT_TYPE_P (stept) + ? build_real_from_wide (stept, lup_mul, + UNSIGNED) + : build_int_cstu (stept, lup_mul)); } - tree peel_mul = NULL_TREE; - if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) + else { if (SCALAR_FLOAT_TYPE_P (stept)) - peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, - LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); - else - peel_mul = gimple_convert (&init_stmts, stept, - LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); - peel_mul = gimple_build_vector_from_val (&init_stmts, - step_vectype, peel_mul); - - /* If early break then we have to create a new PHI which we can use as - an offset to adjust the induction reduction in early exits. - - This is because when peeling for alignment using masking, the first - few elements of the vector can be inactive. As such if we find the - entry in the first iteration we have adjust the starting point of - the scalar code. - - We do this by creating a new scalar PHI that keeps track of whether - we are the first iteration of the loop (with the additional masking) - or whether we have taken a loop iteration already. - - The generated sequence: - - pre-header: - bb1: - i_1 = <number of leading inactive elements> - - header: - bb2: - i_2 = PHI <i_1(bb1), 0(latch)> - … - - early-exit: - bb3: - i_3 = iv_step * i_2 + PHI<vector-iv> - - The first part of the adjustment to create i_1 and i_2 are done here - and the last part creating i_3 is done in - vectorizable_live_operations when the induction extraction is - materialized. */ - if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) - && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo)) { - auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); - tree ty_skip_niters = TREE_TYPE (skip_niters); - tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters, - vect_scalar_var, - "pfa_iv_offset"); - gphi *nphi = create_phi_node (break_lhs_phi, bb); - add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION); - add_phi_arg (nphi, build_zero_cst (ty_skip_niters), - loop_latch_edge (iv_loop), UNKNOWN_LOCATION); - - LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo) - = PHI_RESULT (nphi); + tree tem = build_int_cst (integer_type_node, vf); + lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem); } + else + lupdate_mul = build_int_cst (stept, vf); + lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype, + lupdate_mul); } - tree step_mul = NULL_TREE; - unsigned ivn; - auto_vec<tree> vec_steps; - for (ivn = 0; ivn < nivs; ++ivn) + } + tree peel_mul = NULL_TREE; + if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) + { + if (SCALAR_FLOAT_TYPE_P (stept)) + peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, + LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); + else + peel_mul = gimple_convert (&init_stmts, stept, + LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); + peel_mul = gimple_build_vector_from_val (&init_stmts, + step_vectype, peel_mul); + + /* If early break then we have to create a new PHI which we can use as + an offset to adjust the induction reduction in early exits. + + This is because when peeling for alignment using masking, the first + few elements of the vector can be inactive. As such if we find the + entry in the first iteration we have adjust the starting point of + the scalar code. + + We do this by creating a new scalar PHI that keeps track of whether + we are the first iteration of the loop (with the additional masking) + or whether we have taken a loop iteration already. + + The generated sequence: + + pre-header: + bb1: + i_1 = <number of leading inactive elements> + + header: + bb2: + i_2 = PHI <i_1(bb1), 0(latch)> + … + + early-exit: + bb3: + i_3 = iv_step * i_2 + PHI<vector-iv> + + The first part of the adjustment to create i_1 and i_2 are done here + and the last part creating i_3 is done in + vectorizable_live_operations when the induction extraction is + materialized. */ + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) + && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo)) + { + auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + tree ty_skip_niters = TREE_TYPE (skip_niters); + tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters, + vect_scalar_var, + "pfa_iv_offset"); + gphi *nphi = create_phi_node (break_lhs_phi, bb); + add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION); + add_phi_arg (nphi, build_zero_cst (ty_skip_niters), + loop_latch_edge (iv_loop), UNKNOWN_LOCATION); + + LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo) = PHI_RESULT (nphi); + } + } + tree step_mul = NULL_TREE; + unsigned ivn; + auto_vec<tree> vec_steps; + for (ivn = 0; ivn < nivs; ++ivn) + { + gimple_seq stmts = NULL; + bool invariant = true; + if (nunits.is_constant (&const_nunits)) { - gimple_seq stmts = NULL; - bool invariant = true; - if (nunits.is_constant (&const_nunits)) + tree_vector_builder step_elts (step_vectype, const_nunits, 1); + tree_vector_builder init_elts (vectype, const_nunits, 1); + tree_vector_builder mul_elts (step_vectype, const_nunits, 1); + for (unsigned eltn = 0; eltn < const_nunits; ++eltn) { - tree_vector_builder step_elts (step_vectype, const_nunits, 1); - tree_vector_builder init_elts (vectype, const_nunits, 1); - tree_vector_builder mul_elts (step_vectype, const_nunits, 1); - for (unsigned eltn = 0; eltn < const_nunits; ++eltn) - { - /* The scalar steps of the IVs. */ - tree elt = steps[(ivn*const_nunits + eltn) % group_size]; - elt = gimple_convert (&init_stmts, - TREE_TYPE (step_vectype), elt); - step_elts.quick_push (elt); - if (!init_node) - { - /* The scalar inits of the IVs if not vectorized. */ - elt = inits[(ivn*const_nunits + eltn) % group_size]; - if (!useless_type_conversion_p (TREE_TYPE (vectype), - TREE_TYPE (elt))) - elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR, - TREE_TYPE (vectype), elt); - init_elts.quick_push (elt); - } - /* The number of steps to add to the initial values. */ - unsigned mul_elt = (ivn*const_nunits + eltn) / group_size; - mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept) - ? build_real_from_wide (stept, mul_elt, - UNSIGNED) - : build_int_cstu (stept, mul_elt)); - } - vec_step = gimple_build_vector (&init_stmts, &step_elts); - step_mul = gimple_build_vector (&init_stmts, &mul_elts); + /* The scalar steps of the IVs. */ + tree elt = steps[(ivn*const_nunits + eltn) % group_size]; + elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt); + step_elts.quick_push (elt); if (!init_node) - vec_init = gimple_build_vector (&init_stmts, &init_elts); - } - else - { - if (init_node) - ; - else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0]))) - { - new_name = gimple_convert (&init_stmts, stept, inits[0]); - /* Build the initial value directly as a VEC_SERIES_EXPR. */ - vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR, - step_vectype, new_name, steps[0]); - if (!useless_type_conversion_p (vectype, step_vectype)) - vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR, - vectype, vec_init); - } - else - { - /* Build: - [base, base, base, ...] - + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ - gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0]))); - gcc_assert (flag_associative_math); - gcc_assert (index_vectype != NULL_TREE); - - tree index = build_index_vector (index_vectype, 0, 1); - new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]), - inits[0]); - tree base_vec = gimple_build_vector_from_val (&init_stmts, - step_vectype, - new_name); - tree step_vec = gimple_build_vector_from_val (&init_stmts, - step_vectype, - steps[0]); - vec_init = gimple_build (&init_stmts, FLOAT_EXPR, - step_vectype, index); - vec_init = gimple_build (&init_stmts, MULT_EXPR, - step_vectype, vec_init, step_vec); - vec_init = gimple_build (&init_stmts, PLUS_EXPR, - step_vectype, vec_init, base_vec); - if (!useless_type_conversion_p (vectype, step_vectype)) - vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR, - vectype, vec_init); - } - /* iv_loop is nested in the loop to be vectorized. Generate: - vec_step = [S, S, S, S] */ - t = unshare_expr (steps[0]); - gcc_assert (CONSTANT_CLASS_P (t) - || TREE_CODE (t) == SSA_NAME); - vec_step = gimple_build_vector_from_val (&init_stmts, - step_vectype, t); - } - vec_steps.safe_push (vec_step); - if (peel_mul) - { - if (!step_mul) - step_mul = peel_mul; - else - step_mul = gimple_build (&init_stmts, - MINUS_EXPR, step_vectype, - step_mul, peel_mul); - } - - /* Create the induction-phi that defines the induction-operand. */ - vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, - "vec_iv_"); - induction_phi = create_phi_node (vec_dest, iv_loop->header); - induc_def = PHI_RESULT (induction_phi); - - /* Create the iv update inside the loop */ - tree up = vec_step; - if (lupdate_mul) - { - if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) { - /* When we're using loop_len produced by SELEC_VL, the - non-final iterations are not always processing VF - elements. So vectorize induction variable instead of - - _21 = vect_vec_iv_.6_22 + { VF, ... }; - - We should generate: - - _35 = .SELECT_VL (ivtmp_33, VF); - vect_cst__22 = [vec_duplicate_expr] _35; - _21 = vect_vec_iv_.6_22 + vect_cst__22; */ - vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); - tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, - vectype, 0, 0); - if (SCALAR_FLOAT_TYPE_P (stept)) - expr = gimple_build (&stmts, FLOAT_EXPR, stept, len); - else - expr = gimple_convert (&stmts, stept, len); - lupdate_mul = gimple_build_vector_from_val (&stmts, - step_vectype, - expr); - up = gimple_build (&stmts, MULT_EXPR, - step_vectype, vec_step, lupdate_mul); + /* The scalar inits of the IVs if not vectorized. */ + elt = inits[(ivn*const_nunits + eltn) % group_size]; + if (!useless_type_conversion_p (TREE_TYPE (vectype), + TREE_TYPE (elt))) + elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR, + TREE_TYPE (vectype), elt); + init_elts.quick_push (elt); } - else - up = gimple_build (&init_stmts, - MULT_EXPR, step_vectype, - vec_step, lupdate_mul); - } - vec_def = gimple_convert (&stmts, step_vectype, induc_def); - vec_def = gimple_build (&stmts, - PLUS_EXPR, step_vectype, vec_def, up); - vec_def = gimple_convert (&stmts, vectype, vec_def); - insert_iv_increment (&incr_si, insert_after, stmts); - add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), - UNKNOWN_LOCATION); - - if (init_node) - vec_init = vect_get_slp_vect_def (init_node, ivn); - if (!nested_in_vect_loop - && step_mul - && !integer_zerop (step_mul)) - { - gcc_assert (invariant); - vec_def = gimple_convert (&init_stmts, step_vectype, vec_init); - up = gimple_build (&init_stmts, MULT_EXPR, step_vectype, - vec_step, step_mul); - vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype, - vec_def, up); - vec_init = gimple_convert (&init_stmts, vectype, vec_def); - } - - /* Set the arguments of the phi node: */ - add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); - - slp_node->push_vec_def (induction_phi); - } - if (!nested_in_vect_loop) - { - /* Fill up to the number of vectors we need for the whole group. */ - if (nunits.is_constant (&const_nunits)) - nivs = least_common_multiple (group_size, - const_nunits) / const_nunits; - else - nivs = 1; - vec_steps.reserve (nivs-ivn); - for (; ivn < nivs; ++ivn) - { - slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]); - vec_steps.quick_push (vec_steps[0]); + /* The number of steps to add to the initial values. */ + unsigned mul_elt = (ivn*const_nunits + eltn) / group_size; + mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept) + ? build_real_from_wide (stept, mul_elt, + UNSIGNED) + : build_int_cstu (stept, mul_elt)); } + vec_step = gimple_build_vector (&init_stmts, &step_elts); + step_mul = gimple_build_vector (&init_stmts, &mul_elts); + if (!init_node) + vec_init = gimple_build_vector (&init_stmts, &init_elts); } - - /* Re-use IVs when we can. We are generating further vector - stmts by adding VF' * stride to the IVs generated above. */ - if (ivn < nvects) + else { - if (nunits.is_constant (&const_nunits)) + if (init_node) + ; + else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0]))) { - unsigned vfp = (least_common_multiple (group_size, const_nunits) - / group_size); - lupdate_mul - = build_vector_from_val (step_vectype, - SCALAR_FLOAT_TYPE_P (stept) - ? build_real_from_wide (stept, - vfp, UNSIGNED) - : build_int_cstu (stept, vfp)); + new_name = gimple_convert (&init_stmts, stept, inits[0]); + /* Build the initial value directly as a VEC_SERIES_EXPR. */ + vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR, + step_vectype, new_name, steps[0]); + if (!useless_type_conversion_p (vectype, step_vectype)) + vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR, + vectype, vec_init); } else { - if (SCALAR_FLOAT_TYPE_P (stept)) - { - tree tem = build_int_cst (integer_type_node, nunits); - lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, - stept, tem); - } - else - lupdate_mul = build_int_cst (stept, nunits); - lupdate_mul = gimple_build_vector_from_val (&init_stmts, - step_vectype, - lupdate_mul); - } - for (; ivn < nvects; ++ivn) - { - gimple *iv - = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]); - tree def = gimple_get_lhs (iv); - if (ivn < 2*nivs) - vec_steps[ivn - nivs] - = gimple_build (&init_stmts, MULT_EXPR, step_vectype, - vec_steps[ivn - nivs], lupdate_mul); - gimple_seq stmts = NULL; - def = gimple_convert (&stmts, step_vectype, def); - def = gimple_build (&stmts, PLUS_EXPR, step_vectype, - def, vec_steps[ivn % nivs]); - def = gimple_convert (&stmts, vectype, def); - if (gimple_code (iv) == GIMPLE_PHI) - gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); - else - { - gimple_stmt_iterator tgsi = gsi_for_stmt (iv); - gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING); - } - slp_node->push_vec_def (def); + /* Build: + [base, base, base, ...] + + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ + gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0]))); + gcc_assert (flag_associative_math); + gcc_assert (index_vectype != NULL_TREE); + + tree index = build_index_vector (index_vectype, 0, 1); + new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]), + inits[0]); + tree base_vec = gimple_build_vector_from_val (&init_stmts, + step_vectype, + new_name); + tree step_vec = gimple_build_vector_from_val (&init_stmts, + step_vectype, + steps[0]); + vec_init = gimple_build (&init_stmts, FLOAT_EXPR, + step_vectype, index); + vec_init = gimple_build (&init_stmts, MULT_EXPR, + step_vectype, vec_init, step_vec); + vec_init = gimple_build (&init_stmts, PLUS_EXPR, + step_vectype, vec_init, base_vec); + if (!useless_type_conversion_p (vectype, step_vectype)) + vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR, + vectype, vec_init); } + /* iv_loop is nested in the loop to be vectorized. Generate: + vec_step = [S, S, S, S] */ + t = unshare_expr (steps[0]); + gcc_assert (CONSTANT_CLASS_P (t) + || TREE_CODE (t) == SSA_NAME); + vec_step = gimple_build_vector_from_val (&init_stmts, + step_vectype, t); + } + vec_steps.safe_push (vec_step); + if (peel_mul) + { + if (!step_mul) + step_mul = peel_mul; + else + step_mul = gimple_build (&init_stmts, + MINUS_EXPR, step_vectype, + step_mul, peel_mul); } - new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts); - gcc_assert (!new_bb); - - return true; - } - - tree init_expr = vect_phi_initial_value (phi); - - gimple_seq stmts = NULL; - if (!nested_in_vect_loop) - { - /* Convert the initial value to the IV update type. */ - tree new_type = TREE_TYPE (step_expr); - init_expr = gimple_convert (&stmts, new_type, init_expr); + /* Create the induction-phi that defines the induction-operand. */ + vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, + "vec_iv_"); + induction_phi = create_phi_node (vec_dest, iv_loop->header); + induc_def = PHI_RESULT (induction_phi); - /* If we are using the loop mask to "peel" for alignment then we need - to adjust the start value here. */ - tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); - if (skip_niters != NULL_TREE) + /* Create the iv update inside the loop */ + tree up = vec_step; + if (lupdate_mul) { - if (FLOAT_TYPE_P (vectype)) - skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type, - skip_niters); - else - skip_niters = gimple_convert (&stmts, new_type, skip_niters); - tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type, - skip_niters, step_expr); - init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, - init_expr, skip_step); - } - } + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) + { + /* When we're using loop_len produced by SELEC_VL, the + non-final iterations are not always processing VF + elements. So vectorize induction variable instead of - if (stmts) - { - new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); - gcc_assert (!new_bb); - } + _21 = vect_vec_iv_.6_22 + { VF, ... }; - /* Create the vector that holds the initial_value of the induction. */ - if (nested_in_vect_loop) - { - /* iv_loop is nested in the loop to be vectorized. init_expr had already - been created during vectorization of previous stmts. We obtain it - from the STMT_VINFO_VEC_STMT of the defining stmt. */ - auto_vec<tree> vec_inits; - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, - init_expr, &vec_inits); - vec_init = vec_inits[0]; - /* If the initial value is not of proper type, convert it. */ - if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init))) - { - new_stmt - = gimple_build_assign (vect_get_new_ssa_name (vectype, - vect_simple_var, - "vec_iv_"), - VIEW_CONVERT_EXPR, - build1 (VIEW_CONVERT_EXPR, vectype, - vec_init)); - vec_init = gimple_assign_lhs (new_stmt); - new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop), - new_stmt); - gcc_assert (!new_bb); - } - } - else - { - /* iv_loop is the loop to be vectorized. Create: - vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ - stmts = NULL; - new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr); + We should generate: - unsigned HOST_WIDE_INT const_nunits; - if (nunits.is_constant (&const_nunits)) - { - tree_vector_builder elts (step_vectype, const_nunits, 1); - elts.quick_push (new_name); - for (i = 1; i < const_nunits; i++) - { - /* Create: new_name_i = new_name + step_expr */ - new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), - new_name, step_expr); - elts.quick_push (new_name); + _35 = .SELECT_VL (ivtmp_33, VF); + vect_cst__22 = [vec_duplicate_expr] _35; + _21 = vect_vec_iv_.6_22 + vect_cst__22; */ + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, + vectype, 0, 0); + if (SCALAR_FLOAT_TYPE_P (stept)) + expr = gimple_build (&stmts, FLOAT_EXPR, stept, len); + else + expr = gimple_convert (&stmts, stept, len); + lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype, + expr); + up = gimple_build (&stmts, MULT_EXPR, + step_vectype, vec_step, lupdate_mul); } - /* Create a vector from [new_name_0, new_name_1, ..., - new_name_nunits-1] */ - vec_init = gimple_build_vector (&stmts, &elts); - } - else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) - /* Build the initial value directly from a VEC_SERIES_EXPR. */ - vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype, - new_name, step_expr); - else - { - /* Build: - [base, base, base, ...] - + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ - gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); - gcc_assert (flag_associative_math); - gcc_assert (index_vectype != NULL_TREE); - - tree index = build_index_vector (index_vectype, 0, 1); - tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype, - new_name); - tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype, - step_expr); - vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index); - vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype, - vec_init, step_vec); - vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype, - vec_init, base_vec); - } - vec_init = gimple_convert (&stmts, vectype, vec_init); + else + up = gimple_build (&init_stmts, MULT_EXPR, step_vectype, + vec_step, lupdate_mul); + } + vec_def = gimple_convert (&stmts, step_vectype, induc_def); + vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up); + vec_def = gimple_convert (&stmts, vectype, vec_def); + insert_iv_increment (&incr_si, insert_after, stmts); + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), + UNKNOWN_LOCATION); - if (stmts) + if (init_node) + vec_init = vect_get_slp_vect_def (init_node, ivn); + if (!nested_in_vect_loop + && step_mul + && !integer_zerop (step_mul)) { - new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); - gcc_assert (!new_bb); + gcc_assert (invariant); + vec_def = gimple_convert (&init_stmts, step_vectype, vec_init); + up = gimple_build (&init_stmts, MULT_EXPR, step_vectype, + vec_step, step_mul); + vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype, + vec_def, up); + vec_init = gimple_convert (&init_stmts, vectype, vec_def); } - } + /* Set the arguments of the phi node: */ + add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); - /* Create the vector that holds the step of the induction. */ - gimple_stmt_iterator *step_iv_si = NULL; - if (nested_in_vect_loop) - /* iv_loop is nested in the loop to be vectorized. Generate: - vec_step = [S, S, S, S] */ - new_name = step_expr; - else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) - { - /* When we're using loop_len produced by SELEC_VL, the non-final - iterations are not always processing VF elements. So vectorize - induction variable instead of - - _21 = vect_vec_iv_.6_22 + { VF, ... }; - - We should generate: - - _35 = .SELECT_VL (ivtmp_33, VF); - vect_cst__22 = [vec_duplicate_expr] _35; - _21 = vect_vec_iv_.6_22 + vect_cst__22; */ - gcc_assert (!slp_node); - gimple_seq seq = NULL; - vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); - tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0); - expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr), - unshare_expr (len)), - &seq, true, NULL_TREE); - new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr, - step_expr); - gsi_insert_seq_before (&si, seq, GSI_SAME_STMT); - step_iv_si = &si; + slp_node->push_vec_def (induction_phi); } - else + if (!nested_in_vect_loop) { - /* iv_loop is the loop to be vectorized. Generate: - vec_step = [VF*S, VF*S, VF*S, VF*S] */ - gimple_seq seq = NULL; - if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) - { - expr = build_int_cst (integer_type_node, vf); - expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); - } + /* Fill up to the number of vectors we need for the whole group. */ + if (nunits.is_constant (&const_nunits)) + nivs = least_common_multiple (group_size, const_nunits) / const_nunits; else - expr = build_int_cst (TREE_TYPE (step_expr), vf); - new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), - expr, step_expr); - if (seq) + nivs = 1; + vec_steps.reserve (nivs-ivn); + for (; ivn < nivs; ++ivn) { - new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); - gcc_assert (!new_bb); + slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]); + vec_steps.quick_push (vec_steps[0]); } } - t = unshare_expr (new_name); - gcc_assert (CONSTANT_CLASS_P (new_name) - || TREE_CODE (new_name) == SSA_NAME); - new_vec = build_vector_from_val (step_vectype, t); - vec_step = vect_init_vector (loop_vinfo, stmt_info, - new_vec, step_vectype, step_iv_si); - - - /* Create the following def-use cycle: - loop prolog: - vec_init = ... - vec_step = ... - loop: - vec_iv = PHI <vec_init, vec_loop> - ... - STMT - ... - vec_loop = vec_iv + vec_step; */ - - /* Create the induction-phi that defines the induction-operand. */ - vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); - induction_phi = create_phi_node (vec_dest, iv_loop->header); - induc_def = PHI_RESULT (induction_phi); - - /* Create the iv update inside the loop */ - stmts = NULL; - vec_def = gimple_convert (&stmts, step_vectype, induc_def); - vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step); - vec_def = gimple_convert (&stmts, vectype, vec_def); - gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); - new_stmt = SSA_NAME_DEF_STMT (vec_def); - - /* Set the arguments of the phi node: */ - add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); - add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), - UNKNOWN_LOCATION); - - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi); - *vec_stmt = induction_phi; - - /* In case that vectorization factor (VF) is bigger than the number - of elements that we can fit in a vectype (nunits), we have to generate - more than one vector stmt - i.e - we need to "unroll" the - vector stmt by a factor VF/nunits. For more details see documentation - in vectorizable_operation. */ - - if (ncopies > 1) + /* Re-use IVs when we can. We are generating further vector + stmts by adding VF' * stride to the IVs generated above. */ + if (ivn < nvects) { - gimple_seq seq = NULL; - /* FORNOW. This restriction should be relaxed. */ - gcc_assert (!nested_in_vect_loop); - /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */ - gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); - - /* Create the vector that holds the step of the induction. */ - if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) + if (nunits.is_constant (&const_nunits)) { - expr = build_int_cst (integer_type_node, nunits); - expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); + unsigned vfp = (least_common_multiple (group_size, const_nunits) + / group_size); + lupdate_mul + = build_vector_from_val (step_vectype, + SCALAR_FLOAT_TYPE_P (stept) + ? build_real_from_wide (stept, + vfp, UNSIGNED) + : build_int_cstu (stept, vfp)); } else - expr = build_int_cst (TREE_TYPE (step_expr), nunits); - new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), - expr, step_expr); - if (seq) { - new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); - gcc_assert (!new_bb); - } - - t = unshare_expr (new_name); - gcc_assert (CONSTANT_CLASS_P (new_name) - || TREE_CODE (new_name) == SSA_NAME); - new_vec = build_vector_from_val (step_vectype, t); - vec_step = vect_init_vector (loop_vinfo, stmt_info, - new_vec, step_vectype, NULL); - - vec_def = induc_def; - for (i = 1; i < ncopies + 1; i++) - { - /* vec_i = vec_prev + vec_step */ - gimple_seq stmts = NULL; - vec_def = gimple_convert (&stmts, step_vectype, vec_def); - vec_def = gimple_build (&stmts, - PLUS_EXPR, step_vectype, vec_def, vec_step); - vec_def = gimple_convert (&stmts, vectype, vec_def); - - gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); - if (i < ncopies) + if (SCALAR_FLOAT_TYPE_P (stept)) { - new_stmt = SSA_NAME_DEF_STMT (vec_def); - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + tree tem = build_int_cst (integer_type_node, nunits); + lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem); } else + lupdate_mul = build_int_cst (stept, nunits); + lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype, + lupdate_mul); + } + for (; ivn < nvects; ++ivn) + { + gimple *iv + = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]); + tree def = gimple_get_lhs (iv); + if (ivn < 2*nivs) + vec_steps[ivn - nivs] + = gimple_build (&init_stmts, MULT_EXPR, step_vectype, + vec_steps[ivn - nivs], lupdate_mul); + gimple_seq stmts = NULL; + def = gimple_convert (&stmts, step_vectype, def); + def = gimple_build (&stmts, PLUS_EXPR, step_vectype, + def, vec_steps[ivn % nivs]); + def = gimple_convert (&stmts, vectype, def); + if (gimple_code (iv) == GIMPLE_PHI) + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); + else { - /* vec_1 = vec_iv + (VF/n * S) - vec_2 = vec_1 + (VF/n * S) - ... - vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop - - vec_n is used as vec_loop to save the large step register and - related operations. */ - add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), - UNKNOWN_LOCATION); + gimple_stmt_iterator tgsi = gsi_for_stmt (iv); + gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING); } + slp_node->push_vec_def (def); } } - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "transform induction: created def-use cycle: %G%G", - (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def)); + new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts); + gcc_assert (!new_bb); return true; } @@ -11683,7 +11079,7 @@ vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi, factor = exact_div (nunits1, nunits2).to_constant (); tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); gimple_seq seq = NULL; - loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len, + loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len, build_int_cst (iv_type, factor)); if (seq) gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); @@ -11743,7 +11139,7 @@ scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool fl profile_count entry_count = loop_preheader_edge (loop)->count (); /* If we have unreliable loop profile avoid dropping entry - count bellow header count. This can happen since loops + count below header count. This can happen since loops has unrealistically low trip counts. */ while (vf > 1 && loop->header->count > entry_count @@ -12373,6 +11769,13 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to" " variable-length vectorization factor\n"); } + + /* When we have unrolled the loop due to a user requested value we should + leave it up to the RTL unroll heuristics to determine if it's still worth + while to unroll more. */ + if (LOOP_VINFO_USER_UNROLL (loop_vinfo)) + loop->unroll = 0; + /* Free SLP instances here because otherwise stmt reference counting won't work. */ slp_instance instance; |