diff options
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r-- | gcc/tree-vect-loop.c | 1201 |
1 files changed, 678 insertions, 523 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 3e973e7..c9dcc64 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ +#define INCLUDE_ALGORITHM #include "config.h" #include "system.h" #include "coretypes.h" @@ -813,7 +814,7 @@ bb_in_loop_p (const_basic_block bb, const void *data) stmt_vec_info structs for all the stmts in LOOP_IN. */ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) - : vec_info (vec_info::loop, init_cost (loop_in), shared), + : vec_info (vec_info::loop, init_cost (loop_in, false), shared), loop (loop_in), bbs (XCNEWVEC (basic_block, loop->num_nodes)), num_itersm1 (NULL_TREE), @@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) th (0), versioning_threshold (0), vectorization_factor (0), + main_loop_edge (nullptr), + skip_main_loop_edge (nullptr), + skip_this_loop_edge (nullptr), + reusable_accumulators (), max_vectorization_factor (0), mask_skip_niters (NULL_TREE), rgroup_compare_type (NULL_TREE), @@ -836,6 +841,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) single_scalar_iteration_cost (0), vec_outside_cost (0), vec_inside_cost (0), + inner_loop_cost_factor (param_vect_inner_loop_cost_factor), vectorizable (false), can_use_partial_vectors_p (param_vect_partial_vector_usage != 0), using_partial_vectors_p (false), @@ -1237,7 +1243,7 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) /* FORNOW. */ innerloop_iters = 1; if (loop->inner) - innerloop_iters = 50; /* FIXME */ + innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); for (i = 0; i < nbbs; i++) { @@ -1278,24 +1284,28 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) else kind = scalar_stmt; + /* We are using vect_prologue here to avoid scaling twice + by the inner loop factor. */ record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), factor, kind, stmt_info, 0, vect_prologue); } } /* Now accumulate cost. */ - void *target_cost_data = init_cost (loop); + void *target_cost_data = init_cost (loop, true); stmt_info_for_cost *si; int j; FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count, si->kind, si->stmt_info, si->vectype, - si->misalign, vect_body); - unsigned dummy, body_cost = 0; - finish_cost (target_cost_data, &dummy, &body_cost, &dummy); + si->misalign, si->where); + unsigned prologue_cost = 0, body_cost = 0, epilogue_cost = 0; + finish_cost (target_cost_data, &prologue_cost, &body_cost, + &epilogue_cost); destroy_cost_data (target_cost_data); - LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost; + LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) + = prologue_cost + body_cost + epilogue_cost; } @@ -1509,6 +1519,13 @@ vect_analyze_loop_form (class loop *loop, vec_info_shared *shared) stmt_vec_info inner_loop_cond_info = loop_vinfo->lookup_stmt (inner_loop_cond); STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type; + /* If we have an estimate on the number of iterations of the inner + loop use that to limit the scale for costing, otherwise use + --param vect-inner-loop-cost-factor literally. */ + widest_int nit; + if (estimated_stmt_executions (loop->inner, &nit)) + LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo) + = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi (); } gcc_assert (!loop->aux); @@ -2723,7 +2740,7 @@ again: /* Reset target cost data. */ destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) - = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); + = init_cost (LOOP_VINFO_LOOP (loop_vinfo), false); /* Reset accumulated rgroup information. */ release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo)); release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo)); @@ -2766,7 +2783,15 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, /* Limit the VFs to what is likely to be the maximum number of iterations, to handle cases in which at least one loop_vinfo is fully-masked. */ - HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop); + HOST_WIDE_INT estimated_max_niter; + loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo); + unsigned HOST_WIDE_INT main_vf; + if (main_loop + && LOOP_VINFO_NITERS_KNOWN_P (main_loop) + && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf)) + estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf; + else + estimated_max_niter = likely_max_stmt_executions_int (loop); if (estimated_max_niter != -1) { if (known_le (estimated_max_niter, new_vf)) @@ -3058,7 +3083,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) = opt_loop_vec_info::success (main_loop_vinfo); } else - delete main_loop_vinfo; + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** No longer preferring vector" + " mode %s after reanalyzing the loop" + " as a main loop\n", + GET_MODE_NAME + (main_loop_vinfo->vector_mode)); + delete main_loop_vinfo; + } } } @@ -3208,7 +3242,7 @@ fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn) Return FALSE if CODE currently cannot be vectorized as reduction. */ -static bool +bool reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn) { switch (code) @@ -3247,23 +3281,15 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn) } } -/* If there is a neutral value X such that SLP reduction NODE would not - be affected by the introduction of additional X elements, return that X, - otherwise return null. CODE is the code of the reduction and VECTOR_TYPE - is the vector type that would hold element X. REDUC_CHAIN is true if - the SLP statements perform a single reduction, false if each statement - performs an independent reduction. */ +/* If there is a neutral value X such that a reduction would not be affected + by the introduction of additional X elements, return that X, otherwise + return null. CODE is the code of the reduction and SCALAR_TYPE is type + of the scalar elements. If the reduction has just a single initial value + then INITIAL_VALUE is that value, otherwise it is null. */ static tree -neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type, - tree_code code, bool reduc_chain) +neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value) { - vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); - stmt_vec_info stmt_vinfo = stmts[0]; - tree scalar_type = TREE_TYPE (vector_type); - class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father; - gcc_assert (loop); - switch (code) { case WIDEN_SUM_EXPR: @@ -3283,13 +3309,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type, case MAX_EXPR: case MIN_EXPR: - /* For MIN/MAX the initial values are neutral. A reduction chain - has only a single initial value, so that value is neutral for - all statements. */ - if (reduc_chain) - return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, - loop_preheader_edge (loop)); - return NULL_TREE; + return initial_value; default: return NULL_TREE; @@ -4465,7 +4485,7 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, vect_reduction_type reduction_type, int ncopies, stmt_vector_for_cost *cost_vec) { - int prologue_cost = 0, epilogue_cost = 0, inside_cost; + int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0; enum tree_code code; optab optab; tree vectype; @@ -4620,64 +4640,58 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, prologue_cost, epilogue_cost); } +/* SEQ is a sequence of instructions that initialize the reduction + described by REDUC_INFO. Emit them in the appropriate place. */ +static void +vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo, + stmt_vec_info reduc_info, gimple *seq) +{ + if (reduc_info->reused_accumulator) + { + /* When reusing an accumulator from the main loop, we only need + initialization instructions if the main loop can be skipped. + In that case, emit the initialization instructions at the end + of the guard block that does the skip. */ + edge skip_edge = loop_vinfo->skip_main_loop_edge; + gcc_assert (skip_edge); + gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src); + gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); + } + else + { + /* The normal case: emit the initialization instructions on the + preheader edge. */ + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq); + } +} /* Function get_initial_def_for_reduction Input: - STMT_VINFO - a stmt that performs a reduction operation in the loop. + REDUC_INFO - the info_for_reduction INIT_VAL - the initial value of the reduction variable + NEUTRAL_OP - a value that has no effect on the reduction, as per + neutral_op_for_reduction Output: - ADJUSTMENT_DEF - a tree that holds a value to be added to the final result - of the reduction (used for adjusting the epilog - see below). Return a vector variable, initialized according to the operation that STMT_VINFO performs. This vector will be used as the initial value of the vector of partial results. - Option1 (adjust in epilog): Initialize the vector as follows: - add/bit or/xor: [0,0,...,0,0] - mult/bit and: [1,1,...,1,1] - min/max/cond_expr: [init_val,init_val,..,init_val,init_val] - and when necessary (e.g. add/mult case) let the caller know - that it needs to adjust the result by init_val. - - Option2: Initialize the vector as follows: - add/bit or/xor: [init_val,0,0,...,0] - mult/bit and: [init_val,1,1,...,1] - min/max/cond_expr: [init_val,init_val,...,init_val] - and no adjustments are needed. - - For example, for the following code: - - s = init_val; - for (i=0;i<n;i++) - s = s + a[i]; - - STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'. - For a vector of 4 units, we want to return either [0,0,0,init_val], - or [0,0,0,0] and let the caller know that it needs to adjust - the result at the end by 'init_val'. - - FORNOW, we are using the 'adjust in epilog' scheme, because this way the - initialization vector is simpler (same element in all entries), if - ADJUSTMENT_DEF is not NULL, and Option2 otherwise. - - A cost model should help decide between these two schemes. */ + The value we need is a vector in which element 0 has value INIT_VAL + and every other element has value NEUTRAL_OP. */ static tree get_initial_def_for_reduction (loop_vec_info loop_vinfo, - stmt_vec_info stmt_vinfo, - enum tree_code code, tree init_val, - tree *adjustment_def) + stmt_vec_info reduc_info, + tree init_val, tree neutral_op) { class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree scalar_type = TREE_TYPE (init_val); tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); - tree def_for_init; tree init_def; - REAL_VALUE_TYPE real_init_val = dconst0; - int int_init_val = 0; gimple_seq stmts = NULL; gcc_assert (vectype); @@ -4685,115 +4699,64 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo, gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) || SCALAR_FLOAT_TYPE_P (scalar_type)); - gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo) - || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father); + gcc_assert (nested_in_vect_loop_p (loop, reduc_info) + || loop == (gimple_bb (reduc_info->stmt))->loop_father); - /* ADJUSTMENT_DEF is NULL when called from - vect_create_epilog_for_reduction to vectorize double reduction. */ - if (adjustment_def) - *adjustment_def = NULL; - - switch (code) + if (operand_equal_p (init_val, neutral_op)) { - case WIDEN_SUM_EXPR: - case DOT_PROD_EXPR: - case SAD_EXPR: - case PLUS_EXPR: - case MINUS_EXPR: - case BIT_IOR_EXPR: - case BIT_XOR_EXPR: - case MULT_EXPR: - case BIT_AND_EXPR: - { - if (code == MULT_EXPR) - { - real_init_val = dconst1; - int_init_val = 1; - } - - if (code == BIT_AND_EXPR) - int_init_val = -1; - - if (SCALAR_FLOAT_TYPE_P (scalar_type)) - def_for_init = build_real (scalar_type, real_init_val); - else - def_for_init = build_int_cst (scalar_type, int_init_val); - - if (adjustment_def || operand_equal_p (def_for_init, init_val, 0)) - { - /* Option1: the first element is '0' or '1' as well. */ - if (!operand_equal_p (def_for_init, init_val, 0)) - *adjustment_def = init_val; - init_def = gimple_build_vector_from_val (&stmts, vectype, - def_for_init); - } - else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) - { - /* Option2 (variable length): the first element is INIT_VAL. */ - init_def = gimple_build_vector_from_val (&stmts, vectype, - def_for_init); - init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT, - vectype, init_def, init_val); - } - else - { - /* Option2: the first element is INIT_VAL. */ - tree_vector_builder elts (vectype, 1, 2); - elts.quick_push (init_val); - elts.quick_push (def_for_init); - init_def = gimple_build_vector (&stmts, &elts); - } - } - break; - - case MIN_EXPR: - case MAX_EXPR: - case COND_EXPR: - { - init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); - init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); - } - break; - - default: - gcc_unreachable (); + /* If both elements are equal then the vector described above is + just a splat. */ + neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op); + init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op); + } + else + { + neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op); + init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); + if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) + { + /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into + element 0. */ + init_def = gimple_build_vector_from_val (&stmts, vectype, + neutral_op); + init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT, + vectype, init_def, init_val); + } + else + { + /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */ + tree_vector_builder elts (vectype, 1, 2); + elts.quick_push (init_val); + elts.quick_push (neutral_op); + init_def = gimple_build_vector (&stmts, &elts); + } } if (stmts) - gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); + vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts); return init_def; } -/* Get at the initial defs for the reduction PHIs in SLP_NODE. - NUMBER_OF_VECTORS is the number of vector defs to create. - If NEUTRAL_OP is nonnull, introducing extra elements of that - value will not change the result. */ +/* Get at the initial defs for the reduction PHIs for REDUC_INFO, + which performs a reduction involving GROUP_SIZE scalar statements. + NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP + is nonnull, introducing extra elements of that value will not change the + result. */ static void -get_initial_defs_for_reduction (vec_info *vinfo, - slp_tree slp_node, +get_initial_defs_for_reduction (loop_vec_info loop_vinfo, + stmt_vec_info reduc_info, vec<tree> *vec_oprnds, unsigned int number_of_vectors, - bool reduc_chain, tree neutral_op) + unsigned int group_size, tree neutral_op) { - vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); - stmt_vec_info stmt_vinfo = stmts[0]; + vec<tree> &initial_values = reduc_info->reduc_initial_values; unsigned HOST_WIDE_INT nunits; unsigned j, number_of_places_left_in_vector; - tree vector_type; - unsigned int group_size = stmts.length (); + tree vector_type = STMT_VINFO_VECTYPE (reduc_info); unsigned int i; - class loop *loop; - - vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); - - gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); - loop = (gimple_bb (stmt_vinfo->stmt))->loop_father; - gcc_assert (loop); - edge pe = loop_preheader_edge (loop); - - gcc_assert (!reduc_chain || neutral_op); + gcc_assert (group_size == initial_values.length () || neutral_op); /* NUMBER_OF_COPIES is the number of times we need to use the same values in created vectors. It is greater than 1 if unrolling is performed. @@ -4823,18 +4786,13 @@ get_initial_defs_for_reduction (vec_info *vinfo, { tree op; i = j % group_size; - stmt_vinfo = stmts[i]; /* Get the def before the loop. In reduction chain we have only one initial value. Else we have as many as PHIs in the group. */ - if (reduc_chain) - op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); - else if (((vec_oprnds->length () + 1) * nunits - - number_of_places_left_in_vector >= group_size) - && neutral_op) + if (i >= initial_values.length () || (j > i && neutral_op)) op = neutral_op; else - op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); + op = initial_values[i]; /* Create 'vect_ = {op0,op1,...,opn}'. */ number_of_places_left_in_vector--; @@ -4870,8 +4828,8 @@ get_initial_defs_for_reduction (vec_info *vinfo, { /* First time round, duplicate ELTS to fill the required number of vectors. */ - duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts, - number_of_vectors, *vec_oprnds); + duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type, + elts, number_of_vectors, *vec_oprnds); break; } vec_oprnds->quick_push (init); @@ -4883,7 +4841,7 @@ get_initial_defs_for_reduction (vec_info *vinfo, } } if (ctor_seq != NULL) - gsi_insert_seq_on_edge_immediate (pe, ctor_seq); + vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq); } /* For a statement STMT_INFO taking part in a reduction operation return @@ -4905,15 +4863,200 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info) } else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) { - edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father); - stmt_vec_info info - = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe)); + stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi)); if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def) stmt_info = info; } return stmt_info; } +/* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that + REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise + return false. */ + +static bool +vect_find_reusable_accumulator (loop_vec_info loop_vinfo, + stmt_vec_info reduc_info) +{ + loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); + if (!main_loop_vinfo) + return false; + + if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION) + return false; + + unsigned int num_phis = reduc_info->reduc_initial_values.length (); + auto_vec<tree, 16> main_loop_results (num_phis); + auto_vec<tree, 16> initial_values (num_phis); + if (edge main_loop_edge = loop_vinfo->main_loop_edge) + { + /* The epilogue loop can be entered either from the main loop or + from an earlier guard block. */ + edge skip_edge = loop_vinfo->skip_main_loop_edge; + for (tree incoming_value : reduc_info->reduc_initial_values) + { + /* Look for: + + INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop), + INITIAL_VALUE(guard block)>. */ + gcc_assert (TREE_CODE (incoming_value) == SSA_NAME); + + gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value)); + gcc_assert (gimple_bb (phi) == main_loop_edge->dest); + + tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge); + tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge); + + main_loop_results.quick_push (from_main_loop); + initial_values.quick_push (from_skip); + } + } + else + /* The main loop dominates the epilogue loop. */ + main_loop_results.splice (reduc_info->reduc_initial_values); + + /* See if the main loop has the kind of accumulator we need. */ + vect_reusable_accumulator *accumulator + = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]); + if (!accumulator + || num_phis != accumulator->reduc_info->reduc_scalar_results.length () + || !std::equal (main_loop_results.begin (), main_loop_results.end (), + accumulator->reduc_info->reduc_scalar_results.begin ())) + return false; + + /* Handle the case where we can reduce wider vectors to narrower ones. */ + tree vectype = STMT_VINFO_VECTYPE (reduc_info); + tree old_vectype = TREE_TYPE (accumulator->reduc_input); + if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype), + TYPE_VECTOR_SUBPARTS (vectype))) + return false; + + /* Non-SLP reductions might apply an adjustment after the reduction + operation, in order to simplify the initialization of the accumulator. + If the epilogue loop carries on from where the main loop left off, + it should apply the same adjustment to the final reduction result. + + If the epilogue loop can also be entered directly (rather than via + the main loop), we need to be able to handle that case in the same way, + with the same adjustment. (In principle we could add a PHI node + to select the correct adjustment, but in practice that shouldn't be + necessary.) */ + tree main_adjustment + = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info); + if (loop_vinfo->main_loop_edge && main_adjustment) + { + gcc_assert (num_phis == 1); + tree initial_value = initial_values[0]; + /* Check that we can use INITIAL_VALUE as the adjustment and + initialize the accumulator with a neutral value instead. */ + if (!operand_equal_p (initial_value, main_adjustment)) + return false; + tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); + initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value), + code, initial_value); + } + STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment; + reduc_info->reduc_initial_values.truncate (0); + reduc_info->reduc_initial_values.splice (initial_values); + reduc_info->reused_accumulator = accumulator; + return true; +} + +/* Reduce the vector VEC_DEF down to VECTYPE with reduction operation + CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */ + +static tree +vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code, + gimple_seq *seq) +{ + unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant (); + unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + tree stype = TREE_TYPE (vectype); + tree new_temp = vec_def; + while (nunits > nunits1) + { + nunits /= 2; + tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), + stype, nunits); + unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1)); + + /* The target has to make sure we support lowpart/highpart + extraction, either via direct vector extract or through + an integer mode punning. */ + tree dst1, dst2; + gimple *epilog_stmt; + if (convert_optab_handler (vec_extract_optab, + TYPE_MODE (TREE_TYPE (new_temp)), + TYPE_MODE (vectype1)) + != CODE_FOR_nothing) + { + /* Extract sub-vectors directly once vec_extract becomes + a conversion optab. */ + dst1 = make_ssa_name (vectype1); + epilog_stmt + = gimple_build_assign (dst1, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, vectype1, + new_temp, TYPE_SIZE (vectype1), + bitsize_int (0))); + gimple_seq_add_stmt_without_update (seq, epilog_stmt); + dst2 = make_ssa_name (vectype1); + epilog_stmt + = gimple_build_assign (dst2, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, vectype1, + new_temp, TYPE_SIZE (vectype1), + bitsize_int (bitsize))); + gimple_seq_add_stmt_without_update (seq, epilog_stmt); + } + else + { + /* Extract via punning to appropriately sized integer mode + vector. */ + tree eltype = build_nonstandard_integer_type (bitsize, 1); + tree etype = build_vector_type (eltype, 2); + gcc_assert (convert_optab_handler (vec_extract_optab, + TYPE_MODE (etype), + TYPE_MODE (eltype)) + != CODE_FOR_nothing); + tree tem = make_ssa_name (etype); + epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + etype, new_temp)); + gimple_seq_add_stmt_without_update (seq, epilog_stmt); + new_temp = tem; + tem = make_ssa_name (eltype); + epilog_stmt + = gimple_build_assign (tem, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, eltype, + new_temp, TYPE_SIZE (eltype), + bitsize_int (0))); + gimple_seq_add_stmt_without_update (seq, epilog_stmt); + dst1 = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + vectype1, tem)); + gimple_seq_add_stmt_without_update (seq, epilog_stmt); + tem = make_ssa_name (eltype); + epilog_stmt + = gimple_build_assign (tem, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, eltype, + new_temp, TYPE_SIZE (eltype), + bitsize_int (bitsize))); + gimple_seq_add_stmt_without_update (seq, epilog_stmt); + dst2 = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + vectype1, tem)); + gimple_seq_add_stmt_without_update (seq, epilog_stmt); + } + + new_temp = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); + gimple_seq_add_stmt_without_update (seq, epilog_stmt); + } + + return new_temp; +} + /* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction @@ -5004,15 +5147,18 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, imm_use_iterator imm_iter, phi_imm_iter; use_operand_p use_p, phi_use_p; gimple *use_stmt; - bool nested_in_vect_loop = false; - auto_vec<gimple *> new_phis; + auto_vec<tree> reduc_inputs; int j, i; - auto_vec<tree> scalar_results; + vec<tree> &scalar_results = reduc_info->reduc_scalar_results; unsigned int group_size = 1, k; auto_vec<gimple *> phis; - bool slp_reduc = false; + /* SLP reduction without reduction chain, e.g., + # a1 = phi <a2, a0> + # b1 = phi <b2, b0> + a2 = operation (a1) + b2 = operation (b1) */ + bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); bool direct_slp_reduc; - tree new_phi_result; tree induction_index = NULL_TREE; if (slp_node) @@ -5022,38 +5168,39 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { outer_loop = loop; loop = loop->inner; - nested_in_vect_loop = true; - gcc_assert (!slp_node); + gcc_assert (!slp_node && double_reduc); } - gcc_assert (!nested_in_vect_loop || double_reduc); vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); gcc_assert (vectype); mode = TYPE_MODE (vectype); - tree initial_def = NULL; tree induc_val = NULL_TREE; tree adjustment_def = NULL; if (slp_node) ; else { - /* Get at the scalar def before the loop, that defines the initial value - of the reduction variable. */ - initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, - loop_preheader_edge (loop)); /* Optimize: for induction condition reduction, if we can't use zero for induc_val, use initial_def. */ if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); else if (double_reduc) ; - else if (nested_in_vect_loop) - ; else adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); } + stmt_vec_info single_live_out_stmt[] = { stmt_info }; + array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt; + if (slp_reduc) + /* All statements produce live-out values. */ + live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node); + else if (slp_node) + /* The last statement in the reduction chain produces the live-out + value. */ + single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; + unsigned vec_num; int ncopies; if (slp_node) @@ -5204,31 +5351,28 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, if (double_reduc) loop = outer_loop; exit_bb = single_exit (loop)->dest; - new_phis.create (slp_node ? vec_num : ncopies); + exit_gsi = gsi_after_labels (exit_bb); + reduc_inputs.create (slp_node ? vec_num : ncopies); for (unsigned i = 0; i < vec_num; i++) { + gimple_seq stmts = NULL; if (slp_node) def = vect_get_slp_vect_def (slp_node, i); else def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]); for (j = 0; j < ncopies; j++) - { + { tree new_def = copy_ssa_name (def); - phi = create_phi_node (new_def, exit_bb); - if (j == 0) - new_phis.quick_push (phi); - else - { - def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); - new_phis.quick_push (phi); - } - - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); - } + phi = create_phi_node (new_def, exit_bb); + if (j) + def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); + SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); + new_def = gimple_convert (&stmts, vectype, new_def); + reduc_inputs.quick_push (new_def); + } + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); } - exit_gsi = gsi_after_labels (exit_bb); - /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 (i.e. when reduc_fn is not available) and in the final adjustment code (if needed). Also get the original scalar reduction variable as @@ -5246,19 +5390,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); } - scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt); + scalar_dest = gimple_get_lhs (orig_stmt_info->stmt); scalar_type = TREE_TYPE (scalar_dest); scalar_results.create (group_size); new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); bitsize = TYPE_SIZE (scalar_type); - /* SLP reduction without reduction chain, e.g., - # a1 = phi <a2, a0> - # b1 = phi <b2, b0> - a2 = operation (a1) - b2 = operation (b1) */ - slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); - /* True if we should implement SLP_REDUC using native reduction operations instead of scalar operations. */ direct_slp_reduc = (reduc_fn != IFN_LAST @@ -5270,52 +5407,60 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, a2 = operation (a1) a3 = operation (a2), - we may end up with more than one vector result. Here we reduce them to - one vector. */ - if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc) + we may end up with more than one vector result. Here we reduce them + to one vector. + + The same is true if we couldn't use a single defuse cycle. */ + if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) + || direct_slp_reduc + || ncopies > 1) { gimple_seq stmts = NULL; - tree first_vect = PHI_RESULT (new_phis[0]); - first_vect = gimple_convert (&stmts, vectype, first_vect); - for (k = 1; k < new_phis.length (); k++) - { - gimple *next_phi = new_phis[k]; - tree second_vect = PHI_RESULT (next_phi); - second_vect = gimple_convert (&stmts, vectype, second_vect); - first_vect = gimple_build (&stmts, code, vectype, - first_vect, second_vect); - } + tree single_input = reduc_inputs[0]; + for (k = 1; k < reduc_inputs.length (); k++) + single_input = gimple_build (&stmts, code, vectype, + single_input, reduc_inputs[k]); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - new_phi_result = first_vect; - new_phis.truncate (0); - new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); + reduc_inputs.truncate (0); + reduc_inputs.safe_push (single_input); } - /* Likewise if we couldn't use a single defuse cycle. */ - else if (ncopies > 1) + + tree orig_reduc_input = reduc_inputs[0]; + + /* If this loop is an epilogue loop that can be skipped after the + main loop, we can only share a reduction operation between the + main loop and the epilogue if we put it at the target of the + skip edge. + + We can still reuse accumulators if this check fails. Doing so has + the minor(?) benefit of making the epilogue loop's scalar result + independent of the main loop's scalar result. */ + bool unify_with_main_loop_p = false; + if (reduc_info->reused_accumulator + && loop_vinfo->skip_this_loop_edge + && single_succ_p (exit_bb) + && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest) { - gimple_seq stmts = NULL; - tree first_vect = PHI_RESULT (new_phis[0]); - first_vect = gimple_convert (&stmts, vectype, first_vect); - for (int k = 1; k < ncopies; ++k) - { - tree second_vect = PHI_RESULT (new_phis[k]); - second_vect = gimple_convert (&stmts, vectype, second_vect); - first_vect = gimple_build (&stmts, code, vectype, - first_vect, second_vect); - } - gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - new_phi_result = first_vect; - new_phis.truncate (0); - new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); + unify_with_main_loop_p = true; + + basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest; + reduc_inputs[0] = make_ssa_name (vectype); + gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block); + add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb), + UNKNOWN_LOCATION); + add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input, + loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION); + exit_gsi = gsi_after_labels (reduc_block); } - else - new_phi_result = PHI_RESULT (new_phis[0]); + + /* Shouldn't be used beyond this point. */ + exit_bb = nullptr; if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION && reduc_fn != IFN_LAST) { - /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing + /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing various data values where the condition matched and another vector (INDUCTION_INDEX) containing all the indexes of those matches. We need to extract the last matching index (which will be the index with @@ -5345,10 +5490,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Vector of {0, 0, 0,...}. */ tree zero_vec = build_zero_cst (vectype); - gimple_seq stmts = NULL; - new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); - gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - /* Find maximum value from the vector of found indexes. */ tree max_index = make_ssa_name (index_scalar_type); gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, @@ -5366,7 +5507,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes with the vector (INDUCTION_INDEX) of found indexes, choosing values - from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC) + from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC) otherwise. Only one value should match, resulting in a vector (VEC_COND) with one data value and the rest zeros. In the case where the loop never made any matches, every index will @@ -5385,7 +5526,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, zero. */ tree vec_cond = make_ssa_name (vectype); gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, - vec_compare, new_phi_result, + vec_compare, + reduc_inputs[0], zero_vec); gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); @@ -5415,7 +5557,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Convert the reduced value back to the result type and set as the result. */ - stmts = NULL; + gimple_seq stmts = NULL; new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, data_reduc); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); @@ -5433,7 +5575,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, val = data_reduc[i], idx_val = induction_index[i]; return val; */ - tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result)); + tree data_eltype = TREE_TYPE (vectype); tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); @@ -5457,7 +5599,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, build3 (BIT_FIELD_REF, data_eltype, - new_phi_result, + reduc_inputs[0], bitsize_int (el_size), bitsize_int (off))); gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); @@ -5509,10 +5651,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, "Reduce using direct vector reduction.\n"); gimple_seq stmts = NULL; - new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); - vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); + vec_elem_type = TREE_TYPE (vectype); new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn), - vec_elem_type, new_phi_result); + vec_elem_type, reduc_inputs[0]); new_temp = gimple_convert (&stmts, scalar_type, new_temp); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); @@ -5525,6 +5666,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, the same as initial_def already. */ tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, induc_val); + tree initial_def = reduc_info->reduc_initial_values[0]; tmp = make_ssa_name (new_scalar_dest); epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, @@ -5542,12 +5684,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, neutral value. We can then do a normal reduction on each vector. */ /* Enforced by vectorizable_reduction. */ - gcc_assert (new_phis.length () == 1); + gcc_assert (reduc_inputs.length () == 1); gcc_assert (pow2p_hwi (group_size)); - slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis; - vec<stmt_vec_info> orig_phis - = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node); gimple_seq seq = NULL; /* Build a vector {0, 1, 2, ...}, with the same number of elements @@ -5570,10 +5709,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, tree neutral_op = NULL_TREE; if (slp_node) { - stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info); - neutral_op - = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, - vectype, code, first != NULL); + tree initial_value = NULL_TREE; + if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) + initial_value = reduc_info->reduc_initial_values[0]; + neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code, + initial_value); } if (neutral_op) vector_identity = gimple_build_vector_from_val (&seq, vectype, @@ -5585,9 +5725,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, for MIN and MAX reduction, for example. */ if (!neutral_op) { - tree scalar_value - = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt, - loop_preheader_edge (loop)); + tree scalar_value = reduc_info->reduc_initial_values[i]; scalar_value = gimple_convert (&seq, TREE_TYPE (vectype), scalar_value); vector_identity = gimple_build_vector_from_val (&seq, vectype, @@ -5598,7 +5736,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, sel[j] = (index[j] == i); - which selects the elements of NEW_PHI_RESULT that should + which selects the elements of REDUC_INPUTS[0] that should be included in the result. */ tree compare_val = build_int_cst (index_elt_type, i); compare_val = build_vector_from_val (index_type, compare_val); @@ -5607,11 +5745,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Calculate the equivalent of: - vec = seq ? new_phi_result : vector_identity; + vec = seq ? reduc_inputs[0] : vector_identity; VEC is now suitable for a full vector reduction. */ tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype, - sel, new_phi_result, vector_identity); + sel, reduc_inputs[0], vector_identity); /* Do the reduction and convert it to the appropriate type. */ tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn), @@ -5626,7 +5764,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, bool reduce_with_shift; tree vec_temp; - gcc_assert (slp_reduc || new_phis.length () == 1); + gcc_assert (slp_reduc || reduc_inputs.length () == 1); /* See if the target wants to do the final (shift) reduction in a vector mode of smaller size and first reduce upper/lower @@ -5636,7 +5774,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); unsigned nunits1 = nunits; if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode - && new_phis.length () == 1) + && reduc_inputs.length () == 1) { nunits1 = GET_MODE_NUNITS (mode1).to_constant (); /* For SLP reductions we have to make sure lanes match up, but @@ -5668,87 +5806,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* First reduce the vector to the desired vector size we should do shift reduction on by combining upper and lower halves. */ - new_temp = new_phi_result; - while (nunits > nunits1) - { - nunits /= 2; - vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), - stype, nunits); - unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1)); - - /* The target has to make sure we support lowpart/highpart - extraction, either via direct vector extract or through - an integer mode punning. */ - tree dst1, dst2; - if (convert_optab_handler (vec_extract_optab, - TYPE_MODE (TREE_TYPE (new_temp)), - TYPE_MODE (vectype1)) - != CODE_FOR_nothing) - { - /* Extract sub-vectors directly once vec_extract becomes - a conversion optab. */ - dst1 = make_ssa_name (vectype1); - epilog_stmt - = gimple_build_assign (dst1, BIT_FIELD_REF, - build3 (BIT_FIELD_REF, vectype1, - new_temp, TYPE_SIZE (vectype1), - bitsize_int (0))); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - dst2 = make_ssa_name (vectype1); - epilog_stmt - = gimple_build_assign (dst2, BIT_FIELD_REF, - build3 (BIT_FIELD_REF, vectype1, - new_temp, TYPE_SIZE (vectype1), - bitsize_int (bitsize))); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - } - else - { - /* Extract via punning to appropriately sized integer mode - vector. */ - tree eltype = build_nonstandard_integer_type (bitsize, 1); - tree etype = build_vector_type (eltype, 2); - gcc_assert (convert_optab_handler (vec_extract_optab, - TYPE_MODE (etype), - TYPE_MODE (eltype)) - != CODE_FOR_nothing); - tree tem = make_ssa_name (etype); - epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, - build1 (VIEW_CONVERT_EXPR, - etype, new_temp)); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - new_temp = tem; - tem = make_ssa_name (eltype); - epilog_stmt - = gimple_build_assign (tem, BIT_FIELD_REF, - build3 (BIT_FIELD_REF, eltype, - new_temp, TYPE_SIZE (eltype), - bitsize_int (0))); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - dst1 = make_ssa_name (vectype1); - epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, - build1 (VIEW_CONVERT_EXPR, - vectype1, tem)); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - tem = make_ssa_name (eltype); - epilog_stmt - = gimple_build_assign (tem, BIT_FIELD_REF, - build3 (BIT_FIELD_REF, eltype, - new_temp, TYPE_SIZE (eltype), - bitsize_int (bitsize))); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - dst2 = make_ssa_name (vectype1); - epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, - build1 (VIEW_CONVERT_EXPR, - vectype1, tem)); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - } - - new_temp = make_ssa_name (vectype1); - epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - new_phis[0] = epilog_stmt; - } + gimple_seq stmts = NULL; + new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1, + code, &stmts); + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); + reduc_inputs[0] = new_temp; if (reduce_with_shift && !slp_reduc) { @@ -5828,13 +5890,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, int element_bitsize = tree_to_uhwi (bitsize); tree compute_type = TREE_TYPE (vectype); gimple_seq stmts = NULL; - FOR_EACH_VEC_ELT (new_phis, i, new_phi) + FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp) { int bit_offset; - if (gimple_code (new_phi) == GIMPLE_PHI) - vec_temp = PHI_RESULT (new_phi); - else - vec_temp = gimple_assign_lhs (new_phi); new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type, vec_temp, bitsize, bitsize_zero_node); @@ -5881,6 +5939,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, first_res, res); scalar_results[j % group_size] = new_res; } + scalar_results.truncate (group_size); for (k = 0; k < group_size; k++) scalar_results[k] = gimple_convert (&stmts, scalar_type, scalar_results[k]); @@ -5904,6 +5963,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, the same as initial_def already. */ tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, induc_val); + tree initial_def = reduc_info->reduc_initial_values[0]; tree tmp = make_ssa_name (new_scalar_dest); epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, @@ -5922,13 +5982,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { gcc_assert (!slp_reduc); gimple_seq stmts = NULL; - if (nested_in_vect_loop) + if (double_reduc) { - new_phi = new_phis[0]; gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); adjustment_def = gimple_convert (&stmts, vectype, adjustment_def); new_temp = gimple_build (&stmts, code, vectype, - PHI_RESULT (new_phi), adjustment_def); + reduc_inputs[0], adjustment_def); } else { @@ -5941,21 +6000,16 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, epilog_stmt = gimple_seq_last_stmt (stmts); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - if (nested_in_vect_loop) - { - if (!double_reduc) - scalar_results.quick_push (new_temp); - else - scalar_results[0] = new_temp; - } - else - scalar_results[0] = new_temp; - - new_phis[0] = epilog_stmt; + scalar_results[0] = new_temp; } + /* Record this operation if it could be reused by the epilogue loop. */ + if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION) + loop_vinfo->reusable_accumulators.put (scalar_results[0], + { orig_reduc_input, reduc_info }); + if (double_reduc) - loop = loop->inner; + loop = outer_loop; /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit phis with new adjusted scalar results, i.e., replace use <s_out0> @@ -5982,47 +6036,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, use <s_out4> use <s_out4> */ - - /* In SLP reduction chain we reduce vector results into one vector if - necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the - LHS of the last stmt in the reduction chain, since we are looking for - the loop exit phi node. */ - if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) - { - stmt_vec_info dest_stmt_info - = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]); - scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt); - group_size = 1; - } - - /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in - case that REDUC_GROUP_SIZE is greater than vectorization factor). - Therefore, we need to match SCALAR_RESULTS with corresponding statements. - The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results - correspond to the first vector stmt, etc. - (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */ - if (group_size > new_phis.length ()) - gcc_assert (!(group_size % new_phis.length ())); - - for (k = 0; k < group_size; k++) + gcc_assert (live_out_stmts.size () == scalar_results.length ()); + for (k = 0; k < live_out_stmts.size (); k++) { - if (slp_reduc) - { - stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k]; - - orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info); - /* SLP statements can't participate in patterns. */ - gcc_assert (!orig_stmt_info); - scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); - } - - if (nested_in_vect_loop) - { - if (double_reduc) - loop = outer_loop; - else - gcc_unreachable (); - } + stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]); + scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); phis.create (3); /* Find the loop-closed-use at the loop exit of the original scalar @@ -6057,6 +6075,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { /* Replace the uses: */ orig_name = PHI_RESULT (exit_phi); + + /* Look for a single use at the target of the skip edge. */ + if (unify_with_main_loop_p) + { + use_operand_p use_p; + gimple *user; + if (!single_imm_use (orig_name, &use_p, &user)) + gcc_unreachable (); + orig_name = gimple_get_lhs (user); + } + scalar_result = scalar_results[k]; FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) { @@ -6515,33 +6544,31 @@ vectorizable_reduction (loop_vec_info loop_vinfo, stmt_vec_info orig_stmt_of_analysis = stmt_info; stmt_vec_info phi_info = stmt_info; - if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def - || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) + if (!is_a <gphi *> (stmt_info->stmt)) { - if (!is_a <gphi *> (stmt_info->stmt)) - { - STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; - return true; - } - if (slp_node) - { - slp_node_instance->reduc_phis = slp_node; - /* ??? We're leaving slp_node to point to the PHIs, we only - need it to get at the number of vector stmts which wasn't - yet initialized for the instance root. */ - } - if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) - stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info)); - else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */ - { - use_operand_p use_p; - gimple *use_stmt; - bool res = single_imm_use (gimple_phi_result (stmt_info->stmt), - &use_p, &use_stmt); - gcc_assert (res); - phi_info = loop_vinfo->lookup_stmt (use_stmt); - stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); - } + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; + return true; + } + if (slp_node) + { + slp_node_instance->reduc_phis = slp_node; + /* ??? We're leaving slp_node to point to the PHIs, we only + need it to get at the number of vector stmts which wasn't + yet initialized for the instance root. */ + } + if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) + stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info)); + else + { + gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) + == vect_double_reduction_def); + use_operand_p use_p; + gimple *use_stmt; + bool res = single_imm_use (gimple_phi_result (stmt_info->stmt), + &use_p, &use_stmt); + gcc_assert (res); + phi_info = loop_vinfo->lookup_stmt (use_stmt); + stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); } /* PHIs should not participate in patterns. */ @@ -6662,6 +6689,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo, bool lane_reduc_code_p = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR); int op_type = TREE_CODE_LENGTH (code); + enum optab_subtype optab_query_kind = optab_vector; + if (code == DOT_PROD_EXPR + && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt))) + != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt)))) + optab_query_kind = optab_vector_mixed_sign; + scalar_dest = gimple_assign_lhs (stmt); scalar_type = TREE_TYPE (scalar_dest); @@ -6831,10 +6864,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, else if (cond_reduc_dt == vect_constant_def) { enum vect_def_type cond_initial_dt; - tree cond_initial_val - = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop)); - - gcc_assert (cond_reduc_val != NULL_TREE); + tree cond_initial_val = vect_phi_initial_value (reduc_def_phi); vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); if (cond_initial_dt == vect_constant_def && types_compatible_p (TREE_TYPE (cond_initial_val), @@ -7027,9 +7057,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, /* For SLP reductions, see if there is a neutral value we can use. */ tree neutral_op = NULL_TREE; if (slp_node) - neutral_op = neutral_op_for_slp_reduction - (slp_node_instance->reduc_phis, vectype_out, orig_code, - REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL); + { + tree initial_value = NULL_TREE; + if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL) + initial_value = vect_phi_initial_value (reduc_def_phi); + neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out), + orig_code, initial_value); + } if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) { @@ -7189,7 +7223,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, bool ok = true; /* 4.1. check support for the operation in the loop */ - optab optab = optab_for_tree_code (code, vectype_in, optab_vector); + optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind); if (!optab) { if (dump_enabled_p ()) @@ -7204,23 +7238,20 @@ vectorizable_reduction (loop_vec_info loop_vinfo, if (dump_enabled_p ()) dump_printf (MSG_NOTE, "op not supported by target.\n"); if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) - || !vect_worthwhile_without_simd_p (loop_vinfo, code)) + || !vect_can_vectorize_without_simd_p (code)) ok = false; else if (dump_enabled_p ()) dump_printf (MSG_NOTE, "proceeding using word mode.\n"); } - /* Worthwhile without SIMD support? */ - if (ok - && !VECTOR_MODE_P (TYPE_MODE (vectype_in)) - && !vect_worthwhile_without_simd_p (loop_vinfo, code)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not worthwhile without SIMD support.\n"); - ok = false; - } + if (vect_emulated_vector_p (vectype_in) + && !vect_can_vectorize_without_simd_p (code)) + { + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "using word mode not possible.\n"); + return false; + } /* lane-reducing operations have to go through vect_transform_reduction. For the other cases try without the single cycle optimization. */ @@ -7579,7 +7610,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, vectype_out); /* Get the loop-entry arguments. */ - tree vec_initial_def; + tree vec_initial_def = NULL_TREE; auto_vec<tree> vec_initial_defs; if (slp_node) { @@ -7593,22 +7624,40 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, else { gcc_assert (slp_node == slp_node_instance->reduc_phis); - stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info); - tree neutral_op - = neutral_op_for_slp_reduction (slp_node, vectype_out, - STMT_VINFO_REDUC_CODE (reduc_info), - first != NULL); - get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis, - &vec_initial_defs, vec_num, - first != NULL, neutral_op); + vec<tree> &initial_values = reduc_info->reduc_initial_values; + vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node); + + unsigned int num_phis = stmts.length (); + if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) + num_phis = 1; + initial_values.reserve (num_phis); + for (unsigned int i = 0; i < num_phis; ++i) + { + gphi *this_phi = as_a<gphi *> (stmts[i]->stmt); + initial_values.quick_push (vect_phi_initial_value (this_phi)); + } + if (vec_num == 1) + vect_find_reusable_accumulator (loop_vinfo, reduc_info); + if (!initial_values.is_empty ()) + { + tree initial_value + = (num_phis == 1 ? initial_values[0] : NULL_TREE); + tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); + tree neutral_op + = neutral_op_for_reduction (TREE_TYPE (vectype_out), + code, initial_value); + get_initial_defs_for_reduction (loop_vinfo, reduc_info, + &vec_initial_defs, vec_num, + stmts.length (), neutral_op); + } } } else { /* Get at the scalar def before the loop, that defines the initial value of the reduction variable. */ - tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi, - loop_preheader_edge (loop)); + tree initial_def = vect_phi_initial_value (phi); + reduc_info->reduc_initial_values.safe_push (initial_def); /* Optimize: if initial_def is for REDUC_MAX smaller than the base and we can't use zero for induc_val, use initial_def. Similarly for REDUC_MIN and initial_def larger than the base. */ @@ -7628,9 +7677,6 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE; } vec_initial_def = build_vector_from_val (vectype_out, induc_val); - vec_initial_defs.create (ncopies); - for (i = 0; i < ncopies; ++i) - vec_initial_defs.quick_push (vec_initial_def); } else if (nested_cycle) { @@ -7640,23 +7686,106 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, ncopies, initial_def, &vec_initial_defs); } + else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION + || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) + /* Fill the initial vector with the initial scalar value. */ + vec_initial_def + = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, + initial_def, initial_def); else { - tree adjustment_def = NULL_TREE; - tree *adjustment_defp = &adjustment_def; - enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); - if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) - adjustment_defp = NULL; - vec_initial_def - = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code, - initial_def, adjustment_defp); - STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def; - vec_initial_defs.create (ncopies); - for (i = 0; i < ncopies; ++i) - vec_initial_defs.quick_push (vec_initial_def); + if (ncopies == 1) + vect_find_reusable_accumulator (loop_vinfo, reduc_info); + if (!reduc_info->reduc_initial_values.is_empty ()) + { + initial_def = reduc_info->reduc_initial_values[0]; + enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); + tree neutral_op + = neutral_op_for_reduction (TREE_TYPE (initial_def), + code, initial_def); + gcc_assert (neutral_op); + /* Try to simplify the vector initialization by applying an + adjustment after the reduction has been performed. */ + if (!reduc_info->reused_accumulator + && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def + && !operand_equal_p (neutral_op, initial_def)) + { + STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) + = initial_def; + initial_def = neutral_op; + } + vec_initial_def + = get_initial_def_for_reduction (loop_vinfo, reduc_info, + initial_def, neutral_op); + } } } + if (vec_initial_def) + { + vec_initial_defs.create (ncopies); + for (i = 0; i < ncopies; ++i) + vec_initial_defs.quick_push (vec_initial_def); + } + + if (auto *accumulator = reduc_info->reused_accumulator) + { + tree def = accumulator->reduc_input; + if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def))) + { + unsigned int nreduc; + bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS + (TREE_TYPE (def)), + TYPE_VECTOR_SUBPARTS (vectype_out), + &nreduc); + gcc_assert (res); + gimple_seq stmts = NULL; + /* Reduce the single vector to a smaller one. */ + if (nreduc != 1) + { + /* Perform the reduction in the appropriate type. */ + tree rvectype = vectype_out; + if (!useless_type_conversion_p (TREE_TYPE (vectype_out), + TREE_TYPE (TREE_TYPE (def)))) + rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)), + TYPE_VECTOR_SUBPARTS + (vectype_out)); + def = vect_create_partial_epilog (def, rvectype, + STMT_VINFO_REDUC_CODE + (reduc_info), + &stmts); + } + if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def))) + def = gimple_convert (&stmts, vectype_out, def); + /* Adjust the input so we pick up the partially reduced value + for the skip edge in vect_create_epilog_for_reduction. */ + accumulator->reduc_input = def; + if (loop_vinfo->main_loop_edge) + { + /* While we'd like to insert on the edge this will split + blocks and disturb bookkeeping, we also will eventually + need this on the skip edge. Rely on sinking to + fixup optimal placement and insert in the pred. */ + gimple_stmt_iterator gsi + = gsi_last_bb (loop_vinfo->main_loop_edge->src); + /* Insert before a cond that eventually skips the + epilogue. */ + if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi))) + gsi_prev (&gsi); + gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING); + } + else + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), + stmts); + } + if (loop_vinfo->main_loop_edge) + vec_initial_defs[0] + = vect_get_main_loop_result (loop_vinfo, def, + vec_initial_defs[0]); + else + vec_initial_defs.safe_push (def); + } + /* Generate the reduction PHIs upfront. */ for (i = 0; i < vec_num; i++) { @@ -7826,47 +7955,39 @@ vectorizable_phi (vec_info *, return true; } +/* Return true if VECTYPE represents a vector that requires lowering + by the vector lowering pass. */ -/* Function vect_min_worthwhile_factor. +bool +vect_emulated_vector_p (tree vectype) +{ + return (!VECTOR_MODE_P (TYPE_MODE (vectype)) + && (!VECTOR_BOOLEAN_TYPE_P (vectype) + || TYPE_PRECISION (TREE_TYPE (vectype)) != 1)); +} - For a loop where we could vectorize the operation indicated by CODE, - return the minimum vectorization factor that makes it worthwhile - to use generic vectors. */ -static unsigned int -vect_min_worthwhile_factor (enum tree_code code) +/* Return true if we can emulate CODE on an integer mode representation + of a vector. */ + +bool +vect_can_vectorize_without_simd_p (tree_code code) { switch (code) { case PLUS_EXPR: case MINUS_EXPR: case NEGATE_EXPR: - return 4; - case BIT_AND_EXPR: case BIT_IOR_EXPR: case BIT_XOR_EXPR: case BIT_NOT_EXPR: - return 2; + return true; default: - return INT_MAX; + return false; } } -/* Return true if VINFO indicates we are doing loop vectorization and if - it is worth decomposing CODE operations into scalar operations for - that loop's vectorization factor. */ - -bool -vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code) -{ - loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); - unsigned HOST_WIDE_INT value; - return (loop_vinfo - && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value) - && value >= vect_min_worthwhile_factor (code)); -} - /* Function vectorizable_induction Check if STMT_INFO performs an induction computation that can be vectorized. @@ -8202,11 +8323,12 @@ vectorizable_induction (loop_vec_info loop_vinfo, /* Fill up to the number of vectors we need for the whole group. */ nivs = least_common_multiple (group_size, const_nunits) / const_nunits; + vec_steps.reserve (nivs-ivn); for (; ivn < nivs; ++ivn) { SLP_TREE_VEC_STMTS (slp_node) .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]); - vec_steps.safe_push (vec_steps[0]); + vec_steps.quick_push (vec_steps[0]); } } @@ -8253,8 +8375,7 @@ vectorizable_induction (loop_vec_info loop_vinfo, return true; } - init_expr = PHI_ARG_DEF_FROM_EDGE (phi, - loop_preheader_edge (iv_loop)); + init_expr = vect_phi_initial_value (phi); gimple_seq stmts = NULL; if (!nested_in_vect_loop) @@ -9148,6 +9269,7 @@ maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo, if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p))) if (gimple_bb (phi)->loop_father->header == gimple_bb (phi) && (phi_info = loop_vinfo->lookup_stmt (phi)) + && STMT_VINFO_RELEVANT_P (phi_info) && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info)) && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION) @@ -9674,7 +9796,10 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) !gsi_end_p (gsi); gsi_next (&gsi)) { gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi)); - if (call && gimple_call_internal_p (call, IFN_MASK_LOAD)) + if (!call || !gimple_call_internal_p (call)) + continue; + internal_fn ifn = gimple_call_internal_fn (call); + if (ifn == IFN_MASK_LOAD) { tree lhs = gimple_get_lhs (call); if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) @@ -9684,6 +9809,17 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) gsi_replace (&gsi, new_stmt, true); } } + else if (conditional_internal_fn_code (ifn) != ERROR_MARK) + { + tree lhs = gimple_get_lhs (call); + if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) + { + tree else_arg + = gimple_call_arg (call, gimple_call_num_args (call) - 1); + gimple *new_stmt = gimple_build_assign (lhs, else_arg); + gsi_replace (&gsi, new_stmt, true); + } + } } } /* BBs in loop */ @@ -9723,12 +9859,31 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* In these calculations the "- 1" converts loop iteration counts back to latch counts. */ if (loop->any_upper_bound) - loop->nb_iterations_upper_bound - = (final_iter_may_be_partial - ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest, - lowest_vf) - 1 - : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest, - lowest_vf) - 1); + { + loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); + loop->nb_iterations_upper_bound + = (final_iter_may_be_partial + ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest, + lowest_vf) - 1 + : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest, + lowest_vf) - 1); + if (main_vinfo) + { + unsigned int bound; + poly_uint64 main_iters + = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo), + LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo)); + main_iters + = upper_bound (main_iters, + LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo)); + if (can_div_away_from_zero_p (main_iters, + LOOP_VINFO_VECT_FACTOR (loop_vinfo), + &bound)) + loop->nb_iterations_upper_bound + = wi::umin ((widest_int) (bound - 1), + loop->nb_iterations_upper_bound); + } + } if (loop->any_likely_upper_bound) loop->nb_iterations_likely_upper_bound = (final_iter_may_be_partial |