diff options
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r-- | gcc/tree-vect-loop.c | 836 |
1 files changed, 601 insertions, 235 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index e42f327..d6f1ffc 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -666,27 +666,50 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) unsigned i; FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first) - if (STMT_VINFO_IN_PATTERN_P (first)) - { - stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); - while (next) - { - if (! STMT_VINFO_IN_PATTERN_P (next) - || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1) - break; - next = REDUC_GROUP_NEXT_ELEMENT (next); - } - /* If not all stmt in the chain are patterns or if we failed - to update STMT_VINFO_REDUC_IDX try to handle the chain - without patterns. */ - if (! next - && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1) - { - vect_fixup_reduc_chain (first); - LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] - = STMT_VINFO_RELATED_STMT (first); - } - } + { + stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); + while (next) + { + if ((STMT_VINFO_IN_PATTERN_P (next) + != STMT_VINFO_IN_PATTERN_P (first)) + || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1) + break; + next = REDUC_GROUP_NEXT_ELEMENT (next); + } + /* If all reduction chain members are well-formed patterns adjust + the group to group the pattern stmts instead. */ + if (! next + && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1) + { + if (STMT_VINFO_IN_PATTERN_P (first)) + { + vect_fixup_reduc_chain (first); + LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] + = STMT_VINFO_RELATED_STMT (first); + } + } + /* If not all stmt in the chain are patterns or if we failed + to update STMT_VINFO_REDUC_IDX dissolve the chain and handle + it as regular reduction instead. */ + else + { + stmt_vec_info vinfo = first; + stmt_vec_info last = NULL; + while (vinfo) + { + next = REDUC_GROUP_NEXT_ELEMENT (vinfo); + REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL; + REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL; + last = vinfo; + vinfo = next; + } + STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first)) + = vect_internal_def; + loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last)); + LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i); + --i; + } + } } /* Function vect_get_loop_niters. @@ -1804,6 +1827,19 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo) } } + /* If using the "very cheap" model. reject cases in which we'd keep + a copy of the scalar code (even if we might be able to vectorize it). */ + if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP + && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) + || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "some scalar iterations would need to be peeled\n"); + return 0; + } + int min_profitable_iters, min_profitable_estimate; vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, &min_profitable_estimate); @@ -1862,6 +1898,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo) min_profitable_estimate = min_profitable_iters; } + /* If the vector loop needs multiple iterations to be beneficial then + things are probably too close to call, and the conservative thing + would be to stick with the scalar code. */ + if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP + && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "one iteration of the vector loop would be" + " more expensive than the equivalent number of" + " iterations of the scalar loop\n"); + return 0; + } + HOST_WIDE_INT estimated_niter; /* If we are vectorizing an epilogue then we know the maximum number of @@ -2275,6 +2325,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) /* Optimize the SLP graph with the vectorization factor fixed. */ vect_optimize_slp (loop_vinfo); + + /* Gather the loads reachable from the SLP graph entries. */ + vect_gather_slp_loads (loop_vinfo); } bool saved_can_use_partial_vectors_p @@ -2342,6 +2395,79 @@ start_over: "unsupported SLP instances\n"); goto again; } + + /* Check whether any load in ALL SLP instances is possibly permuted. */ + slp_tree load_node, slp_root; + unsigned i, x; + slp_instance instance; + bool can_use_lanes = true; + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance) + { + slp_root = SLP_INSTANCE_TREE (instance); + int group_size = SLP_TREE_LANES (slp_root); + tree vectype = SLP_TREE_VECTYPE (slp_root); + bool loads_permuted = false; + FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) + { + if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) + continue; + unsigned j; + stmt_vec_info load_info; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) + if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) + { + loads_permuted = true; + break; + } + } + + /* If the loads and stores can be handled with load/store-lane + instructions record it and move on to the next instance. */ + if (loads_permuted + && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store + && vect_store_lanes_supported (vectype, group_size, false)) + { + FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) + { + stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT + (SLP_TREE_SCALAR_STMTS (load_node)[0]); + /* Use SLP for strided accesses (or if we can't + load-lanes). */ + if (STMT_VINFO_STRIDED_P (stmt_vinfo) + || ! vect_load_lanes_supported + (STMT_VINFO_VECTYPE (stmt_vinfo), + DR_GROUP_SIZE (stmt_vinfo), false)) + break; + } + + can_use_lanes + = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length (); + + if (can_use_lanes && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "SLP instance %p can use load/store-lanes\n", + instance); + } + else + { + can_use_lanes = false; + break; + } + } + + /* If all SLP instances can use load/store-lanes abort SLP and try again + with SLP disabled. */ + if (can_use_lanes) + { + ok = opt_result::failure_at (vect_location, + "Built SLP cancelled: can use " + "load/store-lanes\n"); + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Built SLP cancelled: all SLP instances support " + "load/store-lanes\n"); + goto again; + } } /* Dissolve SLP-only groups. */ @@ -2572,9 +2698,13 @@ again: STMT_SLP_TYPE (stmt_info) = loop_vect; if (STMT_VINFO_IN_PATTERN_P (stmt_info)) { + stmt_vec_info pattern_stmt_info + = STMT_VINFO_RELATED_STMT (stmt_info); + if (STMT_VINFO_SLP_VECT_ONLY (pattern_stmt_info)) + STMT_VINFO_IN_PATTERN_P (stmt_info) = false; + gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); - stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); - STMT_SLP_TYPE (stmt_info) = loop_vect; + STMT_SLP_TYPE (pattern_stmt_info) = loop_vect; for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq); !gsi_end_p (pi); gsi_next (&pi)) STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi))) @@ -2643,43 +2773,56 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, /* Check whether the (fractional) cost per scalar iteration is lower or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */ - poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost - * poly_widest_int (old_vf)); - poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost - * poly_widest_int (new_vf)); - if (maybe_lt (rel_old, rel_new)) - { - /* When old_loop_vinfo uses a variable vectorization factor, - we know that it has a lower cost for at least one runtime VF. - However, we don't know how likely that VF is. - - One option would be to compare the costs for the estimated VFs. - The problem is that that can put too much pressure on the cost - model. E.g. if the estimated VF is also the lowest possible VF, - and if old_loop_vinfo is 1 unit worse than new_loop_vinfo - for the estimated VF, we'd then choose new_loop_vinfo even - though (a) new_loop_vinfo might not actually be better than - old_loop_vinfo for that VF and (b) it would be significantly - worse at larger VFs. - - Here we go for a hacky compromise: pick new_loop_vinfo if it is - no more expensive than old_loop_vinfo even after doubling the - estimated old_loop_vinfo VF. For all but trivial loops, this - ensures that we only pick new_loop_vinfo if it is significantly - better than old_loop_vinfo at the estimated VF. */ - if (rel_new.is_constant ()) - return false; + poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf; + poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf; + + HOST_WIDE_INT est_rel_new_min + = estimated_poly_value (rel_new, POLY_VALUE_MIN); + HOST_WIDE_INT est_rel_new_max + = estimated_poly_value (rel_new, POLY_VALUE_MAX); + + HOST_WIDE_INT est_rel_old_min + = estimated_poly_value (rel_old, POLY_VALUE_MIN); + HOST_WIDE_INT est_rel_old_max + = estimated_poly_value (rel_old, POLY_VALUE_MAX); + + /* Check first if we can make out an unambigous total order from the minimum + and maximum estimates. */ + if (est_rel_new_min < est_rel_old_min + && est_rel_new_max < est_rel_old_max) + return true; + else if (est_rel_old_min < est_rel_new_min + && est_rel_old_max < est_rel_new_max) + return false; + /* When old_loop_vinfo uses a variable vectorization factor, + we know that it has a lower cost for at least one runtime VF. + However, we don't know how likely that VF is. + + One option would be to compare the costs for the estimated VFs. + The problem is that that can put too much pressure on the cost + model. E.g. if the estimated VF is also the lowest possible VF, + and if old_loop_vinfo is 1 unit worse than new_loop_vinfo + for the estimated VF, we'd then choose new_loop_vinfo even + though (a) new_loop_vinfo might not actually be better than + old_loop_vinfo for that VF and (b) it would be significantly + worse at larger VFs. + + Here we go for a hacky compromise: pick new_loop_vinfo if it is + no more expensive than old_loop_vinfo even after doubling the + estimated old_loop_vinfo VF. For all but trivial loops, this + ensures that we only pick new_loop_vinfo if it is significantly + better than old_loop_vinfo at the estimated VF. */ - HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf); - HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf); - widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost - * widest_int (old_estimated_vf)); - widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost - * widest_int (new_estimated_vf)); - return estimated_rel_new * 2 <= estimated_rel_old; + if (est_rel_old_min != est_rel_new_min + || est_rel_old_max != est_rel_new_max) + { + HOST_WIDE_INT est_rel_new_likely + = estimated_poly_value (rel_new, POLY_VALUE_LIKELY); + HOST_WIDE_INT est_rel_old_likely + = estimated_poly_value (rel_old, POLY_VALUE_LIKELY); + + return est_rel_new_likely * 2 <= est_rel_old_likely; } - if (known_lt (rel_new, rel_old)) - return true; /* If there's nothing to choose between the loop bodies, see whether there's a difference in the prologue and epilogue costs. */ @@ -3230,14 +3373,17 @@ pop: fail = true; break; } - /* Check there's only a single stmt the op is used on inside - of the loop. */ + /* Check there's only a single stmt the op is used on. For the + not value-changing tail and the last stmt allow out-of-loop uses. + ??? We could relax this and handle arbitrary live stmts by + forcing a scalar epilogue for example. */ imm_use_iterator imm_iter; gimple *op_use_stmt; unsigned cnt = 0; FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op) if (!is_gimple_debug (op_use_stmt) - && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))) + && (*code != ERROR_MARK + || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))) { /* We want to allow x + x but not x < 1 ? x : 2. */ if (is_gimple_assign (op_use_stmt) @@ -4420,34 +4566,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, } -/* Function vect_model_induction_cost. - - Models cost for induction operations. */ - -static void -vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies, - stmt_vector_for_cost *cost_vec) -{ - unsigned inside_cost, prologue_cost; - - if (PURE_SLP_STMT (stmt_info)) - return; - - /* loop cost for vec_loop. */ - inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, - stmt_info, 0, vect_body); - - /* prologue cost for vec_init and vec_step. */ - prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, - stmt_info, 0, vect_prologue); - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "vect_model_induction_cost: inside_cost = %d, " - "prologue_cost = %d .\n", inside_cost, prologue_cost); -} - - /* Function get_initial_def_for_reduction @@ -5160,8 +5278,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, int scalar_precision = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); tree scalar_type_unsigned = make_unsigned_type (scalar_precision); - tree vectype_unsigned = build_vector_type - (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype)); + tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned, + vectype); /* First we need to create a vector (ZERO_VEC) of zeros and another vector (MAX_INDEX_VEC) filled with the last matching index, which we @@ -6313,9 +6431,28 @@ vectorizable_reduction (loop_vec_info loop_vinfo, if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) { if (is_a <gphi *> (stmt_info->stmt)) - /* Analysis for double-reduction is done on the outer - loop PHI, nested cycles have no further restrictions. */ - STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; + { + if (slp_node) + { + /* We eventually need to set a vector type on invariant + arguments. */ + unsigned j; + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) + if (!vect_maybe_update_slp_op_vectype + (child, SLP_TREE_VECTYPE (slp_node))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for " + "invariants\n"); + return false; + } + } + /* Analysis for double-reduction is done on the outer + loop PHI, nested cycles have no further restrictions. */ + STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; + } else STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; return true; @@ -6805,8 +6942,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, int scalar_precision = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); cr_index_scalar_type = make_unsigned_type (scalar_precision); - cr_index_vector_type = build_vector_type (cr_index_scalar_type, - nunits_out); + cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type, + vectype_out); if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, OPTIMIZE_FOR_SPEED)) @@ -7377,15 +7514,24 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, if (slp_node) { vec_initial_defs.reserve (vec_num); - gcc_assert (slp_node == slp_node_instance->reduc_phis); - stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info); - tree neutral_op - = neutral_op_for_slp_reduction (slp_node, vectype_out, - STMT_VINFO_REDUC_CODE (reduc_info), - first != NULL); - get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis, - &vec_initial_defs, vec_num, - first != NULL, neutral_op); + if (nested_cycle) + { + unsigned phi_idx = loop_preheader_edge (loop)->dest_idx; + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx], + &vec_initial_defs); + } + else + { + gcc_assert (slp_node == slp_node_instance->reduc_phis); + stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info); + tree neutral_op + = neutral_op_for_slp_reduction (slp_node, vectype_out, + STMT_VINFO_REDUC_CODE (reduc_info), + first != NULL); + get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis, + &vec_initial_defs, vec_num, + first != NULL, neutral_op); + } } else { @@ -7491,6 +7637,17 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo, if (!vec_stmt) /* transformation not required. */ { + /* Deal with copies from externs or constants that disguise as + loop-closed PHI nodes (PR97886). */ + if (slp_node + && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0], + SLP_TREE_VECTYPE (slp_node))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for invariants\n"); + return false; + } STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type; return true; } @@ -7520,6 +7677,81 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo, return true; } +/* Vectorizes PHIs. */ + +bool +vectorizable_phi (vec_info *, + stmt_vec_info stmt_info, gimple **vec_stmt, + slp_tree slp_node, stmt_vector_for_cost *cost_vec) +{ + if (!is_a <gphi *> (stmt_info->stmt) || !slp_node) + return false; + + if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def) + return false; + + tree vectype = SLP_TREE_VECTYPE (slp_node); + + if (!vec_stmt) /* transformation not required. */ + { + slp_tree child; + unsigned i; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child) + if (!child) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "PHI node with unvectorized backedge def\n"); + return false; + } + else if (!vect_maybe_update_slp_op_vectype (child, vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for invariants\n"); + return false; + } + record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), + vector_stmt, stmt_info, vectype, 0, vect_body); + STMT_VINFO_TYPE (stmt_info) = phi_info_type; + return true; + } + + tree scalar_dest = gimple_phi_result (stmt_info->stmt); + basic_block bb = gimple_bb (stmt_info->stmt); + tree vec_dest = vect_create_destination_var (scalar_dest, vectype); + auto_vec<gphi *> new_phis; + for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i) + { + slp_tree child = SLP_TREE_CHILDREN (slp_node)[i]; + + /* Skip not yet vectorized defs. */ + if (SLP_TREE_DEF_TYPE (child) == vect_internal_def + && SLP_TREE_VEC_STMTS (child).is_empty ()) + continue; + + auto_vec<tree> vec_oprnds; + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds); + if (!new_phis.exists ()) + { + new_phis.create (vec_oprnds.length ()); + for (unsigned j = 0; j < vec_oprnds.length (); j++) + { + /* Create the vectorized LC PHI node. */ + new_phis.quick_push (create_phi_node (vec_dest, bb)); + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]); + } + } + edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i); + for (unsigned j = 0; j < vec_oprnds.length (); j++) + add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION); + } + /* We should have at least one already vectorized child. */ + gcc_assert (new_phis.exists ()); + + return true; +} + /* Function vect_min_worthwhile_factor. @@ -7590,7 +7822,6 @@ vectorizable_induction (loop_vec_info loop_vinfo, poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); unsigned i; tree expr; - gimple_seq stmts; gimple_stmt_iterator si; gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); @@ -7630,10 +7861,6 @@ vectorizable_induction (loop_vec_info loop_vinfo, return false; } - /* FORNOW: outer loop induction with SLP not supported. */ - if (STMT_SLP_TYPE (stmt_info)) - return false; - exit_phi = NULL; latch_e = loop_latch_edge (loop->inner); loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); @@ -7672,7 +7899,7 @@ vectorizable_induction (loop_vec_info loop_vinfo, if (slp_node && !nunits.is_constant ()) { - /* The current SLP code creates the initial value element-by-element. */ + /* The current SLP code creates the step value element-by-element. */ if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "SLP induction not supported for variable-length" @@ -7682,9 +7909,50 @@ vectorizable_induction (loop_vec_info loop_vinfo, if (!vec_stmt) /* transformation not required. */ { + unsigned inside_cost = 0, prologue_cost = 0; + if (slp_node) + { + /* We eventually need to set a vector type on invariant + arguments. */ + unsigned j; + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) + if (!vect_maybe_update_slp_op_vectype + (child, SLP_TREE_VECTYPE (slp_node))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for " + "invariants\n"); + return false; + } + /* loop cost for vec_loop. */ + inside_cost + = record_stmt_cost (cost_vec, + SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), + vector_stmt, stmt_info, 0, vect_body); + /* prologue cost for vec_init (if not nested) and step. */ + prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop, + scalar_to_vec, + stmt_info, 0, vect_prologue); + } + else /* if (!slp_node) */ + { + /* loop cost for vec_loop. */ + inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, + stmt_info, 0, vect_body); + /* prologue cost for vec_init and vec_step. */ + prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, + stmt_info, 0, vect_prologue); + } + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_model_induction_cost: inside_cost = %d, " + "prologue_cost = %d .\n", inside_cost, + prologue_cost); + STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; DUMP_VECT_SCOPE ("vectorizable_induction"); - vect_model_induction_cost (stmt_info, ncopies, cost_vec); return true; } @@ -7703,164 +7971,195 @@ vectorizable_induction (loop_vec_info loop_vinfo, tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype); pe = loop_preheader_edge (iv_loop); - init_expr = PHI_ARG_DEF_FROM_EDGE (phi, - loop_preheader_edge (iv_loop)); - - stmts = NULL; - if (!nested_in_vect_loop) - { - /* Convert the initial value to the IV update type. */ - tree new_type = TREE_TYPE (step_expr); - init_expr = gimple_convert (&stmts, new_type, init_expr); - - /* If we are using the loop mask to "peel" for alignment then we need - to adjust the start value here. */ - tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); - if (skip_niters != NULL_TREE) - { - if (FLOAT_TYPE_P (vectype)) - skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type, - skip_niters); - else - skip_niters = gimple_convert (&stmts, new_type, skip_niters); - tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type, - skip_niters, step_expr); - init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, - init_expr, skip_step); - } - } - - if (stmts) - { - new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); - gcc_assert (!new_bb); - } - /* Find the first insertion point in the BB. */ basic_block bb = gimple_bb (phi); si = gsi_after_labels (bb); /* For SLP induction we have to generate several IVs as for example - with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S] - [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform - [VF*S, VF*S, VF*S, VF*S] for all. */ + with group size 3 we need + [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1] + [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */ if (slp_node) { /* Enforced above. */ unsigned int const_nunits = nunits.to_constant (); - /* Generate [VF*S, VF*S, ... ]. */ - if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) + /* The initial values are vectorized, but any lanes > group_size + need adjustment. */ + slp_tree init_node + = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx]; + + /* Gather steps. Since we do not vectorize inductions as + cycles we have to reconstruct the step from SCEV data. */ + unsigned group_size = SLP_TREE_LANES (slp_node); + tree *steps = XALLOCAVEC (tree, group_size); + tree *inits = XALLOCAVEC (tree, group_size); + stmt_vec_info phi_info; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info) { - expr = build_int_cst (integer_type_node, vf); - expr = fold_convert (TREE_TYPE (step_expr), expr); + steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); + if (!init_node) + inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt), + pe->dest_idx); } - else - expr = build_int_cst (TREE_TYPE (step_expr), vf); - new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), - expr, step_expr); - if (! CONSTANT_CLASS_P (new_name)) - new_name = vect_init_vector (loop_vinfo, stmt_info, new_name, - TREE_TYPE (step_expr), NULL); - new_vec = build_vector_from_val (step_vectype, new_name); - vec_step = vect_init_vector (loop_vinfo, stmt_info, - new_vec, step_vectype, NULL); /* Now generate the IVs. */ - unsigned group_size = SLP_TREE_LANES (slp_node); unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); - unsigned elts = const_nunits * nvects; - /* Compute the number of distinct IVs we need. First reduce - group_size if it is a multiple of const_nunits so we get - one IV for a group_size of 4 but const_nunits 2. */ - unsigned group_sizep = group_size; - if (group_sizep % const_nunits == 0) - group_sizep = group_sizep / const_nunits; - unsigned nivs = least_common_multiple (group_sizep, - const_nunits) / const_nunits; - gcc_assert (elts % group_size == 0); - tree elt = init_expr; + gcc_assert ((const_nunits * nvects) % group_size == 0); + unsigned nivs; + if (nested_in_vect_loop) + nivs = nvects; + else + { + /* Compute the number of distinct IVs we need. First reduce + group_size if it is a multiple of const_nunits so we get + one IV for a group_size of 4 but const_nunits 2. */ + unsigned group_sizep = group_size; + if (group_sizep % const_nunits == 0) + group_sizep = group_sizep / const_nunits; + nivs = least_common_multiple (group_sizep, + const_nunits) / const_nunits; + } + tree stept = TREE_TYPE (step_vectype); + tree lupdate_mul = NULL_TREE; + if (!nested_in_vect_loop) + { + /* The number of iterations covered in one vector iteration. */ + unsigned lup_mul = (nvects * const_nunits) / group_size; + lupdate_mul + = build_vector_from_val (step_vectype, + SCALAR_FLOAT_TYPE_P (stept) + ? build_real_from_wide (stept, lup_mul, + UNSIGNED) + : build_int_cstu (stept, lup_mul)); + } + tree peel_mul = NULL_TREE; + gimple_seq init_stmts = NULL; + if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) + { + if (SCALAR_FLOAT_TYPE_P (stept)) + peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, + LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); + else + peel_mul = gimple_convert (&init_stmts, stept, + LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); + peel_mul = gimple_build_vector_from_val (&init_stmts, + step_vectype, peel_mul); + } unsigned ivn; + auto_vec<tree> vec_steps; for (ivn = 0; ivn < nivs; ++ivn) { - tree_vector_builder elts (step_vectype, const_nunits, 1); - stmts = NULL; + tree_vector_builder step_elts (step_vectype, const_nunits, 1); + tree_vector_builder init_elts (vectype, const_nunits, 1); + tree_vector_builder mul_elts (step_vectype, const_nunits, 1); for (unsigned eltn = 0; eltn < const_nunits; ++eltn) { - if (ivn*const_nunits + eltn >= group_size - && (ivn * const_nunits + eltn) % group_size == 0) - elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt), - elt, step_expr); - elts.quick_push (elt); - } - vec_init = gimple_build_vector (&stmts, &elts); - vec_init = gimple_convert (&stmts, vectype, vec_init); - if (stmts) - { - new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); - gcc_assert (!new_bb); + /* The scalar steps of the IVs. */ + tree elt = steps[(ivn*const_nunits + eltn) % group_size]; + elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt); + step_elts.quick_push (elt); + if (!init_node) + { + /* The scalar inits of the IVs if not vectorized. */ + elt = inits[(ivn*const_nunits + eltn) % group_size]; + if (!useless_type_conversion_p (TREE_TYPE (vectype), + TREE_TYPE (elt))) + elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR, + TREE_TYPE (vectype), elt); + init_elts.quick_push (elt); + } + /* The number of steps to add to the initial values. */ + unsigned mul_elt = (ivn*const_nunits + eltn) / group_size; + mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept) + ? build_real_from_wide (stept, + mul_elt, UNSIGNED) + : build_int_cstu (stept, mul_elt)); } + vec_step = gimple_build_vector (&init_stmts, &step_elts); + vec_steps.safe_push (vec_step); + tree step_mul = gimple_build_vector (&init_stmts, &mul_elts); + if (peel_mul) + step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype, + step_mul, peel_mul); + if (!init_node) + vec_init = gimple_build_vector (&init_stmts, &init_elts); /* Create the induction-phi that defines the induction-operand. */ - vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); + vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, + "vec_iv_"); induction_phi = create_phi_node (vec_dest, iv_loop->header); induc_def = PHI_RESULT (induction_phi); /* Create the iv update inside the loop */ + tree up = vec_step; + if (lupdate_mul) + up = gimple_build (&init_stmts, MULT_EXPR, step_vectype, + vec_step, lupdate_mul); gimple_seq stmts = NULL; vec_def = gimple_convert (&stmts, step_vectype, induc_def); vec_def = gimple_build (&stmts, - PLUS_EXPR, step_vectype, vec_def, vec_step); + PLUS_EXPR, step_vectype, vec_def, up); vec_def = gimple_convert (&stmts, vectype, vec_def); gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), + UNKNOWN_LOCATION); + + if (init_node) + vec_init = vect_get_slp_vect_def (init_node, ivn); + if (!nested_in_vect_loop + && !integer_zerop (step_mul)) + { + vec_def = gimple_convert (&init_stmts, step_vectype, vec_init); + up = gimple_build (&init_stmts, MULT_EXPR, step_vectype, + vec_step, step_mul); + vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype, + vec_def, up); + vec_init = gimple_convert (&init_stmts, vectype, vec_def); + } /* Set the arguments of the phi node: */ add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); - add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), - UNKNOWN_LOCATION); SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi); } - /* Fill up to the number of vectors we need for the whole group. */ - nivs = least_common_multiple (group_size, - const_nunits) / const_nunits; - for (; ivn < nivs; ++ivn) - SLP_TREE_VEC_STMTS (slp_node) - .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]); + if (!nested_in_vect_loop) + { + /* Fill up to the number of vectors we need for the whole group. */ + nivs = least_common_multiple (group_size, + const_nunits) / const_nunits; + for (; ivn < nivs; ++ivn) + { + SLP_TREE_VEC_STMTS (slp_node) + .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]); + vec_steps.safe_push (vec_steps[0]); + } + } - /* Re-use IVs when we can. */ + /* Re-use IVs when we can. We are generating further vector + stmts by adding VF' * stride to the IVs generated above. */ if (ivn < nvects) { unsigned vfp = least_common_multiple (group_size, const_nunits) / group_size; - /* Generate [VF'*S, VF'*S, ... ]. */ - if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) - { - expr = build_int_cst (integer_type_node, vfp); - expr = fold_convert (TREE_TYPE (step_expr), expr); - } - else - expr = build_int_cst (TREE_TYPE (step_expr), vfp); - new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), - expr, step_expr); - if (! CONSTANT_CLASS_P (new_name)) - new_name = vect_init_vector (loop_vinfo, stmt_info, new_name, - TREE_TYPE (step_expr), NULL); - new_vec = build_vector_from_val (step_vectype, new_name); - vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec, - step_vectype, NULL); + tree lupdate_mul + = build_vector_from_val (step_vectype, + SCALAR_FLOAT_TYPE_P (stept) + ? build_real_from_wide (stept, + vfp, UNSIGNED) + : build_int_cstu (stept, vfp)); for (; ivn < nvects; ++ivn) { gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]; - tree def; - if (gimple_code (iv) == GIMPLE_PHI) - def = gimple_phi_result (iv); - else - def = gimple_assign_lhs (iv); + tree def = gimple_get_lhs (iv); + if (ivn < 2*nivs) + vec_steps[ivn - nivs] + = gimple_build (&init_stmts, MULT_EXPR, step_vectype, + vec_steps[ivn - nivs], lupdate_mul); gimple_seq stmts = NULL; def = gimple_convert (&stmts, step_vectype, def); - def = gimple_build (&stmts, - PLUS_EXPR, step_vectype, def, vec_step); + def = gimple_build (&stmts, PLUS_EXPR, step_vectype, + def, vec_steps[ivn % nivs]); def = gimple_convert (&stmts, vectype, def); if (gimple_code (iv) == GIMPLE_PHI) gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); @@ -7874,9 +8173,45 @@ vectorizable_induction (loop_vec_info loop_vinfo, } } + new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts); + gcc_assert (!new_bb); + return true; } + init_expr = PHI_ARG_DEF_FROM_EDGE (phi, + loop_preheader_edge (iv_loop)); + + gimple_seq stmts = NULL; + if (!nested_in_vect_loop) + { + /* Convert the initial value to the IV update type. */ + tree new_type = TREE_TYPE (step_expr); + init_expr = gimple_convert (&stmts, new_type, init_expr); + + /* If we are using the loop mask to "peel" for alignment then we need + to adjust the start value here. */ + tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + if (skip_niters != NULL_TREE) + { + if (FLOAT_TYPE_P (vectype)) + skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type, + skip_niters); + else + skip_niters = gimple_convert (&stmts, new_type, skip_niters); + tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type, + skip_niters, step_expr); + init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, + init_expr, skip_step); + } + } + + if (stmts) + { + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); + gcc_assert (!new_bb); + } + /* Create the vector that holds the initial_value of the induction. */ if (nested_in_vect_loop) { @@ -8376,8 +8711,19 @@ vectorizable_live_operation (vec_info *vinfo, gimple_seq stmts = NULL; new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts, true, NULL_TREE); - - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + if (TREE_CODE (new_tree) == SSA_NAME + && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)) + SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1; + if (is_a <gphi *> (vec_stmt)) + { + gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt)); + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); + } + else + { + gimple_stmt_iterator si = gsi_for_stmt (vec_stmt); + gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT); + } /* Replace use of lhs with newly computed result. If the use stmt is a single arg PHI, just replace all uses of PHI result. It's necessary @@ -8414,6 +8760,24 @@ vectorizable_live_operation (vec_info *vinfo, "def\n"); continue; } + /* ??? It can also happen that we end up pulling a def into + a loop where replacing out-of-loop uses would require + a new LC SSA PHI node. Retain the original scalar in + those cases as well. PR98064. */ + if (TREE_CODE (new_tree) == SSA_NAME + && !SSA_NAME_IS_DEFAULT_DEF (new_tree) + && (gimple_bb (use_stmt)->loop_father + != gimple_bb (vec_stmt)->loop_father) + && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father, + gimple_bb (use_stmt)->loop_father)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Using original scalar computation for " + "live lane because there is an out-of-loop " + "definition for it\n"); + continue; + } FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) SET_USE (use_p, new_tree); update_stmt (use_stmt); @@ -8734,7 +9098,7 @@ maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo, When vectorizing STMT_INFO as a store, set *SEEN_STORE to its stmt_vec_info. */ -static void +static bool vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, stmt_vec_info *seen_store) { @@ -8750,7 +9114,7 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, if (!STMT_VINFO_RELEVANT_P (stmt_info) && !STMT_VINFO_LIVE_P (stmt_info)) - return; + return false; if (STMT_VINFO_VECTYPE (stmt_info)) { @@ -8767,13 +9131,15 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, /* Pure SLP statements have already been vectorized. We still need to apply loop vectorization to hybrid SLP statements. */ if (PURE_SLP_STMT (stmt_info)) - return; + return false; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n"); if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL)) *seen_store = stmt_info; + + return true; } /* Helper function to pass to simplify_replace_tree to enable replacing tree's @@ -9199,17 +9565,17 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) } stmt_vec_info pat_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); - vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si, - &seen_store); - maybe_set_vectorized_backedge_value (loop_vinfo, - pat_stmt_info); + if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, + &si, &seen_store)) + maybe_set_vectorized_backedge_value (loop_vinfo, + pat_stmt_info); } else { - vect_transform_loop_stmt (loop_vinfo, stmt_info, &si, - &seen_store); - maybe_set_vectorized_backedge_value (loop_vinfo, - stmt_info); + if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si, + &seen_store)) + maybe_set_vectorized_backedge_value (loop_vinfo, + stmt_info); } } gsi_next (&si); |