diff options
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r-- | gcc/tree-vect-loop.c | 325 |
1 files changed, 305 insertions, 20 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index d679115..5deb780 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -1121,12 +1121,15 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in) versioning_threshold (0), vectorization_factor (0), max_vectorization_factor (0), + mask_compare_type (NULL_TREE), unaligned_dr (NULL), peeling_for_alignment (0), ptr_mask (0), slp_unrolling_factor (1), single_scalar_iteration_cost (0), vectorizable (false), + can_fully_mask_p (true), + fully_masked_p (false), peeling_for_gaps (false), peeling_for_niter (false), operands_swapped (false), @@ -1168,6 +1171,17 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in) gcc_assert (nbbs == loop->num_nodes); } +/* Free all levels of MASKS. */ + +void +release_vec_loop_masks (vec_loop_masks *masks) +{ + rgroup_masks *rgm; + unsigned int i; + FOR_EACH_VEC_ELT (*masks, i, rgm) + rgm->masks.release (); + masks->release (); +} /* Free all memory used by the _loop_vec_info, as well as all the stmt_vec_info structs of all the stmts in the loop. */ @@ -1233,9 +1247,98 @@ _loop_vec_info::~_loop_vec_info () free (bbs); + release_vec_loop_masks (&masks); + loop->aux = NULL; } +/* Return true if we can use CMP_TYPE as the comparison type to produce + all masks required to mask LOOP_VINFO. */ + +static bool +can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) +{ + rgroup_masks *rgm; + unsigned int i; + FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) + if (rgm->mask_type != NULL_TREE + && !direct_internal_fn_supported_p (IFN_WHILE_ULT, + cmp_type, rgm->mask_type, + OPTIMIZE_FOR_SPEED)) + return false; + return true; +} + +/* Calculate the maximum number of scalars per iteration for every + rgroup in LOOP_VINFO. */ + +static unsigned int +vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo) +{ + unsigned int res = 1; + unsigned int i; + rgroup_masks *rgm; + FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) + res = MAX (res, rgm->max_nscalars_per_iter); + return res; +} + +/* Each statement in LOOP_VINFO can be masked where necessary. Check + whether we can actually generate the masks required. Return true if so, + storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */ + +static bool +vect_verify_full_masking (loop_vec_info loop_vinfo) +{ + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + unsigned int min_ni_width; + + /* Get the maximum number of iterations that is representable + in the counter type. */ + tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); + widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; + + /* Get a more refined estimate for the number of iterations. */ + widest_int max_back_edges; + if (max_loop_iterations (loop, &max_back_edges)) + max_ni = wi::smin (max_ni, max_back_edges + 1); + + /* Account for rgroup masks, in which each bit is replicated N times. */ + max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); + + /* Work out how many bits we need to represent the limit. */ + min_ni_width = wi::min_precision (max_ni, UNSIGNED); + + /* Find a scalar mode for which WHILE_ULT is supported. */ + opt_scalar_int_mode cmp_mode_iter; + tree cmp_type = NULL_TREE; + FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) + { + unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); + if (cmp_bits >= min_ni_width + && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) + { + tree this_type = build_nonstandard_integer_type (cmp_bits, true); + if (this_type + && can_produce_all_loop_masks_p (loop_vinfo, this_type)) + { + /* Although we could stop as soon as we find a valid mode, + it's often better to continue until we hit Pmode, since the + operands to the WHILE are more likely to be reusable in + address calculations. */ + cmp_type = this_type; + if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) + break; + } + } + } + + if (!cmp_type) + return false; + + LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type; + return true; +} /* Calculate the cost of one scalar iteration of the loop. */ static void @@ -1980,6 +2083,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) vect_update_vf_for_slp (loop_vinfo); } + bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo); + + /* We don't expect to have to roll back to anything other than an empty + set of rgroups. */ + gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); + /* This is the point where we can re-start analysis with SLP forced off. */ start_over: @@ -2068,11 +2177,47 @@ start_over: return false; } + if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + { + LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use a fully-masked loop because peeling for" + " gaps is required.\n"); + } + + if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) + { + LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use a fully-masked loop because peeling for" + " alignment is required.\n"); + } + + /* Decide whether to use a fully-masked loop for this vectorization + factor. */ + LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) + && vect_verify_full_masking (loop_vinfo)); + if (dump_enabled_p ()) + { + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + dump_printf_loc (MSG_NOTE, vect_location, + "using a fully-masked loop.\n"); + else + dump_printf_loc (MSG_NOTE, vect_location, + "not using a fully-masked loop.\n"); + } + /* If epilog loop is required because of data accesses with gaps, one additional iteration needs to be peeled. Check if there is enough iterations for vectorization. */ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) - && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) { poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo); @@ -2153,8 +2298,11 @@ start_over: th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); unsigned HOST_WIDE_INT const_vf; - if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + /* The main loop handles all iterations. */ + LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; + else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) { if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo), @@ -2212,7 +2360,8 @@ start_over: niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); /* Niters for at least one iteration of vectorized loop. */ - niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); + if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); /* One additional iteration because of peeling for gap. */ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) niters_th += 1; @@ -2315,11 +2464,14 @@ again: destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); + /* Reset accumulated rgroup information. */ + release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo)); /* Reset assorted flags. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; + LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p; goto start_over; } @@ -3523,7 +3675,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo); /* Add additional cost for the peeled instructions in prologue and epilogue - loop. + loop. (For fully-masked loops there will be no peeling.) FORNOW: If we don't know the value of peel_iters for prologue or epilogue at compile-time - we assume it's vf/2 (the worst would be vf-1). @@ -3531,7 +3683,12 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, TODO: Build an expression that represents peel_iters for prologue and epilogue to be used in a run-time test. */ - if (npeel < 0) + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + { + peel_iters_prologue = 0; + peel_iters_epilogue = 0; + } + else if (npeel < 0) { peel_iters_prologue = assumed_vf / 2; dump_printf (MSG_NOTE, "cost model: " @@ -3762,8 +3919,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, " Calculated minimum iters for profitability: %d\n", min_profitable_iters); - /* We want the vectorized loop to execute at least once. */ - if (min_profitable_iters < (assumed_vf + peel_iters_prologue)) + if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && min_profitable_iters < (assumed_vf + peel_iters_prologue)) + /* We want the vectorized loop to execute at least once. */ min_profitable_iters = assumed_vf + peel_iters_prologue; if (dump_enabled_p ()) @@ -6737,6 +6895,15 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, if (!vec_stmt) /* transformation not required. */ { + if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use a fully-masked loop due to " + "reduction operation.\n"); + LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; + } + if (first_p) vect_model_reduction_cost (stmt_info, reduc_fn, ncopies); STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; @@ -7557,8 +7724,19 @@ vectorizable_live_operation (gimple *stmt, } if (!vec_stmt) - /* No transformation required. */ - return true; + { + if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use a fully-masked loop because " + "a value is live outside the loop.\n"); + LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; + } + + /* No transformation required. */ + return true; + } /* If stmt has a related stmt, then use that for getting the lhs. */ if (is_pattern_stmt_p (stmt_info)) @@ -7573,6 +7751,8 @@ vectorizable_live_operation (gimple *stmt, : TYPE_SIZE (TREE_TYPE (vectype))); vec_bitsize = TYPE_SIZE (vectype); + gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); + /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */ tree vec_lhs, bitstart; if (slp_node) @@ -7706,6 +7886,97 @@ loop_niters_no_overflow (loop_vec_info loop_vinfo) return false; } +/* Return a mask type with half the number of elements as TYPE. */ + +tree +vect_halve_mask_nunits (tree type) +{ + poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2); + return build_truth_vector_type (nunits, current_vector_size); +} + +/* Return a mask type with twice as many elements as TYPE. */ + +tree +vect_double_mask_nunits (tree type) +{ + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2; + return build_truth_vector_type (nunits, current_vector_size); +} + +/* Record that a fully-masked version of LOOP_VINFO would need MASKS to + contain a sequence of NVECTORS masks that each control a vector of type + VECTYPE. */ + +void +vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, + unsigned int nvectors, tree vectype) +{ + gcc_assert (nvectors != 0); + if (masks->length () < nvectors) + masks->safe_grow_cleared (nvectors); + rgroup_masks *rgm = &(*masks)[nvectors - 1]; + /* The number of scalars per iteration and the number of vectors are + both compile-time constants. */ + unsigned int nscalars_per_iter + = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), + LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); + if (rgm->max_nscalars_per_iter < nscalars_per_iter) + { + rgm->max_nscalars_per_iter = nscalars_per_iter; + rgm->mask_type = build_same_sized_truth_vector_type (vectype); + } +} + +/* Given a complete set of masks MASKS, extract mask number INDEX + for an rgroup that operates on NVECTORS vectors of type VECTYPE, + where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI. + + See the comment above vec_loop_masks for more details about the mask + arrangement. */ + +tree +vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, + unsigned int nvectors, tree vectype, unsigned int index) +{ + rgroup_masks *rgm = &(*masks)[nvectors - 1]; + tree mask_type = rgm->mask_type; + + /* Populate the rgroup's mask array, if this is the first time we've + used it. */ + if (rgm->masks.is_empty ()) + { + rgm->masks.safe_grow_cleared (nvectors); + for (unsigned int i = 0; i < nvectors; ++i) + { + tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask"); + /* Provide a dummy definition until the real one is available. */ + SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); + rgm->masks[i] = mask; + } + } + + tree mask = rgm->masks[index]; + if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), + TYPE_VECTOR_SUBPARTS (vectype))) + { + /* A loop mask for data type X can be reused for data type Y + if X has N times more elements than Y and if Y's elements + are N times bigger than X's. In this case each sequence + of N elements in the loop mask will be all-zero or all-one. + We can then view-convert the mask so that each sequence of + N elements is replaced by a single element. */ + gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), + TYPE_VECTOR_SUBPARTS (vectype))); + gimple_seq seq = NULL; + mask_type = build_same_sized_truth_vector_type (vectype); + mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); + if (seq) + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); + } + return mask; +} + /* Scale profiling counters by estimation for LOOP which is vectorized by factor VF. */ @@ -7840,9 +8111,12 @@ vect_transform_loop (loop_vec_info loop_vinfo) epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, &step_vector, &niters_vector_mult_vf, th, check_profitability, niters_no_overflow); + if (niters_vector == NULL_TREE) { - if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && known_eq (lowest_vf, vf)) + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && known_eq (lowest_vf, vf)) { niters_vector = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), @@ -8124,13 +8398,15 @@ vect_transform_loop (loop_vec_info loop_vinfo) a zero NITERS becomes a nonzero NITERS_VECTOR. */ if (integer_onep (step_vector)) niters_no_overflow = true; - slpeel_make_loop_iterate_ntimes (loop, niters_vector, step_vector, - niters_vector_mult_vf, - !niters_no_overflow); + vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector, + niters_vector_mult_vf, !niters_no_overflow); unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); scale_profile_for_vect_loop (loop, assumed_vf); + /* True if the final iteration might not handle a full vector's + worth of scalar iterations. */ + bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); /* The minimum number of iterations performed by the epilogue. This is 1 when peeling for gaps because we always need a final scalar iteration. */ @@ -8143,16 +8419,25 @@ vect_transform_loop (loop_vec_info loop_vinfo) back to latch counts. */ if (loop->any_upper_bound) loop->nb_iterations_upper_bound - = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, - lowest_vf) - 1; + = (final_iter_may_be_partial + ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias, + lowest_vf) - 1 + : wi::udiv_floor (loop->nb_iterations_upper_bound + bias, + lowest_vf) - 1); if (loop->any_likely_upper_bound) loop->nb_iterations_likely_upper_bound - = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, - lowest_vf) - 1; + = (final_iter_may_be_partial + ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound + bias, + lowest_vf) - 1 + : wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, + lowest_vf) - 1); if (loop->any_estimate) loop->nb_iterations_estimate - = wi::udiv_floor (loop->nb_iterations_estimate + bias, - assumed_vf) - 1; + = (final_iter_may_be_partial + ? wi::udiv_ceil (loop->nb_iterations_estimate + bias, + assumed_vf) - 1 + : wi::udiv_floor (loop->nb_iterations_estimate + bias, + assumed_vf) - 1); if (dump_enabled_p ()) { |