diff options
author | Kewen Lin <linkw@linux.ibm.com> | 2020-07-19 20:40:10 -0500 |
---|---|---|
committer | Kewen Lin <linkw@linux.ibm.com> | 2020-07-19 21:13:28 -0500 |
commit | 9fb832ce382d649b7687426e6bc4e5d3715cb78a (patch) | |
tree | 3b2bbe610050f8c47da747491c465de3889f1138 /gcc/tree-vect-stmts.c | |
parent | 3ca6f6698c70c4fa0c98822c73364063fa13ceea (diff) | |
download | gcc-9fb832ce382d649b7687426e6bc4e5d3715cb78a.zip gcc-9fb832ce382d649b7687426e6bc4e5d3715cb78a.tar.gz gcc-9fb832ce382d649b7687426e6bc4e5d3715cb78a.tar.bz2 |
vect: Support length-based partial vectors approach
Power9 supports vector load/store instruction lxvl/stxvl which allow
us to operate partial vectors with one specific length. This patch
extends some of current mask-based partial vectors support code for
length-based approach, also adds some length specific support code.
So far it assumes that we can only have one partial vectors approach
at the same time, it will disable to use partial vectors if both
approaches co-exist.
Like the description of optab len_load/len_store, the length-based
approach can have two flavors, one is length in bytes, the other is
length in lanes. This patch is mainly implemented and tested for
length in bytes, but as Richard S. suggested, most of code has
considered both flavors.
This also introduces one parameter vect-partial-vector-usage allow
users to control when the loop vectorizer considers using partial
vectors as an alternative to falling back to scalar code.
gcc/ChangeLog:
* config/rs6000/rs6000.c (rs6000_option_override_internal):
Set param_vect_partial_vector_usage to 0 explicitly.
* doc/invoke.texi (vect-partial-vector-usage): Document new option.
* optabs-query.c (get_len_load_store_mode): New function.
* optabs-query.h (get_len_load_store_mode): New declare.
* params.opt (vect-partial-vector-usage): New.
* tree-vect-loop-manip.c (vect_set_loop_controls_directly): Add the
handlings for vectorization using length-based partial vectors, call
vect_gen_len for length generation, and rename some variables with
items instead of scalars.
(vect_set_loop_condition_partial_vectors): Add the handlings for
vectorization using length-based partial vectors.
(vect_do_peeling): Allow remaining eiters less than epilogue vf for
LOOP_VINFO_USING_PARTIAL_VECTORS_P.
* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Init
epil_using_partial_vectors_p.
(_loop_vec_info::~_loop_vec_info): Call release_vec_loop_controls
for lengths destruction.
(vect_verify_loop_lens): New function.
(vect_analyze_loop): Add handlings for epilogue of loop when it's
marked to use vectorization using partial vectors.
(vect_analyze_loop_2): Add the check to allow only one vectorization
approach using partial vectorization at the same time. Check param
vect-partial-vector-usage for partial vectors decision. Mark
LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P if the epilogue is
considerable to use partial vectors. Call release_vec_loop_controls
for lengths destruction.
(vect_estimate_min_profitable_iters): Adjust for loop vectorization
using length-based partial vectors.
(vect_record_loop_mask): Init factor to 1 for vectorization using
mask-based partial vectors.
(vect_record_loop_len): New function.
(vect_get_loop_len): Likewise.
* tree-vect-stmts.c (check_load_store_for_partial_vectors): Add
checks for vectorization using length-based partial vectors. Factor
some code to lambda function get_valid_nvectors.
(vectorizable_store): Add handlings when using length-based partial
vectors.
(vectorizable_load): Likewise.
(vect_gen_len): New function.
* tree-vectorizer.h (struct rgroup_controls): Add field factor
mainly for length-based partial vectors.
(vec_loop_lens): New typedef.
(_loop_vec_info): Add lens and epil_using_partial_vectors_p.
(LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P): New macro.
(LOOP_VINFO_LENS): Likewise.
(LOOP_VINFO_FULLY_WITH_LENGTH_P): Likewise.
(vect_record_loop_len): New declare.
(vect_get_loop_len): Likewise.
(vect_gen_len): Likewise.
Diffstat (limited to 'gcc/tree-vect-stmts.c')
-rw-r--r-- | gcc/tree-vect-stmts.c | 167 |
1 files changed, 154 insertions, 13 deletions
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 6730cae..31af46a 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1713,29 +1713,58 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, return; } - machine_mode mask_mode; - if (!VECTOR_MODE_P (vecmode) - || !targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode) - || !can_vec_mask_load_store_p (vecmode, mask_mode, is_load)) + if (!VECTOR_MODE_P (vecmode)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't use a fully-masked loop because the target" - " doesn't have the appropriate masked load or" - " store.\n"); + "can't operate on partial vectors when emulating" + " vector operations.\n"); LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; return; } + /* We might load more scalars than we need for permuting SLP loads. We checked in get_group_load_store_type that the extra elements don't leak into a new vector. */ + auto get_valid_nvectors = [] (poly_uint64 size, poly_uint64 nunits) + { + unsigned int nvectors; + if (can_div_away_from_zero_p (size, nunits, &nvectors)) + return nvectors; + gcc_unreachable (); + }; + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - unsigned int nvectors; - if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors)) - vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask); - else - gcc_unreachable (); + machine_mode mask_mode; + bool using_partial_vectors_p = false; + if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode) + && can_vec_mask_load_store_p (vecmode, mask_mode, is_load)) + { + unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits); + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask); + using_partial_vectors_p = true; + } + + machine_mode vmode; + if (get_len_load_store_mode (vecmode, is_load).exists (&vmode)) + { + unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits); + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vecmode); + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor); + using_partial_vectors_p = true; + } + + if (!using_partial_vectors_p) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't operate on partial vectors because the" + " target doesn't have the appropriate partial" + " vectorization load or store.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } } /* Return the mask input to a masked load or store. VEC_MASK is the vectorized @@ -7694,6 +7723,14 @@ vectorizable_store (vec_info *vinfo, = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); + vec_loop_lens *loop_lens + = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) + ? &LOOP_VINFO_LENS (loop_vinfo) + : NULL); + + /* Shouldn't go with length-based approach if fully masked. */ + gcc_assert (!loop_lens || !loop_masks); + /* Targets with store-lane instructions must not require explicit realignment. vect_supportable_dr_alignment always returns either dr_aligned or dr_unaligned_supported for masked operations. */ @@ -8033,6 +8070,41 @@ vectorizable_store (vec_info *vinfo, vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); new_stmt = call; } + else if (loop_lens) + { + tree final_len + = vect_get_loop_len (loop_vinfo, loop_lens, + vec_num * ncopies, vec_num * j + i); + align = least_bit_hwi (misalign | align); + tree ptr = build_int_cst (ref_type, align); + machine_mode vmode = TYPE_MODE (vectype); + opt_machine_mode new_ovmode + = get_len_load_store_mode (vmode, false); + machine_mode new_vmode = new_ovmode.require (); + /* Need conversion if it's wrapped with VnQI. */ + if (vmode != new_vmode) + { + tree new_vtype + = build_vector_type_for_mode (unsigned_intQI_type_node, + new_vmode); + tree var + = vect_get_new_ssa_name (new_vtype, vect_simple_var); + vec_oprnd + = build1 (VIEW_CONVERT_EXPR, new_vtype, vec_oprnd); + gassign *new_stmt + = gimple_build_assign (var, VIEW_CONVERT_EXPR, + vec_oprnd); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, + gsi); + vec_oprnd = var; + } + gcall *call + = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr, + ptr, final_len, vec_oprnd); + gimple_call_set_nothrow (call, true); + vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); + new_stmt = call; + } else { data_ref = fold_build2 (MEM_REF, vectype, @@ -8577,7 +8649,7 @@ vectorizable_load (vec_info *vinfo, unsigned HOST_WIDE_INT cst_offset = 0; tree dr_offset; - gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); + gcc_assert (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)); gcc_assert (!nested_in_vect_loop); if (grouped_load) @@ -8859,6 +8931,14 @@ vectorizable_load (vec_info *vinfo, = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); + vec_loop_lens *loop_lens + = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) + ? &LOOP_VINFO_LENS (loop_vinfo) + : NULL); + + /* Shouldn't go with length-based approach if fully masked. */ + gcc_assert (!loop_lens || !loop_masks); + /* Targets with store-lane instructions must not require explicit realignment. vect_supportable_dr_alignment always returns either dr_aligned or dr_unaligned_supported for masked operations. */ @@ -9247,6 +9327,43 @@ vectorizable_load (vec_info *vinfo, new_stmt = call; data_ref = NULL_TREE; } + else if (loop_lens && memory_access_type != VMAT_INVARIANT) + { + tree final_len + = vect_get_loop_len (loop_vinfo, loop_lens, + vec_num * ncopies, + vec_num * j + i); + align = least_bit_hwi (misalign | align); + tree ptr = build_int_cst (ref_type, align); + gcall *call + = gimple_build_call_internal (IFN_LEN_LOAD, 3, + dataref_ptr, ptr, + final_len); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + + /* Need conversion if it's wrapped with VnQI. */ + machine_mode vmode = TYPE_MODE (vectype); + opt_machine_mode new_ovmode + = get_len_load_store_mode (vmode, true); + machine_mode new_vmode = new_ovmode.require (); + if (vmode != new_vmode) + { + tree qi_type = unsigned_intQI_type_node; + tree new_vtype + = build_vector_type_for_mode (qi_type, new_vmode); + tree var = vect_get_new_ssa_name (new_vtype, + vect_simple_var); + gimple_set_lhs (call, var); + vect_finish_stmt_generation (vinfo, stmt_info, call, + gsi); + tree op = build1 (VIEW_CONVERT_EXPR, vectype, var); + new_stmt + = gimple_build_assign (vec_dest, + VIEW_CONVERT_EXPR, op); + } + } else { tree ltype = vectype; @@ -11967,3 +12084,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, *nunits_vectype_out = nunits_vectype; return opt_result::success (); } + +/* Generate and return statement sequence that sets vector length LEN that is: + + min_of_start_and_end = min (START_INDEX, END_INDEX); + left_len = END_INDEX - min_of_start_and_end; + rhs = min (left_len, LEN_LIMIT); + LEN = rhs; */ + +gimple_seq +vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit) +{ + gimple_seq stmts = NULL; + tree len_type = TREE_TYPE (len); + gcc_assert (TREE_TYPE (start_index) == len_type); + + tree min = gimple_build (&stmts, MIN_EXPR, len_type, start_index, end_index); + tree left_len = gimple_build (&stmts, MINUS_EXPR, len_type, end_index, min); + tree rhs = gimple_build (&stmts, MIN_EXPR, len_type, left_len, len_limit); + gimple* stmt = gimple_build_assign (len, rhs); + gimple_seq_add_stmt (&stmts, stmt); + + return stmts; +} + |