diff options
Diffstat (limited to 'gcc/tree-vect-data-refs.cc')
-rw-r--r-- | gcc/tree-vect-data-refs.cc | 934 |
1 files changed, 66 insertions, 868 deletions
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index c84cd29..3bf2852 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -3956,8 +3956,7 @@ vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info) } tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); int misalignment; - if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists () - && ((misalignment = dr_misalignment (dr_info, vectype)), true) + if (((misalignment = dr_misalignment (dr_info, vectype)), true) && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment) == dr_explicit_realign_optimized)) { @@ -4539,10 +4538,14 @@ vect_describe_gather_scatter_call (stmt_vec_info stmt_info, info->ifn = gimple_call_internal_fn (call); info->decl = NULL_TREE; info->base = gimple_call_arg (call, 0); - info->offset = gimple_call_arg (call, 1); + info->alias_ptr = gimple_call_arg + (call, internal_fn_alias_ptr_index (info->ifn)); + info->offset = gimple_call_arg + (call, internal_fn_offset_index (info->ifn)); info->offset_dt = vect_unknown_def_type; info->offset_vectype = NULL_TREE; - info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); + info->scale = TREE_INT_CST_LOW (gimple_call_arg + (call, internal_fn_scale_index (info->ifn))); info->element_type = TREE_TYPE (vectype); info->memory_type = TREE_TYPE (DR_REF (dr)); } @@ -4667,26 +4670,19 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, if (off == NULL_TREE) off = size_zero_node; - /* If base is not loop invariant, either off is 0, then we start with just - the constant offset in the loop invariant BASE and continue with base - as OFF, otherwise give up. - We could handle that case by gimplifying the addition of base + off - into some SSA_NAME and use that as off, but for now punt. */ + /* BASE must be loop invariant. If it is not invariant, but OFF is, then we + * can fix that by swapping BASE and OFF. */ if (!expr_invariant_in_loop_p (loop, base)) { - if (!integer_zerop (off)) + if (!expr_invariant_in_loop_p (loop, off)) return false; - off = base; - base = size_int (pbytepos); - } - /* Otherwise put base + constant offset into the loop invariant BASE - and continue with OFF. */ - else - { - base = fold_convert (sizetype, base); - base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); + + std::swap (base, off); } + base = fold_convert (sizetype, base); + base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); + /* OFF at this point may be either a SSA_NAME or some tree expression from get_inner_reference. Try to peel off loop invariants from it into BASE as long as possible. */ @@ -4864,9 +4860,17 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, offset_vectype = NULL_TREE; } + gcc_checking_assert (expr_invariant_in_loop_p (loop, base)); + gcc_checking_assert (!expr_invariant_in_loop_p (loop, off)); + info->ifn = ifn; info->decl = decl; info->base = base; + + info->alias_ptr = build_int_cst + (reference_alias_ptr_type (DR_REF (dr)), + get_object_alignment (DR_REF (dr))); + info->offset = off; info->offset_dt = vect_unknown_def_type; info->offset_vectype = offset_vectype; @@ -5056,7 +5060,7 @@ vect_find_stmt_data_reference (loop_p loop, gimple *stmt, */ opt_result -vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) +vect_analyze_data_refs (vec_info *vinfo, bool *fatal) { class loop *loop = NULL; unsigned int i; @@ -5075,7 +5079,6 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) FOR_EACH_VEC_ELT (datarefs, i, dr) { enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; - poly_uint64 vf; gcc_assert (DR_REF (dr)); stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr)); @@ -5267,11 +5270,6 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) stmt_info->stmt, vectype); } - /* Adjust the minimal vectorization factor according to the - vector type. */ - vf = TYPE_VECTOR_SUBPARTS (vectype); - *min_vf = upper_bound (*min_vf, vf); - /* Leave the BB vectorizer to pick the vector type later, based on the final dataref group size and SLP node size. */ if (is_a <loop_vec_info> (vinfo)) @@ -5728,8 +5726,7 @@ vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info, standard_iv_increment_position (loop, &incr_gsi, &insert_after); create_iv (aggr_ptr_init, PLUS_EXPR, - fold_convert (aggr_ptr_type, iv_step), - aggr_ptr, loop, &incr_gsi, insert_after, + iv_step, aggr_ptr, loop, &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); incr = gsi_stmt (incr_gsi); @@ -5757,7 +5754,7 @@ vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info, { standard_iv_increment_position (containing_loop, &incr_gsi, &insert_after); - create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)), + create_iv (aptr, PLUS_EXPR, DR_STEP (dr), aggr_ptr, containing_loop, &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); incr = gsi_stmt (incr_gsi); @@ -6080,204 +6077,6 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, } -/* Function vect_permute_store_chain. - - Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be - a power of 2 or equal to 3, generate interleave_high/low stmts to reorder - the data correctly for the stores. Return the final references for stores - in RESULT_CHAIN. - - E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. - The input is 4 vectors each containing 8 elements. We assign a number to - each element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - 4th vec: 24 25 26 27 28 29 30 31 - - The output sequence should be: - - 1st vec: 0 8 16 24 1 9 17 25 - 2nd vec: 2 10 18 26 3 11 19 27 - 3rd vec: 4 12 20 28 5 13 21 30 - 4th vec: 6 14 22 30 7 15 23 31 - - i.e., we interleave the contents of the four vectors in their order. - - We use interleave_high/low instructions to create such output. The input of - each interleave_high/low operation is two vectors: - 1st vec 2nd vec - 0 1 2 3 4 5 6 7 - the even elements of the result vector are obtained left-to-right from the - high/low elements of the first vector. The odd elements of the result are - obtained left-to-right from the high/low elements of the second vector. - The output of interleave_high will be: 0 4 1 5 - and of interleave_low: 2 6 3 7 - - - The permutation is done in log LENGTH stages. In each stage interleave_high - and interleave_low stmts are created for each pair of vectors in DR_CHAIN, - where the first argument is taken from the first half of DR_CHAIN and the - second argument from it's second half. - In our example, - - I1: interleave_high (1st vec, 3rd vec) - I2: interleave_low (1st vec, 3rd vec) - I3: interleave_high (2nd vec, 4th vec) - I4: interleave_low (2nd vec, 4th vec) - - The output for the first stage is: - - I1: 0 16 1 17 2 18 3 19 - I2: 4 20 5 21 6 22 7 23 - I3: 8 24 9 25 10 26 11 27 - I4: 12 28 13 29 14 30 15 31 - - The output of the second stage, i.e. the final result is: - - I1: 0 8 16 24 1 9 17 25 - I2: 2 10 18 26 3 11 19 27 - I3: 4 12 20 28 5 13 21 30 - I4: 6 14 22 30 7 15 23 31. */ - -void -vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain, - unsigned int length, - stmt_vec_info stmt_info, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree vect1, vect2, high, low; - gimple *perm_stmt; - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - tree perm_mask_low, perm_mask_high; - tree data_ref; - tree perm3_mask_low, perm3_mask_high; - unsigned int i, j, n, log_length = exact_log2 (length); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (length == 3) - { - /* vect_grouped_store_supported ensures that this is constant. */ - unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); - unsigned int j0 = 0, j1 = 0, j2 = 0; - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - vec_perm_indices indices; - for (j = 0; j < 3; j++) - { - int nelt0 = ((3 - j) * nelt) % 3; - int nelt1 = ((3 - j) * nelt + 1) % 3; - int nelt2 = ((3 - j) * nelt + 2) % 3; - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = j0++; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = nelt + j1++; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = 0; - } - indices.new_vector (sel, 2, nelt); - perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = 3 * i + nelt0; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = 3 * i + nelt1; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = nelt + j2++; - } - indices.new_vector (sel, 2, nelt); - perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); - - vect1 = dr_chain[0]; - vect2 = dr_chain[1]; - - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {j, nelt, *, j + 1, nelt + j + 1, *, - j + 2, nelt + j + 2, *, ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, - vect2, perm3_mask_low); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - - vect1 = data_ref; - vect2 = dr_chain[2]; - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {0, 1, nelt + j, 3, 4, nelt + j + 1, - 6, 7, nelt + j + 2, ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, - vect2, perm3_mask_high); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j] = data_ref; - } - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - - /* The encoding has 2 interleaved stepped patterns. */ - poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); - vec_perm_builder sel (nelt, 2, 3); - sel.quick_grow (6); - for (i = 0; i < 3; i++) - { - sel[i * 2] = i; - sel[i * 2 + 1] = i + nelt; - } - vec_perm_indices indices (sel, 2, nelt); - perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < 6; i++) - sel[i] += exact_div (nelt, 2); - indices.new_vector (sel, 2, nelt); - perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0, n = log_length; i < n; i++) - { - for (j = 0; j < length/2; j++) - { - vect1 = dr_chain[j]; - vect2 = dr_chain[j+length/2]; - - /* Create interleaving stmt: - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, - ...}> */ - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); - perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, - vect2, perm_mask_high); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[2*j] = high; - - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, - ...}> */ - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); - perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, - vect2, perm_mask_low); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[2*j+1] = low; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - } -} - /* Function vect_setup_realignment This function is called when vectorizing an unaligned load using @@ -6708,633 +6507,6 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, return IFN_LAST; } -/* Function vect_permute_load_chain. - - Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2 or equal to 3, generate extract_even/odd stmts to reorder - the input data correctly. Return the final references for loads in - RESULT_CHAIN. - - E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. - The input is 4 vectors each containing 8 elements. We assign a number to each - element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - 4th vec: 24 25 26 27 28 29 30 31 - - The output sequence should be: - - 1st vec: 0 4 8 12 16 20 24 28 - 2nd vec: 1 5 9 13 17 21 25 29 - 3rd vec: 2 6 10 14 18 22 26 30 - 4th vec: 3 7 11 15 19 23 27 31 - - i.e., the first output vector should contain the first elements of each - interleaving group, etc. - - We use extract_even/odd instructions to create such output. The input of - each extract_even/odd operation is two vectors - 1st vec 2nd vec - 0 1 2 3 4 5 6 7 - - and the output is the vector of extracted even/odd elements. The output of - extract_even will be: 0 2 4 6 - and of extract_odd: 1 3 5 7 - - - The permutation is done in log LENGTH stages. In each stage extract_even - and extract_odd stmts are created for each pair of vectors in DR_CHAIN in - their order. In our example, - - E1: extract_even (1st vec, 2nd vec) - E2: extract_odd (1st vec, 2nd vec) - E3: extract_even (3rd vec, 4th vec) - E4: extract_odd (3rd vec, 4th vec) - - The output for the first stage will be: - - E1: 0 2 4 6 8 10 12 14 - E2: 1 3 5 7 9 11 13 15 - E3: 16 18 20 22 24 26 28 30 - E4: 17 19 21 23 25 27 29 31 - - In order to proceed and create the correct sequence for the next stage (or - for the correct output, if the second stage is the last one, as in our - example), we first put the output of extract_even operation and then the - output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). - The input for the second stage is: - - 1st vec (E1): 0 2 4 6 8 10 12 14 - 2nd vec (E3): 16 18 20 22 24 26 28 30 - 3rd vec (E2): 1 3 5 7 9 11 13 15 - 4th vec (E4): 17 19 21 23 25 27 29 31 - - The output of the second stage: - - E1: 0 4 8 12 16 20 24 28 - E2: 2 6 10 14 18 22 26 30 - E3: 1 5 9 13 17 21 25 29 - E4: 3 7 11 15 19 23 27 31 - - And RESULT_CHAIN after reordering: - - 1st vec (E1): 0 4 8 12 16 20 24 28 - 2nd vec (E3): 1 5 9 13 17 21 25 29 - 3rd vec (E2): 2 6 10 14 18 22 26 30 - 4th vec (E4): 3 7 11 15 19 23 27 31. */ - -static void -vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, - unsigned int length, - stmt_vec_info stmt_info, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree data_ref, first_vect, second_vect; - tree perm_mask_even, perm_mask_odd; - tree perm3_mask_low, perm3_mask_high; - gimple *perm_stmt; - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - unsigned int i, j, log_length = exact_log2 (length); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (length == 3) - { - /* vect_grouped_load_supported ensures that this is constant. */ - unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); - unsigned int k; - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - vec_perm_indices indices; - for (k = 0; k < 3; k++) - { - for (i = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = 3 * i + k; - else - sel[i] = 0; - indices.new_vector (sel, 2, nelt); - perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0, j = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = i; - else - sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); - indices.new_vector (sel, 2, nelt); - perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); - - first_vect = dr_chain[0]; - second_vect = dr_chain[1]; - - /* Create interleaving stmt (low part of): - low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, - ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, - second_vect, perm3_mask_low); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - - /* Create interleaving stmt (high part of): - high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, - ...}> */ - first_vect = data_ref; - second_vect = dr_chain[2]; - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, - second_vect, perm3_mask_high); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[k] = data_ref; - } - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - - /* The encoding has a single stepped pattern. */ - poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); - vec_perm_builder sel (nelt, 1, 3); - sel.quick_grow (3); - for (i = 0; i < 3; ++i) - sel[i] = i * 2; - vec_perm_indices indices (sel, 2, nelt); - perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < 3; ++i) - sel[i] = i * 2 + 1; - indices.new_vector (sel, 2, nelt); - perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < log_length; i++) - { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; - - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, second_vect, - perm_mask_even); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, second_vect, - perm_mask_odd); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - } -} - -/* Function vect_shift_permute_load_chain. - - Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate - sequence of stmts to reorder the input data accordingly. - Return the final references for loads in RESULT_CHAIN. - Return true if successed, false otherwise. - - E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. - The input is 3 vectors each containing 8 elements. We assign a - number to each element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - - The output sequence should be: - - 1st vec: 0 3 6 9 12 15 18 21 - 2nd vec: 1 4 7 10 13 16 19 22 - 3rd vec: 2 5 8 11 14 17 20 23 - - We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. - - First we shuffle all 3 vectors to get correct elements order: - - 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) - 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) - 3rd vec: (16 19 22) (17 20 23) (18 21) - - Next we unite and shift vector 3 times: - - 1st step: - shift right by 6 the concatenation of: - "1st vec" and "2nd vec" - ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) - "2nd vec" and "3rd vec" - ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) - "3rd vec" and "1st vec" - (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) - | New vectors | - - So that now new vectors are: - - 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) - 2nd vec: (10 13) (16 19 22) (17 20 23) - 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) - - 2nd step: - shift right by 5 the concatenation of: - "1st vec" and "3rd vec" - ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) - "2nd vec" and "1st vec" - (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) - "3rd vec" and "2nd vec" - (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) - | New vectors | - - So that now new vectors are: - - 1st vec: ( 9 12 15) (18 21) ( 0 3 6) - 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) - 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY - - 3rd step: - shift right by 5 the concatenation of: - "1st vec" and "1st vec" - ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) - shift right by 3 the concatenation of: - "2nd vec" and "2nd vec" - (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) - | New vectors | - - So that now all vectors are READY: - 1st vec: ( 0 3 6) ( 9 12 15) (18 21) - 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) - 3rd vec: ( 1 4 7) (10 13) (16 19 22) - - This algorithm is faster than one in vect_permute_load_chain if: - 1. "shift of a concatination" is faster than general permutation. - This is usually so. - 2. The TARGET machine can't execute vector instructions in parallel. - This is because each step of the algorithm depends on previous. - The algorithm in vect_permute_load_chain is much more parallel. - - The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. -*/ - -static bool -vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, - unsigned int length, - stmt_vec_info stmt_info, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; - tree perm2_mask1, perm2_mask2, perm3_mask; - tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; - gimple *perm_stmt; - - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - machine_mode vmode = TYPE_MODE (vectype); - unsigned int i; - loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); - - unsigned HOST_WIDE_INT nelt, vf; - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) - || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) - /* Not supported for variable-length vectors. */ - return false; - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (pow2p_hwi (length) && vf > 4) - { - unsigned int j, log_length = exact_log2 (length); - for (i = 0; i < nelt / 2; ++i) - sel[i] = i * 2; - for (i = 0; i < nelt / 2; ++i) - sel[nelt / 2 + i] = i * 2 + 1; - vec_perm_indices indices (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 2 fields structure is not \ - supported by target\n"); - return false; - } - perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < nelt / 2; ++i) - sel[i] = i * 2 + 1; - for (i = 0; i < nelt / 2; ++i) - sel[nelt / 2 + i] = i * 2; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 2 fields structure is not \ - supported by target\n"); - return false; - } - perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {4 5 6 7 8 9 10 11}. */ - for (i = 0; i < nelt; i++) - sel[i] = nelt / 2 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift1_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to select vector from 2. - For vector length 8 it is {0 1 2 3 12 13 14 15}. */ - for (i = 0; i < nelt / 2; i++) - sel[i] = i; - for (i = nelt / 2; i < nelt; i++) - sel[i] = nelt + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "select is not supported by target\n"); - return false; - } - select_mask = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < log_length; i++) - { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j + 1]; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, first_vect, - perm2_mask1); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[0] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - second_vect, second_vect, - perm2_mask2); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[1] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[0], vect[1], shift1_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2 + length/2] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[0], vect[1], select_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - return true; - } - if (length == 3 && vf > 2) - { - unsigned int k = 0, l = 0; - - /* Generating permutation constant to get all elements in rigth order. - For vector length 8 it is {0 3 6 1 4 7 2 5}. */ - for (i = 0; i < nelt; i++) - { - if (3 * k + (l % 3) >= nelt) - { - k = 0; - l += (3 - (nelt % 3)); - } - sel[i] = 3 * k + (l % 3); - k++; - } - vec_perm_indices indices (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 3 fields structure is not \ - supported by target\n"); - return false; - } - perm3_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {6 7 8 9 10 11 12 13}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift1_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {5 6 7 8 9 10 11 12}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + 1 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift2_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {3 4 5 6 7 8 9 10}. */ - for (i = 0; i < nelt; i++) - sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift3_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {5 6 7 8 9 10 11 12}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift4_mask = vect_gen_perm_mask_checked (vectype, indices); - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - dr_chain[k], dr_chain[k], - perm3_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[k] = data_ref; - } - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[k % 3], vect[(k + 1) % 3], - shift1_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect_shift[k] = data_ref; - } - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect_shift[(4 - k) % 3], - vect_shift[(3 - k) % 3], - shift2_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[k] = data_ref; - } - - (*result_chain)[3 - (nelt % 3)] = vect[2]; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], - vect[0], shift3_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[nelt % 3] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], - vect[1], shift4_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[0] = data_ref; - return true; - } - return false; -} - -/* Function vect_transform_grouped_load. - - Given a chain of input interleaved data-refs (in DR_CHAIN), build statements - to perform their permutation and ascribe the result vectorized statements to - the scalar statements. -*/ - -void -vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info, - vec<tree> dr_chain, - int size, gimple_stmt_iterator *gsi) -{ - machine_mode mode; - vec<tree> result_chain = vNULL; - - /* DR_CHAIN contains input data-refs that are a part of the interleaving. - RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted - vectors, that are ready for vector computation. */ - result_chain.create (size); - - /* If reassociation width for vector type is 2 or greater target machine can - execute 2 or more vector instructions in parallel. Otherwise try to - get chain for loads group using vect_shift_permute_load_chain. */ - mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); - if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 - || pow2p_hwi (size) - || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info, - gsi, &result_chain)) - vect_permute_load_chain (vinfo, dr_chain, - size, stmt_info, gsi, &result_chain); - vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain); - result_chain.release (); -} - -/* RESULT_CHAIN contains the output of a group of grouped loads that were - generated as part of the vectorization of STMT_INFO. Assign the statement - for each vector to the associated scalar statement. */ - -void -vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info, - vec<tree> result_chain) -{ - stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); - unsigned int i, gap_count; - tree tmp_data_ref; - - /* Put a permuted data-ref in the VECTORIZED_STMT field. - Since we scan the chain starting from it's first node, their order - corresponds the order of data-refs in RESULT_CHAIN. */ - stmt_vec_info next_stmt_info = first_stmt_info; - gap_count = 1; - FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) - { - if (!next_stmt_info) - break; - - /* Skip the gaps. Loads created for the gaps will be removed by dead - code elimination pass later. No need to check for the first stmt in - the group, since it always exists. - DR_GROUP_GAP is the number of steps in elements from the previous - access (if there is no gap DR_GROUP_GAP is 1). We skip loads that - correspond to the gaps. */ - if (next_stmt_info != first_stmt_info - && gap_count < DR_GROUP_GAP (next_stmt_info)) - { - gap_count++; - continue; - } - - /* ??? The following needs cleanup after the removal of - DR_GROUP_SAME_DR_STMT. */ - if (next_stmt_info) - { - gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); - /* We assume that if VEC_STMT is not NULL, this is a case of multiple - copies, and we put the new vector statement last. */ - STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt); - - next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); - gap_count = 1; - } - } -} - /* Function vect_force_dr_alignment_p. Returns whether the alignment of a DECL can be forced to be aligned @@ -7362,13 +6534,14 @@ vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment) alignment. If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even it is aligned, i.e., check if it is possible to vectorize it with different - alignment. */ + alignment. If GS_INFO is passed we are dealing with a gather/scatter. */ enum dr_alignment_support vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, - tree vectype, int misalignment) + tree vectype, int misalignment, + gather_scatter_info *gs_info) { - data_reference *dr = dr_info->dr; + data_reference *dr = dr_info ? dr_info->dr : nullptr; stmt_vec_info stmt_info = dr_info->stmt; machine_mode mode = TYPE_MODE (vectype); loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); @@ -7380,14 +6553,6 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, else if (dr_safe_speculative_read_required (stmt_info)) return dr_unaligned_unsupported; - /* For now assume all conditional loads/stores support unaligned - access without any special code. */ - if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) - if (gimple_call_internal_p (stmt) - && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD - || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) - return dr_unaligned_supported; - if (loop_vinfo) { vect_loop = LOOP_VINFO_LOOP (loop_vinfo); @@ -7457,7 +6622,7 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, } } */ - if (DR_IS_READ (dr)) + if (dr && DR_IS_READ (dr)) { if (can_implement_p (vec_realign_load_optab, mode) && (!targetm.vectorize.builtin_mask_for_load @@ -7485,10 +6650,43 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, bool is_packed = false; tree type = TREE_TYPE (DR_REF (dr)); + bool is_gather_scatter = gs_info != nullptr; if (misalignment == DR_MISALIGNMENT_UNKNOWN) - is_packed = not_size_aligned (DR_REF (dr)); + { + if (!is_gather_scatter || dr != nullptr) + is_packed = not_size_aligned (DR_REF (dr)); + else + { + /* Gather-scatter accesses normally perform only component accesses + so alignment is irrelevant for them. Targets like riscv do care + about scalar alignment in vector accesses, though, so check scalar + alignment here. We determined the alias pointer as well as the + base alignment during pattern recognition and can re-use it here. + + As we do not have an analyzed dataref we only know the alignment + of the reference itself and nothing about init, steps, etc. + For now don't try harder to determine misalignment and + just assume it is unknown. We consider the type packed if its + scalar alignment is lower than the natural alignment of a vector + element's type. */ + + gcc_assert (!GATHER_SCATTER_LEGACY_P (*gs_info)); + gcc_assert (dr == nullptr); + + tree inner_vectype = TREE_TYPE (vectype); + + unsigned HOST_WIDE_INT scalar_align + = tree_to_uhwi (gs_info->alias_ptr); + unsigned HOST_WIDE_INT inner_vectype_sz + = tree_to_uhwi (TYPE_SIZE (inner_vectype)); + + bool is_misaligned = scalar_align < inner_vectype_sz; + is_packed = scalar_align > 1 && is_misaligned; + } + } if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment, - is_packed)) + is_packed, + is_gather_scatter)) return dr_unaligned_supported; /* Unsupported. */ |