diff options
Diffstat (limited to 'gcc/tree-vect-data-refs.cc')
-rw-r--r-- | gcc/tree-vect-data-refs.cc | 972 |
1 files changed, 84 insertions, 888 deletions
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index ee040eb..da700cd 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -1501,10 +1501,17 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info, /* We can only use base and misalignment information relative to an innermost loop if the misalignment stays the same throughout the execution of the loop. As above, this is the case if the stride of - the dataref evenly divides by the alignment. */ - poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - step_preserves_misalignment_p - = multiple_p (drb->step_alignment * vf, vect_align_c); + the dataref evenly divides by the alignment. Make sure to check + previous epilogues and the main loop. */ + step_preserves_misalignment_p = true; + auto lvinfo = loop_vinfo; + while (lvinfo) + { + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (lvinfo); + step_preserves_misalignment_p + &= multiple_p (drb->step_alignment * vf, vect_align_c); + lvinfo = LOOP_VINFO_ORIG_LOOP_INFO (lvinfo); + } if (!step_preserves_misalignment_p && dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -1571,6 +1578,7 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info, unsigned int max_alignment; tree base = get_base_for_alignment (drb->base_address, &max_alignment); if (max_alignment < vect_align_c + || (loop_vinfo && LOOP_VINFO_EPILOGUE_P (loop_vinfo)) || !vect_can_force_dr_alignment_p (base, vect_align_c * BITS_PER_UNIT)) { @@ -2910,12 +2918,14 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 2) there is at least one unsupported misaligned data ref with an unknown misalignment, and 3) all misaligned data refs with a known misalignment are supported, and - 4) the number of runtime alignment checks is within reason. */ + 4) the number of runtime alignment checks is within reason. + 5) the vectorization factor is a constant. */ do_versioning = (optimize_loop_nest_for_speed_p (loop) && !loop->inner /* FORNOW */ - && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP); + && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP) + && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (); if (do_versioning) { @@ -2956,17 +2966,6 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) break; } - /* At present we don't support versioning for alignment - with variable VF, since there's no guarantee that the - VF is a power of two. We could relax this if we added - a way of enforcing a power-of-two size. */ - unsigned HOST_WIDE_INT size; - if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) - { - do_versioning = false; - break; - } - /* Forcing alignment in the first iteration is no good if we don't keep it across iterations. For now, just disable versioning in this case. @@ -2985,7 +2984,8 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) Construct the mask needed for this test. For example, GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the mask must be 15 = 0xf. */ - int mask = size - 1; + gcc_assert (DR_TARGET_ALIGNMENT (dr_info).is_constant ()); + int mask = DR_TARGET_ALIGNMENT (dr_info).to_constant () - 1; /* FORNOW: use the same mask to test all potentially unaligned references in the loop. */ @@ -3948,8 +3948,7 @@ vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info) } tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); int misalignment; - if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists () - && ((misalignment = dr_misalignment (dr_info, vectype)), true) + if (((misalignment = dr_misalignment (dr_info, vectype)), true) && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment) == dr_explicit_realign_optimized)) { @@ -4531,10 +4530,13 @@ vect_describe_gather_scatter_call (stmt_vec_info stmt_info, info->ifn = gimple_call_internal_fn (call); info->decl = NULL_TREE; info->base = gimple_call_arg (call, 0); - info->offset = gimple_call_arg (call, 1); - info->offset_dt = vect_unknown_def_type; + info->alias_ptr = gimple_call_arg + (call, internal_fn_alias_ptr_index (info->ifn)); + info->offset = gimple_call_arg + (call, internal_fn_offset_index (info->ifn)); info->offset_vectype = NULL_TREE; - info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); + info->scale = TREE_INT_CST_LOW (gimple_call_arg + (call, internal_fn_scale_index (info->ifn))); info->element_type = TREE_TYPE (vectype); info->memory_type = TREE_TYPE (DR_REF (dr)); } @@ -4659,26 +4661,19 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, if (off == NULL_TREE) off = size_zero_node; - /* If base is not loop invariant, either off is 0, then we start with just - the constant offset in the loop invariant BASE and continue with base - as OFF, otherwise give up. - We could handle that case by gimplifying the addition of base + off - into some SSA_NAME and use that as off, but for now punt. */ + /* BASE must be loop invariant. If it is not invariant, but OFF is, then we + * can fix that by swapping BASE and OFF. */ if (!expr_invariant_in_loop_p (loop, base)) { - if (!integer_zerop (off)) + if (!expr_invariant_in_loop_p (loop, off)) return false; - off = base; - base = size_int (pbytepos); - } - /* Otherwise put base + constant offset into the loop invariant BASE - and continue with OFF. */ - else - { - base = fold_convert (sizetype, base); - base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); + + std::swap (base, off); } + base = fold_convert (sizetype, base); + base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); + /* OFF at this point may be either a SSA_NAME or some tree expression from get_inner_reference. Try to peel off loop invariants from it into BASE as long as possible. */ @@ -4856,11 +4851,18 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, offset_vectype = NULL_TREE; } + gcc_checking_assert (expr_invariant_in_loop_p (loop, base)); + gcc_checking_assert (!expr_invariant_in_loop_p (loop, off)); + info->ifn = ifn; info->decl = decl; info->base = base; + + info->alias_ptr = build_int_cst + (reference_alias_ptr_type (DR_REF (dr)), + get_object_alignment (DR_REF (dr))); + info->offset = off; - info->offset_dt = vect_unknown_def_type; info->offset_vectype = offset_vectype; info->scale = scale; info->element_type = TREE_TYPE (vectype); @@ -5048,7 +5050,7 @@ vect_find_stmt_data_reference (loop_p loop, gimple *stmt, */ opt_result -vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) +vect_analyze_data_refs (vec_info *vinfo, bool *fatal) { class loop *loop = NULL; unsigned int i; @@ -5067,7 +5069,6 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) FOR_EACH_VEC_ELT (datarefs, i, dr) { enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; - poly_uint64 vf; gcc_assert (DR_REF (dr)); stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr)); @@ -5259,11 +5260,6 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) stmt_info->stmt, vectype); } - /* Adjust the minimal vectorization factor according to the - vector type. */ - vf = TYPE_VECTOR_SUBPARTS (vectype); - *min_vf = upper_bound (*min_vf, vf); - /* Leave the BB vectorizer to pick the vector type later, based on the final dataref group size and SLP node size. */ if (is_a <loop_vec_info> (vinfo)) @@ -5720,8 +5716,7 @@ vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info, standard_iv_increment_position (loop, &incr_gsi, &insert_after); create_iv (aggr_ptr_init, PLUS_EXPR, - fold_convert (aggr_ptr_type, iv_step), - aggr_ptr, loop, &incr_gsi, insert_after, + iv_step, aggr_ptr, loop, &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); incr = gsi_stmt (incr_gsi); @@ -5749,7 +5744,7 @@ vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info, { standard_iv_increment_position (containing_loop, &incr_gsi, &insert_after); - create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)), + create_iv (aptr, PLUS_EXPR, DR_STEP (dr), aggr_ptr, containing_loop, &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); incr = gsi_stmt (incr_gsi); @@ -6072,204 +6067,6 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, } -/* Function vect_permute_store_chain. - - Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be - a power of 2 or equal to 3, generate interleave_high/low stmts to reorder - the data correctly for the stores. Return the final references for stores - in RESULT_CHAIN. - - E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. - The input is 4 vectors each containing 8 elements. We assign a number to - each element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - 4th vec: 24 25 26 27 28 29 30 31 - - The output sequence should be: - - 1st vec: 0 8 16 24 1 9 17 25 - 2nd vec: 2 10 18 26 3 11 19 27 - 3rd vec: 4 12 20 28 5 13 21 30 - 4th vec: 6 14 22 30 7 15 23 31 - - i.e., we interleave the contents of the four vectors in their order. - - We use interleave_high/low instructions to create such output. The input of - each interleave_high/low operation is two vectors: - 1st vec 2nd vec - 0 1 2 3 4 5 6 7 - the even elements of the result vector are obtained left-to-right from the - high/low elements of the first vector. The odd elements of the result are - obtained left-to-right from the high/low elements of the second vector. - The output of interleave_high will be: 0 4 1 5 - and of interleave_low: 2 6 3 7 - - - The permutation is done in log LENGTH stages. In each stage interleave_high - and interleave_low stmts are created for each pair of vectors in DR_CHAIN, - where the first argument is taken from the first half of DR_CHAIN and the - second argument from it's second half. - In our example, - - I1: interleave_high (1st vec, 3rd vec) - I2: interleave_low (1st vec, 3rd vec) - I3: interleave_high (2nd vec, 4th vec) - I4: interleave_low (2nd vec, 4th vec) - - The output for the first stage is: - - I1: 0 16 1 17 2 18 3 19 - I2: 4 20 5 21 6 22 7 23 - I3: 8 24 9 25 10 26 11 27 - I4: 12 28 13 29 14 30 15 31 - - The output of the second stage, i.e. the final result is: - - I1: 0 8 16 24 1 9 17 25 - I2: 2 10 18 26 3 11 19 27 - I3: 4 12 20 28 5 13 21 30 - I4: 6 14 22 30 7 15 23 31. */ - -void -vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain, - unsigned int length, - stmt_vec_info stmt_info, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree vect1, vect2, high, low; - gimple *perm_stmt; - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - tree perm_mask_low, perm_mask_high; - tree data_ref; - tree perm3_mask_low, perm3_mask_high; - unsigned int i, j, n, log_length = exact_log2 (length); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (length == 3) - { - /* vect_grouped_store_supported ensures that this is constant. */ - unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); - unsigned int j0 = 0, j1 = 0, j2 = 0; - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - vec_perm_indices indices; - for (j = 0; j < 3; j++) - { - int nelt0 = ((3 - j) * nelt) % 3; - int nelt1 = ((3 - j) * nelt + 1) % 3; - int nelt2 = ((3 - j) * nelt + 2) % 3; - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = j0++; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = nelt + j1++; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = 0; - } - indices.new_vector (sel, 2, nelt); - perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = 3 * i + nelt0; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = 3 * i + nelt1; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = nelt + j2++; - } - indices.new_vector (sel, 2, nelt); - perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); - - vect1 = dr_chain[0]; - vect2 = dr_chain[1]; - - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {j, nelt, *, j + 1, nelt + j + 1, *, - j + 2, nelt + j + 2, *, ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, - vect2, perm3_mask_low); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - - vect1 = data_ref; - vect2 = dr_chain[2]; - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {0, 1, nelt + j, 3, 4, nelt + j + 1, - 6, 7, nelt + j + 2, ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, - vect2, perm3_mask_high); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j] = data_ref; - } - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - - /* The encoding has 2 interleaved stepped patterns. */ - poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); - vec_perm_builder sel (nelt, 2, 3); - sel.quick_grow (6); - for (i = 0; i < 3; i++) - { - sel[i * 2] = i; - sel[i * 2 + 1] = i + nelt; - } - vec_perm_indices indices (sel, 2, nelt); - perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < 6; i++) - sel[i] += exact_div (nelt, 2); - indices.new_vector (sel, 2, nelt); - perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0, n = log_length; i < n; i++) - { - for (j = 0; j < length/2; j++) - { - vect1 = dr_chain[j]; - vect2 = dr_chain[j+length/2]; - - /* Create interleaving stmt: - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, - ...}> */ - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); - perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, - vect2, perm_mask_high); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[2*j] = high; - - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, - ...}> */ - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); - perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, - vect2, perm_mask_low); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[2*j+1] = low; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - } -} - /* Function vect_setup_realignment This function is called when vectorizing an unaligned load using @@ -6700,633 +6497,6 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, return IFN_LAST; } -/* Function vect_permute_load_chain. - - Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2 or equal to 3, generate extract_even/odd stmts to reorder - the input data correctly. Return the final references for loads in - RESULT_CHAIN. - - E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. - The input is 4 vectors each containing 8 elements. We assign a number to each - element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - 4th vec: 24 25 26 27 28 29 30 31 - - The output sequence should be: - - 1st vec: 0 4 8 12 16 20 24 28 - 2nd vec: 1 5 9 13 17 21 25 29 - 3rd vec: 2 6 10 14 18 22 26 30 - 4th vec: 3 7 11 15 19 23 27 31 - - i.e., the first output vector should contain the first elements of each - interleaving group, etc. - - We use extract_even/odd instructions to create such output. The input of - each extract_even/odd operation is two vectors - 1st vec 2nd vec - 0 1 2 3 4 5 6 7 - - and the output is the vector of extracted even/odd elements. The output of - extract_even will be: 0 2 4 6 - and of extract_odd: 1 3 5 7 - - - The permutation is done in log LENGTH stages. In each stage extract_even - and extract_odd stmts are created for each pair of vectors in DR_CHAIN in - their order. In our example, - - E1: extract_even (1st vec, 2nd vec) - E2: extract_odd (1st vec, 2nd vec) - E3: extract_even (3rd vec, 4th vec) - E4: extract_odd (3rd vec, 4th vec) - - The output for the first stage will be: - - E1: 0 2 4 6 8 10 12 14 - E2: 1 3 5 7 9 11 13 15 - E3: 16 18 20 22 24 26 28 30 - E4: 17 19 21 23 25 27 29 31 - - In order to proceed and create the correct sequence for the next stage (or - for the correct output, if the second stage is the last one, as in our - example), we first put the output of extract_even operation and then the - output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). - The input for the second stage is: - - 1st vec (E1): 0 2 4 6 8 10 12 14 - 2nd vec (E3): 16 18 20 22 24 26 28 30 - 3rd vec (E2): 1 3 5 7 9 11 13 15 - 4th vec (E4): 17 19 21 23 25 27 29 31 - - The output of the second stage: - - E1: 0 4 8 12 16 20 24 28 - E2: 2 6 10 14 18 22 26 30 - E3: 1 5 9 13 17 21 25 29 - E4: 3 7 11 15 19 23 27 31 - - And RESULT_CHAIN after reordering: - - 1st vec (E1): 0 4 8 12 16 20 24 28 - 2nd vec (E3): 1 5 9 13 17 21 25 29 - 3rd vec (E2): 2 6 10 14 18 22 26 30 - 4th vec (E4): 3 7 11 15 19 23 27 31. */ - -static void -vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, - unsigned int length, - stmt_vec_info stmt_info, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree data_ref, first_vect, second_vect; - tree perm_mask_even, perm_mask_odd; - tree perm3_mask_low, perm3_mask_high; - gimple *perm_stmt; - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - unsigned int i, j, log_length = exact_log2 (length); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (length == 3) - { - /* vect_grouped_load_supported ensures that this is constant. */ - unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); - unsigned int k; - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - vec_perm_indices indices; - for (k = 0; k < 3; k++) - { - for (i = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = 3 * i + k; - else - sel[i] = 0; - indices.new_vector (sel, 2, nelt); - perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0, j = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = i; - else - sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); - indices.new_vector (sel, 2, nelt); - perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); - - first_vect = dr_chain[0]; - second_vect = dr_chain[1]; - - /* Create interleaving stmt (low part of): - low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, - ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, - second_vect, perm3_mask_low); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - - /* Create interleaving stmt (high part of): - high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, - ...}> */ - first_vect = data_ref; - second_vect = dr_chain[2]; - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, - second_vect, perm3_mask_high); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[k] = data_ref; - } - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - - /* The encoding has a single stepped pattern. */ - poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); - vec_perm_builder sel (nelt, 1, 3); - sel.quick_grow (3); - for (i = 0; i < 3; ++i) - sel[i] = i * 2; - vec_perm_indices indices (sel, 2, nelt); - perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < 3; ++i) - sel[i] = i * 2 + 1; - indices.new_vector (sel, 2, nelt); - perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < log_length; i++) - { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; - - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, second_vect, - perm_mask_even); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, second_vect, - perm_mask_odd); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - } -} - -/* Function vect_shift_permute_load_chain. - - Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate - sequence of stmts to reorder the input data accordingly. - Return the final references for loads in RESULT_CHAIN. - Return true if successed, false otherwise. - - E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. - The input is 3 vectors each containing 8 elements. We assign a - number to each element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - - The output sequence should be: - - 1st vec: 0 3 6 9 12 15 18 21 - 2nd vec: 1 4 7 10 13 16 19 22 - 3rd vec: 2 5 8 11 14 17 20 23 - - We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. - - First we shuffle all 3 vectors to get correct elements order: - - 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) - 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) - 3rd vec: (16 19 22) (17 20 23) (18 21) - - Next we unite and shift vector 3 times: - - 1st step: - shift right by 6 the concatenation of: - "1st vec" and "2nd vec" - ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) - "2nd vec" and "3rd vec" - ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) - "3rd vec" and "1st vec" - (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) - | New vectors | - - So that now new vectors are: - - 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) - 2nd vec: (10 13) (16 19 22) (17 20 23) - 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) - - 2nd step: - shift right by 5 the concatenation of: - "1st vec" and "3rd vec" - ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) - "2nd vec" and "1st vec" - (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) - "3rd vec" and "2nd vec" - (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) - | New vectors | - - So that now new vectors are: - - 1st vec: ( 9 12 15) (18 21) ( 0 3 6) - 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) - 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY - - 3rd step: - shift right by 5 the concatenation of: - "1st vec" and "1st vec" - ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) - shift right by 3 the concatenation of: - "2nd vec" and "2nd vec" - (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) - | New vectors | - - So that now all vectors are READY: - 1st vec: ( 0 3 6) ( 9 12 15) (18 21) - 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) - 3rd vec: ( 1 4 7) (10 13) (16 19 22) - - This algorithm is faster than one in vect_permute_load_chain if: - 1. "shift of a concatination" is faster than general permutation. - This is usually so. - 2. The TARGET machine can't execute vector instructions in parallel. - This is because each step of the algorithm depends on previous. - The algorithm in vect_permute_load_chain is much more parallel. - - The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. -*/ - -static bool -vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, - unsigned int length, - stmt_vec_info stmt_info, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; - tree perm2_mask1, perm2_mask2, perm3_mask; - tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; - gimple *perm_stmt; - - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - machine_mode vmode = TYPE_MODE (vectype); - unsigned int i; - loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); - - unsigned HOST_WIDE_INT nelt, vf; - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) - || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) - /* Not supported for variable-length vectors. */ - return false; - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (pow2p_hwi (length) && vf > 4) - { - unsigned int j, log_length = exact_log2 (length); - for (i = 0; i < nelt / 2; ++i) - sel[i] = i * 2; - for (i = 0; i < nelt / 2; ++i) - sel[nelt / 2 + i] = i * 2 + 1; - vec_perm_indices indices (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 2 fields structure is not \ - supported by target\n"); - return false; - } - perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < nelt / 2; ++i) - sel[i] = i * 2 + 1; - for (i = 0; i < nelt / 2; ++i) - sel[nelt / 2 + i] = i * 2; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 2 fields structure is not \ - supported by target\n"); - return false; - } - perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {4 5 6 7 8 9 10 11}. */ - for (i = 0; i < nelt; i++) - sel[i] = nelt / 2 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift1_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to select vector from 2. - For vector length 8 it is {0 1 2 3 12 13 14 15}. */ - for (i = 0; i < nelt / 2; i++) - sel[i] = i; - for (i = nelt / 2; i < nelt; i++) - sel[i] = nelt + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "select is not supported by target\n"); - return false; - } - select_mask = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < log_length; i++) - { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j + 1]; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, first_vect, - perm2_mask1); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[0] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - second_vect, second_vect, - perm2_mask2); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[1] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[0], vect[1], shift1_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2 + length/2] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[0], vect[1], select_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - return true; - } - if (length == 3 && vf > 2) - { - unsigned int k = 0, l = 0; - - /* Generating permutation constant to get all elements in rigth order. - For vector length 8 it is {0 3 6 1 4 7 2 5}. */ - for (i = 0; i < nelt; i++) - { - if (3 * k + (l % 3) >= nelt) - { - k = 0; - l += (3 - (nelt % 3)); - } - sel[i] = 3 * k + (l % 3); - k++; - } - vec_perm_indices indices (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 3 fields structure is not \ - supported by target\n"); - return false; - } - perm3_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {6 7 8 9 10 11 12 13}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift1_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {5 6 7 8 9 10 11 12}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + 1 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift2_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {3 4 5 6 7 8 9 10}. */ - for (i = 0; i < nelt; i++) - sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift3_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {5 6 7 8 9 10 11 12}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift4_mask = vect_gen_perm_mask_checked (vectype, indices); - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - dr_chain[k], dr_chain[k], - perm3_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[k] = data_ref; - } - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[k % 3], vect[(k + 1) % 3], - shift1_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect_shift[k] = data_ref; - } - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect_shift[(4 - k) % 3], - vect_shift[(3 - k) % 3], - shift2_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[k] = data_ref; - } - - (*result_chain)[3 - (nelt % 3)] = vect[2]; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], - vect[0], shift3_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[nelt % 3] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], - vect[1], shift4_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[0] = data_ref; - return true; - } - return false; -} - -/* Function vect_transform_grouped_load. - - Given a chain of input interleaved data-refs (in DR_CHAIN), build statements - to perform their permutation and ascribe the result vectorized statements to - the scalar statements. -*/ - -void -vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info, - vec<tree> dr_chain, - int size, gimple_stmt_iterator *gsi) -{ - machine_mode mode; - vec<tree> result_chain = vNULL; - - /* DR_CHAIN contains input data-refs that are a part of the interleaving. - RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted - vectors, that are ready for vector computation. */ - result_chain.create (size); - - /* If reassociation width for vector type is 2 or greater target machine can - execute 2 or more vector instructions in parallel. Otherwise try to - get chain for loads group using vect_shift_permute_load_chain. */ - mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); - if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 - || pow2p_hwi (size) - || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info, - gsi, &result_chain)) - vect_permute_load_chain (vinfo, dr_chain, - size, stmt_info, gsi, &result_chain); - vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain); - result_chain.release (); -} - -/* RESULT_CHAIN contains the output of a group of grouped loads that were - generated as part of the vectorization of STMT_INFO. Assign the statement - for each vector to the associated scalar statement. */ - -void -vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info, - vec<tree> result_chain) -{ - stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); - unsigned int i, gap_count; - tree tmp_data_ref; - - /* Put a permuted data-ref in the VECTORIZED_STMT field. - Since we scan the chain starting from it's first node, their order - corresponds the order of data-refs in RESULT_CHAIN. */ - stmt_vec_info next_stmt_info = first_stmt_info; - gap_count = 1; - FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) - { - if (!next_stmt_info) - break; - - /* Skip the gaps. Loads created for the gaps will be removed by dead - code elimination pass later. No need to check for the first stmt in - the group, since it always exists. - DR_GROUP_GAP is the number of steps in elements from the previous - access (if there is no gap DR_GROUP_GAP is 1). We skip loads that - correspond to the gaps. */ - if (next_stmt_info != first_stmt_info - && gap_count < DR_GROUP_GAP (next_stmt_info)) - { - gap_count++; - continue; - } - - /* ??? The following needs cleanup after the removal of - DR_GROUP_SAME_DR_STMT. */ - if (next_stmt_info) - { - gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); - /* We assume that if VEC_STMT is not NULL, this is a case of multiple - copies, and we put the new vector statement last. */ - STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt); - - next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); - gap_count = 1; - } - } -} - /* Function vect_force_dr_alignment_p. Returns whether the alignment of a DECL can be forced to be aligned @@ -7354,13 +6524,14 @@ vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment) alignment. If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even it is aligned, i.e., check if it is possible to vectorize it with different - alignment. */ + alignment. If GS_INFO is passed we are dealing with a gather/scatter. */ enum dr_alignment_support vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, - tree vectype, int misalignment) + tree vectype, int misalignment, + gather_scatter_info *gs_info) { - data_reference *dr = dr_info->dr; + data_reference *dr = dr_info ? dr_info->dr : nullptr; stmt_vec_info stmt_info = dr_info->stmt; machine_mode mode = TYPE_MODE (vectype); loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); @@ -7372,14 +6543,6 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, else if (dr_safe_speculative_read_required (stmt_info)) return dr_unaligned_unsupported; - /* For now assume all conditional loads/stores support unaligned - access without any special code. */ - if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) - if (gimple_call_internal_p (stmt) - && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD - || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) - return dr_unaligned_supported; - if (loop_vinfo) { vect_loop = LOOP_VINFO_LOOP (loop_vinfo); @@ -7449,7 +6612,7 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, } } */ - if (DR_IS_READ (dr)) + if (dr && DR_IS_READ (dr)) { if (can_implement_p (vec_realign_load_optab, mode) && (!targetm.vectorize.builtin_mask_for_load @@ -7477,10 +6640,43 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, bool is_packed = false; tree type = TREE_TYPE (DR_REF (dr)); + bool is_gather_scatter = gs_info != nullptr; if (misalignment == DR_MISALIGNMENT_UNKNOWN) - is_packed = not_size_aligned (DR_REF (dr)); + { + if (!is_gather_scatter || dr != nullptr) + is_packed = not_size_aligned (DR_REF (dr)); + else + { + /* Gather-scatter accesses normally perform only component accesses + so alignment is irrelevant for them. Targets like riscv do care + about scalar alignment in vector accesses, though, so check scalar + alignment here. We determined the alias pointer as well as the + base alignment during pattern recognition and can re-use it here. + + As we do not have an analyzed dataref we only know the alignment + of the reference itself and nothing about init, steps, etc. + For now don't try harder to determine misalignment and + just assume it is unknown. We consider the type packed if its + scalar alignment is lower than the natural alignment of a vector + element's type. */ + + gcc_assert (!GATHER_SCATTER_LEGACY_P (*gs_info)); + gcc_assert (dr == nullptr); + + tree inner_vectype = TREE_TYPE (vectype); + + unsigned HOST_WIDE_INT scalar_align + = tree_to_uhwi (gs_info->alias_ptr); + unsigned HOST_WIDE_INT inner_vectype_sz + = tree_to_uhwi (TYPE_SIZE (inner_vectype)); + + bool is_misaligned = scalar_align < inner_vectype_sz; + is_packed = scalar_align > 1 && is_misaligned; + } + } if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment, - is_packed)) + is_packed, + is_gather_scatter)) return dr_unaligned_supported; /* Unsupported. */ |