diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c | 16 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-46.c | 2 | ||||
-rw-r--r-- | gcc/tree-vect-slp.cc | 51 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.cc | 128 |
4 files changed, 127 insertions, 70 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c index ee12136..8cefa7f 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c @@ -24,11 +24,17 @@ void rephase (void) struct site *s; for(i=0,s=lattice;i<sites_on_node;i++,s++) for(dir=0;dir<32;dir++) - for(j=0;j<3;j++)for(k=0;k<3;k++) - { - s->link[dir].e[j][k].real *= s->phase[dir]; - s->link[dir].e[j][k].imag *= s->phase[dir]; - } + { + for(j=0;j<3;j++) + for(k=0;k<3;k++) + { + s->link[dir].e[j][k].real *= s->phase[dir]; + s->link[dir].e[j][k].imag *= s->phase[dir]; + } + /* Avoid loop vectorizing the outer loop after unrolling + the inners. */ + __asm__ volatile ("" : : : "memory"); + } } int main() diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c index 18476a4..79ed0bb 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-46.c +++ b/gcc/testsuite/gcc.dg/vect/slp-46.c @@ -94,4 +94,4 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_load_lanes } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index fee992d..8cb1ac1 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1286,15 +1286,19 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, { if (load_p && rhs_code != CFN_GATHER_LOAD - && rhs_code != CFN_MASK_GATHER_LOAD) + && rhs_code != CFN_MASK_GATHER_LOAD + /* Not grouped loads are handled as externals for BB + vectorization. For loop vectorization we can handle + splats the same we handle single element interleaving. */ + && (is_a <bb_vec_info> (vinfo) + || stmt_info != first_stmt_info)) { /* Not grouped load. */ if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "Build SLP failed: not grouped load %G", stmt); - /* FORNOW: Not grouped loads are not supported. */ - if (is_a <bb_vec_info> (vinfo) && i != 0) + if (i != 0) continue; /* Fatal mismatch. */ matches[0] = false; @@ -1302,7 +1306,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, } /* Not memory operation. */ - if (!phi_p + if (!load_p + && !phi_p && rhs_code.is_tree_code () && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary @@ -1774,7 +1779,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, return NULL; /* If the SLP node is a load, terminate the recursion unless masked. */ - if (STMT_VINFO_GROUPED_ACCESS (stmt_info) + if (STMT_VINFO_DATA_REF (stmt_info) && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) { if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) @@ -1798,8 +1803,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) { - int load_place = vect_get_place_in_interleaving_chain - (load_info, first_stmt_info); + int load_place; + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + load_place = vect_get_place_in_interleaving_chain + (load_info, first_stmt_info); + else + load_place = 0; gcc_assert (load_place != -1); load_permutation.safe_push (load_place); } @@ -5439,6 +5448,16 @@ vect_optimize_slp_pass::remove_redundant_permutations () this_load_permuted = true; break; } + /* When this isn't a grouped access we know it's single element + and contiguous. */ + if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0])) + { + if (!this_load_permuted + && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U) + || SLP_TREE_LANES (node) == 1)) + SLP_TREE_LOAD_PERMUTATION (node).release (); + continue; + } stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); if (!this_load_permuted @@ -8129,12 +8148,16 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, tree vectype = SLP_TREE_VECTYPE (node); unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length (); unsigned int mask_element; + unsigned dr_group_size; machine_mode mode; if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) - return false; - - stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + dr_group_size = 1; + else + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + dr_group_size = DR_GROUP_SIZE (stmt_info); + } mode = TYPE_MODE (vectype); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); @@ -8175,7 +8198,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, unsigned int nelts_to_build; unsigned int nvectors_per_build; unsigned int in_nlanes; - bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) + bool repeating_p = (group_size == dr_group_size && multiple_p (nunits, group_size)); if (repeating_p) { @@ -8188,7 +8211,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, it at least one to ensure the later computation for n_perms proceed. */ nvectors_per_build = nstmts > 0 ? nstmts : 1; - in_nlanes = DR_GROUP_SIZE (stmt_info) * 3; + in_nlanes = dr_group_size * 3; } else { @@ -8200,7 +8223,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, mask.new_vector (const_nunits, const_nunits, 1); nelts_to_build = const_vf * group_size; nvectors_per_build = 1; - in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info); + in_nlanes = const_vf * dr_group_size; } auto_sbitmap used_in_lanes (in_nlanes); bitmap_clear (used_in_lanes); @@ -8214,7 +8237,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, { unsigned int iter_num = j / group_size; unsigned int stmt_num = j % group_size; - unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]); + unsigned int i = (iter_num * dr_group_size + perm[stmt_num]); bitmap_set_bit (used_in_lanes, i); if (repeating_p) { diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index b31971e..d642d3c 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1150,6 +1150,8 @@ vect_model_load_cost (vec_info *vinfo, /* If the load is permuted then the alignment is determined by the first group element not by the first scalar stmt DR. */ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (!first_stmt_info) + first_stmt_info = stmt_info; /* Record the cost for the permutation. */ unsigned n_perms, n_loads; vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, @@ -2203,12 +2205,24 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, { loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; - stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + stmt_vec_info first_stmt_info; + unsigned int group_size; + unsigned HOST_WIDE_INT gap; + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + { + first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + group_size = DR_GROUP_SIZE (first_stmt_info); + gap = DR_GROUP_GAP (first_stmt_info); + } + else + { + first_stmt_info = stmt_info; + group_size = 1; + gap = 0; + } dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); - unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); bool single_element_p = (stmt_info == first_stmt_info && !DR_GROUP_NEXT_ELEMENT (stmt_info)); - unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); /* True if the vectorized statements would access beyond the last @@ -2311,11 +2325,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, *memory_access_type = VMAT_ELEMENTWISE; } } - else + else if (cmp == 0 && loop_vinfo) { - gcc_assert (!loop_vinfo || cmp > 0); - *memory_access_type = VMAT_CONTIGUOUS; + gcc_assert (vls_type == VLS_LOAD); + *memory_access_type = VMAT_INVARIANT; + /* Invariant accesses perform only component accesses, alignment + is irrelevant for them. */ + *alignment_support_scheme = dr_unaligned_supported; } + else + *memory_access_type = VMAT_CONTIGUOUS; /* When we have a contiguous access across loop iterations but the access in the loop doesn't cover the full vector @@ -2540,7 +2559,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, is irrelevant for them. */ *alignment_support_scheme = dr_unaligned_supported; } - else if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + else if (STMT_VINFO_GROUPED_ACCESS (stmt_info) || slp_node) { if (!get_group_load_store_type (vinfo, stmt_info, vectype, slp_node, masked_p, @@ -9464,46 +9483,6 @@ vectorizable_load (vec_info *vinfo, return false; } - if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) - { - slp_perm = true; - - if (!loop_vinfo) - { - /* In BB vectorization we may not actually use a loaded vector - accessing elements in excess of DR_GROUP_SIZE. */ - stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0]; - group_info = DR_GROUP_FIRST_ELEMENT (group_info); - unsigned HOST_WIDE_INT nunits; - unsigned j, k, maxk = 0; - FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k) - if (k > maxk) - maxk = k; - tree vectype = SLP_TREE_VECTYPE (slp_node); - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) - || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "BB vectorization with gaps at the end of " - "a load is not supported\n"); - return false; - } - } - - auto_vec<tree> tem; - unsigned n_perms; - if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf, - true, &n_perms)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, - vect_location, - "unsupported load permutation\n"); - return false; - } - } - /* Invalidate assumptions made by dependence analysis when vectorization on the unrolled body effectively re-orders stmts. */ if (!PURE_SLP_STMT (stmt_info) @@ -9521,6 +9500,46 @@ vectorizable_load (vec_info *vinfo, else group_size = 1; + if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) + { + slp_perm = true; + + if (!loop_vinfo) + { + /* In BB vectorization we may not actually use a loaded vector + accessing elements in excess of DR_GROUP_SIZE. */ + stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0]; + group_info = DR_GROUP_FIRST_ELEMENT (group_info); + unsigned HOST_WIDE_INT nunits; + unsigned j, k, maxk = 0; + FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k) + if (k > maxk) + maxk = k; + tree vectype = SLP_TREE_VECTYPE (slp_node); + if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) + || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "BB vectorization with gaps at the end of " + "a load is not supported\n"); + return false; + } + } + + auto_vec<tree> tem; + unsigned n_perms; + if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf, + true, &n_perms)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "unsupported load permutation\n"); + return false; + } + } + vect_memory_access_type memory_access_type; enum dr_alignment_support alignment_support_scheme; int misalignment; @@ -9898,10 +9917,19 @@ vectorizable_load (vec_info *vinfo, || (!slp && memory_access_type == VMAT_CONTIGUOUS)) grouped_load = false; - if (grouped_load) + if (grouped_load + || (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())) { - first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); - group_size = DR_GROUP_SIZE (first_stmt_info); + if (grouped_load) + { + first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + group_size = DR_GROUP_SIZE (first_stmt_info); + } + else + { + first_stmt_info = stmt_info; + group_size = 1; + } /* For SLP vectorization we directly vectorize a subchain without permutation. */ if (slp && ! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) |