diff options
author | Richard Sandiford <richard.sandiford@arm.com> | 2016-05-24 10:15:36 +0000 |
---|---|---|
committer | Richard Sandiford <rsandifo@gcc.gnu.org> | 2016-05-24 10:15:36 +0000 |
commit | d3465d72bacd65663da8f8b4977f2936a799c3cd (patch) | |
tree | 15d92b71f80740174fee4dd8fb61fd75530eaef4 /gcc | |
parent | 836dbb1a27f3e793ad32723aeb6630237887e6d5 (diff) | |
download | gcc-d3465d72bacd65663da8f8b4977f2936a799c3cd.zip gcc-d3465d72bacd65663da8f8b4977f2936a799c3cd.tar.gz gcc-d3465d72bacd65663da8f8b4977f2936a799c3cd.tar.bz2 |
Avoid unnecessary peeling for gaps with LD3
vectorizable_load forces peeling for gaps if the vectorisation factor
is not a multiple of the group size, since in that case we'd normally load
beyond the original scalar accesses but drop the excess elements as part
of a following permute:
if (loop_vinfo
&& ! STMT_VINFO_STRIDED_P (stmt_info)
&& (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
|| (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
This isn't necessary for LOAD_LANES though, since it loads only the
data needed and does the permute itself.
Tested on aarch64-linux-gnu and x86_64-linux-gnu.
gcc/
* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
load_lanes/grouped_load classification comes first. Don't check
whether the vectorization factor is a multiple of the group size
for load_lanes.
gcc/testsuite/
* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.
From-SVN: r236632
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 7 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c | 13 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 25 |
4 files changed, 36 insertions, 13 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a61b6cd..8f726b2 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,12 @@ 2016-05-24 Richard Sandiford <richard.sandiford@arm.com> + * tree-vect-stmts.c (vectorizable_load): Reorder checks so that + load_lanes/grouped_load classification comes first. Don't check + whether the vectorization factor is a multiple of the group size + for load_lanes. + +2016-05-24 Richard Sandiford <richard.sandiford@arm.com> + * tree-vect-data-refs.c (vect_analyze_group_access_1): Set GROUP_GAP for single-element interleaving. * tree-vect-stmts.c (vectorizable_load): Remove force_peeling diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 65589fa..cb78cc2 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-05-24 Richard Sandiford <richard.sandiford@arm.com> + + * gcc.dg/vect/vect-load-lanes-peeling-1.c: New test. + 2016-05-24 Richard Biener <rguenther@suse.de> PR middle-end/70434 diff --git a/gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c b/gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c new file mode 100644 index 0000000..c9cd104 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_load_lanes } */ + +void +f (int *__restrict a, int *__restrict b) +{ + for (int i = 0; i < 96; ++i) + a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2]; +} + +/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */ +/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index f66e180..1252d33 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -6303,6 +6303,17 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)); first_stmt = GROUP_FIRST_ELEMENT (stmt_info); + group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); + + if (!slp + && !PURE_SLP_STMT (stmt_info) + && !STMT_VINFO_STRIDED_P (stmt_info)) + { + if (vect_load_lanes_supported (vectype, group_size)) + load_lanes_p = true; + else if (!vect_grouped_load_supported (vectype, group_size)) + return false; + } /* If this is single-element interleaving with an element distance that leaves unused vector loads around punt - we at least create @@ -6330,7 +6341,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (loop_vinfo && ! STMT_VINFO_STRIDED_P (stmt_info) && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0 - || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0))) + || (!slp && !load_lanes_p && vf % group_size != 0))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -6350,8 +6361,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) slp_perm = true; - group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); - /* ??? The following is overly pessimistic (as well as the loop case above) in the case we can statically determine the excess elements loaded are within the bounds of a decl that is accessed. @@ -6364,16 +6373,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return false; } - if (!slp - && !PURE_SLP_STMT (stmt_info) - && !STMT_VINFO_STRIDED_P (stmt_info)) - { - if (vect_load_lanes_supported (vectype, group_size)) - load_lanes_p = true; - else if (!vect_grouped_load_supported (vectype, group_size)) - return false; - } - /* Invalidate assumptions made by dependence analysis when vectorization on the unrolled body effectively re-orders stmts. */ if (!PURE_SLP_STMT (stmt_info) |