diff options
author | Richard Sandiford <richard.sandiford@linaro.org> | 2018-01-13 18:00:31 +0000 |
---|---|---|
committer | Richard Sandiford <rsandifo@gcc.gnu.org> | 2018-01-13 18:00:31 +0000 |
commit | 4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f (patch) | |
tree | 99c1b77bf8b46d8dacdacdaaeba64d79aaf2f997 | |
parent | bb6c2b68d6961dfe98bece34e4418d7287ce7089 (diff) | |
download | gcc-4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f.zip gcc-4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f.tar.gz gcc-4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f.tar.bz2 |
Allow single-element interleaving for non-power-of-2 strides
This allows LD3 to be used for isolated a[i * 3] accesses, in a similar
way to the current a[i * 2] and a[i * 4] for LD2 and LD4 respectively.
Given the problems with the cost model underestimating the cost of
elementwise accesses, the patch continues to reject the VMAT_ELEMENTWISE
cases that are currently rejected.
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
gcc/
* tree-vect-data-refs.c (vect_analyze_group_access_1): Allow
single-element interleaving even if the size is not a power of 2.
* tree-vect-stmts.c (get_load_store_type): Disallow elementwise
accesses for single-element interleaving if the group size is
not a power of 2.
gcc/testsuite/
* gcc.target/aarch64/sve/struct_vect_18.c: New test.
* gcc.target/aarch64/sve/struct_vect_18_run.c: Likewise.
* gcc.target/aarch64/sve/struct_vect_19.c: Likewise.
* gcc.target/aarch64/sve/struct_vect_19_run.c: Likewise.
Co-Authored-By: Alan Hayward <alan.hayward@arm.com>
Co-Authored-By: David Sherwood <david.sherwood@arm.com>
From-SVN: r256634
-rw-r--r-- | gcc/ChangeLog | 10 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 9 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c | 44 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c | 36 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c | 42 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c | 45 | ||||
-rw-r--r-- | gcc/tree-vect-data-refs.c | 5 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 5 |
8 files changed, 192 insertions, 4 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 169cae0..73bfb41 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -2,6 +2,16 @@ Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> + * tree-vect-data-refs.c (vect_analyze_group_access_1): Allow + single-element interleaving even if the size is not a power of 2. + * tree-vect-stmts.c (get_load_store_type): Disallow elementwise + accesses for single-element interleaving if the group size is + not a power of 2. + +2018-01-13 Richard Sandiford <richard.sandiford@linaro.org> + Alan Hayward <alan.hayward@arm.com> + David Sherwood <david.sherwood@arm.com> + * doc/md.texi (fold_extract_last_@var{m}): Document. * doc/sourcebuild.texi (vect_fold_extract_last): Likewise. * optabs.def (fold_extract_last_optab): New optab. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index df572fd..72da419 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -2,6 +2,15 @@ Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> + * gcc.target/aarch64/sve/struct_vect_18.c: New test. + * gcc.target/aarch64/sve/struct_vect_18_run.c: Likewise. + * gcc.target/aarch64/sve/struct_vect_19.c: Likewise. + * gcc.target/aarch64/sve/struct_vect_19_run.c: Likewise. + +2018-01-13 Richard Sandiford <richard.sandiford@linaro.org> + Alan Hayward <alan.hayward@arm.com> + David Sherwood <david.sherwood@arm.com> + * lib/target-supports.exp (check_effective_target_vect_fold_extract_last): New proc. * gcc.dg/vect/pr65947-1.c: Update dump messages. Add markup diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c new file mode 100644 index 0000000..67b08d1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#define N 2000 + +#define TEST_LOOP(NAME, TYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME (TYPE *restrict dest, TYPE *restrict src) \ + { \ + for (int i = 0; i < N; ++i) \ + dest[i] += src[i * 3]; \ + } + +#define TEST(NAME) \ + TEST_LOOP (NAME##_i8, signed char) \ + TEST_LOOP (NAME##_i16, unsigned short) \ + TEST_LOOP (NAME##_f32, float) \ + TEST_LOOP (NAME##_f64, double) + +TEST (test) + +/* Check the vectorized loop. */ +/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld3b\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld3h\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld3w\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld3d\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */ + +/* Check the scalar tail. */ +/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */ +/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */ +/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */ +/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */ +/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */ +/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */ +/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */ +/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c new file mode 100644 index 0000000..9698216 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c @@ -0,0 +1,36 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include "struct_vect_18.c" + +#undef TEST_LOOP +#define TEST_LOOP(NAME, TYPE) \ + { \ + TYPE out[N]; \ + TYPE in[N * 3]; \ + for (int i = 0; i < N; ++i) \ + { \ + out[i] = i * 7 / 2; \ + asm volatile ("" ::: "memory"); \ + } \ + for (int i = 0; i < N * 3; ++i) \ + { \ + in[i] = i * 9 / 2; \ + asm volatile ("" ::: "memory"); \ + } \ + NAME (out, in); \ + for (int i = 0; i < N; ++i) \ + { \ + TYPE expected = i * 7 / 2 + in[i * 3]; \ + if (out[i] != expected) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } + +int __attribute__ ((optimize (1))) +main (void) +{ + TEST (test); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c new file mode 100644 index 0000000..3754190 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#define TEST_LOOP(NAME, TYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME (TYPE *restrict dest, TYPE *restrict src, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + dest[i] += src[i * 3]; \ + } + +#define TEST(NAME) \ + TEST_LOOP (NAME##_i8, signed char) \ + TEST_LOOP (NAME##_i16, unsigned short) \ + TEST_LOOP (NAME##_f32, float) \ + TEST_LOOP (NAME##_f64, double) + +TEST (test) + +/* Check the vectorized loop. */ +/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld3b\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld3h\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld3w\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld3d\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */ + +/* Check the scalar tail. */ +/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */ +/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */ +/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */ +/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */ +/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */ +/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */ +/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */ +/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c new file mode 100644 index 0000000..1d0325f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c @@ -0,0 +1,45 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include "struct_vect_19.c" + +#define N 1000 + +#undef TEST_LOOP +#define TEST_LOOP(NAME, TYPE) \ + { \ + TYPE out[N]; \ + TYPE in[N * 3]; \ + int counts[] = { 0, 1, N - 1 }; \ + for (int j = 0; j < 3; ++j) \ + { \ + int count = counts[j]; \ + for (int i = 0; i < N; ++i) \ + { \ + out[i] = i * 7 / 2; \ + asm volatile ("" ::: "memory"); \ + } \ + for (int i = 0; i < N * 3; ++i) \ + { \ + in[i] = i * 9 / 2; \ + asm volatile ("" ::: "memory"); \ + } \ + NAME (out, in, count); \ + for (int i = 0; i < N; ++i) \ + { \ + TYPE expected = i * 7 / 2; \ + if (i < count) \ + expected += in[i * 3]; \ + if (out[i] != expected) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } \ + } + +int __attribute__ ((optimize (1))) +main (void) +{ + TEST (test); + return 0; +} diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 23b1084..59462be 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -2427,11 +2427,10 @@ vect_analyze_group_access_1 (struct data_reference *dr) element of the group that is accessed in the loop. */ /* Gaps are supported only for loads. STEP must be a multiple of the type - size. The size of the group must be a power of 2. */ + size. */ if (DR_IS_READ (dr) && (dr_step % type_size) == 0 - && groupsize > 0 - && pow2p_hwi (groupsize)) + && groupsize > 0) { GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 5bb6131..e4d2051 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -2176,7 +2176,10 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p, cost of using elementwise accesses. This check preserves the traditional behavior until that can be fixed. */ if (*memory_access_type == VMAT_ELEMENTWISE - && !STMT_VINFO_STRIDED_P (stmt_info)) + && !STMT_VINFO_STRIDED_P (stmt_info) + && !(stmt == GROUP_FIRST_ELEMENT (stmt_info) + && !GROUP_NEXT_ELEMENT (stmt_info) + && !pow2p_hwi (GROUP_SIZE (stmt_info)))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |