aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@linaro.org>2018-01-13 18:00:31 +0000
committerRichard Sandiford <rsandifo@gcc.gnu.org>2018-01-13 18:00:31 +0000
commit4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f (patch)
tree99c1b77bf8b46d8dacdacdaaeba64d79aaf2f997
parentbb6c2b68d6961dfe98bece34e4418d7287ce7089 (diff)
downloadgcc-4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f.zip
gcc-4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f.tar.gz
gcc-4aa157e8d2aec2e4f9e97dcee86068135e0dcb2f.tar.bz2
Allow single-element interleaving for non-power-of-2 strides
This allows LD3 to be used for isolated a[i * 3] accesses, in a similar way to the current a[i * 2] and a[i * 4] for LD2 and LD4 respectively. Given the problems with the cost model underestimating the cost of elementwise accesses, the patch continues to reject the VMAT_ELEMENTWISE cases that are currently rejected. 2018-01-13 Richard Sandiford <richard.sandiford@linaro.org> Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> gcc/ * tree-vect-data-refs.c (vect_analyze_group_access_1): Allow single-element interleaving even if the size is not a power of 2. * tree-vect-stmts.c (get_load_store_type): Disallow elementwise accesses for single-element interleaving if the group size is not a power of 2. gcc/testsuite/ * gcc.target/aarch64/sve/struct_vect_18.c: New test. * gcc.target/aarch64/sve/struct_vect_18_run.c: Likewise. * gcc.target/aarch64/sve/struct_vect_19.c: Likewise. * gcc.target/aarch64/sve/struct_vect_19_run.c: Likewise. Co-Authored-By: Alan Hayward <alan.hayward@arm.com> Co-Authored-By: David Sherwood <david.sherwood@arm.com> From-SVN: r256634
-rw-r--r--gcc/ChangeLog10
-rw-r--r--gcc/testsuite/ChangeLog9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c44
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c36
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c42
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c45
-rw-r--r--gcc/tree-vect-data-refs.c5
-rw-r--r--gcc/tree-vect-stmts.c5
8 files changed, 192 insertions, 4 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 169cae0..73bfb41 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -2,6 +2,16 @@
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
+ * tree-vect-data-refs.c (vect_analyze_group_access_1): Allow
+ single-element interleaving even if the size is not a power of 2.
+ * tree-vect-stmts.c (get_load_store_type): Disallow elementwise
+ accesses for single-element interleaving if the group size is
+ not a power of 2.
+
+2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
+ Alan Hayward <alan.hayward@arm.com>
+ David Sherwood <david.sherwood@arm.com>
+
* doc/md.texi (fold_extract_last_@var{m}): Document.
* doc/sourcebuild.texi (vect_fold_extract_last): Likewise.
* optabs.def (fold_extract_last_optab): New optab.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index df572fd..72da419 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -2,6 +2,15 @@
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
+ * gcc.target/aarch64/sve/struct_vect_18.c: New test.
+ * gcc.target/aarch64/sve/struct_vect_18_run.c: Likewise.
+ * gcc.target/aarch64/sve/struct_vect_19.c: Likewise.
+ * gcc.target/aarch64/sve/struct_vect_19_run.c: Likewise.
+
+2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
+ Alan Hayward <alan.hayward@arm.com>
+ David Sherwood <david.sherwood@arm.com>
+
* lib/target-supports.exp
(check_effective_target_vect_fold_extract_last): New proc.
* gcc.dg/vect/pr65947-1.c: Update dump messages. Add markup
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
new file mode 100644
index 0000000..67b08d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define N 2000
+
+#define TEST_LOOP(NAME, TYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME (TYPE *restrict dest, TYPE *restrict src) \
+ { \
+ for (int i = 0; i < N; ++i) \
+ dest[i] += src[i * 3]; \
+ }
+
+#define TEST(NAME) \
+ TEST_LOOP (NAME##_i8, signed char) \
+ TEST_LOOP (NAME##_i16, unsigned short) \
+ TEST_LOOP (NAME##_f32, float) \
+ TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop. */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
+
+/* Check the scalar tail. */
+/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c
new file mode 100644
index 0000000..9698216
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "struct_vect_18.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE) \
+ { \
+ TYPE out[N]; \
+ TYPE in[N * 3]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ out[i] = i * 7 / 2; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ for (int i = 0; i < N * 3; ++i) \
+ { \
+ in[i] = i * 9 / 2; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ NAME (out, in); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected = i * 7 / 2 + in[i * 3]; \
+ if (out[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST (test);
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
new file mode 100644
index 0000000..3754190
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TEST_LOOP(NAME, TYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME (TYPE *restrict dest, TYPE *restrict src, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dest[i] += src[i * 3]; \
+ }
+
+#define TEST(NAME) \
+ TEST_LOOP (NAME##_i8, signed char) \
+ TEST_LOOP (NAME##_i16, unsigned short) \
+ TEST_LOOP (NAME##_f32, float) \
+ TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop. */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
+
+/* Check the scalar tail. */
+/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c
new file mode 100644
index 0000000..1d0325f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "struct_vect_19.c"
+
+#define N 1000
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE) \
+ { \
+ TYPE out[N]; \
+ TYPE in[N * 3]; \
+ int counts[] = { 0, 1, N - 1 }; \
+ for (int j = 0; j < 3; ++j) \
+ { \
+ int count = counts[j]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ out[i] = i * 7 / 2; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ for (int i = 0; i < N * 3; ++i) \
+ { \
+ in[i] = i * 9 / 2; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ NAME (out, in, count); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected = i * 7 / 2; \
+ if (i < count) \
+ expected += in[i * 3]; \
+ if (out[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ } \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST (test);
+ return 0;
+}
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 23b1084..59462be 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -2427,11 +2427,10 @@ vect_analyze_group_access_1 (struct data_reference *dr)
element of the group that is accessed in the loop. */
/* Gaps are supported only for loads. STEP must be a multiple of the type
- size. The size of the group must be a power of 2. */
+ size. */
if (DR_IS_READ (dr)
&& (dr_step % type_size) == 0
- && groupsize > 0
- && pow2p_hwi (groupsize))
+ && groupsize > 0)
{
GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 5bb6131..e4d2051 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -2176,7 +2176,10 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p,
cost of using elementwise accesses. This check preserves the
traditional behavior until that can be fixed. */
if (*memory_access_type == VMAT_ELEMENTWISE
- && !STMT_VINFO_STRIDED_P (stmt_info))
+ && !STMT_VINFO_STRIDED_P (stmt_info)
+ && !(stmt == GROUP_FIRST_ELEMENT (stmt_info)
+ && !GROUP_NEXT_ELEMENT (stmt_info)
+ && !pow2p_hwi (GROUP_SIZE (stmt_info))))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,