aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc')
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c77
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c49
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c71
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c34
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c71
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c66
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c101
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c61
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c107
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c187
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c16
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c22
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c77
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c29
-rw-r--r--gcc/tree-vect-loop-manip.c26
-rw-r--r--gcc/tree-vect-loop.c307
-rw-r--r--gcc/tree-vectorizer.c4
-rw-r--r--gcc/tree-vectorizer.h56
18 files changed, 1297 insertions, 64 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
new file mode 100644
index 0000000..fb817b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
@@ -0,0 +1,77 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < n; ++i)
+ res += x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n)
+{
+ unsigned short res = ~0;
+ for (int i = 0; i < n; ++i)
+ res = res < x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < n; ++i)
+ res = res > x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n)
+{
+ unsigned short res = ~0;
+ for (int i = 0; i < n; ++i)
+ res &= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < n; ++i)
+ res |= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < n; ++i)
+ res ^= x[i];
+ return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
new file mode 100644
index 0000000..1dd579b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_10.c"
+
+int
+main (void)
+{
+ unsigned short x[N];
+ for (int i = 0; i < N; ++i)
+ x[i] = (i + 1) * (i + 2);
+
+ if (add_loop (x, 0) != 0
+ || add_loop (x, 11) != 572
+ || add_loop (x, 0x100) != 22016
+ || add_loop (x, 0xfff) != 20480
+ || max_loop (x, 0) != 0
+ || max_loop (x, 11) != 132
+ || max_loop (x, 0x100) != 65280
+ || max_loop (x, 0xfff) != 65504
+ || or_loop (x, 0) != 0
+ || or_loop (x, 11) != 0xfe
+ || or_loop (x, 0x80) != 0x7ffe
+ || or_loop (x, 0xb4) != 0x7ffe
+ || or_loop (x, 0xb5) != 0xfffe
+ || eor_loop (x, 0) != 0
+ || eor_loop (x, 11) != 0xe8
+ || eor_loop (x, 0x100) != 0xcf00
+ || eor_loop (x, 0xfff) != 0xa000)
+ __builtin_abort ();
+
+ for (int i = 0; i < N; ++i)
+ x[i] = ~x[i];
+
+ if (min_loop (x, 0) != 65535
+ || min_loop (x, 11) != 65403
+ || min_loop (x, 0x100) != 255
+ || min_loop (x, 0xfff) != 31
+ || and_loop (x, 0) != 0xffff
+ || and_loop (x, 11) != 0xff01
+ || and_loop (x, 0x80) != 0x8001
+ || and_loop (x, 0xb4) != 0x8001
+ || and_loop (x, 0xb5) != 1)
+ __builtin_abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
new file mode 100644
index 0000000..f99ef4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
@@ -0,0 +1,71 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, unsigned short res)
+{
+ for (int i = 0; i < 0xfff; ++i)
+ res += x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, unsigned short res)
+{
+ for (int i = 0; i < 0xfff; ++i)
+ res = res < x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, unsigned short res)
+{
+ for (int i = 0; i < 0xfff; ++i)
+ res = res > x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, unsigned short res)
+{
+ for (int i = 0; i < 0xfff; ++i)
+ res &= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, unsigned short res)
+{
+ for (int i = 0; i < 0xfff; ++i)
+ res |= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, unsigned short res)
+{
+ for (int i = 0; i < 0xfff; ++i)
+ res ^= x[i];
+ return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
new file mode 100644
index 0000000..5b41560
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_11.c"
+
+int
+main (void)
+{
+ unsigned short x[N];
+ for (int i = 0; i < N; ++i)
+ x[i] = (i + 1) * (i + 2);
+
+ if (add_loop (x, 42) != 20522
+ || max_loop (x, 65503) != 65504
+ || max_loop (x, 65505) != 65505
+ || or_loop (x, 0) != 0xfffe
+ || or_loop (x, 1) != 0xffff
+ || eor_loop (x, 0) != 0xa000
+ || eor_loop (x, 0xbfff) != 0x1fff)
+ __builtin_abort ();
+
+ for (int i = 0; i < N; ++i)
+ x[i] = ~x[i];
+
+ if (min_loop (x, 32) != 31
+ || min_loop (x, 30) != 30
+ || and_loop (x, 0xff) != 1
+ || and_loop (x, 0) != 0)
+ __builtin_abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
new file mode 100644
index 0000000..d32b81a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
@@ -0,0 +1,71 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res += x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res = res < x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res = res > x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res &= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res |= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res ^= x[i];
+ return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
new file mode 100644
index 0000000..929b81a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
@@ -0,0 +1,66 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_12.c"
+
+int
+main (void)
+{
+ unsigned short x[N];
+ for (int i = 0; i < N; ++i)
+ x[i] = (i + 1) * (i + 2);
+
+ if (add_loop (x, 0, 10) != 10
+ || add_loop (x, 11, 42) != 614
+ || add_loop (x, 0x100, 84) != 22100
+ || add_loop (x, 0xfff, 20) != 20500
+ || max_loop (x, 0, 10) != 10
+ || max_loop (x, 11, 131) != 132
+ || max_loop (x, 11, 133) != 133
+ || max_loop (x, 0x100, 65279) != 65280
+ || max_loop (x, 0x100, 65281) != 65281
+ || max_loop (x, 0xfff, 65503) != 65504
+ || max_loop (x, 0xfff, 65505) != 65505
+ || or_loop (x, 0, 0x71) != 0x71
+ || or_loop (x, 11, 0) != 0xfe
+ || or_loop (x, 11, 0xb3c) != 0xbfe
+ || or_loop (x, 0x80, 0) != 0x7ffe
+ || or_loop (x, 0x80, 1) != 0x7fff
+ || or_loop (x, 0xb4, 0) != 0x7ffe
+ || or_loop (x, 0xb4, 1) != 0x7fff
+ || or_loop (x, 0xb5, 0) != 0xfffe
+ || or_loop (x, 0xb5, 1) != 0xffff
+ || eor_loop (x, 0, 0x3e) != 0x3e
+ || eor_loop (x, 11, 0) != 0xe8
+ || eor_loop (x, 11, 0x1ff) != 0x117
+ || eor_loop (x, 0x100, 0) != 0xcf00
+ || eor_loop (x, 0x100, 0xeee) != 0xc1ee
+ || eor_loop (x, 0xfff, 0) != 0xa000
+ || eor_loop (x, 0xfff, 0x8888) != 0x2888)
+ __builtin_abort ();
+
+ for (int i = 0; i < N; ++i)
+ x[i] = ~x[i];
+
+ if (min_loop (x, 0, 10000) != 10000
+ || min_loop (x, 11, 65404) != 65403
+ || min_loop (x, 11, 65402) != 65402
+ || min_loop (x, 0x100, 256) != 255
+ || min_loop (x, 0x100, 254) != 254
+ || min_loop (x, 0xfff, 32) != 31
+ || min_loop (x, 0xfff, 30) != 30
+ || and_loop (x, 0, 0x1234) != 0x1234
+ || and_loop (x, 11, 0xffff) != 0xff01
+ || and_loop (x, 11, 0xcdef) != 0xcd01
+ || and_loop (x, 0x80, 0xffff) != 0x8001
+ || and_loop (x, 0x80, 0xfffe) != 0x8000
+ || and_loop (x, 0xb4, 0xffff) != 0x8001
+ || and_loop (x, 0xb4, 0xfffe) != 0x8000
+ || and_loop (x, 0xb5, 0xffff) != 1
+ || and_loop (x, 0xb5, 0xfffe) != 0)
+ __builtin_abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
new file mode 100644
index 0000000..ce2b8f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
@@ -0,0 +1,101 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < 0x7ff; ++i)
+ {
+ res0 += x[i * 2];
+ res1 += x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < 0x7ff; ++i)
+ {
+ res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+ res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < 0x7ff; ++i)
+ {
+ res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+ res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < 0x7ff; ++i)
+ {
+ res0 &= x[i * 2];
+ res1 &= x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < 0x7ff; ++i)
+ {
+ res0 |= x[i * 2];
+ res1 |= x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < 0x7ff; ++i)
+ {
+ res0 ^= x[i * 2];
+ res1 ^= x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
new file mode 100644
index 0000000..5514d8d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
@@ -0,0 +1,61 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_13.c"
+
+int
+main (void)
+{
+ unsigned int x[N];
+ for (int i = 0; i < N; ++i)
+ x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+ unsigned int add_res[2] = { 42, 1111 };
+ add_loop (x, add_res);
+ if (add_res[0] != 968538154
+ || add_res[1] != 964340823)
+ __builtin_abort ();
+
+ unsigned int max_res1[2] = { 0, 0 };
+ max_loop (x, max_res1);
+ if (max_res1[0] != 1048150
+ || max_res1[1] != 1045506)
+ __builtin_abort ();
+
+ unsigned int max_res2[2] = { 1048151, 1045507 };
+ max_loop (x, max_res2);
+ if (max_res2[0] != 1048151
+ || max_res2[1] != 1045507)
+ __builtin_abort ();
+
+ unsigned int or_res[2] = { 0x1000000, 0x2000000 };
+ or_loop (x, or_res);
+ if (or_res[0] != 0x10ffffe
+ || or_res[1] != 0x20ffffe)
+ __builtin_abort ();
+
+ unsigned int eor_res[2] = { 0x1000000, 0x2000000 };
+ eor_loop (x, eor_res);
+ if (eor_res[0] != 0x1010000
+ || eor_res[1] != 0x20b5000)
+ __builtin_abort ();
+
+ for (int i = 0; i < N; ++i)
+ x[i] = ~x[i] & 0xfffff;
+
+ unsigned int min_res1[2] = { 500, 4000 };
+ min_loop (x, min_res1);
+ if (min_res1[0] != 425
+ || min_res1[1] != 3069)
+ __builtin_abort ();
+
+ unsigned int min_res2[2] = { 424, 3068 };
+ min_loop (x, min_res2);
+ if (min_res2[0] != 424
+ || min_res2[1] != 3068)
+ __builtin_abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
new file mode 100644
index 0000000..3be611e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
@@ -0,0 +1,107 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, int n, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < n; ++i)
+ {
+ res0 += x[i * 2];
+ res1 += x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, int n, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < n; ++i)
+ {
+ res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+ res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, int n, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < n; ++i)
+ {
+ res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+ res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, int n, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < n; ++i)
+ {
+ res0 &= x[i * 2];
+ res1 &= x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, int n, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < n; ++i)
+ {
+ res0 |= x[i * 2];
+ res1 |= x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, int n, unsigned int *res)
+{
+ unsigned int res0 = res[0];
+ unsigned int res1 = res[1];
+ for (int i = 0; i < n; ++i)
+ {
+ res0 ^= x[i * 2];
+ res1 ^= x[i * 2 + 1];
+ }
+ res[0] = res0;
+ res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
new file mode 100644
index 0000000..ccaa770
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
@@ -0,0 +1,187 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_14.c"
+
+int
+main (void)
+{
+ unsigned int x[N];
+ for (int i = 0; i < N; ++i)
+ x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+ unsigned int add_res1[2] = { 11, 22 };
+ add_loop (x, 0, add_res1);
+ if (add_res1[0] != 11
+ || add_res1[1] != 22)
+ __builtin_abort ();
+
+ unsigned int add_res2[2] = { 10, 20 };
+ add_loop (x, 11, add_res2);
+ if (add_res2[0] != 1902
+ || add_res2[1] != 2176)
+ __builtin_abort ();
+
+ unsigned int add_res3[2] = { 15, 30 };
+ add_loop (x, 0x100, add_res3);
+ if (add_res3[0] != 22435087
+ || add_res3[1] != 22566686)
+ __builtin_abort ();
+
+ unsigned int add_res4[2] = { 100, 200 };
+ add_loop (x, 0x11f, add_res4);
+ if (add_res4[0] != 31602244
+ || add_res4[1] != 31767656)
+ __builtin_abort ();
+
+ unsigned int max_res1[2] = { 461, 500 };
+ max_loop (x, 11, max_res1);
+ if (max_res1[0] != 462
+ || max_res1[1] != 506)
+ __builtin_abort ();
+
+ unsigned int max_res2[2] = { 463, 507 };
+ max_loop (x, 11, max_res2);
+ if (max_res2[0] != 463
+ || max_res2[1] != 507)
+ __builtin_abort ();
+
+ unsigned int max_res3[2] = { 1000000, 1000000 };
+ max_loop (x, 0x200, max_res3);
+ if (max_res3[0] != 1047552
+ || max_res3[1] != 1045506)
+ __builtin_abort ();
+
+ unsigned int max_res4[2] = { 1047553, 1045507 };
+ max_loop (x, 0x200, max_res4);
+ if (max_res4[0] != 1047553
+ || max_res4[1] != 1045507)
+ __builtin_abort ();
+
+ unsigned int max_res5[2] = { 300000, 30000 };
+ max_loop (x, 0x11f, max_res5);
+ if (max_res5[0] != 328902
+ || max_res5[1] != 330050)
+ __builtin_abort ();
+
+ unsigned int max_res6[2] = { 328903, 330051 };
+ max_loop (x, 0x11f, max_res6);
+ if (max_res6[0] != 328903
+ || max_res6[1] != 330051)
+ __builtin_abort ();
+
+ unsigned int or_res1[2] = { 11, 22 };
+ or_loop (x, 0, or_res1);
+ if (or_res1[0] != 11
+ || or_res1[1] != 22)
+ __builtin_abort ();
+
+ unsigned int or_res2[2] = { 0x200000, 0xe00000 };
+ or_loop (x, 11, or_res2);
+ if (or_res2[0] != 0x2001fe
+ || or_res2[1] != 0xe001fe)
+ __builtin_abort ();
+
+ unsigned int or_res3[2] = { 0x800000, 0x700000 };
+ or_loop (x, 0x40, or_res3);
+ if (or_res3[0] != 0x803ffe
+ || or_res3[1] != 0x707ffe)
+ __builtin_abort ();
+
+ unsigned int or_res4[2] = { 0x100001, 0x300000 };
+ or_loop (x, 0x4f, or_res4);
+ if (or_res4[0] != 0x107fff
+ || or_res4[1] != 0x307ffe)
+ __builtin_abort ();
+
+ unsigned int eor_res1[2] = { 11, 22 };
+ eor_loop (x, 0, eor_res1);
+ if (eor_res1[0] != 11
+ || eor_res1[1] != 22)
+ __builtin_abort ();
+
+ unsigned int eor_res2[2] = { 0x2000ff, 0xe000ff };
+ eor_loop (x, 11, eor_res2);
+ if (eor_res2[0] != 0x2001cf
+ || eor_res2[1] != 0xe000b7)
+ __builtin_abort ();
+
+ unsigned int eor_res3[2] = { 0x805000, 0x70f000 };
+ eor_loop (x, 0x100, eor_res3);
+ if (eor_res3[0] != 0x824200
+ || eor_res3[1] != 0x77dc00)
+ __builtin_abort ();
+
+ unsigned int eor_res4[2] = { 0x101201, 0x300f00 };
+ eor_loop (x, 0x11f, eor_res4);
+ if (eor_res4[0] != 0x178801
+ || eor_res4[1] != 0x337240)
+ __builtin_abort ();
+
+ for (int i = 0; i < N; ++i)
+ x[i] = ~x[i] & 0xfffff;
+
+ unsigned int min_res1[2] = { 1048200, 1048100 };
+ min_loop (x, 11, min_res1);
+ if (min_res1[0] != 1048113
+ || min_res1[1] != 1048069)
+ __builtin_abort ();
+
+ unsigned int min_res2[2] = { 1048112, 1048068 };
+ min_loop (x, 11, min_res2);
+ if (min_res2[0] != 1048112
+ || min_res2[1] != 1048068)
+ __builtin_abort ();
+
+ unsigned int min_res3[2] = { 10000, 10000 };
+ min_loop (x, 0x200, min_res3);
+ if (min_res3[0] != 1023
+ || min_res3[1] != 3069)
+ __builtin_abort ();
+
+ unsigned int min_res4[2] = { 1022, 3068 };
+ min_loop (x, 0x200, min_res4);
+ if (min_res4[0] != 1022
+ || min_res4[1] != 3068)
+ __builtin_abort ();
+
+ unsigned int min_res5[2] = { 719680, 718530 };
+ min_loop (x, 0x11f, min_res5);
+ if (min_res5[0] != 719673
+ || min_res5[1] != 718525)
+ __builtin_abort ();
+
+ unsigned int min_res6[2] = { 719672, 718524 };
+ min_loop (x, 0x11f, min_res6);
+ if (min_res6[0] != 719672
+ || min_res6[1] != 718524)
+ __builtin_abort ();
+
+ unsigned int and_res1[2] = { 11, 22 };
+ and_loop (x, 0, and_res1);
+ if (and_res1[0] != 11
+ || and_res1[1] != 22)
+ __builtin_abort ();
+
+ unsigned int and_res2[2] = { 0xf5cff, 0xf78ff };
+ and_loop (x, 11, and_res2);
+ if (and_res2[0] != 0xf5c01
+ || and_res2[1] != 0xf7801)
+ __builtin_abort ();
+
+ unsigned int and_res3[2] = { 0x7efff, 0xecfff };
+ and_loop (x, 0x40, and_res3);
+ if (and_res3[0] != 0x7c001
+ || and_res3[1] != 0xe8001)
+ __builtin_abort ();
+
+ unsigned int and_res4[2] = { 0xffffff, 0xffffff };
+ and_loop (x, 0x4f, and_res4);
+ if (and_res4[0] != 0xf8001
+ || and_res4[1] != 0xf8001)
+ __builtin_abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
new file mode 100644
index 0000000..15b1ade
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
@@ -0,0 +1,16 @@
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+int __attribute__((noipa))
+add_loop (int *x, int n, int res)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ res += x[i * 2];
+ res += x[i * 2 + 1];
+ }
+ return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
new file mode 100644
index 0000000..3207fce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
@@ -0,0 +1,22 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_15.c"
+
+int
+main (void)
+{
+ int x[N];
+ for (int i = 0; i < N; ++i)
+ x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+ if (add_loop (x, 0, 33) != 33
+ || add_loop (x, 11, 30) != 4078
+ || add_loop (x, 0x100, 45) != 45001773
+ || add_loop (x, 0x11f, 300) != 63369900)
+ __builtin_abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
new file mode 100644
index 0000000..b839821
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
@@ -0,0 +1,77 @@
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < 0xfff; ++i)
+ res += x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x)
+{
+ unsigned short res = ~0;
+ for (int i = 0; i < 0xfff; ++i)
+ res = res < x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < 0xfff; ++i)
+ res = res > x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x)
+{
+ unsigned short res = ~0;
+ for (int i = 0; i < 0xfff; ++i)
+ res &= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < 0xfff; ++i)
+ res |= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < 0xfff; ++i)
+ res ^= x[i];
+ return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
new file mode 100644
index 0000000..aa248f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_9.c"
+
+int
+main (void)
+{
+ unsigned short x[N];
+ for (int i = 0; i < N; ++i)
+ x[i] = (i + 1) * (i + 2);
+
+ if (add_loop (x) != 20480
+ || max_loop (x) != 65504
+ || or_loop (x) != 0xfffe
+ || eor_loop (x) != 0xa000)
+ __builtin_abort ();
+
+ for (int i = 0; i < N; ++i)
+ x[i] = ~x[i];
+
+ if (min_loop (x) != 31
+ || and_loop (x) != 1)
+ __builtin_abort ();
+
+ return 0;
+}
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 2909e8a..c29ffb3 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2457,6 +2457,28 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
}
+/* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
+ Return a value that equals:
+
+ - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
+ - SKIP_VALUE when the main loop is skipped. */
+
+tree
+vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
+ tree skip_value)
+{
+ gcc_assert (loop_vinfo->main_loop_edge);
+
+ tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
+ basic_block bb = loop_vinfo->main_loop_edge->dest;
+ gphi *new_phi = create_phi_node (phi_result, bb);
+ add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
+ UNKNOWN_LOCATION);
+ add_phi_arg (new_phi, skip_value,
+ loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
+ return phi_result;
+}
+
/* Function vect_do_peeling.
Input:
@@ -2986,6 +3008,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
skip_vector ? anchor : guard_bb,
prob_epilog.invert (),
irred_flag);
+ if (vect_epilogues)
+ epilogue_vinfo->skip_this_loop_edge = guard_e;
slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
single_exit (epilog));
/* Only need to handle basic block before epilog loop if it's not
@@ -3057,6 +3081,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
UNKNOWN_LOCATION);
niters = PHI_RESULT (new_phi);
+ epilogue_vinfo->main_loop_edge = update_e;
+ epilogue_vinfo->skip_main_loop_edge = skip_e;
}
/* Set ADVANCE to the number of iterations performed by the previous
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index fe7e73f..8c27d75 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
+#define INCLUDE_ALGORITHM
#include "config.h"
#include "system.h"
#include "coretypes.h"
@@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
th (0),
versioning_threshold (0),
vectorization_factor (0),
+ main_loop_edge (nullptr),
+ skip_main_loop_edge (nullptr),
+ skip_this_loop_edge (nullptr),
+ reusable_accumulators (),
max_vectorization_factor (0),
mask_skip_niters (NULL_TREE),
rgroup_compare_type (NULL_TREE),
@@ -4607,7 +4612,32 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
prologue_cost, epilogue_cost);
}
+/* SEQ is a sequence of instructions that initialize the reduction
+ described by REDUC_INFO. Emit them in the appropriate place. */
+static void
+vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
+ stmt_vec_info reduc_info, gimple *seq)
+{
+ if (reduc_info->reused_accumulator)
+ {
+ /* When reusing an accumulator from the main loop, we only need
+ initialization instructions if the main loop can be skipped.
+ In that case, emit the initialization instructions at the end
+ of the guard block that does the skip. */
+ edge skip_edge = loop_vinfo->skip_main_loop_edge;
+ gcc_assert (skip_edge);
+ gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
+ gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+ }
+ else
+ {
+ /* The normal case: emit the initialization instructions on the
+ preheader edge. */
+ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
+ }
+}
/* Function get_initial_def_for_reduction
@@ -4675,36 +4705,30 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo,
}
if (stmts)
- gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+ vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
return init_def;
}
-/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose
- associated SLP node is SLP_NODE. NUMBER_OF_VECTORS is the number of vector
- defs to create. If NEUTRAL_OP is nonnull, introducing extra elements of
- that value will not change the result. */
+/* Get at the initial defs for the reduction PHIs for REDUC_INFO,
+ which performs a reduction involving GROUP_SIZE scalar statements.
+ NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
+ is nonnull, introducing extra elements of that value will not change the
+ result. */
static void
-get_initial_defs_for_reduction (vec_info *vinfo,
+get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
stmt_vec_info reduc_info,
- slp_tree slp_node,
vec<tree> *vec_oprnds,
unsigned int number_of_vectors,
- bool reduc_chain, tree neutral_op)
+ unsigned int group_size, tree neutral_op)
{
- vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+ vec<tree> &initial_values = reduc_info->reduc_initial_values;
unsigned HOST_WIDE_INT nunits;
unsigned j, number_of_places_left_in_vector;
tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
- unsigned int group_size = stmts.length ();
unsigned int i;
- class loop *loop;
-
- loop = (gimple_bb (reduc_info->stmt))->loop_father;
- gcc_assert (loop);
- edge pe = loop_preheader_edge (loop);
- gcc_assert (!reduc_chain || neutral_op);
+ gcc_assert (group_size == initial_values.length () || neutral_op);
/* NUMBER_OF_COPIES is the number of times we need to use the same values in
created vectors. It is greater than 1 if unrolling is performed.
@@ -4734,18 +4758,13 @@ get_initial_defs_for_reduction (vec_info *vinfo,
{
tree op;
i = j % group_size;
- stmt_vec_info stmt_vinfo = stmts[i];
/* Get the def before the loop. In reduction chain we have only
one initial value. Else we have as many as PHIs in the group. */
- if (reduc_chain)
- op = j != 0 ? neutral_op : vect_phi_initial_value (stmt_vinfo);
- else if (((vec_oprnds->length () + 1) * nunits
- - number_of_places_left_in_vector >= group_size)
- && neutral_op)
+ if (i >= initial_values.length () || (j > i && neutral_op))
op = neutral_op;
else
- op = vect_phi_initial_value (stmt_vinfo);
+ op = initial_values[i];
/* Create 'vect_ = {op0,op1,...,opn}'. */
number_of_places_left_in_vector--;
@@ -4781,8 +4800,8 @@ get_initial_defs_for_reduction (vec_info *vinfo,
{
/* First time round, duplicate ELTS to fill the
required number of vectors. */
- duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
- number_of_vectors, *vec_oprnds);
+ duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
+ elts, number_of_vectors, *vec_oprnds);
break;
}
vec_oprnds->quick_push (init);
@@ -4794,7 +4813,7 @@ get_initial_defs_for_reduction (vec_info *vinfo,
}
}
if (ctor_seq != NULL)
- gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
+ vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
}
/* For a statement STMT_INFO taking part in a reduction operation return
@@ -4823,6 +4842,99 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
return stmt_info;
}
+/* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
+ REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
+ return false. */
+
+static bool
+vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
+ stmt_vec_info reduc_info)
+{
+ loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+ if (!main_loop_vinfo)
+ return false;
+
+ if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
+ return false;
+
+ unsigned int num_phis = reduc_info->reduc_initial_values.length ();
+ auto_vec<tree, 16> main_loop_results (num_phis);
+ auto_vec<tree, 16> initial_values (num_phis);
+ if (edge main_loop_edge = loop_vinfo->main_loop_edge)
+ {
+ /* The epilogue loop can be entered either from the main loop or
+ from an earlier guard block. */
+ edge skip_edge = loop_vinfo->skip_main_loop_edge;
+ for (tree incoming_value : reduc_info->reduc_initial_values)
+ {
+ /* Look for:
+
+ INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
+ INITIAL_VALUE(guard block)>. */
+ gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
+
+ gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
+ gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
+
+ tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
+ tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
+
+ main_loop_results.quick_push (from_main_loop);
+ initial_values.quick_push (from_skip);
+ }
+ }
+ else
+ /* The main loop dominates the epilogue loop. */
+ main_loop_results.splice (reduc_info->reduc_initial_values);
+
+ /* See if the main loop has the kind of accumulator we need. */
+ vect_reusable_accumulator *accumulator
+ = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
+ if (!accumulator
+ || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
+ || !std::equal (main_loop_results.begin (), main_loop_results.end (),
+ accumulator->reduc_info->reduc_scalar_results.begin ()))
+ return false;
+
+ /* For now, only handle the case in which both loops are operating on the
+ same vector types. In future we could reduce wider vectors to narrower
+ ones as well. */
+ tree vectype = STMT_VINFO_VECTYPE (reduc_info);
+ tree old_vectype = TREE_TYPE (accumulator->reduc_input);
+ if (!useless_type_conversion_p (old_vectype, vectype))
+ return false;
+
+ /* Non-SLP reductions might apply an adjustment after the reduction
+ operation, in order to simplify the initialization of the accumulator.
+ If the epilogue loop carries on from where the main loop left off,
+ it should apply the same adjustment to the final reduction result.
+
+ If the epilogue loop can also be entered directly (rather than via
+ the main loop), we need to be able to handle that case in the same way,
+ with the same adjustment. (In principle we could add a PHI node
+ to select the correct adjustment, but in practice that shouldn't be
+ necessary.) */
+ tree main_adjustment
+ = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
+ if (loop_vinfo->main_loop_edge && main_adjustment)
+ {
+ gcc_assert (num_phis == 1);
+ tree initial_value = initial_values[0];
+ /* Check that we can use INITIAL_VALUE as the adjustment and
+ initialize the accumulator with a neutral value instead. */
+ if (!operand_equal_p (initial_value, main_adjustment))
+ return false;
+ tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+ initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
+ code, initial_value);
+ }
+ STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
+ reduc_info->reduc_initial_values.truncate (0);
+ reduc_info->reduc_initial_values.splice (initial_values);
+ reduc_info->reused_accumulator = accumulator;
+ return true;
+}
+
/* Function vect_create_epilog_for_reduction
Create code at the loop-epilog to finalize the result of a reduction
@@ -4915,7 +5027,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
gimple *use_stmt;
auto_vec<tree> reduc_inputs;
int j, i;
- auto_vec<tree> scalar_results;
+ vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
unsigned int group_size = 1, k;
auto_vec<gimple *> phis;
/* SLP reduction without reduction chain, e.g.,
@@ -4941,16 +5053,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
gcc_assert (vectype);
mode = TYPE_MODE (vectype);
- tree initial_def = NULL;
tree induc_val = NULL_TREE;
tree adjustment_def = NULL;
if (slp_node)
;
else
{
- /* Get at the scalar def before the loop, that defines the initial value
- of the reduction variable. */
- initial_def = vect_phi_initial_value (reduc_def_stmt);
/* Optimize: for induction condition reduction, if we can't use zero
for induc_val, use initial_def. */
if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
@@ -5196,6 +5304,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
reduc_inputs.safe_push (single_input);
}
+ tree orig_reduc_input = reduc_inputs[0];
+
+ /* If this loop is an epilogue loop that can be skipped after the
+ main loop, we can only share a reduction operation between the
+ main loop and the epilogue if we put it at the target of the
+ skip edge.
+
+ We can still reuse accumulators if this check fails. Doing so has
+ the minor(?) benefit of making the epilogue loop's scalar result
+ independent of the main loop's scalar result. */
+ bool unify_with_main_loop_p = false;
+ if (reduc_info->reused_accumulator
+ && loop_vinfo->skip_this_loop_edge
+ && single_succ_p (exit_bb)
+ && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
+ {
+ unify_with_main_loop_p = true;
+
+ basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
+ reduc_inputs[0] = make_ssa_name (vectype);
+ gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
+ add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
+ UNKNOWN_LOCATION);
+ add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
+ loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
+ exit_gsi = gsi_after_labels (reduc_block);
+ }
+
+ /* Shouldn't be used beyond this point. */
+ exit_bb = nullptr;
+
if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
&& reduc_fn != IFN_LAST)
{
@@ -5405,6 +5544,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
the same as initial_def already. */
tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
induc_val);
+ tree initial_def = reduc_info->reduc_initial_values[0];
tmp = make_ssa_name (new_scalar_dest);
epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5425,9 +5565,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
gcc_assert (reduc_inputs.length () == 1);
gcc_assert (pow2p_hwi (group_size));
- slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
- vec<stmt_vec_info> orig_phis
- = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
gimple_seq seq = NULL;
/* Build a vector {0, 1, 2, ...}, with the same number of elements
@@ -5452,7 +5589,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
{
tree initial_value = NULL_TREE;
if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
- initial_value = vect_phi_initial_value (orig_phis[0]);
+ initial_value = reduc_info->reduc_initial_values[0];
neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
initial_value);
}
@@ -5466,7 +5603,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
for MIN and MAX reduction, for example. */
if (!neutral_op)
{
- tree scalar_value = vect_phi_initial_value (orig_phis[i]);
+ tree scalar_value = reduc_info->reduc_initial_values[i];
scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
scalar_value);
vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5780,6 +5917,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
the same as initial_def already. */
tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
induc_val);
+ tree initial_def = reduc_info->reduc_initial_values[0];
tree tmp = make_ssa_name (new_scalar_dest);
epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5819,6 +5957,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
scalar_results[0] = new_temp;
}
+ /* Record this operation if it could be reused by the epilogue loop. */
+ if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
+ loop_vinfo->reusable_accumulators.put (scalar_results[0],
+ { orig_reduc_input, reduc_info });
+
if (double_reduc)
loop = outer_loop;
@@ -5886,6 +6029,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
{
/* Replace the uses: */
orig_name = PHI_RESULT (exit_phi);
+
+ /* Look for a single use at the target of the skip edge. */
+ if (unify_with_main_loop_p)
+ {
+ use_operand_p use_p;
+ gimple *user;
+ if (!single_imm_use (orig_name, &use_p, &user))
+ gcc_unreachable ();
+ orig_name = gimple_get_lhs (user);
+ }
+
scalar_result = scalar_results[k];
FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
{
@@ -7421,16 +7575,32 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
else
{
gcc_assert (slp_node == slp_node_instance->reduc_phis);
- tree initial_value = NULL_TREE;
+ vec<tree> &initial_values = reduc_info->reduc_initial_values;
+ vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+
+ unsigned int num_phis = stmts.length ();
if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
- initial_value = vect_phi_initial_value (phi);
- tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
- tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
- code, initial_value);
- get_initial_defs_for_reduction (loop_vinfo, reduc_info,
- slp_node_instance->reduc_phis,
- &vec_initial_defs, vec_num,
- initial_value != NULL, neutral_op);
+ num_phis = 1;
+ initial_values.reserve (num_phis);
+ for (unsigned int i = 0; i < num_phis; ++i)
+ {
+ gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
+ initial_values.quick_push (vect_phi_initial_value (this_phi));
+ }
+ if (vec_num == 1)
+ vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+ if (!initial_values.is_empty ())
+ {
+ tree initial_value
+ = (num_phis == 1 ? initial_values[0] : NULL_TREE);
+ tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+ tree neutral_op
+ = neutral_op_for_reduction (TREE_TYPE (vectype_out),
+ code, initial_value);
+ get_initial_defs_for_reduction (loop_vinfo, reduc_info,
+ &vec_initial_defs, vec_num,
+ stmts.length (), neutral_op);
+ }
}
}
else
@@ -7438,6 +7608,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
/* Get at the scalar def before the loop, that defines the initial
value of the reduction variable. */
tree initial_def = vect_phi_initial_value (phi);
+ reduc_info->reduc_initial_values.safe_push (initial_def);
/* Optimize: if initial_def is for REDUC_MAX smaller than the base
and we can't use zero for induc_val, use initial_def. Similarly
for REDUC_MIN and initial_def larger than the base. */
@@ -7474,21 +7645,30 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
initial_def, initial_def);
else
{
- enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
- tree neutral_op = neutral_op_for_reduction (TREE_TYPE (initial_def),
- code, initial_def);
- gcc_assert (neutral_op);
- /* Try to simplify the vector initialization by applying an
- adjustment after the reduction has been performed. */
- if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
- && !operand_equal_p (neutral_op, initial_def))
+ if (ncopies == 1)
+ vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+ if (!reduc_info->reduc_initial_values.is_empty ())
{
- STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = initial_def;
- initial_def = neutral_op;
+ initial_def = reduc_info->reduc_initial_values[0];
+ enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+ tree neutral_op
+ = neutral_op_for_reduction (TREE_TYPE (initial_def),
+ code, initial_def);
+ gcc_assert (neutral_op);
+ /* Try to simplify the vector initialization by applying an
+ adjustment after the reduction has been performed. */
+ if (!reduc_info->reused_accumulator
+ && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+ && !operand_equal_p (neutral_op, initial_def))
+ {
+ STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
+ = initial_def;
+ initial_def = neutral_op;
+ }
+ vec_initial_def
+ = get_initial_def_for_reduction (loop_vinfo, reduc_info,
+ initial_def, neutral_op);
}
- vec_initial_def
- = get_initial_def_for_reduction (loop_vinfo, reduc_info,
- initial_def, neutral_op);
}
}
@@ -7499,6 +7679,17 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
vec_initial_defs.quick_push (vec_initial_def);
}
+ if (auto *accumulator = reduc_info->reused_accumulator)
+ {
+ if (loop_vinfo->main_loop_edge)
+ vec_initial_defs[0]
+ = vect_get_main_loop_result (loop_vinfo, accumulator->reduc_input,
+ vec_initial_defs[0]);
+ else
+ vec_initial_defs.safe_push (accumulator->reduc_input);
+ gcc_assert (vec_initial_defs.length () == 1);
+ }
+
/* Generate the reduction PHIs upfront. */
for (i = 0; i < vec_num; i++)
{
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 9748043..f1035a8 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -694,6 +694,8 @@ vec_info::new_stmt_vec_info (gimple *stmt)
STMT_VINFO_SLP_VECT_ONLY (res) = false;
STMT_VINFO_SLP_VECT_ONLY_PATTERN (res) = false;
STMT_VINFO_VEC_STMTS (res) = vNULL;
+ res->reduc_initial_values = vNULL;
+ res->reduc_scalar_results = vNULL;
if (is_a <loop_vec_info> (this)
&& gimple_code (stmt) == GIMPLE_PHI
@@ -755,6 +757,8 @@ vec_info::free_stmt_vec_info (stmt_vec_info stmt_info)
release_ssa_name (lhs);
}
+ stmt_info->reduc_initial_values.release ();
+ stmt_info->reduc_scalar_results.release ();
STMT_VINFO_SIMD_CLONE_INFO (stmt_info).release ();
STMT_VINFO_VEC_STMTS (stmt_info).release ();
free (stmt_info);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e2fd360..d825b0c 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -551,6 +551,18 @@ typedef auto_vec<rgroup_controls> vec_loop_lens;
typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
+/* Information about a reduction accumulator from the main loop that could
+ conceivably be reused as the input to a reduction in an epilogue loop. */
+struct vect_reusable_accumulator {
+ /* The final value of the accumulator, which forms the input to the
+ reduction operation. */
+ tree reduc_input;
+
+ /* The stmt_vec_info that describes the reduction (i.e. the one for
+ which is_reduc_info is true). */
+ stmt_vec_info reduc_info;
+};
+
/*-----------------------------------------------------------------*/
/* Info on vectorized loops. */
/*-----------------------------------------------------------------*/
@@ -588,6 +600,26 @@ public:
/* Unrolling factor */
poly_uint64 vectorization_factor;
+ /* If this loop is an epilogue loop whose main loop can be skipped,
+ MAIN_LOOP_EDGE is the edge from the main loop to this loop's
+ preheader. SKIP_MAIN_LOOP_EDGE is then the edge that skips the
+ main loop and goes straight to this loop's preheader.
+
+ Both fields are null otherwise. */
+ edge main_loop_edge;
+ edge skip_main_loop_edge;
+
+ /* If this loop is an epilogue loop that might be skipped after executing
+ the main loop, this edge is the one that skips the epilogue. */
+ edge skip_this_loop_edge;
+
+ /* The vectorized form of a standard reduction replaces the original
+ scalar code's final result (a loop-closed SSA PHI) with the result
+ of a vector-to-scalar reduction operation. After vectorization,
+ this variable maps these vector-to-scalar results to information
+ about the reductions that generated them. */
+ hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
+
/* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
if there is no particular limit. */
unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1186,6 +1218,23 @@ public:
/* The vector type for performing the actual reduction. */
tree reduc_vectype;
+ /* If IS_REDUC_INFO is true and if the vector code is performing
+ N scalar reductions in parallel, this variable gives the initial
+ scalar values of those N reductions. */
+ vec<tree> reduc_initial_values;
+
+ /* If IS_REDUC_INFO is true and if the vector code is performing
+ N scalar reductions in parallel, this variable gives the vectorized code's
+ final (scalar) result for each of those N reductions. In other words,
+ REDUC_SCALAR_RESULTS[I] replaces the original scalar code's loop-closed
+ SSA PHI for reduction number I. */
+ vec<tree> reduc_scalar_results;
+
+ /* Only meaningful if IS_REDUC_INFO. If non-null, the reduction is
+ being performed by an epilogue loop and we have decided to reuse
+ this accumulator from the main loop. */
+ vect_reusable_accumulator *reused_accumulator;
+
/* Whether we force a single cycle PHI during reduction vectorization. */
bool force_single_cycle;
@@ -1382,12 +1431,6 @@ vect_phi_initial_value (gphi *phi)
return PHI_ARG_DEF_FROM_EDGE (phi, pe);
}
-static inline tree
-vect_phi_initial_value (stmt_vec_info stmt_info)
-{
- return vect_phi_initial_value (as_a <gphi *> (stmt_info->stmt));
-}
-
/* Return true if STMT_INFO should produce a vector mask type rather than
a normal nonmask type. */
@@ -1818,6 +1861,7 @@ class loop *vect_loop_versioning (loop_vec_info, gimple *);
extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
tree *, tree *, tree *, int, bool, bool,
tree *);
+extern tree vect_get_main_loop_result (loop_vec_info, tree, tree);
extern void vect_prepare_for_masked_peels (loop_vec_info);
extern dump_user_location_t find_loop_location (class loop *);
extern bool vect_can_advance_ivs_p (loop_vec_info);