diff options
Diffstat (limited to 'gcc')
18 files changed, 1297 insertions, 64 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c new file mode 100644 index 0000000..fb817b7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c @@ -0,0 +1,77 @@ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +unsigned short __attribute__((noipa)) +add_loop (unsigned short *x, int n) +{ + unsigned short res = 0; + for (int i = 0; i < n; ++i) + res += x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +min_loop (unsigned short *x, int n) +{ + unsigned short res = ~0; + for (int i = 0; i < n; ++i) + res = res < x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +max_loop (unsigned short *x, int n) +{ + unsigned short res = 0; + for (int i = 0; i < n; ++i) + res = res > x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +and_loop (unsigned short *x, int n) +{ + unsigned short res = ~0; + for (int i = 0; i < n; ++i) + res &= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +or_loop (unsigned short *x, int n) +{ + unsigned short res = 0; + for (int i = 0; i < n; ++i) + res |= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +eor_loop (unsigned short *x, int n) +{ + unsigned short res = 0; + for (int i = 0; i < n; ++i) + res ^= x[i]; + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c new file mode 100644 index 0000000..1dd579b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c @@ -0,0 +1,49 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_10.c" + +int +main (void) +{ + unsigned short x[N]; + for (int i = 0; i < N; ++i) + x[i] = (i + 1) * (i + 2); + + if (add_loop (x, 0) != 0 + || add_loop (x, 11) != 572 + || add_loop (x, 0x100) != 22016 + || add_loop (x, 0xfff) != 20480 + || max_loop (x, 0) != 0 + || max_loop (x, 11) != 132 + || max_loop (x, 0x100) != 65280 + || max_loop (x, 0xfff) != 65504 + || or_loop (x, 0) != 0 + || or_loop (x, 11) != 0xfe + || or_loop (x, 0x80) != 0x7ffe + || or_loop (x, 0xb4) != 0x7ffe + || or_loop (x, 0xb5) != 0xfffe + || eor_loop (x, 0) != 0 + || eor_loop (x, 11) != 0xe8 + || eor_loop (x, 0x100) != 0xcf00 + || eor_loop (x, 0xfff) != 0xa000) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i]; + + if (min_loop (x, 0) != 65535 + || min_loop (x, 11) != 65403 + || min_loop (x, 0x100) != 255 + || min_loop (x, 0xfff) != 31 + || and_loop (x, 0) != 0xffff + || and_loop (x, 11) != 0xff01 + || and_loop (x, 0x80) != 0x8001 + || and_loop (x, 0xb4) != 0x8001 + || and_loop (x, 0xb5) != 1) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c new file mode 100644 index 0000000..f99ef4a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c @@ -0,0 +1,71 @@ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +unsigned short __attribute__((noipa)) +add_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res += x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +min_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res = res < x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +max_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res = res > x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +and_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res &= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +or_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res |= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +eor_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res ^= x[i]; + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c new file mode 100644 index 0000000..5b41560 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c @@ -0,0 +1,34 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_11.c" + +int +main (void) +{ + unsigned short x[N]; + for (int i = 0; i < N; ++i) + x[i] = (i + 1) * (i + 2); + + if (add_loop (x, 42) != 20522 + || max_loop (x, 65503) != 65504 + || max_loop (x, 65505) != 65505 + || or_loop (x, 0) != 0xfffe + || or_loop (x, 1) != 0xffff + || eor_loop (x, 0) != 0xa000 + || eor_loop (x, 0xbfff) != 0x1fff) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i]; + + if (min_loop (x, 32) != 31 + || min_loop (x, 30) != 30 + || and_loop (x, 0xff) != 1 + || and_loop (x, 0) != 0) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c new file mode 100644 index 0000000..d32b81a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c @@ -0,0 +1,71 @@ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +unsigned short __attribute__((noipa)) +add_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res += x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +min_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res = res < x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +max_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res = res > x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +and_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res &= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +or_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res |= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +eor_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res ^= x[i]; + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c new file mode 100644 index 0000000..929b81a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c @@ -0,0 +1,66 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_12.c" + +int +main (void) +{ + unsigned short x[N]; + for (int i = 0; i < N; ++i) + x[i] = (i + 1) * (i + 2); + + if (add_loop (x, 0, 10) != 10 + || add_loop (x, 11, 42) != 614 + || add_loop (x, 0x100, 84) != 22100 + || add_loop (x, 0xfff, 20) != 20500 + || max_loop (x, 0, 10) != 10 + || max_loop (x, 11, 131) != 132 + || max_loop (x, 11, 133) != 133 + || max_loop (x, 0x100, 65279) != 65280 + || max_loop (x, 0x100, 65281) != 65281 + || max_loop (x, 0xfff, 65503) != 65504 + || max_loop (x, 0xfff, 65505) != 65505 + || or_loop (x, 0, 0x71) != 0x71 + || or_loop (x, 11, 0) != 0xfe + || or_loop (x, 11, 0xb3c) != 0xbfe + || or_loop (x, 0x80, 0) != 0x7ffe + || or_loop (x, 0x80, 1) != 0x7fff + || or_loop (x, 0xb4, 0) != 0x7ffe + || or_loop (x, 0xb4, 1) != 0x7fff + || or_loop (x, 0xb5, 0) != 0xfffe + || or_loop (x, 0xb5, 1) != 0xffff + || eor_loop (x, 0, 0x3e) != 0x3e + || eor_loop (x, 11, 0) != 0xe8 + || eor_loop (x, 11, 0x1ff) != 0x117 + || eor_loop (x, 0x100, 0) != 0xcf00 + || eor_loop (x, 0x100, 0xeee) != 0xc1ee + || eor_loop (x, 0xfff, 0) != 0xa000 + || eor_loop (x, 0xfff, 0x8888) != 0x2888) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i]; + + if (min_loop (x, 0, 10000) != 10000 + || min_loop (x, 11, 65404) != 65403 + || min_loop (x, 11, 65402) != 65402 + || min_loop (x, 0x100, 256) != 255 + || min_loop (x, 0x100, 254) != 254 + || min_loop (x, 0xfff, 32) != 31 + || min_loop (x, 0xfff, 30) != 30 + || and_loop (x, 0, 0x1234) != 0x1234 + || and_loop (x, 11, 0xffff) != 0xff01 + || and_loop (x, 11, 0xcdef) != 0xcd01 + || and_loop (x, 0x80, 0xffff) != 0x8001 + || and_loop (x, 0x80, 0xfffe) != 0x8000 + || and_loop (x, 0xb4, 0xffff) != 0x8001 + || and_loop (x, 0xb4, 0xfffe) != 0x8000 + || and_loop (x, 0xb5, 0xffff) != 1 + || and_loop (x, 0xb5, 0xfffe) != 0) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c new file mode 100644 index 0000000..ce2b8f2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c @@ -0,0 +1,101 @@ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +void __attribute__((noipa)) +add_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 += x[i * 2]; + res1 += x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +min_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 = res0 < x[i * 2] ? res0 : x[i * 2]; + res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +max_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 = res0 > x[i * 2] ? res0 : x[i * 2]; + res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +and_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 &= x[i * 2]; + res1 &= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +or_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 |= x[i * 2]; + res1 |= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +eor_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 ^= x[i * 2]; + res1 ^= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c new file mode 100644 index 0000000..5514d8d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c @@ -0,0 +1,61 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_13.c" + +int +main (void) +{ + unsigned int x[N]; + for (int i = 0; i < N; ++i) + x[i] = ((i + 1) * (i + 2)) & 0xfffff; + + unsigned int add_res[2] = { 42, 1111 }; + add_loop (x, add_res); + if (add_res[0] != 968538154 + || add_res[1] != 964340823) + __builtin_abort (); + + unsigned int max_res1[2] = { 0, 0 }; + max_loop (x, max_res1); + if (max_res1[0] != 1048150 + || max_res1[1] != 1045506) + __builtin_abort (); + + unsigned int max_res2[2] = { 1048151, 1045507 }; + max_loop (x, max_res2); + if (max_res2[0] != 1048151 + || max_res2[1] != 1045507) + __builtin_abort (); + + unsigned int or_res[2] = { 0x1000000, 0x2000000 }; + or_loop (x, or_res); + if (or_res[0] != 0x10ffffe + || or_res[1] != 0x20ffffe) + __builtin_abort (); + + unsigned int eor_res[2] = { 0x1000000, 0x2000000 }; + eor_loop (x, eor_res); + if (eor_res[0] != 0x1010000 + || eor_res[1] != 0x20b5000) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i] & 0xfffff; + + unsigned int min_res1[2] = { 500, 4000 }; + min_loop (x, min_res1); + if (min_res1[0] != 425 + || min_res1[1] != 3069) + __builtin_abort (); + + unsigned int min_res2[2] = { 424, 3068 }; + min_loop (x, min_res2); + if (min_res2[0] != 424 + || min_res2[1] != 3068) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c new file mode 100644 index 0000000..3be611e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c @@ -0,0 +1,107 @@ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +void __attribute__((noipa)) +add_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 += x[i * 2]; + res1 += x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +min_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 = res0 < x[i * 2] ? res0 : x[i * 2]; + res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +max_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 = res0 > x[i * 2] ? res0 : x[i * 2]; + res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +and_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 &= x[i * 2]; + res1 &= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +or_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 |= x[i * 2]; + res1 |= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +eor_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 ^= x[i * 2]; + res1 ^= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c new file mode 100644 index 0000000..ccaa770 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c @@ -0,0 +1,187 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_14.c" + +int +main (void) +{ + unsigned int x[N]; + for (int i = 0; i < N; ++i) + x[i] = ((i + 1) * (i + 2)) & 0xfffff; + + unsigned int add_res1[2] = { 11, 22 }; + add_loop (x, 0, add_res1); + if (add_res1[0] != 11 + || add_res1[1] != 22) + __builtin_abort (); + + unsigned int add_res2[2] = { 10, 20 }; + add_loop (x, 11, add_res2); + if (add_res2[0] != 1902 + || add_res2[1] != 2176) + __builtin_abort (); + + unsigned int add_res3[2] = { 15, 30 }; + add_loop (x, 0x100, add_res3); + if (add_res3[0] != 22435087 + || add_res3[1] != 22566686) + __builtin_abort (); + + unsigned int add_res4[2] = { 100, 200 }; + add_loop (x, 0x11f, add_res4); + if (add_res4[0] != 31602244 + || add_res4[1] != 31767656) + __builtin_abort (); + + unsigned int max_res1[2] = { 461, 500 }; + max_loop (x, 11, max_res1); + if (max_res1[0] != 462 + || max_res1[1] != 506) + __builtin_abort (); + + unsigned int max_res2[2] = { 463, 507 }; + max_loop (x, 11, max_res2); + if (max_res2[0] != 463 + || max_res2[1] != 507) + __builtin_abort (); + + unsigned int max_res3[2] = { 1000000, 1000000 }; + max_loop (x, 0x200, max_res3); + if (max_res3[0] != 1047552 + || max_res3[1] != 1045506) + __builtin_abort (); + + unsigned int max_res4[2] = { 1047553, 1045507 }; + max_loop (x, 0x200, max_res4); + if (max_res4[0] != 1047553 + || max_res4[1] != 1045507) + __builtin_abort (); + + unsigned int max_res5[2] = { 300000, 30000 }; + max_loop (x, 0x11f, max_res5); + if (max_res5[0] != 328902 + || max_res5[1] != 330050) + __builtin_abort (); + + unsigned int max_res6[2] = { 328903, 330051 }; + max_loop (x, 0x11f, max_res6); + if (max_res6[0] != 328903 + || max_res6[1] != 330051) + __builtin_abort (); + + unsigned int or_res1[2] = { 11, 22 }; + or_loop (x, 0, or_res1); + if (or_res1[0] != 11 + || or_res1[1] != 22) + __builtin_abort (); + + unsigned int or_res2[2] = { 0x200000, 0xe00000 }; + or_loop (x, 11, or_res2); + if (or_res2[0] != 0x2001fe + || or_res2[1] != 0xe001fe) + __builtin_abort (); + + unsigned int or_res3[2] = { 0x800000, 0x700000 }; + or_loop (x, 0x40, or_res3); + if (or_res3[0] != 0x803ffe + || or_res3[1] != 0x707ffe) + __builtin_abort (); + + unsigned int or_res4[2] = { 0x100001, 0x300000 }; + or_loop (x, 0x4f, or_res4); + if (or_res4[0] != 0x107fff + || or_res4[1] != 0x307ffe) + __builtin_abort (); + + unsigned int eor_res1[2] = { 11, 22 }; + eor_loop (x, 0, eor_res1); + if (eor_res1[0] != 11 + || eor_res1[1] != 22) + __builtin_abort (); + + unsigned int eor_res2[2] = { 0x2000ff, 0xe000ff }; + eor_loop (x, 11, eor_res2); + if (eor_res2[0] != 0x2001cf + || eor_res2[1] != 0xe000b7) + __builtin_abort (); + + unsigned int eor_res3[2] = { 0x805000, 0x70f000 }; + eor_loop (x, 0x100, eor_res3); + if (eor_res3[0] != 0x824200 + || eor_res3[1] != 0x77dc00) + __builtin_abort (); + + unsigned int eor_res4[2] = { 0x101201, 0x300f00 }; + eor_loop (x, 0x11f, eor_res4); + if (eor_res4[0] != 0x178801 + || eor_res4[1] != 0x337240) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i] & 0xfffff; + + unsigned int min_res1[2] = { 1048200, 1048100 }; + min_loop (x, 11, min_res1); + if (min_res1[0] != 1048113 + || min_res1[1] != 1048069) + __builtin_abort (); + + unsigned int min_res2[2] = { 1048112, 1048068 }; + min_loop (x, 11, min_res2); + if (min_res2[0] != 1048112 + || min_res2[1] != 1048068) + __builtin_abort (); + + unsigned int min_res3[2] = { 10000, 10000 }; + min_loop (x, 0x200, min_res3); + if (min_res3[0] != 1023 + || min_res3[1] != 3069) + __builtin_abort (); + + unsigned int min_res4[2] = { 1022, 3068 }; + min_loop (x, 0x200, min_res4); + if (min_res4[0] != 1022 + || min_res4[1] != 3068) + __builtin_abort (); + + unsigned int min_res5[2] = { 719680, 718530 }; + min_loop (x, 0x11f, min_res5); + if (min_res5[0] != 719673 + || min_res5[1] != 718525) + __builtin_abort (); + + unsigned int min_res6[2] = { 719672, 718524 }; + min_loop (x, 0x11f, min_res6); + if (min_res6[0] != 719672 + || min_res6[1] != 718524) + __builtin_abort (); + + unsigned int and_res1[2] = { 11, 22 }; + and_loop (x, 0, and_res1); + if (and_res1[0] != 11 + || and_res1[1] != 22) + __builtin_abort (); + + unsigned int and_res2[2] = { 0xf5cff, 0xf78ff }; + and_loop (x, 11, and_res2); + if (and_res2[0] != 0xf5c01 + || and_res2[1] != 0xf7801) + __builtin_abort (); + + unsigned int and_res3[2] = { 0x7efff, 0xecfff }; + and_loop (x, 0x40, and_res3); + if (and_res3[0] != 0x7c001 + || and_res3[1] != 0xe8001) + __builtin_abort (); + + unsigned int and_res4[2] = { 0xffffff, 0xffffff }; + and_loop (x, 0x4f, and_res4); + if (and_res4[0] != 0xf8001 + || and_res4[1] != 0xf8001) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c new file mode 100644 index 0000000..15b1ade --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c @@ -0,0 +1,16 @@ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +int __attribute__((noipa)) +add_loop (int *x, int n, int res) +{ + for (int i = 0; i < n; ++i) + { + res += x[i * 2]; + res += x[i * 2 + 1]; + } + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c new file mode 100644 index 0000000..3207fce --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c @@ -0,0 +1,22 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_15.c" + +int +main (void) +{ + int x[N]; + for (int i = 0; i < N; ++i) + x[i] = ((i + 1) * (i + 2)) & 0xfffff; + + if (add_loop (x, 0, 33) != 33 + || add_loop (x, 11, 30) != 4078 + || add_loop (x, 0x100, 45) != 45001773 + || add_loop (x, 0x11f, 300) != 63369900) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c new file mode 100644 index 0000000..b839821 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c @@ -0,0 +1,77 @@ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +unsigned short __attribute__((noipa)) +add_loop (unsigned short *x) +{ + unsigned short res = 0; + for (int i = 0; i < 0xfff; ++i) + res += x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +min_loop (unsigned short *x) +{ + unsigned short res = ~0; + for (int i = 0; i < 0xfff; ++i) + res = res < x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +max_loop (unsigned short *x) +{ + unsigned short res = 0; + for (int i = 0; i < 0xfff; ++i) + res = res > x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +and_loop (unsigned short *x) +{ + unsigned short res = ~0; + for (int i = 0; i < 0xfff; ++i) + res &= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +or_loop (unsigned short *x) +{ + unsigned short res = 0; + for (int i = 0; i < 0xfff; ++i) + res |= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +eor_loop (unsigned short *x) +{ + unsigned short res = 0; + for (int i = 0; i < 0xfff; ++i) + res ^= x[i]; + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c new file mode 100644 index 0000000..aa248f5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c @@ -0,0 +1,29 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_9.c" + +int +main (void) +{ + unsigned short x[N]; + for (int i = 0; i < N; ++i) + x[i] = (i + 1) * (i + 2); + + if (add_loop (x) != 20480 + || max_loop (x) != 65504 + || or_loop (x) != 0xfffe + || eor_loop (x) != 0xa000) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i]; + + if (min_loop (x) != 31 + || and_loop (x) != 1) + __builtin_abort (); + + return 0; +} diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 2909e8a..c29ffb3 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -2457,6 +2457,28 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo, return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true); } +/* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped. + Return a value that equals: + + - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and + - SKIP_VALUE when the main loop is skipped. */ + +tree +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value, + tree skip_value) +{ + gcc_assert (loop_vinfo->main_loop_edge); + + tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value)); + basic_block bb = loop_vinfo->main_loop_edge->dest; + gphi *new_phi = create_phi_node (phi_result, bb); + add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge, + UNKNOWN_LOCATION); + add_phi_arg (new_phi, skip_value, + loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION); + return phi_result; +} + /* Function vect_do_peeling. Input: @@ -2986,6 +3008,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, skip_vector ? anchor : guard_bb, prob_epilog.invert (), irred_flag); + if (vect_epilogues) + epilogue_vinfo->skip_this_loop_edge = guard_e; slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, single_exit (epilog)); /* Only need to handle basic block before epilog loop if it's not @@ -3057,6 +3081,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e, UNKNOWN_LOCATION); niters = PHI_RESULT (new_phi); + epilogue_vinfo->main_loop_edge = update_e; + epilogue_vinfo->skip_main_loop_edge = skip_e; } /* Set ADVANCE to the number of iterations performed by the previous diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index fe7e73f..8c27d75 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ +#define INCLUDE_ALGORITHM #include "config.h" #include "system.h" #include "coretypes.h" @@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) th (0), versioning_threshold (0), vectorization_factor (0), + main_loop_edge (nullptr), + skip_main_loop_edge (nullptr), + skip_this_loop_edge (nullptr), + reusable_accumulators (), max_vectorization_factor (0), mask_skip_niters (NULL_TREE), rgroup_compare_type (NULL_TREE), @@ -4607,7 +4612,32 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, prologue_cost, epilogue_cost); } +/* SEQ is a sequence of instructions that initialize the reduction + described by REDUC_INFO. Emit them in the appropriate place. */ +static void +vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo, + stmt_vec_info reduc_info, gimple *seq) +{ + if (reduc_info->reused_accumulator) + { + /* When reusing an accumulator from the main loop, we only need + initialization instructions if the main loop can be skipped. + In that case, emit the initialization instructions at the end + of the guard block that does the skip. */ + edge skip_edge = loop_vinfo->skip_main_loop_edge; + gcc_assert (skip_edge); + gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src); + gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); + } + else + { + /* The normal case: emit the initialization instructions on the + preheader edge. */ + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq); + } +} /* Function get_initial_def_for_reduction @@ -4675,36 +4705,30 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo, } if (stmts) - gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); + vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts); return init_def; } -/* Get at the initial defs for the reduction PHIs for REDUC_INFO, whose - associated SLP node is SLP_NODE. NUMBER_OF_VECTORS is the number of vector - defs to create. If NEUTRAL_OP is nonnull, introducing extra elements of - that value will not change the result. */ +/* Get at the initial defs for the reduction PHIs for REDUC_INFO, + which performs a reduction involving GROUP_SIZE scalar statements. + NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP + is nonnull, introducing extra elements of that value will not change the + result. */ static void -get_initial_defs_for_reduction (vec_info *vinfo, +get_initial_defs_for_reduction (loop_vec_info loop_vinfo, stmt_vec_info reduc_info, - slp_tree slp_node, vec<tree> *vec_oprnds, unsigned int number_of_vectors, - bool reduc_chain, tree neutral_op) + unsigned int group_size, tree neutral_op) { - vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); + vec<tree> &initial_values = reduc_info->reduc_initial_values; unsigned HOST_WIDE_INT nunits; unsigned j, number_of_places_left_in_vector; tree vector_type = STMT_VINFO_VECTYPE (reduc_info); - unsigned int group_size = stmts.length (); unsigned int i; - class loop *loop; - - loop = (gimple_bb (reduc_info->stmt))->loop_father; - gcc_assert (loop); - edge pe = loop_preheader_edge (loop); - gcc_assert (!reduc_chain || neutral_op); + gcc_assert (group_size == initial_values.length () || neutral_op); /* NUMBER_OF_COPIES is the number of times we need to use the same values in created vectors. It is greater than 1 if unrolling is performed. @@ -4734,18 +4758,13 @@ get_initial_defs_for_reduction (vec_info *vinfo, { tree op; i = j % group_size; - stmt_vec_info stmt_vinfo = stmts[i]; /* Get the def before the loop. In reduction chain we have only one initial value. Else we have as many as PHIs in the group. */ - if (reduc_chain) - op = j != 0 ? neutral_op : vect_phi_initial_value (stmt_vinfo); - else if (((vec_oprnds->length () + 1) * nunits - - number_of_places_left_in_vector >= group_size) - && neutral_op) + if (i >= initial_values.length () || (j > i && neutral_op)) op = neutral_op; else - op = vect_phi_initial_value (stmt_vinfo); + op = initial_values[i]; /* Create 'vect_ = {op0,op1,...,opn}'. */ number_of_places_left_in_vector--; @@ -4781,8 +4800,8 @@ get_initial_defs_for_reduction (vec_info *vinfo, { /* First time round, duplicate ELTS to fill the required number of vectors. */ - duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts, - number_of_vectors, *vec_oprnds); + duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type, + elts, number_of_vectors, *vec_oprnds); break; } vec_oprnds->quick_push (init); @@ -4794,7 +4813,7 @@ get_initial_defs_for_reduction (vec_info *vinfo, } } if (ctor_seq != NULL) - gsi_insert_seq_on_edge_immediate (pe, ctor_seq); + vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq); } /* For a statement STMT_INFO taking part in a reduction operation return @@ -4823,6 +4842,99 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info) return stmt_info; } +/* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that + REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise + return false. */ + +static bool +vect_find_reusable_accumulator (loop_vec_info loop_vinfo, + stmt_vec_info reduc_info) +{ + loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); + if (!main_loop_vinfo) + return false; + + if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION) + return false; + + unsigned int num_phis = reduc_info->reduc_initial_values.length (); + auto_vec<tree, 16> main_loop_results (num_phis); + auto_vec<tree, 16> initial_values (num_phis); + if (edge main_loop_edge = loop_vinfo->main_loop_edge) + { + /* The epilogue loop can be entered either from the main loop or + from an earlier guard block. */ + edge skip_edge = loop_vinfo->skip_main_loop_edge; + for (tree incoming_value : reduc_info->reduc_initial_values) + { + /* Look for: + + INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop), + INITIAL_VALUE(guard block)>. */ + gcc_assert (TREE_CODE (incoming_value) == SSA_NAME); + + gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value)); + gcc_assert (gimple_bb (phi) == main_loop_edge->dest); + + tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge); + tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge); + + main_loop_results.quick_push (from_main_loop); + initial_values.quick_push (from_skip); + } + } + else + /* The main loop dominates the epilogue loop. */ + main_loop_results.splice (reduc_info->reduc_initial_values); + + /* See if the main loop has the kind of accumulator we need. */ + vect_reusable_accumulator *accumulator + = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]); + if (!accumulator + || num_phis != accumulator->reduc_info->reduc_scalar_results.length () + || !std::equal (main_loop_results.begin (), main_loop_results.end (), + accumulator->reduc_info->reduc_scalar_results.begin ())) + return false; + + /* For now, only handle the case in which both loops are operating on the + same vector types. In future we could reduce wider vectors to narrower + ones as well. */ + tree vectype = STMT_VINFO_VECTYPE (reduc_info); + tree old_vectype = TREE_TYPE (accumulator->reduc_input); + if (!useless_type_conversion_p (old_vectype, vectype)) + return false; + + /* Non-SLP reductions might apply an adjustment after the reduction + operation, in order to simplify the initialization of the accumulator. + If the epilogue loop carries on from where the main loop left off, + it should apply the same adjustment to the final reduction result. + + If the epilogue loop can also be entered directly (rather than via + the main loop), we need to be able to handle that case in the same way, + with the same adjustment. (In principle we could add a PHI node + to select the correct adjustment, but in practice that shouldn't be + necessary.) */ + tree main_adjustment + = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info); + if (loop_vinfo->main_loop_edge && main_adjustment) + { + gcc_assert (num_phis == 1); + tree initial_value = initial_values[0]; + /* Check that we can use INITIAL_VALUE as the adjustment and + initialize the accumulator with a neutral value instead. */ + if (!operand_equal_p (initial_value, main_adjustment)) + return false; + tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); + initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value), + code, initial_value); + } + STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment; + reduc_info->reduc_initial_values.truncate (0); + reduc_info->reduc_initial_values.splice (initial_values); + reduc_info->reused_accumulator = accumulator; + return true; +} + /* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction @@ -4915,7 +5027,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, gimple *use_stmt; auto_vec<tree> reduc_inputs; int j, i; - auto_vec<tree> scalar_results; + vec<tree> &scalar_results = reduc_info->reduc_scalar_results; unsigned int group_size = 1, k; auto_vec<gimple *> phis; /* SLP reduction without reduction chain, e.g., @@ -4941,16 +5053,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, gcc_assert (vectype); mode = TYPE_MODE (vectype); - tree initial_def = NULL; tree induc_val = NULL_TREE; tree adjustment_def = NULL; if (slp_node) ; else { - /* Get at the scalar def before the loop, that defines the initial value - of the reduction variable. */ - initial_def = vect_phi_initial_value (reduc_def_stmt); /* Optimize: for induction condition reduction, if we can't use zero for induc_val, use initial_def. */ if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) @@ -5196,6 +5304,37 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, reduc_inputs.safe_push (single_input); } + tree orig_reduc_input = reduc_inputs[0]; + + /* If this loop is an epilogue loop that can be skipped after the + main loop, we can only share a reduction operation between the + main loop and the epilogue if we put it at the target of the + skip edge. + + We can still reuse accumulators if this check fails. Doing so has + the minor(?) benefit of making the epilogue loop's scalar result + independent of the main loop's scalar result. */ + bool unify_with_main_loop_p = false; + if (reduc_info->reused_accumulator + && loop_vinfo->skip_this_loop_edge + && single_succ_p (exit_bb) + && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest) + { + unify_with_main_loop_p = true; + + basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest; + reduc_inputs[0] = make_ssa_name (vectype); + gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block); + add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb), + UNKNOWN_LOCATION); + add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input, + loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION); + exit_gsi = gsi_after_labels (reduc_block); + } + + /* Shouldn't be used beyond this point. */ + exit_bb = nullptr; + if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION && reduc_fn != IFN_LAST) { @@ -5405,6 +5544,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, the same as initial_def already. */ tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, induc_val); + tree initial_def = reduc_info->reduc_initial_values[0]; tmp = make_ssa_name (new_scalar_dest); epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, @@ -5425,9 +5565,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, gcc_assert (reduc_inputs.length () == 1); gcc_assert (pow2p_hwi (group_size)); - slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis; - vec<stmt_vec_info> orig_phis - = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node); gimple_seq seq = NULL; /* Build a vector {0, 1, 2, ...}, with the same number of elements @@ -5452,7 +5589,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { tree initial_value = NULL_TREE; if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) - initial_value = vect_phi_initial_value (orig_phis[0]); + initial_value = reduc_info->reduc_initial_values[0]; neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code, initial_value); } @@ -5466,7 +5603,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, for MIN and MAX reduction, for example. */ if (!neutral_op) { - tree scalar_value = vect_phi_initial_value (orig_phis[i]); + tree scalar_value = reduc_info->reduc_initial_values[i]; scalar_value = gimple_convert (&seq, TREE_TYPE (vectype), scalar_value); vector_identity = gimple_build_vector_from_val (&seq, vectype, @@ -5780,6 +5917,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, the same as initial_def already. */ tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, induc_val); + tree initial_def = reduc_info->reduc_initial_values[0]; tree tmp = make_ssa_name (new_scalar_dest); epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, @@ -5819,6 +5957,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, scalar_results[0] = new_temp; } + /* Record this operation if it could be reused by the epilogue loop. */ + if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION) + loop_vinfo->reusable_accumulators.put (scalar_results[0], + { orig_reduc_input, reduc_info }); + if (double_reduc) loop = outer_loop; @@ -5886,6 +6029,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { /* Replace the uses: */ orig_name = PHI_RESULT (exit_phi); + + /* Look for a single use at the target of the skip edge. */ + if (unify_with_main_loop_p) + { + use_operand_p use_p; + gimple *user; + if (!single_imm_use (orig_name, &use_p, &user)) + gcc_unreachable (); + orig_name = gimple_get_lhs (user); + } + scalar_result = scalar_results[k]; FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) { @@ -7421,16 +7575,32 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, else { gcc_assert (slp_node == slp_node_instance->reduc_phis); - tree initial_value = NULL_TREE; + vec<tree> &initial_values = reduc_info->reduc_initial_values; + vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node); + + unsigned int num_phis = stmts.length (); if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) - initial_value = vect_phi_initial_value (phi); - tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); - tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out), - code, initial_value); - get_initial_defs_for_reduction (loop_vinfo, reduc_info, - slp_node_instance->reduc_phis, - &vec_initial_defs, vec_num, - initial_value != NULL, neutral_op); + num_phis = 1; + initial_values.reserve (num_phis); + for (unsigned int i = 0; i < num_phis; ++i) + { + gphi *this_phi = as_a<gphi *> (stmts[i]->stmt); + initial_values.quick_push (vect_phi_initial_value (this_phi)); + } + if (vec_num == 1) + vect_find_reusable_accumulator (loop_vinfo, reduc_info); + if (!initial_values.is_empty ()) + { + tree initial_value + = (num_phis == 1 ? initial_values[0] : NULL_TREE); + tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); + tree neutral_op + = neutral_op_for_reduction (TREE_TYPE (vectype_out), + code, initial_value); + get_initial_defs_for_reduction (loop_vinfo, reduc_info, + &vec_initial_defs, vec_num, + stmts.length (), neutral_op); + } } } else @@ -7438,6 +7608,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, /* Get at the scalar def before the loop, that defines the initial value of the reduction variable. */ tree initial_def = vect_phi_initial_value (phi); + reduc_info->reduc_initial_values.safe_push (initial_def); /* Optimize: if initial_def is for REDUC_MAX smaller than the base and we can't use zero for induc_val, use initial_def. Similarly for REDUC_MIN and initial_def larger than the base. */ @@ -7474,21 +7645,30 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, initial_def, initial_def); else { - enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); - tree neutral_op = neutral_op_for_reduction (TREE_TYPE (initial_def), - code, initial_def); - gcc_assert (neutral_op); - /* Try to simplify the vector initialization by applying an - adjustment after the reduction has been performed. */ - if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def - && !operand_equal_p (neutral_op, initial_def)) + if (ncopies == 1) + vect_find_reusable_accumulator (loop_vinfo, reduc_info); + if (!reduc_info->reduc_initial_values.is_empty ()) { - STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = initial_def; - initial_def = neutral_op; + initial_def = reduc_info->reduc_initial_values[0]; + enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); + tree neutral_op + = neutral_op_for_reduction (TREE_TYPE (initial_def), + code, initial_def); + gcc_assert (neutral_op); + /* Try to simplify the vector initialization by applying an + adjustment after the reduction has been performed. */ + if (!reduc_info->reused_accumulator + && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def + && !operand_equal_p (neutral_op, initial_def)) + { + STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) + = initial_def; + initial_def = neutral_op; + } + vec_initial_def + = get_initial_def_for_reduction (loop_vinfo, reduc_info, + initial_def, neutral_op); } - vec_initial_def - = get_initial_def_for_reduction (loop_vinfo, reduc_info, - initial_def, neutral_op); } } @@ -7499,6 +7679,17 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, vec_initial_defs.quick_push (vec_initial_def); } + if (auto *accumulator = reduc_info->reused_accumulator) + { + if (loop_vinfo->main_loop_edge) + vec_initial_defs[0] + = vect_get_main_loop_result (loop_vinfo, accumulator->reduc_input, + vec_initial_defs[0]); + else + vec_initial_defs.safe_push (accumulator->reduc_input); + gcc_assert (vec_initial_defs.length () == 1); + } + /* Generate the reduction PHIs upfront. */ for (i = 0; i < vec_num; i++) { diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 9748043..f1035a8 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -694,6 +694,8 @@ vec_info::new_stmt_vec_info (gimple *stmt) STMT_VINFO_SLP_VECT_ONLY (res) = false; STMT_VINFO_SLP_VECT_ONLY_PATTERN (res) = false; STMT_VINFO_VEC_STMTS (res) = vNULL; + res->reduc_initial_values = vNULL; + res->reduc_scalar_results = vNULL; if (is_a <loop_vec_info> (this) && gimple_code (stmt) == GIMPLE_PHI @@ -755,6 +757,8 @@ vec_info::free_stmt_vec_info (stmt_vec_info stmt_info) release_ssa_name (lhs); } + stmt_info->reduc_initial_values.release (); + stmt_info->reduc_scalar_results.release (); STMT_VINFO_SIMD_CLONE_INFO (stmt_info).release (); STMT_VINFO_VEC_STMTS (stmt_info).release (); free (stmt_info); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index e2fd360..d825b0c 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -551,6 +551,18 @@ typedef auto_vec<rgroup_controls> vec_loop_lens; typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec; +/* Information about a reduction accumulator from the main loop that could + conceivably be reused as the input to a reduction in an epilogue loop. */ +struct vect_reusable_accumulator { + /* The final value of the accumulator, which forms the input to the + reduction operation. */ + tree reduc_input; + + /* The stmt_vec_info that describes the reduction (i.e. the one for + which is_reduc_info is true). */ + stmt_vec_info reduc_info; +}; + /*-----------------------------------------------------------------*/ /* Info on vectorized loops. */ /*-----------------------------------------------------------------*/ @@ -588,6 +600,26 @@ public: /* Unrolling factor */ poly_uint64 vectorization_factor; + /* If this loop is an epilogue loop whose main loop can be skipped, + MAIN_LOOP_EDGE is the edge from the main loop to this loop's + preheader. SKIP_MAIN_LOOP_EDGE is then the edge that skips the + main loop and goes straight to this loop's preheader. + + Both fields are null otherwise. */ + edge main_loop_edge; + edge skip_main_loop_edge; + + /* If this loop is an epilogue loop that might be skipped after executing + the main loop, this edge is the one that skips the epilogue. */ + edge skip_this_loop_edge; + + /* The vectorized form of a standard reduction replaces the original + scalar code's final result (a loop-closed SSA PHI) with the result + of a vector-to-scalar reduction operation. After vectorization, + this variable maps these vector-to-scalar results to information + about the reductions that generated them. */ + hash_map<tree, vect_reusable_accumulator> reusable_accumulators; + /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR if there is no particular limit. */ unsigned HOST_WIDE_INT max_vectorization_factor; @@ -1186,6 +1218,23 @@ public: /* The vector type for performing the actual reduction. */ tree reduc_vectype; + /* If IS_REDUC_INFO is true and if the vector code is performing + N scalar reductions in parallel, this variable gives the initial + scalar values of those N reductions. */ + vec<tree> reduc_initial_values; + + /* If IS_REDUC_INFO is true and if the vector code is performing + N scalar reductions in parallel, this variable gives the vectorized code's + final (scalar) result for each of those N reductions. In other words, + REDUC_SCALAR_RESULTS[I] replaces the original scalar code's loop-closed + SSA PHI for reduction number I. */ + vec<tree> reduc_scalar_results; + + /* Only meaningful if IS_REDUC_INFO. If non-null, the reduction is + being performed by an epilogue loop and we have decided to reuse + this accumulator from the main loop. */ + vect_reusable_accumulator *reused_accumulator; + /* Whether we force a single cycle PHI during reduction vectorization. */ bool force_single_cycle; @@ -1382,12 +1431,6 @@ vect_phi_initial_value (gphi *phi) return PHI_ARG_DEF_FROM_EDGE (phi, pe); } -static inline tree -vect_phi_initial_value (stmt_vec_info stmt_info) -{ - return vect_phi_initial_value (as_a <gphi *> (stmt_info->stmt)); -} - /* Return true if STMT_INFO should produce a vector mask type rather than a normal nonmask type. */ @@ -1818,6 +1861,7 @@ class loop *vect_loop_versioning (loop_vec_info, gimple *); extern class loop *vect_do_peeling (loop_vec_info, tree, tree, tree *, tree *, tree *, int, bool, bool, tree *); +extern tree vect_get_main_loop_result (loop_vec_info, tree, tree); extern void vect_prepare_for_masked_peels (loop_vec_info); extern dump_user_location_t find_loop_location (class loop *); extern bool vect_can_advance_ivs_p (loop_vec_info); |