aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc')
-rw-r--r--gcc/testsuite/gcc.dg/vect/pr68445.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-1.c3
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-11b.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-11c.c3
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-2.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-23.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-33.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-42.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-46.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-53.c3
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-54.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-55.c37
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-56.c51
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-cond-1.c3
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c3
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c4
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-1.c5
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-10.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-2.c4
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-3.c4
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-4.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-5.c5
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-6.c4
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-7.c4
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-8.c6
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-9.c6
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-complex-5.c3
-rw-r--r--gcc/tree-vect-loop.cc76
-rw-r--r--gcc/tree-vect-slp.cc255
-rw-r--r--gcc/tree-vect-stmts.cc127
-rw-r--r--gcc/tree-vectorizer.h4
31 files changed, 458 insertions, 172 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/pr68445.c b/gcc/testsuite/gcc.dg/vect/pr68445.c
index 15bffdc..71d61b9 100644
--- a/gcc/testsuite/gcc.dg/vect/pr68445.c
+++ b/gcc/testsuite/gcc.dg/vect/pr68445.c
@@ -16,4 +16,4 @@ void IMB_double_fast_x (int *destf, int *dest, int y, int *p1f)
}
}
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail { vect_variable_length && { ! vect_strided8 } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-1.c b/gcc/testsuite/gcc.dg/vect/slp-1.c
index d4a13f1..e1a45e1 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-1.c
@@ -122,5 +122,4 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target {! vect_strided5 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_strided5 } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11b.c b/gcc/testsuite/gcc.dg/vect/slp-11b.c
index df64c8d..0208f03 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-11b.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11b.c
@@ -45,4 +45,4 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_strided4 || vect_perm } && vect_int_mult } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } xfail vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11c.c b/gcc/testsuite/gcc.dg/vect/slp-11c.c
index 2e70fca..25d7f2c 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-11c.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11c.c
@@ -45,5 +45,4 @@ int main (void)
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_uintfloat_cvt && vect_strided2 } && vect_int_mult } } } } */
/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { { vect_uintfloat_cvt && vect_strided2 } && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target { vect_load_lanes } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-2.c b/gcc/testsuite/gcc.dg/vect/slp-2.c
index d0de357..08d2116 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-2.c
@@ -144,5 +144,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-23.c b/gcc/testsuite/gcc.dg/vect/slp-23.c
index 8836acf..d32ee5b 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-23.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-23.c
@@ -114,5 +114,5 @@ int main (void)
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_perm } } } } */
/* SLP fails for the second loop with variable-length SVE because
the load size is greater than the minimum vector size. */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm xfail { { aarch64_sve || riscv_v } && vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm xfail { aarch64_sve && vect_variable_length } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-33.c b/gcc/testsuite/gcc.dg/vect/slp-33.c
index c382093..9c6c1e4 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-33.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-33.c
@@ -108,7 +108,7 @@ int main (void)
/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target {vect_uintfloat_cvt && vect_int_mult} } } } */
/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target {{! { vect_uintfloat_cvt}} && vect_int_mult} } } } */
/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target {{! { vect_uintfloat_cvt}} && {! {vect_int_mult}}} } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_uintfloat_cvt && vect_int_mult} xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_uintfloat_cvt && vect_int_mult} } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target {{! { vect_uintfloat_cvt}} && vect_int_mult} } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target {{! { vect_uintfloat_cvt}} && {! {vect_int_mult}}} } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-42.c b/gcc/testsuite/gcc.dg/vect/slp-42.c
index 6b78246..53eca6b 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-42.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-42.c
@@ -15,5 +15,5 @@ void foo (int n)
}
}
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail { vect_variable_length && { ! vect_strided8 } } } } } */
/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c
index bf44547..b44a673 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-46.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-46.c
@@ -98,4 +98,4 @@ main ()
return 0;
}
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail { vect_load_lanes && vect_variable_length } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-53.c b/gcc/testsuite/gcc.dg/vect/slp-53.c
index d8cd5f8..50b3e9d 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-53.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-53.c
@@ -12,4 +12,5 @@ void foo (int * __restrict x, int *y)
}
}
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } xfail vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target { vect_load_lanes } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-54.c b/gcc/testsuite/gcc.dg/vect/slp-54.c
index ab66b34..57268ab 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-54.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-54.c
@@ -15,4 +15,4 @@ void foo (int * __restrict x, int *y)
}
}
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } xfail riscv*-*-* } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-55.c b/gcc/testsuite/gcc.dg/vect/slp-55.c
new file mode 100644
index 0000000..0bf65ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-55.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_int_mult } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+void foo (int * __restrict a, int *b, int *c)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[2*i] = b[i] + 7;
+ a[2*i+1] = c[i] * 3;
+ }
+}
+
+int bar (int *b)
+{
+ int res = 0;
+ for (int i = 0; i < 1024; ++i)
+ {
+ res += b[2*i] + 7;
+ res += b[2*i+1] * 3;
+ }
+ return res;
+}
+
+void baz (int * __restrict a, int *b)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[2*i] = b[2*i] + 7;
+ a[2*i+1] = b[2*i+1] * 3;
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "LOAD_LANES" 2 "optimized" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "STORE_LANES" 2 "optimized" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-56.c b/gcc/testsuite/gcc.dg/vect/slp-56.c
new file mode 100644
index 0000000..0b985ea
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-56.c
@@ -0,0 +1,51 @@
+#include "tree-vect.h"
+
+/* This is a load-lane / masked-store-lane test that more reliably
+ triggers SLP than SVEs mask_srtuct_store_*.c */
+
+void __attribute__ ((noipa))
+test4 (int *__restrict dest, int *__restrict src,
+ int *__restrict cond, int bias, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ int value0 = src[i * 4] + bias;
+ int value1 = src[i * 4 + 1] * bias;
+ int value2 = src[i * 4 + 2] + bias;
+ int value3 = src[i * 4 + 3] * bias;
+ if (cond[i])
+ {
+ dest[i * 4] = value0;
+ dest[i * 4 + 1] = value1;
+ dest[i * 4 + 2] = value2;
+ dest[i * 4 + 3] = value3;
+ }
+ }
+}
+
+int dest[16*4];
+int src[16*4];
+int cond[16];
+const int dest_chk[16*4] = {0, 0, 0, 0, 9, 25, 11, 35, 0, 0, 0, 0, 17, 65, 19,
+ 75, 0, 0, 0, 0, 25, 105, 27, 115, 0, 0, 0, 0, 33, 145, 35, 155, 0, 0, 0,
+ 0, 41, 185, 43, 195, 0, 0, 0, 0, 49, 225, 51, 235, 0, 0, 0, 0, 57, 265, 59,
+ 275, 0, 0, 0, 0, 65, 305, 67, 315};
+
+int main()
+{
+ check_vect ();
+#pragma GCC novector
+ for (int i = 0; i < 16; ++i)
+ cond[i] = i & 1;
+#pragma GCC novector
+ for (int i = 0; i < 16 * 4; ++i)
+ src[i] = i;
+ test4 (dest, src, cond, 5, 16);
+#pragma GCC novector
+ for (int i = 0; i < 16 * 4; ++i)
+ if (dest[i] != dest_chk[i])
+ abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target { vect_variable_length && vect_load_lanes } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-cond-1.c b/gcc/testsuite/gcc.dg/vect/slp-cond-1.c
index c76ea5d..16ab0cc 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-cond-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-cond-1.c
@@ -125,5 +125,4 @@ main ()
return 0;
}
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { ! vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target { vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c
index 2792b93..07f871c 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c
@@ -56,5 +56,4 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_unpack } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_unpack xfail { vect_variable_length && vect_load_lanes } } } } */
-
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_unpack } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c
index 5c75dc1..0f7b479 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c
@@ -51,5 +51,5 @@ int main (void)
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_unpack } } } */
/* The epilogues are vectorized using partial vectors. */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_unpack && { { ! vect_partial_vectors_usage_1 } || s390_vx } } xfail { vect_variable_length && vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { { vect_unpack && vect_partial_vectors_usage_1 } && { ! s390_vx } } xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_unpack && { { ! vect_partial_vectors_usage_1 } || s390_vx } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { { vect_unpack && vect_partial_vectors_usage_1 } && { ! s390_vx } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c
index dbb107f..93b5907 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c
@@ -81,9 +81,8 @@ int main (int argc, const char* argv[])
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int && {! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump "can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-10.c b/gcc/testsuite/gcc.dg/vect/slp-perm-10.c
index 03de4c6..2cce30c 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-10.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-10.c
@@ -53,4 +53,4 @@ int main ()
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* SLP fails for variable-length SVE because the load size is greater
than the minimum vector size. */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm xfail { { aarch64_sve || riscv_v } && vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm xfail { aarch64_sve && vect_variable_length } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c
index 41fd159..6ac29e7 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c
@@ -55,8 +55,6 @@ int main (int argc, const char* argv[])
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes } } } } */
/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c
index 9ea35ba..d1953054 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c
@@ -68,9 +68,7 @@ int main (int argc, const char* argv[])
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes } } } } */
/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c
index f4bda39..107968f 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c
@@ -115,4 +115,4 @@ int main (int argc, const char* argv[])
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! { vect_load_lanes && vect_strided5 } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-5.c b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c
index 7128cf4..0dedd4a 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-5.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c
@@ -105,9 +105,6 @@ int main (int argc, const char* argv[])
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int && { ! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */
/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
-
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
index 5cc6261..000848c 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
@@ -106,5 +106,5 @@ int main (int argc, const char* argv[])
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm3_int } } } */
/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */
-/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */
+/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
index df13c37..f15736e 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
@@ -97,8 +97,6 @@ int main (int argc, const char* argv[])
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int && { ! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */
/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-8.c b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c
index 029be54..7610524 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-8.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c
@@ -61,10 +61,8 @@ int main (int argc, const char* argv[])
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_perm_byte } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_byte && { { ! vect_load_lanes } && { { ! vect_partial_vectors_usage_1 } || s390_vx } } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_byte && { { ! vect_partial_vectors_usage_1 } || s390_vx } } } } } */
/* The epilogues are vectorized using partial vectors. */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_byte && { { ! vect_load_lanes } && { vect_partial_vectors_usage_1 && { ! s390_vx } } } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_byte && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_byte && { vect_partial_vectors_usage_1 && { ! s390_vx } } } } } } */
/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
index 89400fb..c9468d8 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
@@ -58,7 +58,5 @@ int main (int argc, const char* argv[])
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_perm_short || vect32 } || vect_load_lanes } } } } */
/* We don't try permutes with a group size of 3 for variable-length
vectors. */
-/* { dg-final { scan-tree-dump "permutation requires at least three vectors" "vect" { target { vect_perm_short && { ! vect_perm3_short } } xfail vect_variable_length } } } */
-/* { dg-final { scan-tree-dump-not "permutation requires at least three vectors" "vect" { target vect_perm3_short } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { ! { vect_perm3_short || vect32 } } || vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { { vect_perm3_short || vect32 } && { ! vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_perm3_short || { vect32 || vect_load_lanes } } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_short || { vect32 || vect_load_lanes } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
index ac562dc..0d85072 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
@@ -40,5 +40,4 @@ main (void)
return 0;
}
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { ! vect_hw_misalign } } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1fb7bbd..242d5e2 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2958,82 +2958,6 @@ start_over:
"unsupported SLP instances\n");
goto again;
}
-
- /* Check whether any load in ALL SLP instances is possibly permuted. */
- slp_tree load_node, slp_root;
- unsigned i, x;
- slp_instance instance;
- bool can_use_lanes = true;
- FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
- {
- slp_root = SLP_INSTANCE_TREE (instance);
- int group_size = SLP_TREE_LANES (slp_root);
- tree vectype = SLP_TREE_VECTYPE (slp_root);
- bool loads_permuted = false;
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
- {
- if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
- continue;
- unsigned j;
- stmt_vec_info load_info;
- FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
- if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
- {
- loads_permuted = true;
- break;
- }
- }
-
- /* If the loads and stores can be handled with load/store-lane
- instructions record it and move on to the next instance. */
- if (loads_permuted
- && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
- && vect_store_lanes_supported (vectype, group_size, false)
- != IFN_LAST)
- {
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
- if (STMT_VINFO_GROUPED_ACCESS
- (SLP_TREE_REPRESENTATIVE (load_node)))
- {
- stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
- (SLP_TREE_REPRESENTATIVE (load_node));
- /* Use SLP for strided accesses (or if we can't
- load-lanes). */
- if (STMT_VINFO_STRIDED_P (stmt_vinfo)
- || vect_load_lanes_supported
- (STMT_VINFO_VECTYPE (stmt_vinfo),
- DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
- break;
- }
-
- can_use_lanes
- = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
-
- if (can_use_lanes && dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "SLP instance %p can use load/store-lanes\n",
- (void *) instance);
- }
- else
- {
- can_use_lanes = false;
- break;
- }
- }
-
- /* If all SLP instances can use load/store-lanes abort SLP and try again
- with SLP disabled. */
- if (can_use_lanes)
- {
- ok = opt_result::failure_at (vect_location,
- "Built SLP cancelled: can use "
- "load/store-lanes\n");
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "Built SLP cancelled: all SLP instances support "
- "load/store-lanes\n");
- goto again;
- }
}
/* Dissolve SLP-only groups. */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 2304cda..5a65a99 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -121,6 +121,7 @@ _slp_tree::_slp_tree ()
SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
SLP_TREE_CODE (this) = ERROR_MARK;
+ this->ldst_lanes = false;
SLP_TREE_VECTYPE (this) = NULL_TREE;
SLP_TREE_REPRESENTATIVE (this) = NULL;
SLP_TREE_REF_COUNT (this) = 1;
@@ -3483,7 +3484,8 @@ static bool
vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info, slp_instance_kind kind,
- unsigned max_tree_size, unsigned *limit);
+ unsigned max_tree_size, unsigned *limit,
+ bool force_single_lane = false);
/* Build an interleaving scheme for the store sources RHS_NODES from
SCALAR_STMTS. */
@@ -3678,7 +3680,8 @@ vect_build_slp_instance (vec_info *vinfo,
unsigned max_tree_size, unsigned *limit,
scalar_stmts_to_slp_tree_map_t *bst_map,
/* ??? We need stmt_info for group splitting. */
- stmt_vec_info stmt_info_)
+ stmt_vec_info stmt_info_,
+ bool force_single_lane = false)
{
/* If there's no budget left bail out early. */
if (*limit == 0)
@@ -3707,9 +3710,17 @@ vect_build_slp_instance (vec_info *vinfo,
poly_uint64 max_nunits = 1;
unsigned tree_size = 0;
unsigned i;
- slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
- &max_nunits, matches, limit,
- &tree_size, bst_map);
+
+ slp_tree node = NULL;
+ if (force_single_lane)
+ {
+ matches[0] = true;
+ matches[1] = false;
+ }
+ else
+ node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
+ &max_nunits, matches, limit,
+ &tree_size, bst_map);
if (node != NULL)
{
/* Calculate the unrolling factor based on the smallest type. */
@@ -3905,10 +3916,33 @@ vect_build_slp_instance (vec_info *vinfo,
/* For loop vectorization split the RHS into arbitrary pieces of
size >= 1. */
else if (is_a <loop_vec_info> (vinfo)
- && (i > 0 && i < group_size)
- && !vect_slp_prefer_store_lanes_p (vinfo,
- stmt_info, group_size, i))
- {
+ && (group_size != 1 && i < group_size))
+ {
+ /* There are targets that cannot do even/odd interleaving schemes
+ so they absolutely need to use load/store-lanes. For now
+ force single-lane SLP for them - they would be happy with
+ uniform power-of-two lanes (but depending on element size),
+ but even if we can use 'i' as indicator we would need to
+ backtrack when later lanes fail to discover with the same
+ granularity. We cannot turn any of strided or scatter store
+ into store-lanes. */
+ /* ??? If this is not in sync with what get_load_store_type
+ later decides the SLP representation is not good for other
+ store vectorization methods. */
+ bool want_store_lanes
+ = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+ && ! STMT_VINFO_STRIDED_P (stmt_info)
+ && compare_step_with_zero (vinfo, stmt_info) > 0
+ && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
+ group_size, 1));
+ if (want_store_lanes || force_single_lane)
+ i = 1;
+
+ /* A fatal discovery fail doesn't always mean single-lane SLP
+ isn't a possibility, so try. */
+ if (i == 0)
+ i = 1;
+
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Splitting SLP group at stmt %u\n", i);
@@ -3942,7 +3976,10 @@ vect_build_slp_instance (vec_info *vinfo,
(max_nunits, end - start));
rhs_nodes.safe_push (node);
start = end;
- end = group_size;
+ if (want_store_lanes || force_single_lane)
+ end = start + 1;
+ else
+ end = group_size;
}
else
{
@@ -3976,7 +4013,31 @@ vect_build_slp_instance (vec_info *vinfo,
}
/* Now we assume we can build the root SLP node from all stores. */
- node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
+ if (want_store_lanes)
+ {
+ /* For store-lanes feed the store node with all RHS nodes
+ in order. */
+ node = vect_create_new_slp_node (scalar_stmts,
+ SLP_TREE_CHILDREN
+ (rhs_nodes[0]).length ());
+ SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+ node->ldst_lanes = true;
+ SLP_TREE_CHILDREN (node)
+ .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
+ + rhs_nodes.length () - 1);
+ /* First store value and possibly mask. */
+ SLP_TREE_CHILDREN (node)
+ .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
+ /* Rest of the store values. All mask nodes are the same,
+ this should be guaranteed by dataref group discovery. */
+ for (unsigned j = 1; j < rhs_nodes.length (); ++j)
+ SLP_TREE_CHILDREN (node)
+ .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+ for (slp_tree child : SLP_TREE_CHILDREN (node))
+ child->refcnt++;
+ }
+ else
+ node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
while (!rhs_nodes.is_empty ())
vect_free_slp_tree (rhs_nodes.pop ());
@@ -4043,7 +4104,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info,
slp_instance_kind kind,
- unsigned max_tree_size, unsigned *limit)
+ unsigned max_tree_size, unsigned *limit,
+ bool force_single_lane)
{
vec<stmt_vec_info> scalar_stmts;
@@ -4088,7 +4150,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
roots, remain,
max_tree_size, limit, bst_map,
kind == slp_inst_kind_store
- ? stmt_info : NULL);
+ ? stmt_info : NULL, force_single_lane);
/* ??? If this is slp_inst_kind_store and the above succeeded here's
where we should do store group splitting. */
@@ -4184,12 +4246,50 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
lower. */
stmt_vec_info first
= DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
+ unsigned group_lanes = DR_GROUP_SIZE (first);
+
+ /* Verify if all load permutations can be implemented with a suitably
+ large element load-lanes operation. */
+ unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
+ if (STMT_VINFO_STRIDED_P (first)
+ || compare_step_with_zero (loop_vinfo, first) <= 0
+ || exact_log2 (ld_lanes_lanes) == -1
+ /* ??? For now only support the single-lane case as there is
+ missing support on the store-lane side and code generation
+ isn't up to the task yet. */
+ || ld_lanes_lanes != 1
+ || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
+ group_lanes / ld_lanes_lanes,
+ false) == IFN_LAST)
+ ld_lanes_lanes = 0;
+ else
+ /* Verify the loads access the same number of lanes aligned to
+ ld_lanes_lanes. */
+ for (slp_tree load : loads)
+ {
+ if (SLP_TREE_LANES (load) != ld_lanes_lanes)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
+ if (first % ld_lanes_lanes != 0)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
+ if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
+ {
+ ld_lanes_lanes = 0;
+ break;
+ }
+ }
/* Only a power-of-two number of lanes matches interleaving with N levels.
??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
at each step. */
- unsigned group_lanes = DR_GROUP_SIZE (first);
- if (exact_log2 (group_lanes) == -1 && group_lanes != 3)
+ if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
return;
for (slp_tree load : loads)
@@ -4206,7 +4306,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
with a non-1:1 load permutation around instead of canonicalizing
those into a load and a permute node. Removing this early
check would do such canonicalization. */
- if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
+ if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
+ && ld_lanes_lanes == 0)
continue;
/* First build (and possibly re-use) a load node for the
@@ -4239,10 +4340,20 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
final_perm.quick_push
(std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
+ if (ld_lanes_lanes != 0)
+ {
+ /* ??? If this is not in sync with what get_load_store_type
+ later decides the SLP representation is not good for other
+ store vectorization methods. */
+ l0->ldst_lanes = true;
+ load->ldst_lanes = true;
+ }
+
while (1)
{
unsigned group_lanes = SLP_TREE_LANES (l0);
- if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
+ if (ld_lanes_lanes != 0
+ || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
break;
/* Try to lower by reducing the group to half its size using an
@@ -4570,6 +4681,94 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
}
}
+ /* Check whether we should force some SLP instances to use load/store-lanes
+ and do so by forcing SLP re-discovery with single lanes. We used
+ to cancel SLP when this applied to all instances in a loop but now
+ we decide this per SLP instance. It's important to do this only
+ after SLP pattern recognition. */
+ if (is_a <loop_vec_info> (vinfo))
+ FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
+ if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+ && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
+ {
+ slp_tree slp_root = SLP_INSTANCE_TREE (instance);
+ int group_size = SLP_TREE_LANES (slp_root);
+ tree vectype = SLP_TREE_VECTYPE (slp_root);
+
+ auto_vec<slp_tree> loads;
+ hash_set<slp_tree> visited;
+ vect_gather_slp_loads (loads, slp_root, visited);
+
+ /* Check whether any load in the SLP instance is possibly
+ permuted. */
+ bool loads_permuted = false;
+ slp_tree load_node;
+ unsigned j;
+ FOR_EACH_VEC_ELT (loads, j, load_node)
+ {
+ if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+ continue;
+ unsigned k;
+ stmt_vec_info load_info;
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
+ if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
+ {
+ loads_permuted = true;
+ break;
+ }
+ }
+
+ /* If the loads and stores can use load/store-lanes force re-discovery
+ with single lanes. */
+ if (loads_permuted
+ && !slp_root->ldst_lanes
+ && vect_store_lanes_supported (vectype, group_size, false)
+ != IFN_LAST)
+ {
+ bool can_use_lanes = true;
+ FOR_EACH_VEC_ELT (loads, j, load_node)
+ if (STMT_VINFO_GROUPED_ACCESS
+ (SLP_TREE_REPRESENTATIVE (load_node)))
+ {
+ stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+ (SLP_TREE_REPRESENTATIVE (load_node));
+ /* Use SLP for strided accesses (or if we can't
+ load-lanes). */
+ if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+ || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
+ || vect_load_lanes_supported
+ (STMT_VINFO_VECTYPE (stmt_vinfo),
+ DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
+ {
+ can_use_lanes = false;
+ break;
+ }
+ }
+
+ if (can_use_lanes)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP instance %p can use load/store-lanes,"
+ " re-discovering with single-lanes\n",
+ (void *) instance);
+
+ stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
+
+ vect_free_slp_instance (instance);
+ limit = max_tree_size;
+ bool res = vect_analyze_slp_instance (vinfo, bst_map,
+ stmt_info,
+ slp_inst_kind_store,
+ max_tree_size, &limit,
+ true);
+ gcc_assert (res);
+ auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
+ LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
+ }
+ }
+ }
+
/* When we end up with load permutations that we cannot possibly handle,
like those requiring three vector inputs, lower them using interleaving
like schemes. */
@@ -9877,6 +10076,28 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
gcc_assert (perm.length () == SLP_TREE_LANES (node));
+ /* Load-lanes permute. This permute only acts as a forwarder to
+ select the correct vector def of the load-lanes load which
+ has the permuted vectors in its vector defs like
+ { v0, w0, r0, v1, w1, r1 ... } for a ld3. */
+ if (node->ldst_lanes)
+ {
+ gcc_assert (children.length () == 1);
+ if (!gsi)
+ /* This is a trivial op always supported. */
+ return 1;
+ slp_tree child = children[0];
+ unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
+ / SLP_TREE_LANES (node));
+ unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
+ for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
+ {
+ tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
+ node->push_vec_def (def);
+ }
+ return 1;
+ }
+
/* REPEATING_P is true if every output vector is guaranteed to use the
same permute vector. We can handle that case for both variable-length
and constant-length vectors, but we only handle other cases for
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 72a29c0..d2282c0 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1509,7 +1509,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
unsigned int nvectors;
if (slp_node)
- nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ /* ??? Incorrect for multi-lane lanes. */
+ nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size;
else
nvectors = vect_get_num_copies (loop_vinfo, vectype);
@@ -1795,7 +1796,7 @@ vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
elements with a known constant step. Return -1 if that step
is negative, 0 if it is zero, and 1 if it is greater than zero. */
-static int
+int
compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
{
dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
@@ -2070,6 +2071,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
is irrelevant for them. */
*alignment_support_scheme = dr_unaligned_supported;
}
+ /* Try using LOAD/STORE_LANES. */
+ else if (slp_node->ldst_lanes
+ && (*lanes_ifn
+ = (vls_type == VLS_LOAD
+ ? vect_load_lanes_supported (vectype, group_size, masked_p)
+ : vect_store_lanes_supported (vectype, group_size,
+ masked_p))) != IFN_LAST)
+ *memory_access_type = VMAT_LOAD_STORE_LANES;
else
*memory_access_type = VMAT_CONTIGUOUS;
@@ -8201,6 +8210,16 @@ vectorizable_store (vec_info *vinfo,
&lanes_ifn))
return false;
+ if (slp_node
+ && slp_node->ldst_lanes
+ && memory_access_type != VMAT_LOAD_STORE_LANES)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "discovered store-lane but cannot use it.\n");
+ return false;
+ }
+
if (mask)
{
if (memory_access_type == VMAT_CONTIGUOUS)
@@ -8717,7 +8736,7 @@ vectorizable_store (vec_info *vinfo,
else
{
if (memory_access_type == VMAT_LOAD_STORE_LANES)
- aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ aggr_type = build_array_type_nelts (elem_type, group_size * nunits);
else
aggr_type = vectype;
bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
@@ -8774,11 +8793,24 @@ vectorizable_store (vec_info *vinfo,
if (memory_access_type == VMAT_LOAD_STORE_LANES)
{
- gcc_assert (!slp && grouped_store);
+ if (costing_p && slp_node)
+ /* Update all incoming store operand nodes, the general handling
+ above only handles the mask and the first store operand node. */
+ for (slp_tree child : SLP_TREE_CHILDREN (slp_node))
+ if (child != mask_node
+ && !vect_maybe_update_slp_op_vectype (child, vectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
unsigned inside_cost = 0, prologue_cost = 0;
/* For costing some adjacent vector stores, we'd like to cost with
the total number of them once instead of cost each one by one. */
unsigned int n_adjacent_stores = 0;
+ if (slp)
+ ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size;
for (j = 0; j < ncopies; j++)
{
gimple *new_stmt;
@@ -8796,7 +8828,7 @@ vectorizable_store (vec_info *vinfo,
op = vect_get_store_rhs (next_stmt_info);
if (costing_p)
update_prologue_cost (&prologue_cost, op);
- else
+ else if (!slp)
{
vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
ncopies, op,
@@ -8811,15 +8843,15 @@ vectorizable_store (vec_info *vinfo,
{
if (mask)
{
- vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
- mask, &vec_masks,
- mask_vectype);
+ if (slp_node)
+ vect_get_slp_defs (mask_node, &vec_masks);
+ else
+ vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
+ mask, &vec_masks,
+ mask_vectype);
vec_mask = vec_masks[0];
}
- /* We should have catched mismatched types earlier. */
- gcc_assert (
- useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
dataref_ptr
= vect_create_data_ref_ptr (vinfo, first_stmt_info,
aggr_type, NULL, offset, &dummy,
@@ -8831,10 +8863,16 @@ vectorizable_store (vec_info *vinfo,
gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
/* DR_CHAIN is then used as an input to
vect_permute_store_chain(). */
- for (i = 0; i < group_size; i++)
+ if (!slp)
{
- vec_oprnd = (*gvec_oprnds[i])[j];
- dr_chain[i] = vec_oprnd;
+ /* We should have caught mismatched types earlier. */
+ gcc_assert (
+ useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
+ for (i = 0; i < group_size; i++)
+ {
+ vec_oprnd = (*gvec_oprnds[i])[j];
+ dr_chain[i] = vec_oprnd;
+ }
}
if (mask)
vec_mask = vec_masks[j];
@@ -8844,12 +8882,12 @@ vectorizable_store (vec_info *vinfo,
if (costing_p)
{
- n_adjacent_stores += vec_num;
+ n_adjacent_stores += group_size;
continue;
}
/* Get an array into which we can store the individual vectors. */
- tree vec_array = create_vector_array (vectype, vec_num);
+ tree vec_array = create_vector_array (vectype, group_size);
/* Invalidate the current contents of VEC_ARRAY. This should
become an RTL clobber too, which prevents the vector registers
@@ -8857,9 +8895,19 @@ vectorizable_store (vec_info *vinfo,
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
/* Store the individual vectors into the array. */
- for (i = 0; i < vec_num; i++)
+ for (i = 0; i < group_size; i++)
{
- vec_oprnd = dr_chain[i];
+ if (slp)
+ {
+ slp_tree child;
+ if (i == 0 || !mask_node)
+ child = SLP_TREE_CHILDREN (slp_node)[i];
+ else
+ child = SLP_TREE_CHILDREN (slp_node)[i + 1];
+ vec_oprnd = SLP_TREE_VEC_DEFS (child)[j];
+ }
+ else
+ vec_oprnd = dr_chain[i];
write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array,
i);
}
@@ -8929,9 +8977,10 @@ vectorizable_store (vec_info *vinfo,
/* Record that VEC_ARRAY is now dead. */
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
- if (j == 0)
+ if (j == 0 && !slp)
*vec_stmt = new_stmt;
- STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ if (!slp)
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
if (costing_p)
@@ -10035,6 +10084,16 @@ vectorizable_load (vec_info *vinfo,
&lanes_ifn))
return false;
+ if (slp_node
+ && slp_node->ldst_lanes
+ && memory_access_type != VMAT_LOAD_STORE_LANES)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "discovered load-lane but cannot use it.\n");
+ return false;
+ }
+
if (mask)
{
if (memory_access_type == VMAT_CONTIGUOUS)
@@ -10753,7 +10812,7 @@ vectorizable_load (vec_info *vinfo,
else
{
if (memory_access_type == VMAT_LOAD_STORE_LANES)
- aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ aggr_type = build_array_type_nelts (elem_type, group_size * nunits);
else
aggr_type = vectype;
bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
@@ -10777,12 +10836,13 @@ vectorizable_load (vec_info *vinfo,
{
gcc_assert (alignment_support_scheme == dr_aligned
|| alignment_support_scheme == dr_unaligned_supported);
- gcc_assert (grouped_load && !slp);
unsigned int inside_cost = 0, prologue_cost = 0;
/* For costing some adjacent vector loads, we'd like to cost with
the total number of them once instead of cost each one by one. */
unsigned int n_adjacent_loads = 0;
+ if (slp_node)
+ ncopies = slp_node->vec_stmts_size / group_size;
for (j = 0; j < ncopies; j++)
{
if (costing_p)
@@ -10833,7 +10893,7 @@ vectorizable_load (vec_info *vinfo,
if (mask)
vec_mask = vec_masks[j];
- tree vec_array = create_vector_array (vectype, vec_num);
+ tree vec_array = create_vector_array (vectype, group_size);
tree final_mask = NULL_TREE;
tree final_len = NULL_TREE;
@@ -10896,24 +10956,31 @@ vectorizable_load (vec_info *vinfo,
gimple_call_set_nothrow (call, true);
vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
- dr_chain.create (vec_num);
+ if (!slp)
+ dr_chain.create (group_size);
/* Extract each vector into an SSA_NAME. */
- for (i = 0; i < vec_num; i++)
+ for (unsigned i = 0; i < group_size; i++)
{
new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
vec_array, i);
- dr_chain.quick_push (new_temp);
+ if (slp)
+ slp_node->push_vec_def (new_temp);
+ else
+ dr_chain.quick_push (new_temp);
}
- /* Record the mapping between SSA_NAMEs and statements. */
- vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
+ if (!slp)
+ /* Record the mapping between SSA_NAMEs and statements. */
+ vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
/* Record that VEC_ARRAY is now dead. */
vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
- dr_chain.release ();
+ if (!slp)
+ dr_chain.release ();
- *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+ if (!slp_node)
+ *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
}
if (costing_p)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index df6c8ad..699ae9e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -222,6 +222,9 @@ struct _slp_tree {
unsigned int lanes;
/* The operation of this node. */
enum tree_code code;
+ /* Whether uses of this load or feeders of this store are suitable
+ for load/store-lanes. */
+ bool ldst_lanes;
int vertex;
@@ -2313,6 +2316,7 @@ extern bool supportable_indirect_convert_operation (code_helper,
tree, tree,
vec<std::pair<tree, tree_code> > *,
tree = NULL_TREE);
+extern int compare_step_with_zero (vec_info *, stmt_vec_info);
extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
enum vect_cost_for_stmt, stmt_vec_info,