aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc')
-rw-r--r--gcc/doc/invoke.texi4
-rw-r--r--gcc/params.opt4
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-1.c13
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-10.c6
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-11.c34
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-12.c8
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-13.c13
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-14.c6
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-15.c13
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-16.c6
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-17.c27
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-2.c6
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-3.c13
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-4.c6
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-5.c13
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-6.c6
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-7.c17
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-8.c6
-rw-r--r--gcc/testsuite/gcc.dg/vect/bb-slp-layout-9.c36
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-11b.c2
-rw-r--r--gcc/testsuite/lib/target-supports.exp1
-rw-r--r--gcc/tree-vect-slp.cc2137
-rw-r--r--gcc/tree-vectorizer.h2
23 files changed, 1975 insertions, 404 deletions
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 6131bfa..e5eb525 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14619,6 +14619,10 @@ Complex expressions slow the analyzer.
Maximum number of arguments in a PHI supported by TREE if conversion
unless the loop is marked with simd pragma.
+@item vect-max-layout-candidates
+The maximum number of possible vector layouts (such as permutations)
+to consider when optimizing to-be-vectorized code.
+
@item vect-max-version-for-alignment-checks
The maximum number of run-time checks that can be performed when
doing loop versioning for alignment in the vectorizer.
diff --git a/gcc/params.opt b/gcc/params.opt
index 201b5c9..3001566 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1137,6 +1137,10 @@ Whether to use canonical types.
Common Joined UInteger Var(param_vect_epilogues_nomask) Init(1) IntegerRange(0, 1) Param Optimization
Enable loop epilogue vectorization using smaller vector size.
+-param=vect-max-layout-candidates=
+Common Joined UInteger Var(param_vect_max_layout_candidates) Init(32) Param Optimization
+Maximum number of possible vector layouts (such as permutations) to consider when optimizing to-be-vectorized code.
+
-param=vect-max-peeling-for-alignment=
Common Joined UInteger Var(param_vect_max_peeling_for_alignment) Init(-1) IntegerRange(0, 64) Param Optimization
Maximum number of loop peels to enhance alignment of data references in a loop.
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-1.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-1.c
new file mode 100644
index 0000000..c1d4ba3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4], d[4];
+
+void f1()
+{
+ a[0] = (b[1] << c[3]) - d[1];
+ a[1] = (b[0] << c[2]) - d[0];
+ a[2] = (b[3] << c[1]) - d[3];
+ a[3] = (b[2] << c[0]) - d[2];
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 3 "slp2" { target { vect_var_shift && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-10.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-10.c
new file mode 100644
index 0000000..e2c86f3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-10.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os -fno-tree-loop-vectorize" } */
+
+#include "bb-slp-layout-9.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp1" { target { vect_int && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-11.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-11.c
new file mode 100644
index 0000000..d9b5349
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-11.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+
+int a[4], b[4], c[400], d[400];
+
+void f1()
+{
+ int a0 = a[0] - b[0];
+ int a1 = a[1] + b[1];
+ int a2 = a[2] - b[2];
+ int a3 = a[3] + b[3];
+ int b0 = a0;
+ int b1 = a1;
+ int b2 = a2;
+ int b3 = a3;
+ for (int i = 0; i < 100; ++i)
+ {
+ a0 += c[i * 4 + 1];
+ a1 += c[i * 4 + 0];
+ a2 += c[i * 4 + 3];
+ a3 += c[i * 4 + 2];
+ b0 ^= d[i * 4 + 3];
+ b1 ^= d[i * 4 + 2];
+ b2 ^= d[i * 4 + 1];
+ b3 ^= d[i * 4 + 0];
+ }
+ a[0] = a0 ^ b0;
+ a[1] = a1 ^ b1;
+ a[2] = a2 ^ b2;
+ a[3] = a3 ^ b3;
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 4 "slp1" { target { vect_int && vect_perm } } } } */
+/* { dg-final { scan-tree-dump "duplicating permutation node" "slp1" { target { vect_int && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-12.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-12.c
new file mode 100644
index 0000000..3bf48af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-12.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os -fno-tree-loop-vectorize" } */
+
+#include "bb-slp-layout-11.c"
+
+/* It would be better to keep the original three permutations. */
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 3 "slp1" { target { vect_int && { vect_perm && vect_hw_misalign } } xfail { *-*-* } } } } */
+/* { dg-final { scan-tree-dump-not "duplicating permutation node" "slp1" { target { vect_int && { vect_perm && vect_hw_misalign } } xfail { *-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-13.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-13.c
new file mode 100644
index 0000000..9669ade
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-13.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4], d[4];
+
+void f1()
+{
+ a[0] = (b[1] << c[3]) - (d[1] >> c[3]);
+ a[1] = (b[0] << c[2]) - (d[0] >> c[2]);
+ a[2] = (b[3] << c[1]) - (d[3] >> c[1]);
+ a[3] = (b[2] << c[0]) - (d[2] >> c[0]);
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 3 "slp2" { target { vect_var_shift && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-14.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-14.c
new file mode 100644
index 0000000..159bb15
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-14.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-13.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp2" { target { vect_var_shift && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-15.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-15.c
new file mode 100644
index 0000000..d87fc1e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-15.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4], d[4];
+
+void f1()
+{
+ a[0] = (b[3] << c[3]) - d[0];
+ a[1] = (b[2] << c[2]) - d[2];
+ a[2] = (b[1] << c[1]) - d[4];
+ a[3] = (b[0] << c[0]) - d[6];
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp2" { target { vect_var_shift && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-16.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-16.c
new file mode 100644
index 0000000..559583a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-16.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-15.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp2" { target { vect_var_shift && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-17.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-17.c
new file mode 100644
index 0000000..f64a2d9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-17.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+
+int a[8], b[8];
+
+int f1()
+{
+ a[0] = b[4] + 1;
+ a[1] = b[5] + 1;
+ a[2] = b[6] + 1;
+ a[3] = b[7] + 1;
+ a[4] = b[0] + 1;
+ a[5] = b[1] + 1;
+ a[6] = b[2] + 1;
+ a[7] = b[3] + 1;
+}
+
+unsigned short c[2], d[2];
+void f2() {
+ c[0] += d[1];
+ c[1] += d[0];
+}
+
+typedef int v4si __attribute__((vector_size(16)));
+void f3(v4si x) {
+ a[0] = b[1] + x[1];
+ a[1] = b[0] + x[3];
+}
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-2.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-2.c
new file mode 100644
index 0000000..f12290b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-1.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp2" { target { vect_var_shift && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-3.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-3.c
new file mode 100644
index 0000000..82c2720
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4], d[4];
+
+void f1()
+{
+ a[0] = (b[3] << c[3]) - d[3];
+ a[1] = (b[2] << c[2]) - d[2];
+ a[2] = (b[1] << c[1]) - d[1];
+ a[3] = (b[0] << c[0]) - d[0];
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp2" { target { vect_var_shift && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-4.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-4.c
new file mode 100644
index 0000000..45bd6c8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-4.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-3.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp2" { target { vect_var_shift && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-5.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-5.c
new file mode 100644
index 0000000..b59a159
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-5.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4];
+
+void f1()
+{
+ a[0] = b[3] - c[3];
+ a[1] = b[2] + c[2];
+ a[2] = b[1] - c[1];
+ a[3] = b[0] + c[0];
+}
+
+/* { dg-final { scan-tree-dump "absorbing input layouts" "slp2" { target { vect_int && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-6.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-6.c
new file mode 100644
index 0000000..06f5308
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-6.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-5.c"
+
+/* { dg-final { scan-tree-dump "absorbing input layouts" "slp2" { target { vect_int && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-7.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-7.c
new file mode 100644
index 0000000..7a20c60
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-7.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+
+int a[4], b[400];
+
+void f1()
+{
+ for (int i = 0; i < 100; ++i)
+ {
+ a[0] += b[i * 4 + 3];
+ a[1] += b[i * 4 + 2];
+ a[2] += b[i * 4 + 1];
+ a[3] += b[i * 4 + 0];
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp1" { target { vect_int && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-8.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-8.c
new file mode 100644
index 0000000..ef2f0c6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-8.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os -fno-tree-loop-vectorize" } */
+
+#include "bb-slp-layout-7.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp1" { target { vect_int && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-9.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-9.c
new file mode 100644
index 0000000..c841862
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-9.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+
+int a[4], b[400], c[400], d[40000];
+
+void f1()
+{
+ int a0 = a[0];
+ int a1 = a[1];
+ int a2 = a[2];
+ int a3 = a[3];
+ for (int i = 0; i < 100; ++i)
+ {
+ a0 ^= c[i * 4 + 0];
+ a1 ^= c[i * 4 + 1];
+ a2 ^= c[i * 4 + 2];
+ a3 ^= c[i * 4 + 3];
+ for (int j = 0; j < 100; ++j)
+ {
+ a0 += d[i * 400 + j * 4 + 1];
+ a1 += d[i * 400 + j * 4 + 0];
+ a2 += d[i * 400 + j * 4 + 3];
+ a3 += d[i * 400 + j * 4 + 2];
+ }
+ b[i * 4 + 0] = a0;
+ b[i * 4 + 1] = a1;
+ b[i * 4 + 2] = a2;
+ b[i * 4 + 3] = a3;
+ }
+ a[0] = a0;
+ a[1] = a1;
+ a[2] = a2;
+ a[3] = a3;
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp1" { target { vect_int && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11b.c b/gcc/testsuite/gcc.dg/vect/slp-11b.c
index c4d9ab0..d0b972f 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-11b.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11b.c
@@ -44,4 +44,4 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_strided4 || vect_perm } && vect_int_mult } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } xfail vect_load_lanes } } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 2f243cd..3c1913b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -6814,6 +6814,7 @@ proc check_effective_target_vect_var_shift { } {
return [check_cached_effective_target_indexed vect_var_shift {
expr {(([istarget i?86-*-*] || [istarget x86_64-*-*])
&& [check_avx2_available])
+ || [istarget aarch64*-*-*]
}}]
}
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 64b3379..ab4c6fa 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -20,6 +20,7 @@ along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
+#define INCLUDE_ALGORITHM
#include "system.h"
#include "coretypes.h"
#include "backend.h"
@@ -49,7 +50,20 @@ along with GCC; see the file COPYING3. If not see
#include "tree-eh.h"
#include "tree-cfg.h"
#include "alloc-pool.h"
-
+#include "sreal.h"
+#include "predict.h"
+
+static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
+ load_permutation_t &,
+ const vec<tree> &,
+ gimple_stmt_iterator *,
+ poly_uint64, bool, bool,
+ unsigned *,
+ unsigned * = nullptr,
+ bool = false);
+static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
+ slp_tree, lane_permutation_t &,
+ vec<slp_tree> &, bool);
static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
slp_tree, stmt_vector_for_cost *);
static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
@@ -304,6 +318,16 @@ vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
oprnds_info.release ();
}
+/* Return the execution frequency of NODE (so that a higher value indicates
+ a "more important" node when optimizing for speed). */
+
+static sreal
+vect_slp_node_weight (slp_tree node)
+{
+ stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
+ basic_block bb = gimple_bb (stmt_info->stmt);
+ return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
+}
/* Return true if STMTS contains a pattern statement. */
@@ -3531,29 +3555,465 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
return opt_result::success ();
}
-struct slpg_vertex
+/* Estimates the cost of inserting layout changes into the SLP graph.
+ It can also say that the insertion is impossible. */
+
+struct slpg_layout_cost
+{
+ slpg_layout_cost () = default;
+ slpg_layout_cost (sreal, bool);
+
+ static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
+ bool is_possible () const { return depth != sreal::max (); }
+
+ bool operator== (const slpg_layout_cost &) const;
+ bool operator!= (const slpg_layout_cost &) const;
+
+ bool is_better_than (const slpg_layout_cost &, bool) const;
+
+ void add_parallel_cost (const slpg_layout_cost &);
+ void add_serial_cost (const slpg_layout_cost &);
+ void split (unsigned int);
+
+ /* The longest sequence of layout changes needed during any traversal
+ of the partition dag, weighted by execution frequency.
+
+ This is the most important metric when optimizing for speed, since
+ it helps to ensure that we keep the number of operations on
+ critical paths to a minimum. */
+ sreal depth = 0;
+
+ /* An estimate of the total number of operations needed. It is weighted by
+ execution frequency when optimizing for speed but not when optimizing for
+ size. In order to avoid double-counting, a node with a fanout of N will
+ distribute 1/N of its total cost to each successor.
+
+ This is the most important metric when optimizing for size, since
+ it helps to keep the total number of operations to a minimum, */
+ sreal total = 0;
+};
+
+/* Construct costs for a node with weight WEIGHT. A higher weight
+ indicates more frequent execution. IS_FOR_SIZE is true if we are
+ optimizing for size rather than speed. */
+
+slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
+ : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
{
- slpg_vertex (slp_tree node_)
- : node (node_), perm_in (-1), perm_out (-1) {}
+}
- int get_perm_materialized () const
- { return perm_in != perm_out ? perm_in : 0; }
+bool
+slpg_layout_cost::operator== (const slpg_layout_cost &other) const
+{
+ return depth == other.depth && total == other.total;
+}
+bool
+slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
+{
+ return !operator== (other);
+}
+
+/* Return true if these costs are better than OTHER. IS_FOR_SIZE is
+ true if we are optimizing for size rather than speed. */
+
+bool
+slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
+ bool is_for_size) const
+{
+ if (is_for_size)
+ {
+ if (total != other.total)
+ return total < other.total;
+ return depth < other.depth;
+ }
+ else
+ {
+ if (depth != other.depth)
+ return depth < other.depth;
+ return total < other.total;
+ }
+}
+
+/* Increase the costs to account for something with cost INPUT_COST
+ happening in parallel with the current costs. */
+
+void
+slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
+{
+ depth = std::max (depth, input_cost.depth);
+ total += input_cost.total;
+}
+
+/* Increase the costs to account for something with cost INPUT_COST
+ happening in series with the current costs. */
+
+void
+slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
+{
+ depth += other.depth;
+ total += other.total;
+}
+
+/* Split the total cost among TIMES successors or predecessors. */
+
+void
+slpg_layout_cost::split (unsigned int times)
+{
+ if (times > 1)
+ total /= times;
+}
+
+/* Information about one node in the SLP graph, for use during
+ vect_optimize_slp_pass. */
+
+struct slpg_vertex
+{
+ slpg_vertex (slp_tree node_) : node (node_) {}
+
+ /* The node itself. */
slp_tree node;
- /* The common permutation on the incoming lanes (towards SLP children). */
- int perm_in;
- /* The permutation on the outgoing lanes (towards SLP parents). When
- the node is a materialization point for a permute this differs
- from perm_in (and is then usually zero). Materialization happens
- on the input side. */
- int perm_out;
+
+ /* Which partition the node belongs to, or -1 if none. Nodes outside of
+ partitions are flexible; they can have whichever layout consumers
+ want them to have. */
+ int partition = -1;
+
+ /* The number of nodes that directly use the result of this one
+ (i.e. the number of nodes that count this one as a child). */
+ unsigned int out_degree = 0;
+
+ /* The execution frequency of the node. */
+ sreal weight = 0;
+
+ /* The total execution frequency of all nodes that directly use the
+ result of this one. */
+ sreal out_weight = 0;
};
-/* Fill the vertices and leafs vector with all nodes in the SLP graph. */
+/* Information about one partition of the SLP graph, for use during
+ vect_optimize_slp_pass. */
-static void
-vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
- vec<slpg_vertex> &vertices, vec<int> &leafs)
+struct slpg_partition_info
+{
+ /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
+ of m_partitioned_nodes. */
+ unsigned int node_begin = 0;
+ unsigned int node_end = 0;
+
+ /* Which layout we've chosen to use for this partition, or -1 if
+ we haven't picked one yet. */
+ int layout = -1;
+
+ /* The number of predecessors and successors in the partition dag.
+ The predecessors always have lower partition numbers and the
+ successors always have higher partition numbers.
+
+ Note that the directions of these edges are not necessarily the
+ same as in the data flow graph. For example, if an SCC has separate
+ partitions for an inner loop and an outer loop, the inner loop's
+ partition will have at least two incoming edges from the outer loop's
+ partition: one for a live-in value and one for a live-out value.
+ In data flow terms, one of these edges would also be from the outer loop
+ to the inner loop, but the other would be in the opposite direction. */
+ unsigned int in_degree = 0;
+ unsigned int out_degree = 0;
+};
+
+/* Information about the costs of using a particular layout for a
+ particular partition. It can also say that the combination is
+ impossible. */
+
+struct slpg_partition_layout_costs
+{
+ bool is_possible () const { return internal_cost.is_possible (); }
+ void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
+
+ /* The costs inherited from predecessor partitions. */
+ slpg_layout_cost in_cost;
+
+ /* The inherent cost of the layout within the node itself. For example,
+ this is nonzero for a load if choosing a particular layout would require
+ the load to permute the loaded elements. It is nonzero for a
+ VEC_PERM_EXPR if the permutation cannot be eliminated or converted
+ to full-vector moves. */
+ slpg_layout_cost internal_cost;
+
+ /* The costs inherited from successor partitions. */
+ slpg_layout_cost out_cost;
+};
+
+/* This class tries to optimize the layout of vectors in order to avoid
+ unnecessary shuffling. At the moment, the set of possible layouts are
+ restricted to bijective permutations.
+
+ The goal of the pass depends on whether we're optimizing for size or
+ for speed. When optimizing for size, the goal is to reduce the overall
+ number of layout changes (including layout changes implied by things
+ like load permutations). When optimizing for speed, the goal is to
+ reduce the maximum latency attributable to layout changes on any
+ non-cyclical path through the data flow graph.
+
+ For example, when optimizing a loop nest for speed, we will prefer
+ to make layout changes outside of a loop rather than inside of a loop,
+ and will prefer to make layout changes in parallel rather than serially,
+ even if that increases the overall number of layout changes.
+
+ The high-level procedure is:
+
+ (1) Build a graph in which edges go from uses (parents) to definitions
+ (children).
+
+ (2) Divide the graph into a dag of strongly-connected components (SCCs).
+
+ (3) When optimizing for speed, partition the nodes in each SCC based
+ on their containing cfg loop. When optimizing for size, treat
+ each SCC as a single partition.
+
+ This gives us a dag of partitions. The goal is now to assign a
+ layout to each partition.
+
+ (4) Construct a set of vector layouts that are worth considering.
+ Record which nodes must keep their current layout.
+
+ (5) Perform a forward walk over the partition dag (from loads to stores)
+ accumulating the "forward" cost of using each layout. When visiting
+ each partition, assign a tentative choice of layout to the partition
+ and use that choice when calculating the cost of using a different
+ layout in successor partitions.
+
+ (6) Perform a backward walk over the partition dag (from stores to loads),
+ accumulating the "backward" cost of using each layout. When visiting
+ each partition, make a final choice of layout for that partition based
+ on the accumulated forward costs (from (5)) and backward costs
+ (from (6)).
+
+ (7) Apply the chosen layouts to the SLP graph.
+
+ For example, consider the SLP statements:
+
+ S1: a_1 = load
+ loop:
+ S2: a_2 = PHI<a_1, a_3>
+ S3: b_1 = load
+ S4: a_3 = a_2 + b_1
+ exit:
+ S5: a_4 = PHI<a_3>
+ S6: store a_4
+
+ S2 and S4 form an SCC and are part of the same loop. Every other
+ statement is in a singleton SCC. In this example there is a one-to-one
+ mapping between SCCs and partitions and the partition dag looks like this;
+
+ S1 S3
+ \ /
+ S2+S4
+ |
+ S5
+ |
+ S6
+
+ S2, S3 and S4 will have a higher execution frequency than the other
+ statements, so when optimizing for speed, the goal is to avoid any
+ layout changes:
+
+ - within S3
+ - within S2+S4
+ - on the S3->S2+S4 edge
+
+ For example, if S3 was originally a reversing load, the goal of the
+ pass is to make it an unreversed load and change the layout on the
+ S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
+ on S1->S2+S4 and S5->S6 would also be acceptable.)
+
+ The difference between SCCs and partitions becomes important if we
+ add an outer loop:
+
+ S1: a_1 = ...
+ loop1:
+ S2: a_2 = PHI<a_1, a_6>
+ S3: b_1 = load
+ S4: a_3 = a_2 + b_1
+ loop2:
+ S5: a_4 = PHI<a_3, a_5>
+ S6: c_1 = load
+ S7: a_5 = a_4 + c_1
+ exit2:
+ S8: a_6 = PHI<a_5>
+ S9: store a_6
+ exit1:
+
+ Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
+ for speed, we usually do not want restrictions in the outer loop to "infect"
+ the decision for the inner loop. For example, if an outer-loop node
+ in the SCC contains a statement with a fixed layout, that should not
+ prevent the inner loop from using a different layout. Conversely,
+ the inner loop should not dictate a layout to the outer loop: if the
+ outer loop does a lot of computation, then it may not be efficient to
+ do all of that computation in the inner loop's preferred layout.
+
+ So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
+ and S5+S7 (inner). We also try to arrange partitions so that:
+
+ - the partition for an outer loop comes before the partition for
+ an inner loop
+
+ - if a sibling loop A dominates a sibling loop B, A's partition
+ comes before B's
+
+ This gives the following partition dag for the example above:
+
+ S1 S3
+ \ /
+ S2+S4+S8 S6
+ | \\ /
+ | S5+S7
+ |
+ S9
+
+ There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
+ one for a reversal of the edge S7->S8.
+
+ The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
+ for S2+S4+S8 therefore has to balance the cost of using the outer loop's
+ preferred layout against the cost of changing the layout on entry to the
+ inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
+
+ Although this works well when optimizing for speed, it has the downside
+ when optimizing for size that the choice of layout for S5+S7 is completely
+ independent of S9, which lessens the chance of reducing the overall number
+ of permutations. We therefore do not partition SCCs when optimizing
+ for size.
+
+ To give a concrete example of the difference between optimizing
+ for size and speed, consider:
+
+ a[0] = (b[1] << c[3]) - d[1];
+ a[1] = (b[0] << c[2]) - d[0];
+ a[2] = (b[3] << c[1]) - d[3];
+ a[3] = (b[2] << c[0]) - d[2];
+
+ There are three different layouts here: one for a, one for b and d,
+ and one for c. When optimizing for speed it is better to permute each
+ of b, c and d into the order required by a, since those permutations
+ happen in parallel. But when optimizing for size, it is better to:
+
+ - permute c into the same order as b
+ - do the arithmetic
+ - permute the result into the order required by a
+
+ This gives 2 permutations rather than 3. */
+
+class vect_optimize_slp_pass
+{
+public:
+ vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
+ void run ();
+
+private:
+ /* Graph building. */
+ struct loop *containing_loop (slp_tree);
+ bool is_cfg_latch_edge (graph_edge *);
+ void build_vertices (hash_set<slp_tree> &, slp_tree);
+ void build_vertices ();
+ void build_graph ();
+
+ /* Partitioning. */
+ void create_partitions ();
+ template<typename T> void for_each_partition_edge (unsigned int, T);
+
+ /* Layout selection. */
+ bool is_compatible_layout (slp_tree, unsigned int);
+ int change_layout_cost (slp_tree, unsigned int, unsigned int);
+ slpg_partition_layout_costs &partition_layout_costs (unsigned int,
+ unsigned int);
+ void change_vec_perm_layout (slp_tree, lane_permutation_t &,
+ int, unsigned int);
+ int internal_node_cost (slp_tree, int, unsigned int);
+ void start_choosing_layouts ();
+
+ /* Cost propagation. */
+ slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
+ unsigned int, unsigned int);
+ slpg_layout_cost total_in_cost (unsigned int);
+ slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
+ slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
+ void forward_pass ();
+ void backward_pass ();
+
+ /* Rematerialization. */
+ slp_tree get_result_with_layout (slp_tree, unsigned int);
+ void materialize ();
+
+ /* Clean-up. */
+ void remove_redundant_permutations ();
+
+ void dump ();
+
+ vec_info *m_vinfo;
+
+ /* True if we should optimize the graph for size, false if we should
+ optimize it for speed. (It wouldn't be easy to make this decision
+ more locally.) */
+ bool m_optimize_size;
+
+ /* A graph of all SLP nodes, with edges leading from uses to definitions.
+ In other words, a node's predecessors are its slp_tree parents and
+ a node's successors are its slp_tree children. */
+ graph *m_slpg = nullptr;
+
+ /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
+ auto_vec<slpg_vertex> m_vertices;
+
+ /* The list of all leaves of M_SLPG. such as external definitions, constants,
+ and loads. */
+ auto_vec<int> m_leafs;
+
+ /* This array has one entry for every vector layout that we're considering.
+ Element 0 is null and indicates "no change". Other entries describe
+ permutations that are inherent in the current graph and that we would
+ like to reverse if possible.
+
+ For example, a permutation { 1, 2, 3, 0 } means that something has
+ effectively been permuted in that way, such as a load group
+ { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
+ We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
+ in order to put things "back" in order. */
+ auto_vec<vec<unsigned> > m_perms;
+
+ /* A partitioning of the nodes for which a layout must be chosen.
+ Each partition represents an <SCC, cfg loop> pair; that is,
+ nodes in different SCCs belong to different partitions, and nodes
+ within an SCC can be further partitioned according to a containing
+ cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
+
+ - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
+ from leaves (such as loads) to roots (such as stores).
+
+ - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
+ auto_vec<slpg_partition_info> m_partitions;
+
+ /* The list of all nodes for which a layout must be chosen. Nodes for
+ partition P come before the nodes for partition P+1. Nodes within a
+ partition are in reverse postorder. */
+ auto_vec<unsigned int> m_partitioned_nodes;
+
+ /* Index P * num-layouts + L contains the cost of using layout L
+ for partition P. */
+ auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
+
+ /* Index N * num-layouts + L, if nonnull, is a node that provides the
+ original output of node N adjusted to have layout L. */
+ auto_vec<slp_tree> m_node_layouts;
+};
+
+/* Fill the vertices and leafs vector with all nodes in the SLP graph.
+ Also record whether we should optimize anything for speed rather
+ than size. */
+
+void
+vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
+ slp_tree node)
{
unsigned i;
slp_tree child;
@@ -3561,8 +4021,15 @@ vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
if (visited.add (node))
return;
- node->vertex = vertices.length ();
- vertices.safe_push (slpg_vertex (node));
+ if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
+ {
+ basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
+ if (optimize_bb_for_speed_p (bb))
+ m_optimize_size = false;
+ }
+
+ node->vertex = m_vertices.length ();
+ m_vertices.safe_push (slpg_vertex (node));
bool leaf = true;
bool force_leaf = false;
@@ -3570,7 +4037,7 @@ vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
if (child)
{
leaf = false;
- vect_slp_build_vertices (visited, child, vertices, leafs);
+ build_vertices (visited, child);
}
else
force_leaf = true;
@@ -3580,21 +4047,19 @@ vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
and inductions. Force those SLP PHIs to act as leafs to make them
backwards reachable. */
if (leaf || force_leaf)
- leafs.safe_push (node->vertex);
+ m_leafs.safe_push (node->vertex);
}
/* Fill the vertices and leafs vector with all nodes in the SLP graph. */
-static void
-vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
- vec<int> &leafs)
+void
+vect_optimize_slp_pass::build_vertices ()
{
hash_set<slp_tree> visited;
unsigned i;
slp_instance instance;
- FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
- vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
- leafs);
+ FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
+ build_vertices (visited, SLP_INSTANCE_TREE (instance));
}
/* Apply (reverse) bijectite PERM to VEC. */
@@ -3625,75 +4090,457 @@ vect_slp_permute (vec<unsigned> perm,
}
}
-/* Return whether permutations PERM_A and PERM_B as recorded in the
- PERMS vector are equal. */
+/* Return the cfg loop that contains NODE. */
-static bool
-vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
- int perm_a, int perm_b)
+struct loop *
+vect_optimize_slp_pass::containing_loop (slp_tree node)
{
- return (perm_a == perm_b
- || (perm_a != -1 && perm_b != -1
- && perms[perm_a].length () == perms[perm_b].length ()
- && memcmp (&perms[perm_a][0], &perms[perm_b][0],
- sizeof (unsigned) * perms[perm_a].length ()) == 0));
+ stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
+ if (!rep)
+ return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
+ return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
}
-/* Optimize the SLP graph of VINFO. */
+/* Return true if UD (an edge from a use to a definition) is associated
+ with a loop latch edge in the cfg. */
-void
-vect_optimize_slp (vec_info *vinfo)
+bool
+vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
{
- if (vinfo->slp_instances.is_empty ())
- return;
+ slp_tree use = m_vertices[ud->src].node;
+ slp_tree def = m_vertices[ud->dest].node;
+ if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
+ || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
+ return false;
- slp_tree node;
- unsigned i;
- auto_vec<slpg_vertex> vertices;
- auto_vec<int> leafs;
- vect_slp_build_vertices (vinfo, vertices, leafs);
+ stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
+ return (is_a<gphi *> (use_rep->stmt)
+ && bb_loop_header_p (gimple_bb (use_rep->stmt))
+ && containing_loop (def) == containing_loop (use));
+}
+
+/* Build the graph. Mark edges that correspond to cfg loop latch edges with
+ a nonnull data field. */
+
+void
+vect_optimize_slp_pass::build_graph ()
+{
+ m_optimize_size = true;
+ build_vertices ();
- struct graph *slpg = new_graph (vertices.length ());
- for (slpg_vertex &v : vertices)
+ m_slpg = new_graph (m_vertices.length ());
+ for (slpg_vertex &v : m_vertices)
for (slp_tree child : SLP_TREE_CHILDREN (v.node))
if (child)
- add_edge (slpg, v.node->vertex, child->vertex);
+ {
+ graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
+ if (is_cfg_latch_edge (ud))
+ ud->data = this;
+ }
+}
+
+/* Return true if E corresponds to a loop latch edge in the cfg. */
- /* Compute (reverse) postorder on the inverted graph. */
- auto_vec<int> ipo;
- graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
+static bool
+skip_cfg_latch_edges (graph_edge *e)
+{
+ return e->data;
+}
- auto_vec<vec<unsigned> > perms;
- perms.safe_push (vNULL); /* zero is no permute */
+/* Create the node partitions. */
- /* Produce initial permutations. */
- for (i = 0; i < leafs.length (); ++i)
+void
+vect_optimize_slp_pass::create_partitions ()
+{
+ /* Calculate a postorder of the graph, ignoring edges that correspond
+ to natural latch edges in the cfg. Reading the vector from the end
+ to the beginning gives the reverse postorder. */
+ auto_vec<int> initial_rpo;
+ graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
+ false, NULL, skip_cfg_latch_edges);
+ gcc_assert (initial_rpo.length () == m_vertices.length ());
+
+ /* Calculate the strongly connected components of the graph. */
+ auto_vec<int> scc_grouping;
+ unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
+
+ /* Create a new index order in which all nodes from the same SCC are
+ consecutive. Use scc_pos to record the index of the first node in
+ each SCC. */
+ auto_vec<unsigned int> scc_pos (num_sccs);
+ int last_component = -1;
+ unsigned int node_count = 0;
+ for (unsigned int node_i : scc_grouping)
+ {
+ if (last_component != m_slpg->vertices[node_i].component)
+ {
+ last_component = m_slpg->vertices[node_i].component;
+ gcc_assert (last_component == int (scc_pos.length ()));
+ scc_pos.quick_push (node_count);
+ }
+ node_count += 1;
+ }
+ gcc_assert (node_count == initial_rpo.length ()
+ && last_component + 1 == int (num_sccs));
+
+ /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
+ inside each SCC following the RPO we calculated above. The fact that
+ we ignored natural latch edges when calculating the RPO should ensure
+ that, for natural loop nests:
+
+ - the first node that we encounter in a cfg loop is the loop header phi
+ - the loop header phis are in dominance order
+
+ Arranging for this is an optimization (see below) rather than a
+ correctness issue. Unnatural loops with a tangled mess of backedges
+ will still work correctly, but might give poorer results.
+
+ Also update scc_pos so that it gives 1 + the index of the last node
+ in the SCC. */
+ m_partitioned_nodes.safe_grow (node_count);
+ for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
+ {
+ unsigned int node_i = initial_rpo[old_i];
+ unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
+ m_partitioned_nodes[new_i] = node_i;
+ }
+
+ /* When optimizing for speed, partition each SCC based on the containing
+ cfg loop. The order we constructed above should ensure that, for natural
+ cfg loops, we'll create sub-SCC partitions for outer loops before
+ the corresponding sub-SCC partitions for inner loops. Similarly,
+ when one sibling loop A dominates another sibling loop B, we should
+ create a sub-SCC partition for A before a sub-SCC partition for B.
+
+ As above, nothing depends for correctness on whether this achieves
+ a natural nesting, but we should get better results when it does. */
+ m_partitions.reserve (m_vertices.length ());
+ unsigned int next_partition_i = 0;
+ hash_map<struct loop *, int> loop_partitions;
+ unsigned int rpo_begin = 0;
+ unsigned int num_partitioned_nodes = 0;
+ for (unsigned int rpo_end : scc_pos)
+ {
+ loop_partitions.empty ();
+ unsigned int partition_i = next_partition_i;
+ for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
+ {
+ /* Handle externals and constants optimistically throughout.
+ But treat existing vectors as fixed since we do not handle
+ permuting them. */
+ unsigned int node_i = m_partitioned_nodes[rpo_i];
+ auto &vertex = m_vertices[node_i];
+ if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
+ && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
+ || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
+ vertex.partition = -1;
+ else
+ {
+ bool existed;
+ if (m_optimize_size)
+ existed = next_partition_i > partition_i;
+ else
+ {
+ struct loop *loop = containing_loop (vertex.node);
+ auto &entry = loop_partitions.get_or_insert (loop, &existed);
+ if (!existed)
+ entry = next_partition_i;
+ partition_i = entry;
+ }
+ if (!existed)
+ {
+ m_partitions.quick_push (slpg_partition_info ());
+ next_partition_i += 1;
+ }
+ vertex.partition = partition_i;
+ num_partitioned_nodes += 1;
+ m_partitions[partition_i].node_end += 1;
+ }
+ }
+ rpo_begin = rpo_end;
+ }
+
+ /* Assign ranges of consecutive node indices to each partition,
+ in partition order. Start with node_end being the same as
+ node_begin so that the next loop can use it as a counter. */
+ unsigned int node_begin = 0;
+ for (auto &partition : m_partitions)
{
- int idx = leafs[i];
- slp_tree node = vertices[idx].node;
+ partition.node_begin = node_begin;
+ node_begin += partition.node_end;
+ partition.node_end = partition.node_begin;
+ }
+ gcc_assert (node_begin == num_partitioned_nodes);
- /* Handle externals and constants optimistically throughout the
- iteration. But treat existing vectors as fixed since we
- do not handle permuting them below. */
- if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
- && !SLP_TREE_VEC_DEFS (node).exists ())
- || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
- continue;
+ /* Finally build the list of nodes in partition order. */
+ m_partitioned_nodes.truncate (num_partitioned_nodes);
+ for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
+ {
+ int partition_i = m_vertices[node_i].partition;
+ if (partition_i >= 0)
+ {
+ unsigned int order_i = m_partitions[partition_i].node_end++;
+ m_partitioned_nodes[order_i] = node_i;
+ }
+ }
+}
+
+/* Look for edges from earlier partitions into node NODE_I and edges from
+ node NODE_I into later partitions. Call:
+
+ FN (ud, other_node_i)
+
+ for each such use-to-def edge ud, where other_node_i is the node at the
+ other end of the edge. */
+
+template<typename T>
+void
+vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
+{
+ int partition_i = m_vertices[node_i].partition;
+ for (graph_edge *pred = m_slpg->vertices[node_i].pred;
+ pred; pred = pred->pred_next)
+ {
+ int src_partition_i = m_vertices[pred->src].partition;
+ if (src_partition_i >= 0 && src_partition_i != partition_i)
+ fn (pred, pred->src);
+ }
+ for (graph_edge *succ = m_slpg->vertices[node_i].succ;
+ succ; succ = succ->succ_next)
+ {
+ int dest_partition_i = m_vertices[succ->dest].partition;
+ if (dest_partition_i >= 0 && dest_partition_i != partition_i)
+ fn (succ, succ->dest);
+ }
+}
+
+/* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
+ that NODE would operate on. This test is independent of NODE's actual
+ operation. */
+
+bool
+vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
+ unsigned int layout_i)
+{
+ if (layout_i == 0)
+ return true;
+
+ /* SLP_TREE_LANES is zero for existing vectors, but those only support
+ layout 0 anyway. */
+ if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
+ return false;
+
+ return true;
+}
+
+/* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
+ to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
+ layouts is incompatible with NODE or if the change is not possible for
+ some other reason.
+
+ The properties taken from NODE include the number of lanes and the
+ vector type. The actual operation doesn't matter. */
+
+int
+vect_optimize_slp_pass::change_layout_cost (slp_tree node,
+ unsigned int from_layout_i,
+ unsigned int to_layout_i)
+{
+ if (!is_compatible_layout (node, from_layout_i)
+ || !is_compatible_layout (node, to_layout_i))
+ return -1;
+
+ if (from_layout_i == to_layout_i)
+ return 0;
+
+ auto_vec<slp_tree, 1> children (1);
+ children.quick_push (node);
+ auto_lane_permutation_t perm (SLP_TREE_LANES (node));
+ if (from_layout_i > 0)
+ for (unsigned int i : m_perms[from_layout_i])
+ perm.quick_push ({ 0, i });
+ else
+ for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
+ perm.quick_push ({ 0, i });
+ if (to_layout_i > 0)
+ vect_slp_permute (m_perms[to_layout_i], perm, true);
+ auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
+ children, false);
+ if (count >= 0)
+ return MAX (count, 1);
+
+ /* ??? In principle we could try changing via layout 0, giving two
+ layout changes rather than 1. Doing that would require
+ corresponding support in get_result_with_layout. */
+ return -1;
+}
+
+/* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
+
+inline slpg_partition_layout_costs &
+vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
+ unsigned int layout_i)
+{
+ return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
+}
+
+/* Change PERM in one of two ways:
+
+ - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
+ chosen for child I of NODE.
+
+ - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
+
+ In both cases, arrange for the output to have layout OUT_LAYOUT_I */
+
+void
+vect_optimize_slp_pass::
+change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
+ int in_layout_i, unsigned int out_layout_i)
+{
+ for (auto &entry : perm)
+ {
+ int this_in_layout_i = in_layout_i;
+ if (this_in_layout_i < 0)
+ {
+ slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
+ unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
+ this_in_layout_i = m_partitions[in_partition_i].layout;
+ }
+ if (this_in_layout_i > 0)
+ entry.second = m_perms[this_in_layout_i][entry.second];
+ }
+ if (out_layout_i > 0)
+ vect_slp_permute (m_perms[out_layout_i], perm, true);
+}
- /* Leafs do not change across iterations. Note leafs also double
- as entries to the reverse graph. */
- if (!slpg->vertices[idx].succ)
+/* Check whether the target allows NODE to be rearranged so that the node's
+ output has layout OUT_LAYOUT_I. Return the cost of the change if so,
+ in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
+
+ If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
+ NODE can adapt to the layout changes that have (perhaps provisionally)
+ been chosen for NODE's children, so that no extra permutations are
+ needed on either the input or the output of NODE.
+
+ If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
+ that all inputs will be forced into layout IN_LAYOUT_I beforehand.
+
+ IN_LAYOUT_I has no meaning for other types of node.
+
+ Keeping the node as-is is always valid. If the target doesn't appear to
+ support the node as-is then layout 0 has a high and arbitrary cost instead
+ of being invalid. On the one hand, this ensures that every node has at
+ least one valid layout, avoiding what would otherwise be an awkward
+ special case. On the other, it still encourages the pass to change
+ an invalid pre-existing layout choice into a valid one. */
+
+int
+vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
+ unsigned int out_layout_i)
+{
+ const int fallback_cost = 100;
+
+ if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ {
+ auto_lane_permutation_t tmp_perm;
+ tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
+
+ /* Check that the child nodes support the chosen layout. Checking
+ the first child is enough, since any second child would have the
+ same shape. */
+ if (in_layout_i > 0
+ && !is_compatible_layout (SLP_TREE_CHILDREN (node)[0], in_layout_i))
+ return -1;
+
+ change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
+ int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
+ node, tmp_perm,
+ SLP_TREE_CHILDREN (node),
+ false);
+ if (count < 0)
{
- vertices[idx].perm_in = 0;
- vertices[idx].perm_out = 0;
+ if (in_layout_i == 0 && out_layout_i == 0)
+ return fallback_cost;
+ return -1;
}
+ /* We currently have no way of telling whether the new layout is cheaper
+ or more expensive than the old one. But at least in principle,
+ it should be worth making zero permutations (whole-vector shuffles)
+ cheaper than real permutations, in case the pass is able to remove
+ the latter. */
+ return count == 0 ? 0 : 1;
+ }
+
+ stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
+ if (rep
+ && STMT_VINFO_DATA_REF (rep)
+ && DR_IS_READ (STMT_VINFO_DATA_REF (rep)))
+ {
+ auto_load_permutation_t tmp_perm;
+ tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
+ if (out_layout_i > 0)
+ vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
+
+ poly_uint64 vf = 1;
+ if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
+ vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ unsigned int n_perms;
+ if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
+ nullptr, vf, true, false, &n_perms))
+ {
+ if (out_layout_i == 0)
+ return fallback_cost;
+ return -1;
+ }
+
+ /* See the comment above the corresponding VEC_PERM_EXPR handling. */
+ return n_perms == 0 ? 0 : 1;
+ }
+
+ return 0;
+}
+
+/* Decide which element layouts we should consider using. Calculate the
+ weights associated with inserting layout changes on partition edges.
+ Also mark partitions that cannot change layout, by setting their
+ layout to zero. */
+
+void
+vect_optimize_slp_pass::start_choosing_layouts ()
+{
+ /* Used to assign unique permutation indices. */
+ using perm_hash = unbounded_hashmap_traits<
+ vec_free_hash_base<int_hash_base<unsigned>>,
+ int_hash<int, -1, -2>
+ >;
+ hash_map<vec<unsigned>, int, perm_hash> layout_ids;
+
+ /* Layout 0 is "no change". */
+ m_perms.safe_push (vNULL);
+
+ /* Create layouts from existing permutations. */
+ for (unsigned int node_i : m_leafs)
+ {
+ auto &vertex = m_vertices[node_i];
+ if (vertex.partition < 0)
+ continue;
+
+ /* Leafs also double as entries to the reverse graph. Allow the
+ layout of those to be changed. */
+ auto &partition = m_partitions[vertex.partition];
+ if (!m_slpg->vertices[node_i].succ)
+ partition.layout = 0;
+
/* Loads are the only thing generating permutes. */
+ slp_tree node = vertex.node;
if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
continue;
- /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
- node unpermuted, record this permute. */
+ /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
+ unpermuted, record a layout that reverses this permutation. */
+ gcc_assert (partition.layout == 0);
stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
continue;
@@ -3708,13 +4555,18 @@ vect_optimize_slp (vec_info *vinfo)
if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
any_permute = true;
}
- /* If there's no permute no need to split one out. */
- if (!any_permute)
- continue;
/* If the span doesn't match we'd disrupt VF computation, avoid
that for now. */
if (imax - imin + 1 != SLP_TREE_LANES (node))
continue;
+ /* If there's no permute no need to split one out. In this case
+ we can consider turning the load into a permuted load, if that
+ turns out to be cheaper than alternatives. */
+ if (!any_permute)
+ {
+ partition.layout = -1;
+ continue;
+ }
/* For now only handle true permutes, like
vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
@@ -3735,407 +4587,742 @@ vect_optimize_slp (vec_info *vinfo)
perm.safe_grow (SLP_TREE_LANES (node), true);
for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
- perms.safe_push (perm);
- vertices[idx].perm_in = perms.length () - 1;
- vertices[idx].perm_out = perms.length () - 1;
+
+ if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
+ {
+ /* Continue to use existing layouts, but don't add any more. */
+ int *entry = layout_ids.get (perm);
+ partition.layout = entry ? *entry : 0;
+ perm.release ();
+ }
+ else
+ {
+ bool existed;
+ int &layout_i = layout_ids.get_or_insert (perm, &existed);
+ if (existed)
+ perm.release ();
+ else
+ {
+ layout_i = m_perms.length ();
+ m_perms.safe_push (perm);
+ }
+ partition.layout = layout_i;
+ }
}
- /* In addition to the above we have to mark outgoing permutes facing
- non-reduction graph entries that are not represented as to be
- materialized. */
- for (slp_instance instance : vinfo->slp_instances)
+ /* Initially assume that every layout is possible and has zero cost
+ in every partition. */
+ m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
+ * m_perms.length ());
+
+ /* We have to mark outgoing permutations facing non-reduction graph
+ entries that are not represented as to be materialized. */
+ for (slp_instance instance : m_vinfo->slp_instances)
if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
{
- /* Just setting perm_out isn't enough for the propagation to
- pick this up. */
- vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
- vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
+ unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
+ m_partitions[m_vertices[node_i].partition].layout = 0;
}
- /* Propagate permutes along the graph and compute materialization points. */
- bool changed;
- bool do_materialization = false;
- unsigned iteration = 0;
- do
+ /* Check which layouts each node and partition can handle. Calculate the
+ weights associated with inserting layout changes on edges. */
+ for (unsigned int node_i : m_partitioned_nodes)
{
- changed = false;
- ++iteration;
+ auto &vertex = m_vertices[node_i];
+ auto &partition = m_partitions[vertex.partition];
+ slp_tree node = vertex.node;
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "SLP optimize iteration %d\n", iteration);
+ if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
+ {
+ vertex.weight = vect_slp_node_weight (node);
+
+ /* We do not handle stores with a permutation, so all
+ incoming permutations must have been materialized. */
+ if (STMT_VINFO_DATA_REF (rep)
+ && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
+ /* ??? We're forcing materialization in place
+ of the child here, we'd need special handling
+ in materialization to leave layout -1 here. */
+ partition.layout = 0;
+
+ /* We cannot change the layout of an operation that is
+ not independent on lanes. Note this is an explicit
+ negative list since that's much shorter than the respective
+ positive one but it's critical to keep maintaining it. */
+ if (is_gimple_call (STMT_VINFO_STMT (rep)))
+ switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
+ {
+ case CFN_COMPLEX_ADD_ROT90:
+ case CFN_COMPLEX_ADD_ROT270:
+ case CFN_COMPLEX_MUL:
+ case CFN_COMPLEX_MUL_CONJ:
+ case CFN_VEC_ADDSUB:
+ case CFN_VEC_FMADDSUB:
+ case CFN_VEC_FMSUBADD:
+ partition.layout = 0;
+ default:;
+ }
+ }
- for (i = vertices.length (); i > 0 ; --i)
+ auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
{
- int idx = ipo[i-1];
- slp_tree node = vertices[idx].node;
+ auto &other_vertex = m_vertices[other_node_i];
- /* Handle externals and constants optimistically throughout the
- iteration. */
- if (SLP_TREE_DEF_TYPE (node) == vect_external_def
- || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
+ /* Count the number of edges from earlier partitions and the number
+ of edges to later partitions. */
+ if (other_vertex.partition < vertex.partition)
+ partition.in_degree += 1;
+ else
+ partition.out_degree += 1;
+
+ /* If the current node uses the result of OTHER_NODE_I, accumulate
+ the effects of that. */
+ if (ud->src == int (node_i))
+ {
+ other_vertex.out_weight += vertex.weight;
+ other_vertex.out_degree += 1;
+ }
+ };
+ for_each_partition_edge (node_i, process_edge);
+ }
+}
+
+/* Return the incoming costs for node NODE_I, assuming that each input keeps
+ its current (provisional) choice of layout. The inputs do not necessarily
+ have the same layout as each other. */
+
+slpg_layout_cost
+vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
+{
+ auto &vertex = m_vertices[node_i];
+ slpg_layout_cost cost;
+ auto add_cost = [&](graph_edge *, unsigned int other_node_i)
+ {
+ auto &other_vertex = m_vertices[other_node_i];
+ if (other_vertex.partition < vertex.partition)
+ {
+ auto &other_partition = m_partitions[other_vertex.partition];
+ auto &other_costs = partition_layout_costs (other_vertex.partition,
+ other_partition.layout);
+ slpg_layout_cost this_cost = other_costs.in_cost;
+ this_cost.add_serial_cost (other_costs.internal_cost);
+ this_cost.split (other_partition.out_degree);
+ cost.add_parallel_cost (this_cost);
+ }
+ };
+ for_each_partition_edge (node_i, add_cost);
+ return cost;
+}
+
+/* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
+ and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
+ slpg_layout_cost::impossible () if the change isn't possible. */
+
+slpg_layout_cost
+vect_optimize_slp_pass::
+edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
+ unsigned int layout2_i)
+{
+ auto &def_vertex = m_vertices[ud->dest];
+ auto &use_vertex = m_vertices[ud->src];
+ auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
+ auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
+ auto factor = change_layout_cost (def_vertex.node, def_layout_i,
+ use_layout_i);
+ if (factor < 0)
+ return slpg_layout_cost::impossible ();
+
+ /* We have a choice of putting the layout change at the site of the
+ definition or at the site of the use. Prefer the former when
+ optimizing for size or when the execution frequency of the
+ definition is no greater than the combined execution frequencies of
+ the uses. When putting the layout change at the site of the definition,
+ divvy up the cost among all consumers. */
+ if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
+ {
+ slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
+ cost.split (def_vertex.out_degree);
+ return cost;
+ }
+ return { use_vertex.weight * factor, m_optimize_size };
+}
+
+/* UD represents a use-def link between FROM_NODE_I and a node in a later
+ partition; FROM_NODE_I could be the definition node or the use node.
+ The node at the other end of the link wants to use layout TO_LAYOUT_I.
+ Return the cost of any necessary fix-ups on edge UD, or return
+ slpg_layout_cost::impossible () if the change isn't possible.
+
+ At this point, FROM_NODE_I's partition has chosen the cheapest
+ layout based on the information available so far, but this choice
+ is only provisional. */
+
+slpg_layout_cost
+vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
+ unsigned int to_layout_i)
+{
+ auto &from_vertex = m_vertices[from_node_i];
+ unsigned int from_partition_i = from_vertex.partition;
+ slpg_partition_info &from_partition = m_partitions[from_partition_i];
+ gcc_assert (from_partition.layout >= 0);
+
+ /* First calculate the cost on the assumption that FROM_PARTITION sticks
+ with its current layout preference. */
+ slpg_layout_cost cost = slpg_layout_cost::impossible ();
+ auto edge_cost = edge_layout_cost (ud, from_node_i,
+ from_partition.layout, to_layout_i);
+ if (edge_cost.is_possible ())
+ {
+ auto &from_costs = partition_layout_costs (from_partition_i,
+ from_partition.layout);
+ cost = from_costs.in_cost;
+ cost.add_serial_cost (from_costs.internal_cost);
+ cost.split (from_partition.out_degree);
+ cost.add_serial_cost (edge_cost);
+ }
+
+ /* Take the minimum of that cost and the cost that applies if
+ FROM_PARTITION instead switches to TO_LAYOUT_I. */
+ auto &direct_layout_costs = partition_layout_costs (from_partition_i,
+ to_layout_i);
+ if (direct_layout_costs.is_possible ())
+ {
+ slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
+ direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
+ direct_cost.split (from_partition.out_degree);
+ if (!cost.is_possible ()
+ || direct_cost.is_better_than (cost, m_optimize_size))
+ cost = direct_cost;
+ }
+
+ return cost;
+}
+
+/* UD represents a use-def link between TO_NODE_I and a node in an earlier
+ partition; TO_NODE_I could be the definition node or the use node.
+ The node at the other end of the link wants to use layout FROM_LAYOUT_I;
+ return the cost of any necessary fix-ups on edge UD, or
+ slpg_layout_cost::impossible () if the choice cannot be made.
+
+ At this point, TO_NODE_I's partition has a fixed choice of layout. */
+
+slpg_layout_cost
+vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
+ unsigned int from_layout_i)
+{
+ auto &to_vertex = m_vertices[to_node_i];
+ unsigned int to_partition_i = to_vertex.partition;
+ slpg_partition_info &to_partition = m_partitions[to_partition_i];
+ gcc_assert (to_partition.layout >= 0);
+
+ /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
+ adjusted for this input having layout FROM_LAYOUT_I. Assume that
+ any other inputs keep their current choice of layout. */
+ auto &to_costs = partition_layout_costs (to_partition_i,
+ to_partition.layout);
+ if (ud->src == int (to_node_i)
+ && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
+ {
+ auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
+ auto old_layout = from_partition.layout;
+ from_partition.layout = from_layout_i;
+ int factor = internal_node_cost (to_vertex.node, -1,
+ to_partition.layout);
+ from_partition.layout = old_layout;
+ if (factor >= 0)
+ {
+ slpg_layout_cost cost = to_costs.out_cost;
+ cost.add_serial_cost ({ to_vertex.weight * factor,
+ m_optimize_size });
+ cost.split (to_partition.in_degree);
+ return cost;
+ }
+ }
+
+ /* Compute the cost if we insert any necessary layout change on edge UD. */
+ auto edge_cost = edge_layout_cost (ud, to_node_i,
+ to_partition.layout, from_layout_i);
+ if (edge_cost.is_possible ())
+ {
+ slpg_layout_cost cost = to_costs.out_cost;
+ cost.add_serial_cost (to_costs.internal_cost);
+ cost.split (to_partition.in_degree);
+ cost.add_serial_cost (edge_cost);
+ return cost;
+ }
+
+ return slpg_layout_cost::impossible ();
+}
+
+/* Make a forward pass through the partitions, accumulating input costs.
+ Make a tentative (provisional) choice of layout for each partition,
+ ensuring that this choice still allows later partitions to keep
+ their original layout. */
+
+void
+vect_optimize_slp_pass::forward_pass ()
+{
+ for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
+ ++partition_i)
+ {
+ auto &partition = m_partitions[partition_i];
+
+ /* If the partition consists of a single VEC_PERM_EXPR, precompute
+ the incoming cost that would apply if every predecessor partition
+ keeps its current layout. This is used within the loop below. */
+ slpg_layout_cost in_cost;
+ slp_tree single_node = nullptr;
+ if (partition.node_end == partition.node_begin + 1)
+ {
+ unsigned int node_i = m_partitioned_nodes[partition.node_begin];
+ single_node = m_vertices[node_i].node;
+ if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
+ in_cost = total_in_cost (node_i);
+ }
+
+ /* Go through the possible layouts. Decide which ones are valid
+ for this partition and record which of the valid layouts has
+ the lowest cost. */
+ unsigned int min_layout_i = 0;
+ slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
+ for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
+ {
+ auto &layout_costs = partition_layout_costs (partition_i, layout_i);
+ if (!layout_costs.is_possible ())
continue;
- /* We still eventually have failed backedge SLP nodes in the
- graph, those are only cancelled when analyzing operations.
- Simply treat them as transparent ops, propagating permutes
- through them. */
- if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
+ /* If the recorded layout is already 0 then the layout cannot
+ change. */
+ if (partition.layout == 0 && layout_i != 0)
{
- /* We do not handle stores with a permutation, so all
- incoming permutes must have been materialized. */
- stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
- if (STMT_VINFO_DATA_REF (rep)
- && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
- {
- /* ??? We're forcing materialization in place
- of the child here, we'd need special handling
- in materialization to leave perm_in -1 here. */
- vertices[idx].perm_in = 0;
- vertices[idx].perm_out = 0;
- }
- /* We cannot move a permute across an operation that is
- not independent on lanes. Note this is an explicit
- negative list since that's much shorter than the respective
- positive one but it's critical to keep maintaining it. */
- if (is_gimple_call (STMT_VINFO_STMT (rep)))
- switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
- {
- case CFN_COMPLEX_ADD_ROT90:
- case CFN_COMPLEX_ADD_ROT270:
- case CFN_COMPLEX_MUL:
- case CFN_COMPLEX_MUL_CONJ:
- case CFN_VEC_ADDSUB:
- case CFN_VEC_FMADDSUB:
- case CFN_VEC_FMSUBADD:
- vertices[idx].perm_in = 0;
- vertices[idx].perm_out = 0;
- default:;
- }
+ layout_costs.mark_impossible ();
+ continue;
}
- if (!slpg->vertices[idx].succ)
- /* Pick up pre-computed leaf values. */
- ;
- else
+ bool is_possible = true;
+ for (unsigned int order_i = partition.node_begin;
+ order_i < partition.node_end; ++order_i)
{
- bool any_succ_perm_out_m1 = false;
- int perm_in = vertices[idx].perm_in;
- for (graph_edge *succ = slpg->vertices[idx].succ;
- succ; succ = succ->succ_next)
+ unsigned int node_i = m_partitioned_nodes[order_i];
+ auto &vertex = m_vertices[node_i];
+
+ /* Reject the layout if it is individually incompatible
+ with any node in the partition. */
+ if (!is_compatible_layout (vertex.node, layout_i))
{
- int succ_idx = succ->dest;
- int succ_perm = vertices[succ_idx].perm_out;
- /* Handle unvisited (and constant) nodes optimistically. */
- /* ??? But for constants once we want to handle
- non-bijective permutes we have to verify the permute,
- when unifying lanes, will not unify different constants.
- For example see gcc.dg/vect/bb-slp-14.c for a case
- that would break. */
- if (succ_perm == -1)
- {
- /* When we handled a non-leaf optimistically, note
- that so we can adjust its outgoing permute below. */
- slp_tree succ_node = vertices[succ_idx].node;
- if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
- && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
- any_succ_perm_out_m1 = true;
- continue;
- }
- if (perm_in == -1)
- perm_in = succ_perm;
- else if (succ_perm == 0
- || !vect_slp_perms_eq (perms, perm_in, succ_perm))
- {
- perm_in = 0;
- break;
- }
+ is_possible = false;
+ break;
}
- /* Adjust any incoming permutes we treated optimistically. */
- if (perm_in != -1 && any_succ_perm_out_m1)
+ auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
{
- for (graph_edge *succ = slpg->vertices[idx].succ;
- succ; succ = succ->succ_next)
+ auto &other_vertex = m_vertices[other_node_i];
+ if (other_vertex.partition < vertex.partition)
{
- slp_tree succ_node = vertices[succ->dest].node;
- if (vertices[succ->dest].perm_out == -1
- && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
- && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
- {
- vertices[succ->dest].perm_out = perm_in;
- /* And ensure this propagates. */
- if (vertices[succ->dest].perm_in == -1)
- vertices[succ->dest].perm_in = perm_in;
- }
+ /* Accumulate the incoming costs from earlier
+ partitions, plus the cost of any layout changes
+ on UD itself. */
+ auto cost = forward_cost (ud, other_node_i, layout_i);
+ if (!cost.is_possible ())
+ is_possible = false;
+ else
+ layout_costs.in_cost.add_parallel_cost (cost);
}
- changed = true;
+ else
+ /* Reject the layout if it would make layout 0 impossible
+ for later partitions. This amounts to testing that the
+ target supports reversing the layout change on edges
+ to later partitions.
+
+ In principle, it might be possible to push a layout
+ change all the way down a graph, so that it never
+ needs to be reversed and so that the target doesn't
+ need to support the reverse operation. But it would
+ be awkward to bail out if we hit a partition that
+ does not support the new layout, especially since
+ we are not dealing with a lattice. */
+ is_possible &= edge_layout_cost (ud, other_node_i, 0,
+ layout_i).is_possible ();
+ };
+ for_each_partition_edge (node_i, add_cost);
+
+ /* Accumulate the cost of using LAYOUT_I within NODE,
+ both for the inputs and the outputs. */
+ int factor = internal_node_cost (vertex.node, layout_i,
+ layout_i);
+ if (factor < 0)
+ {
+ is_possible = false;
+ break;
}
+ else if (factor)
+ layout_costs.internal_cost.add_serial_cost
+ ({ vertex.weight * factor, m_optimize_size });
+ }
+ if (!is_possible)
+ {
+ layout_costs.mark_impossible ();
+ continue;
+ }
- if (!vect_slp_perms_eq (perms, perm_in,
- vertices[idx].perm_in))
+ /* Combine the incoming and partition-internal costs. */
+ slpg_layout_cost combined_cost = layout_costs.in_cost;
+ combined_cost.add_serial_cost (layout_costs.internal_cost);
+
+ /* If this partition consists of a single VEC_PERM_EXPR, see
+ if the VEC_PERM_EXPR can be changed to support output layout
+ LAYOUT_I while keeping all the provisional choices of input
+ layout. */
+ if (single_node
+ && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
+ {
+ int factor = internal_node_cost (single_node, -1, layout_i);
+ if (factor >= 0)
{
- /* Make sure we eventually converge. */
- gcc_checking_assert (vertices[idx].perm_in == -1
- || perm_in == 0);
- vertices[idx].perm_in = perm_in;
-
- /* While we can handle VEC_PERM nodes as transparent
- pass-through they can be a cheap materialization
- point as well. In addition they can act as source
- of a random permutation as well.
- The following ensures that former materialization
- points that now have zero incoming permutes no
- longer appear as such and that former "any" permutes
- get pass-through. We keep VEC_PERM nodes optimistic
- as "any" outgoing permute though. */
- if (vertices[idx].perm_out != 0
- && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
- vertices[idx].perm_out = perm_in;
- changed = true;
+ auto weight = m_vertices[single_node->vertex].weight;
+ slpg_layout_cost internal_cost
+ = { weight * factor, m_optimize_size };
+
+ slpg_layout_cost alt_cost = in_cost;
+ alt_cost.add_serial_cost (internal_cost);
+ if (alt_cost.is_better_than (combined_cost, m_optimize_size))
+ {
+ combined_cost = alt_cost;
+ layout_costs.in_cost = in_cost;
+ layout_costs.internal_cost = internal_cost;
+ }
}
}
- /* Elide pruning at materialization points in the first
- iteration phase. */
- if (!do_materialization)
- continue;
+ /* Record the layout with the lowest cost. Prefer layout 0 in
+ the event of a tie between it and another layout. */
+ if (!min_layout_cost.is_possible ()
+ || combined_cost.is_better_than (min_layout_cost,
+ m_optimize_size))
+ {
+ min_layout_i = layout_i;
+ min_layout_cost = combined_cost;
+ }
+ }
+
+ /* This loop's handling of earlier partitions should ensure that
+ choosing the original layout for the current partition is no
+ less valid than it was in the original graph, even with the
+ provisional layout choices for those earlier partitions. */
+ gcc_assert (min_layout_cost.is_possible ());
+ partition.layout = min_layout_i;
+ }
+}
+
+/* Make a backward pass through the partitions, accumulating output costs.
+ Make a final choice of layout for each partition. */
+
+void
+vect_optimize_slp_pass::backward_pass ()
+{
+ for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
+ {
+ auto &partition = m_partitions[partition_i];
- int perm = vertices[idx].perm_out;
- if (perm == 0 || perm == -1)
+ unsigned int min_layout_i = 0;
+ slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
+ for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
+ {
+ auto &layout_costs = partition_layout_costs (partition_i, layout_i);
+ if (!layout_costs.is_possible ())
continue;
- /* Decide on permute materialization. Look whether there's
- a use (pred) edge that is permuted differently than us.
- In that case mark ourselves so the permutation is applied. */
- bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
- if (all_preds_permuted)
- for (graph_edge *pred = slpg->vertices[idx].pred;
- pred; pred = pred->pred_next)
- {
- int pred_perm = vertices[pred->src].perm_in;
- gcc_checking_assert (pred_perm != -1);
- if (!vect_slp_perms_eq (perms, perm, pred_perm))
- {
- all_preds_permuted = false;
- break;
- }
- }
- if (!all_preds_permuted)
+ /* Accumulate the costs from successor partitions. */
+ bool is_possible = true;
+ for (unsigned int order_i = partition.node_begin;
+ order_i < partition.node_end; ++order_i)
{
- vertices[idx].perm_out = 0;
- changed = true;
+ unsigned int node_i = m_partitioned_nodes[order_i];
+ auto &vertex = m_vertices[node_i];
+ auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
+ {
+ auto &other_vertex = m_vertices[other_node_i];
+ auto &other_partition = m_partitions[other_vertex.partition];
+ if (other_vertex.partition > vertex.partition)
+ {
+ /* Accumulate the incoming costs from later
+ partitions, plus the cost of any layout changes
+ on UD itself. */
+ auto cost = backward_cost (ud, other_node_i, layout_i);
+ if (!cost.is_possible ())
+ is_possible = false;
+ else
+ layout_costs.out_cost.add_parallel_cost (cost);
+ }
+ else
+ /* Make sure that earlier partitions can (if necessary
+ or beneficial) keep the layout that they chose in
+ the forward pass. This ensures that there is at
+ least one valid choice of layout. */
+ is_possible &= edge_layout_cost (ud, other_node_i,
+ other_partition.layout,
+ layout_i).is_possible ();
+ };
+ for_each_partition_edge (node_i, add_cost);
+ }
+ if (!is_possible)
+ {
+ layout_costs.mark_impossible ();
+ continue;
+ }
+
+ /* Locally combine the costs from the forward and backward passes.
+ (This combined cost is not passed on, since that would lead
+ to double counting.) */
+ slpg_layout_cost combined_cost = layout_costs.in_cost;
+ combined_cost.add_serial_cost (layout_costs.internal_cost);
+ combined_cost.add_serial_cost (layout_costs.out_cost);
+
+ /* Record the layout with the lowest cost. Prefer layout 0 in
+ the event of a tie between it and another layout. */
+ if (!min_layout_cost.is_possible ()
+ || combined_cost.is_better_than (min_layout_cost,
+ m_optimize_size))
+ {
+ min_layout_i = layout_i;
+ min_layout_cost = combined_cost;
}
}
- /* If the initial propagation converged, switch on materialization
- and re-propagate. */
- if (!changed && !do_materialization)
+ gcc_assert (min_layout_cost.is_possible ());
+ partition.layout = min_layout_i;
+ }
+}
+
+/* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
+ NODE already has the layout that was selected for its partition. */
+
+slp_tree
+vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
+ unsigned int to_layout_i)
+{
+ unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
+ slp_tree result = m_node_layouts[result_i];
+ if (result)
+ return result;
+
+ if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
+ || SLP_TREE_DEF_TYPE (node) == vect_external_def)
+ {
+ /* If the vector is uniform or unchanged, there's nothing to do. */
+ if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
+ result = node;
+ else
{
- do_materialization = true;
- changed = true;
+ auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
+ result = vect_create_new_slp_node (scalar_ops);
+ vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
}
}
- while (changed);
- statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
-
- /* Materialize. */
- for (i = 0; i < vertices.length (); ++i)
+ else
{
- int perm_in = vertices[i].perm_in;
- slp_tree node = vertices[i].node;
+ unsigned int partition_i = m_vertices[node->vertex].partition;
+ unsigned int from_layout_i = m_partitions[partition_i].layout;
+ if (from_layout_i == to_layout_i)
+ return node;
- /* First permute invariant/external original successors, we handle
- those optimistically during propagation and duplicate them if
- they are used with different permutations. */
- unsigned j;
- slp_tree child;
- if (perm_in > 0)
- FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
- {
- if (!child
- || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
- && SLP_TREE_DEF_TYPE (child) != vect_external_def))
- continue;
+ /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
+ permutation instead of a serial one. Leave the new permutation
+ in TMP_PERM on success. */
+ auto_lane_permutation_t tmp_perm;
+ unsigned int num_inputs = 1;
+ if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+ {
+ tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
+ if (from_layout_i != 0)
+ vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
+ if (to_layout_i != 0)
+ vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
+ if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
+ tmp_perm,
+ SLP_TREE_CHILDREN (node),
+ false) >= 0)
+ num_inputs = SLP_TREE_CHILDREN (node).length ();
+ else
+ tmp_perm.truncate (0);
+ }
- /* If the vector is uniform there's nothing to do. */
- if (vect_slp_tree_uniform_p (child))
- continue;
+ if (dump_enabled_p ())
+ {
+ if (tmp_perm.length () > 0)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "duplicating permutation node %p with"
+ " layout %d\n",
+ node, to_layout_i);
+ else
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "inserting permutation node in place of %p\n",
+ node);
+ }
- /* We can end up sharing some externals via two_operator
- handling. Be prepared to unshare those. */
- if (child->refcnt != 1)
- {
- gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
- SLP_TREE_CHILDREN (node)[j] = child
- = vect_create_new_slp_node
- (SLP_TREE_SCALAR_OPS (child).copy ());
- }
- vect_slp_permute (perms[perm_in],
- SLP_TREE_SCALAR_OPS (child), true);
- }
+ unsigned int num_lanes = SLP_TREE_LANES (node);
+ result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
+ if (SLP_TREE_SCALAR_STMTS (node).length ())
+ {
+ auto &stmts = SLP_TREE_SCALAR_STMTS (result);
+ stmts.safe_splice (SLP_TREE_SCALAR_STMTS (result));
+ if (from_layout_i != 0)
+ vect_slp_permute (m_perms[from_layout_i], stmts, false);
+ if (to_layout_i != 0)
+ vect_slp_permute (m_perms[to_layout_i], stmts, true);
+ }
+ SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
+ SLP_TREE_LANES (result) = num_lanes;
+ SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
+ result->vertex = -1;
+
+ auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
+ if (tmp_perm.length ())
+ {
+ lane_perm.safe_splice (tmp_perm);
+ SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
+ }
+ else
+ {
+ lane_perm.create (num_lanes);
+ for (unsigned j = 0; j < num_lanes; ++j)
+ lane_perm.quick_push ({ 0, j });
+ if (from_layout_i != 0)
+ vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
+ if (to_layout_i != 0)
+ vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
+ SLP_TREE_CHILDREN (result).safe_push (node);
+ }
+ for (slp_tree child : SLP_TREE_CHILDREN (result))
+ child->refcnt++;
+ }
+ m_node_layouts[result_i] = result;
+ return result;
+}
+
+/* Apply the chosen vector layouts to the SLP graph. */
+void
+vect_optimize_slp_pass::materialize ()
+{
+ /* We no longer need the costs, so avoid having two O(N * P) arrays
+ live at the same time. */
+ m_partition_layout_costs.release ();
+ m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
+
+ auto_sbitmap fully_folded (m_vertices.length ());
+ bitmap_clear (fully_folded);
+ for (unsigned int node_i : m_partitioned_nodes)
+ {
+ auto &vertex = m_vertices[node_i];
+ slp_tree node = vertex.node;
+ int layout_i = m_partitions[vertex.partition].layout;
+ gcc_assert (layout_i >= 0);
+
+ /* Rearrange the scalar statements to match the chosen layout. */
+ if (layout_i > 0)
+ vect_slp_permute (m_perms[layout_i],
+ SLP_TREE_SCALAR_STMTS (node), true);
+
+ /* Update load and lane permutations. */
if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
{
- /* Apply the common permutes to the input vectors. */
- if (perm_in > 0)
+ /* First try to absorb the input vector layouts. If that fails,
+ force the inputs to have layout LAYOUT_I too. We checked that
+ that was possible before deciding to use nonzero output layouts.
+ (Note that at this stage we don't really have any guarantee that
+ the target supports the original VEC_PERM_EXPR.) */
+ auto &perm = SLP_TREE_LANE_PERMUTATION (node);
+ auto_lane_permutation_t tmp_perm;
+ tmp_perm.safe_splice (perm);
+ change_vec_perm_layout (node, tmp_perm, -1, layout_i);
+ if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
+ tmp_perm,
+ SLP_TREE_CHILDREN (node),
+ false) >= 0)
{
- /* If the node is already a permute node we can apply
- the permutation to the lane selection, effectively
- materializing it on the incoming vectors. */
- if (dump_enabled_p ())
+ if (dump_enabled_p ()
+ && !std::equal (tmp_perm.begin (), tmp_perm.end (),
+ perm.begin ()))
dump_printf_loc (MSG_NOTE, vect_location,
- "simplifying permute node %p\n",
- node);
- for (unsigned k = 0;
- k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
- SLP_TREE_LANE_PERMUTATION (node)[k].second
- = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
- }
- /* Apply the anticipated output permute to the permute and
- stmt vectors. */
- int perm_out = vertices[i].perm_out;
- if (perm_out > 0)
- {
- vect_slp_permute (perms[perm_out],
- SLP_TREE_SCALAR_STMTS (node), true);
- vect_slp_permute (perms[perm_out],
- SLP_TREE_LANE_PERMUTATION (node), true);
+ "absorbing input layouts into %p\n", node);
+ std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
+ bitmap_set_bit (fully_folded, node_i);
}
- }
- else if (vertices[i].get_perm_materialized () != 0)
- {
- if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
- /* For loads simply drop the permutation, the load permutation
- already performs the desired permutation. */
- ;
- else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
- gcc_unreachable ();
else
{
+ /* Not MSG_MISSED because it would make no sense to users. */
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
- "inserting permute node in place of %p\n",
+ "failed to absorb input layouts into %p\n",
node);
-
- /* Make a copy of NODE and in-place change it to a
- VEC_PERM node to permute the lanes of the copy. */
- slp_tree copy = new _slp_tree;
- SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
- SLP_TREE_CHILDREN (node) = vNULL;
- SLP_TREE_SCALAR_STMTS (copy)
- = SLP_TREE_SCALAR_STMTS (node).copy ();
- vect_slp_permute (perms[perm_in],
- SLP_TREE_SCALAR_STMTS (copy), true);
- gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
- SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
- gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
- SLP_TREE_LANE_PERMUTATION (copy)
- = SLP_TREE_LANE_PERMUTATION (node);
- SLP_TREE_LANE_PERMUTATION (node) = vNULL;
- SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
- copy->refcnt = 1;
- copy->max_nunits = node->max_nunits;
- SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
- SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
- SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
-
- /* Now turn NODE into a VEC_PERM. */
- SLP_TREE_CHILDREN (node).safe_push (copy);
- SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
- for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
- SLP_TREE_LANE_PERMUTATION (node)
- .quick_push (std::make_pair (0, perms[perm_in][j]));
- SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+ change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
}
}
- else if (perm_in > 0) /* perm_in == perm_out */
+ else
{
- /* Apply the reverse permutation to our stmts. */
- vect_slp_permute (perms[perm_in],
- SLP_TREE_SCALAR_STMTS (node), true);
- /* And to the lane/load permutation, which we can simply
- make regular by design. */
- if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
- {
- gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
- /* ??? When we handle non-bijective permutes the idea
- is that we can force the load-permutation to be
- { min, min + 1, min + 2, ... max }. But then the
- scalar defs might no longer match the lane content
- which means wrong-code with live lane vectorization.
- So we possibly have to have NULL entries for those. */
- vect_slp_permute (perms[perm_in],
- SLP_TREE_LOAD_PERMUTATION (node), true);
- }
- else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
- gcc_unreachable ();
+ gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
+ auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
+ if (layout_i > 0)
+ /* ??? When we handle non-bijective permutes the idea
+ is that we can force the load-permutation to be
+ { min, min + 1, min + 2, ... max }. But then the
+ scalar defs might no longer match the lane content
+ which means wrong-code with live lane vectorization.
+ So we possibly have to have NULL entries for those. */
+ vect_slp_permute (m_perms[layout_i], load_perm, true);
}
}
- /* Elide any permutations at BB reduction roots. */
- if (is_a <bb_vec_info> (vinfo))
+ /* Do this before any nodes disappear, since it involves a walk
+ over the leaves. */
+ remove_redundant_permutations ();
+
+ /* Replace each child with a correctly laid-out version. */
+ for (unsigned int node_i : m_partitioned_nodes)
{
- for (slp_instance instance : vinfo->slp_instances)
+ /* Skip nodes that have already been handled above. */
+ if (bitmap_bit_p (fully_folded, node_i))
+ continue;
+
+ auto &vertex = m_vertices[node_i];
+ int in_layout_i = m_partitions[vertex.partition].layout;
+ gcc_assert (in_layout_i >= 0);
+
+ unsigned j;
+ slp_tree child;
+ FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
{
- if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
+ if (!child)
continue;
- slp_tree old = SLP_INSTANCE_TREE (instance);
- if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
- && SLP_TREE_CHILDREN (old).length () == 1)
- {
- slp_tree child = SLP_TREE_CHILDREN (old)[0];
- if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
- {
- /* Preserve the special VEC_PERM we use to shield existing
- vector defs from the rest. But make it a no-op. */
- unsigned i = 0;
- for (std::pair<unsigned, unsigned> &p
- : SLP_TREE_LANE_PERMUTATION (old))
- p.second = i++;
- }
- else
- {
- SLP_INSTANCE_TREE (instance) = child;
- SLP_TREE_REF_COUNT (child)++;
- vect_free_slp_tree (old);
- }
- }
- else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
- && SLP_TREE_REF_COUNT (old) == 1
- && vertices[old->vertex].get_perm_materialized () != 0)
+
+ slp_tree new_child = get_result_with_layout (child, in_layout_i);
+ if (new_child != child)
{
- /* ??? For loads the situation is more complex since
- we can't modify the permute in place in case the
- node is used multiple times. In fact for loads this
- should be somehow handled in the propagation engine. */
- /* Apply the reverse permutation to our stmts. */
- int perm = vertices[old->vertex].get_perm_materialized ();
- vect_slp_permute (perms[perm],
- SLP_TREE_SCALAR_STMTS (old), true);
- vect_slp_permute (perms[perm],
- SLP_TREE_LOAD_PERMUTATION (old), true);
+ vect_free_slp_tree (child);
+ SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
+ new_child->refcnt += 1;
}
}
}
+}
- /* Free the perms vector used for propagation. */
- while (!perms.is_empty ())
- perms.pop ().release ();
- free_graph (slpg);
-
+/* Elide load permutations that are not necessary. Such permutations might
+ be pre-existing, rather than created by the layout optimizations. */
- /* Now elide load permutations that are not necessary. */
- for (i = 0; i < leafs.length (); ++i)
+void
+vect_optimize_slp_pass::remove_redundant_permutations ()
+{
+ for (unsigned int node_i : m_leafs)
{
- node = vertices[leafs[i]].node;
+ slp_tree node = m_vertices[node_i].node;
if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
continue;
/* In basic block vectorization we allow any subchain of an interleaving
chain.
FORNOW: not in loop SLP because of realignment complications. */
- if (is_a <bb_vec_info> (vinfo))
+ if (is_a <bb_vec_info> (m_vinfo))
{
bool subchain_p = true;
stmt_vec_info next_load_info = NULL;
@@ -4160,6 +5347,7 @@ vect_optimize_slp (vec_info *vinfo)
}
else
{
+ loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
stmt_vec_info load_info;
bool this_load_permuted = false;
unsigned j;
@@ -4175,8 +5363,7 @@ vect_optimize_slp (vec_info *vinfo)
/* The load requires permutation when unrolling exposes
a gap either because the group is larger than the SLP
group-size or because there is a gap between the groups. */
- && (known_eq (LOOP_VINFO_VECT_FACTOR
- (as_a <loop_vec_info> (vinfo)), 1U)
+ && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
|| ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
&& DR_GROUP_GAP (first_stmt_info) == 0)))
{
@@ -4187,6 +5374,150 @@ vect_optimize_slp (vec_info *vinfo)
}
}
+/* Print the partition graph and layout information to the dump file. */
+
+void
+vect_optimize_slp_pass::dump ()
+{
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP optimize permutations:\n");
+ for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
+ {
+ dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
+ const char *sep = "";
+ for (unsigned int idx : m_perms[layout_i])
+ {
+ dump_printf (MSG_NOTE, "%s%d", sep, idx);
+ sep = ", ";
+ }
+ dump_printf (MSG_NOTE, " }\n");
+ }
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP optimize partitions:\n");
+ for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
+ ++partition_i)
+ {
+ auto &partition = m_partitions[partition_i];
+ dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " partition %d (layout %d):\n",
+ partition_i, partition.layout);
+ dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
+ for (unsigned int order_i = partition.node_begin;
+ order_i < partition.node_end; ++order_i)
+ {
+ auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
+ dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
+ (void *) vertex.node);
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " weight: %f\n",
+ vertex.weight.to_double ());
+ if (vertex.out_degree)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " out weight: %f (degree %d)\n",
+ vertex.out_weight.to_double (),
+ vertex.out_degree);
+ if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " op: VEC_PERM_EXPR\n");
+ else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " op template: %G", rep->stmt);
+ }
+ dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
+ for (unsigned int order_i = partition.node_begin;
+ order_i < partition.node_end; ++order_i)
+ {
+ unsigned int node_i = m_partitioned_nodes[order_i];
+ auto &vertex = m_vertices[node_i];
+ auto print_edge = [&](graph_edge *, unsigned int other_node_i)
+ {
+ auto &other_vertex = m_vertices[other_node_i];
+ if (other_vertex.partition < vertex.partition)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " - %p [%d] --> %p\n",
+ other_vertex.node, other_vertex.partition,
+ vertex.node);
+ else
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " - %p --> [%d] %p\n",
+ vertex.node, other_vertex.partition,
+ other_vertex.node);
+ };
+ for_each_partition_edge (node_i, print_edge);
+ }
+
+ for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
+ {
+ auto &layout_costs = partition_layout_costs (partition_i, layout_i);
+ if (layout_costs.is_possible ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " layout %d:%s\n", layout_i,
+ partition.layout == int (layout_i)
+ ? " (*)" : "");
+ slpg_layout_cost combined_cost = layout_costs.in_cost;
+ combined_cost.add_serial_cost (layout_costs.internal_cost);
+ combined_cost.add_serial_cost (layout_costs.out_cost);
+#define TEMPLATE "{depth: %f, total: %f}"
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " " TEMPLATE "\n", layout_i,
+ layout_costs.in_cost.depth.to_double (),
+ layout_costs.in_cost.total.to_double ());
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " + " TEMPLATE "\n",
+ layout_costs.internal_cost.depth.to_double (),
+ layout_costs.internal_cost.total.to_double ());
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " + " TEMPLATE "\n",
+ layout_costs.out_cost.depth.to_double (),
+ layout_costs.out_cost.total.to_double ());
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " = " TEMPLATE "\n",
+ combined_cost.depth.to_double (),
+ combined_cost.total.to_double ());
+#undef TEMPLATE
+ }
+ else
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " layout %d: rejected\n", layout_i);
+ }
+ }
+}
+
+/* Main entry point for the SLP graph optimization pass. */
+
+void
+vect_optimize_slp_pass::run ()
+{
+ build_graph ();
+ create_partitions ();
+ start_choosing_layouts ();
+ if (m_perms.length () > 1)
+ {
+ forward_pass ();
+ backward_pass ();
+ if (dump_enabled_p ())
+ dump ();
+ materialize ();
+ while (!m_perms.is_empty ())
+ m_perms.pop ().release ();
+ }
+ else
+ remove_redundant_permutations ();
+ free_graph (m_slpg);
+}
+
+/* Optimize the SLP graph of VINFO. */
+
+void
+vect_optimize_slp (vec_info *vinfo)
+{
+ if (vinfo->slp_instances.is_empty ())
+ return;
+ vect_optimize_slp_pass (vinfo).run ();
+}
+
/* Gather loads reachable from the individual SLP graph entries. */
void
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e5fdc9e..a2b0afb 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -154,7 +154,9 @@ struct vect_scalar_ops_slice_hash : typed_noop_remove<vect_scalar_ops_slice>
SLP
************************************************************************/
typedef vec<std::pair<unsigned, unsigned> > lane_permutation_t;
+typedef auto_vec<std::pair<unsigned, unsigned>, 16> auto_lane_permutation_t;
typedef vec<unsigned> load_permutation_t;
+typedef auto_vec<unsigned, 16> auto_load_permutation_t;
/* A computation tree of an SLP instance. Each node corresponds to a group of
stmts to be packed in a SIMD stmt. */