23 files changed, 1975 insertions, 404 deletions
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 6131bfa..e5eb525 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14619,6 +14619,10 @@ Complex expressions slow the analyzer.
 Maximum number of arguments in a PHI supported by TREE if conversion
 unless the loop is marked with simd pragma.
 
+@item vect-max-layout-candidates
+The maximum number of possible vector layouts (such as permutations)
+to consider when optimizing to-be-vectorized code.
+
 @item vect-max-version-for-alignment-checks
 The maximum number of run-time checks that can be performed when
 doing loop versioning for alignment in the vectorizer.
diff --git a/gcc/params.opt b/gcc/params.opt
index 201b5c9..3001566 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1137,6 +1137,10 @@ Whether to use canonical types.
 Common Joined UInteger Var(param_vect_epilogues_nomask) Init(1) IntegerRange(0, 1) Param Optimization
 Enable loop epilogue vectorization using smaller vector size.
 
+-param=vect-max-layout-candidates=
+Common Joined UInteger Var(param_vect_max_layout_candidates) Init(32) Param Optimization
+Maximum number of possible vector layouts (such as permutations) to consider when optimizing to-be-vectorized code.
+
 -param=vect-max-peeling-for-alignment=
 Common Joined UInteger Var(param_vect_max_peeling_for_alignment) Init(-1) IntegerRange(0, 64) Param Optimization
 Maximum number of loop peels to enhance alignment of data references in a loop.
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-1.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-1.c
new file mode 100644
index 0000000..c1d4ba3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4], d[4];
+
+void f1()
+{
+  a[0] = (b[1] << c[3]) - d[1];
+  a[1] = (b[0] << c[2]) - d[0];
+  a[2] = (b[3] << c[1]) - d[3];
+  a[3] = (b[2] << c[0]) - d[2];
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 3 "slp2" { target { vect_var_shift && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-10.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-10.c
new file mode 100644
index 0000000..e2c86f3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-10.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os -fno-tree-loop-vectorize" } */
+
+#include "bb-slp-layout-9.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp1" { target { vect_int && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-11.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-11.c
new file mode 100644
index 0000000..d9b5349
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-11.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+
+int a[4], b[4], c[400], d[400];
+
+void f1()
+{
+  int a0 = a[0] - b[0];
+  int a1 = a[1] + b[1];
+  int a2 = a[2] - b[2];
+  int a3 = a[3] + b[3];
+  int b0 = a0;
+  int b1 = a1;
+  int b2 = a2;
+  int b3 = a3;
+  for (int i = 0; i < 100; ++i)
+    {
+      a0 += c[i * 4 + 1];
+      a1 += c[i * 4 + 0];
+      a2 += c[i * 4 + 3];
+      a3 += c[i * 4 + 2];
+      b0 ^= d[i * 4 + 3];
+      b1 ^= d[i * 4 + 2];
+      b2 ^= d[i * 4 + 1];
+      b3 ^= d[i * 4 + 0];
+    }
+  a[0] = a0 ^ b0;
+  a[1] = a1 ^ b1;
+  a[2] = a2 ^ b2;
+  a[3] = a3 ^ b3;
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 4 "slp1" { target { vect_int && vect_perm } } } } */
+/* { dg-final { scan-tree-dump "duplicating permutation node" "slp1" { target { vect_int && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-12.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-12.c
new file mode 100644
index 0000000..3bf48af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-12.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os -fno-tree-loop-vectorize" } */
+
+#include "bb-slp-layout-11.c"
+
+/* It would be better to keep the original three permutations.  */
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 3 "slp1" { target { vect_int && { vect_perm && vect_hw_misalign } } xfail { *-*-* } } } } */
+/* { dg-final { scan-tree-dump-not "duplicating permutation node" "slp1" { target { vect_int && { vect_perm && vect_hw_misalign } } xfail { *-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-13.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-13.c
new file mode 100644
index 0000000..9669ade
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-13.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4], d[4];
+
+void f1()
+{
+  a[0] = (b[1] << c[3]) - (d[1] >> c[3]);
+  a[1] = (b[0] << c[2]) - (d[0] >> c[2]);
+  a[2] = (b[3] << c[1]) - (d[3] >> c[1]);
+  a[3] = (b[2] << c[0]) - (d[2] >> c[0]);
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 3 "slp2" { target { vect_var_shift && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-14.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-14.c
new file mode 100644
index 0000000..159bb15
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-14.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-13.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp2" { target { vect_var_shift && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-15.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-15.c
new file mode 100644
index 0000000..d87fc1e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-15.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4], d[4];
+
+void f1()
+{
+  a[0] = (b[3] << c[3]) - d[0];
+  a[1] = (b[2] << c[2]) - d[2];
+  a[2] = (b[1] << c[1]) - d[4];
+  a[3] = (b[0] << c[0]) - d[6];
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp2" { target { vect_var_shift && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-16.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-16.c
new file mode 100644
index 0000000..559583a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-16.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-15.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp2" { target { vect_var_shift && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-17.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-17.c
new file mode 100644
index 0000000..f64a2d9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-17.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+
+int a[8], b[8];
+
+int f1()
+{
+  a[0] = b[4] + 1;
+  a[1] = b[5] + 1;
+  a[2] = b[6] + 1;
+  a[3] = b[7] + 1;
+  a[4] = b[0] + 1;
+  a[5] = b[1] + 1;
+  a[6] = b[2] + 1;
+  a[7] = b[3] + 1;
+}
+
+unsigned short c[2], d[2];
+void f2() {
+  c[0] += d[1];
+  c[1] += d[0];
+}
+
+typedef int v4si __attribute__((vector_size(16)));
+void f3(v4si x) {
+  a[0] = b[1] + x[1];
+  a[1] = b[0] + x[3];
+}
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-2.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-2.c
new file mode 100644
index 0000000..f12290b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-1.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp2" { target { vect_var_shift && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-3.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-3.c
new file mode 100644
index 0000000..82c2720
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4], d[4];
+
+void f1()
+{
+  a[0] = (b[3] << c[3]) - d[3];
+  a[1] = (b[2] << c[2]) - d[2];
+  a[2] = (b[1] << c[1]) - d[1];
+  a[3] = (b[0] << c[0]) - d[0];
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp2" { target { vect_var_shift && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-4.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-4.c
new file mode 100644
index 0000000..45bd6c8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-4.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-3.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp2" { target { vect_var_shift && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-5.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-5.c
new file mode 100644
index 0000000..b59a159
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-5.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+int a[4], b[4], c[4];
+
+void f1()
+{
+  a[0] = b[3] - c[3];
+  a[1] = b[2] + c[2];
+  a[2] = b[1] - c[1];
+  a[3] = b[0] + c[0];
+}
+
+/* { dg-final { scan-tree-dump "absorbing input layouts" "slp2" { target { vect_int && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-6.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-6.c
new file mode 100644
index 0000000..06f5308
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-6.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os" } */
+
+#include "bb-slp-layout-5.c"
+
+/* { dg-final { scan-tree-dump "absorbing input layouts" "slp2" { target { vect_int && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-7.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-7.c
new file mode 100644
index 0000000..7a20c60
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-7.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+
+int a[4], b[400];
+
+void f1()
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      a[0] += b[i * 4 + 3];
+      a[1] += b[i * 4 + 2];
+      a[2] += b[i * 4 + 1];
+      a[3] += b[i * 4 + 0];
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp1" { target { vect_int && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-8.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-8.c
new file mode 100644
index 0000000..ef2f0c6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-8.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Os -fno-tree-loop-vectorize" } */
+
+#include "bb-slp-layout-7.c"
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 1 "slp1" { target { vect_int && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-9.c b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-9.c
new file mode 100644
index 0000000..c841862
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-9.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+
+int a[4], b[400], c[400], d[40000];
+
+void f1()
+{
+  int a0 = a[0];
+  int a1 = a[1];
+  int a2 = a[2];
+  int a3 = a[3];
+  for (int i = 0; i < 100; ++i)
+    {
+      a0 ^= c[i * 4 + 0];
+      a1 ^= c[i * 4 + 1];
+      a2 ^= c[i * 4 + 2];
+      a3 ^= c[i * 4 + 3];
+      for (int j = 0; j < 100; ++j)
+	{
+	  a0 += d[i * 400 + j * 4 + 1];
+	  a1 += d[i * 400 + j * 4 + 0];
+	  a2 += d[i * 400 + j * 4 + 3];
+	  a3 += d[i * 400 + j * 4 + 2];
+	}
+      b[i * 4 + 0] = a0;
+      b[i * 4 + 1] = a1;
+      b[i * 4 + 2] = a2;
+      b[i * 4 + 3] = a3;
+    }
+  a[0] = a0;
+  a[1] = a1;
+  a[2] = a2;
+  a[3] = a3;
+}
+
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp1" { target { vect_int && vect_perm } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11b.c b/gcc/testsuite/gcc.dg/vect/slp-11b.c
index c4d9ab0..d0b972f 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-11b.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11b.c
@@ -44,4 +44,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_strided4 || vect_perm } && vect_int_mult } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } xfail vect_load_lanes } } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 2f243cd..3c1913b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -6814,6 +6814,7 @@ proc check_effective_target_vect_var_shift { } {
     return [check_cached_effective_target_indexed vect_var_shift {
       expr {(([istarget i?86-*-*] || [istarget x86_64-*-*])
 	     && [check_avx2_available])
+	    || [istarget aarch64*-*-*]
       }}]
 }
 
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 64b3379..ab4c6fa 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -20,6 +20,7 @@ along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
 #include "config.h"
+#define INCLUDE_ALGORITHM
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
@@ -49,7 +50,20 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-eh.h"
 #include "tree-cfg.h"
 #include "alloc-pool.h"
-
+#include "sreal.h"
+#include "predict.h"
+
+static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
+					    load_permutation_t &,
+					    const vec<tree> &,
+					    gimple_stmt_iterator *,
+					    poly_uint64, bool, bool,
+					    unsigned *,
+					    unsigned * = nullptr,
+					    bool = false);
+static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
+					   slp_tree, lane_permutation_t &,
+					   vec<slp_tree> &, bool);
 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
 					  slp_tree, stmt_vector_for_cost *);
 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
@@ -304,6 +318,16 @@ vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
   oprnds_info.release ();
 }
 
+/* Return the execution frequency of NODE (so that a higher value indicates
+   a "more important" node when optimizing for speed).  */
+
+static sreal
+vect_slp_node_weight (slp_tree node)
+{
+  stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
+  basic_block bb = gimple_bb (stmt_info->stmt);
+  return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
+}
 
 /* Return true if STMTS contains a pattern statement.  */
 
@@ -3531,29 +3555,465 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
   return opt_result::success ();
 }
 
-struct slpg_vertex
+/* Estimates the cost of inserting layout changes into the SLP graph.
+   It can also say that the insertion is impossible.  */
+
+struct slpg_layout_cost
+{
+  slpg_layout_cost () = default;
+  slpg_layout_cost (sreal, bool);
+
+  static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
+  bool is_possible () const { return depth != sreal::max (); }
+
+  bool operator== (const slpg_layout_cost &) const;
+  bool operator!= (const slpg_layout_cost &) const;
+
+  bool is_better_than (const slpg_layout_cost &, bool) const;
+
+  void add_parallel_cost (const slpg_layout_cost &);
+  void add_serial_cost (const slpg_layout_cost &);
+  void split (unsigned int);
+
+  /* The longest sequence of layout changes needed during any traversal
+     of the partition dag, weighted by execution frequency.
+
+     This is the most important metric when optimizing for speed, since
+     it helps to ensure that we keep the number of operations on
+     critical paths to a minimum.  */
+  sreal depth = 0;
+
+  /* An estimate of the total number of operations needed.  It is weighted by
+     execution frequency when optimizing for speed but not when optimizing for
+     size.  In order to avoid double-counting, a node with a fanout of N will
+     distribute 1/N of its total cost to each successor.
+
+     This is the most important metric when optimizing for size, since
+     it helps to keep the total number of operations to a minimum,  */
+  sreal total = 0;
+};
+
+/* Construct costs for a node with weight WEIGHT.  A higher weight
+   indicates more frequent execution.  IS_FOR_SIZE is true if we are
+   optimizing for size rather than speed.  */
+
+slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
+  : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
 {
-  slpg_vertex (slp_tree node_)
-    : node (node_), perm_in (-1), perm_out (-1) {}
+}
 
-  int get_perm_materialized () const
-    { return perm_in != perm_out ? perm_in : 0; }
+bool
+slpg_layout_cost::operator== (const slpg_layout_cost &other) const
+{
+  return depth == other.depth && total == other.total;
+}
 
+bool
+slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
+{
+  return !operator== (other);
+}
+
+/* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
+   true if we are optimizing for size rather than speed.  */
+
+bool
+slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
+				  bool is_for_size) const
+{
+  if (is_for_size)
+    {
+      if (total != other.total)
+	return total < other.total;
+      return depth < other.depth;
+    }
+  else
+    {
+      if (depth != other.depth)
+	return depth < other.depth;
+      return total < other.total;
+    }
+}
+
+/* Increase the costs to account for something with cost INPUT_COST
+   happening in parallel with the current costs.  */
+
+void
+slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
+{
+  depth = std::max (depth, input_cost.depth);
+  total += input_cost.total;
+}
+
+/* Increase the costs to account for something with cost INPUT_COST
+   happening in series with the current costs.  */
+
+void
+slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
+{
+  depth += other.depth;
+  total += other.total;
+}
+
+/* Split the total cost among TIMES successors or predecessors.  */
+
+void
+slpg_layout_cost::split (unsigned int times)
+{
+  if (times > 1)
+    total /= times;
+}
+
+/* Information about one node in the SLP graph, for use during
+   vect_optimize_slp_pass.  */
+
+struct slpg_vertex
+{
+  slpg_vertex (slp_tree node_) : node (node_) {}
+
+  /* The node itself.  */
   slp_tree node;
-  /* The common permutation on the incoming lanes (towards SLP children).  */
-  int perm_in;
-  /* The permutation on the outgoing lanes (towards SLP parents).  When
-     the node is a materialization point for a permute this differs
-     from perm_in (and is then usually zero).  Materialization happens
-     on the input side.  */
-  int perm_out;
+
+  /* Which partition the node belongs to, or -1 if none.  Nodes outside of
+     partitions are flexible; they can have whichever layout consumers
+     want them to have.  */
+  int partition = -1;
+
+  /* The number of nodes that directly use the result of this one
+     (i.e. the number of nodes that count this one as a child).  */
+  unsigned int out_degree = 0;
+
+  /* The execution frequency of the node.  */
+  sreal weight = 0;
+
+  /* The total execution frequency of all nodes that directly use the
+     result of this one.  */
+  sreal out_weight = 0;
 };
 
-/* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
+/* Information about one partition of the SLP graph, for use during
+   vect_optimize_slp_pass.  */
 
-static void
-vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
-			 vec<slpg_vertex> &vertices, vec<int> &leafs)
+struct slpg_partition_info
+{
+  /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
+     of m_partitioned_nodes.  */
+  unsigned int node_begin = 0;
+  unsigned int node_end = 0;
+
+  /* Which layout we've chosen to use for this partition, or -1 if
+     we haven't picked one yet.  */
+  int layout = -1;
+
+  /* The number of predecessors and successors in the partition dag.
+     The predecessors always have lower partition numbers and the
+     successors always have higher partition numbers.
+
+     Note that the directions of these edges are not necessarily the
+     same as in the data flow graph.  For example, if an SCC has separate
+     partitions for an inner loop and an outer loop, the inner loop's
+     partition will have at least two incoming edges from the outer loop's
+     partition: one for a live-in value and one for a live-out value.
+     In data flow terms, one of these edges would also be from the outer loop
+     to the inner loop, but the other would be in the opposite direction.  */
+  unsigned int in_degree = 0;
+  unsigned int out_degree = 0;
+};
+
+/* Information about the costs of using a particular layout for a
+   particular partition.  It can also say that the combination is
+   impossible.  */
+
+struct slpg_partition_layout_costs
+{
+  bool is_possible () const { return internal_cost.is_possible (); }
+  void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
+
+  /* The costs inherited from predecessor partitions.  */
+  slpg_layout_cost in_cost;
+
+  /* The inherent cost of the layout within the node itself.  For example,
+     this is nonzero for a load if choosing a particular layout would require
+     the load to permute the loaded elements.  It is nonzero for a
+     VEC_PERM_EXPR if the permutation cannot be eliminated or converted
+     to full-vector moves.  */
+  slpg_layout_cost internal_cost;
+
+  /* The costs inherited from successor partitions.  */
+  slpg_layout_cost out_cost;
+};
+
+/* This class tries to optimize the layout of vectors in order to avoid
+   unnecessary shuffling.  At the moment, the set of possible layouts are
+   restricted to bijective permutations.
+
+   The goal of the pass depends on whether we're optimizing for size or
+   for speed.  When optimizing for size, the goal is to reduce the overall
+   number of layout changes (including layout changes implied by things
+   like load permutations).  When optimizing for speed, the goal is to
+   reduce the maximum latency attributable to layout changes on any
+   non-cyclical path through the data flow graph.
+
+   For example, when optimizing a loop nest for speed, we will prefer
+   to make layout changes outside of a loop rather than inside of a loop,
+   and will prefer to make layout changes in parallel rather than serially,
+   even if that increases the overall number of layout changes.
+
+   The high-level procedure is:
+
+   (1) Build a graph in which edges go from uses (parents) to definitions
+       (children).
+
+   (2) Divide the graph into a dag of strongly-connected components (SCCs).
+
+   (3) When optimizing for speed, partition the nodes in each SCC based
+       on their containing cfg loop.  When optimizing for size, treat
+       each SCC as a single partition.
+
+       This gives us a dag of partitions.  The goal is now to assign a
+       layout to each partition.
+
+   (4) Construct a set of vector layouts that are worth considering.
+       Record which nodes must keep their current layout.
+
+   (5) Perform a forward walk over the partition dag (from loads to stores)
+       accumulating the "forward" cost of using each layout.  When visiting
+       each partition, assign a tentative choice of layout to the partition
+       and use that choice when calculating the cost of using a different
+       layout in successor partitions.
+
+   (6) Perform a backward walk over the partition dag (from stores to loads),
+       accumulating the "backward" cost of using each layout.  When visiting
+       each partition, make a final choice of layout for that partition based
+       on the accumulated forward costs (from (5)) and backward costs
+       (from (6)).
+
+   (7) Apply the chosen layouts to the SLP graph.
+
+   For example, consider the SLP statements:
+
+   S1:      a_1 = load
+       loop:
+   S2:      a_2 = PHI<a_1, a_3>
+   S3:      b_1 = load
+   S4:      a_3 = a_2 + b_1
+       exit:
+   S5:      a_4 = PHI<a_3>
+   S6:      store a_4
+
+   S2 and S4 form an SCC and are part of the same loop.  Every other
+   statement is in a singleton SCC.  In this example there is a one-to-one
+   mapping between SCCs and partitions and the partition dag looks like this;
+
+	S1     S3
+	 \     /
+	  S2+S4
+	    |
+	   S5
+	    |
+	   S6
+
+   S2, S3 and S4 will have a higher execution frequency than the other
+   statements, so when optimizing for speed, the goal is to avoid any
+   layout changes:
+
+   - within S3
+   - within S2+S4
+   - on the S3->S2+S4 edge
+
+   For example, if S3 was originally a reversing load, the goal of the
+   pass is to make it an unreversed load and change the layout on the
+   S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
+   on S1->S2+S4 and S5->S6 would also be acceptable.)
+
+   The difference between SCCs and partitions becomes important if we
+   add an outer loop:
+
+   S1:      a_1 = ...
+       loop1:
+   S2:      a_2 = PHI<a_1, a_6>
+   S3:      b_1 = load
+   S4:      a_3 = a_2 + b_1
+       loop2:
+   S5:      a_4 = PHI<a_3, a_5>
+   S6:      c_1 = load
+   S7:      a_5 = a_4 + c_1
+       exit2:
+   S8:      a_6 = PHI<a_5>
+   S9:      store a_6
+       exit1:
+
+   Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
+   for speed, we usually do not want restrictions in the outer loop to "infect"
+   the decision for the inner loop.  For example, if an outer-loop node
+   in the SCC contains a statement with a fixed layout, that should not
+   prevent the inner loop from using a different layout.  Conversely,
+   the inner loop should not dictate a layout to the outer loop: if the
+   outer loop does a lot of computation, then it may not be efficient to
+   do all of that computation in the inner loop's preferred layout.
+
+   So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
+   and S5+S7 (inner).  We also try to arrange partitions so that:
+
+   - the partition for an outer loop comes before the partition for
+     an inner loop
+
+   - if a sibling loop A dominates a sibling loop B, A's partition
+     comes before B's
+
+   This gives the following partition dag for the example above:
+
+	S1        S3
+	 \        /
+	  S2+S4+S8   S6
+	   |   \\    /
+	   |    S5+S7
+	   |
+	  S9
+
+   There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
+   one for a reversal of the edge S7->S8.
+
+   The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
+   for S2+S4+S8 therefore has to balance the cost of using the outer loop's
+   preferred layout against the cost of changing the layout on entry to the
+   inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
+
+   Although this works well when optimizing for speed, it has the downside
+   when optimizing for size that the choice of layout for S5+S7 is completely
+   independent of S9, which lessens the chance of reducing the overall number
+   of permutations.  We therefore do not partition SCCs when optimizing
+   for size.
+
+   To give a concrete example of the difference between optimizing
+   for size and speed, consider:
+
+   a[0] = (b[1] << c[3]) - d[1];
+   a[1] = (b[0] << c[2]) - d[0];
+   a[2] = (b[3] << c[1]) - d[3];
+   a[3] = (b[2] << c[0]) - d[2];
+
+   There are three different layouts here: one for a, one for b and d,
+   and one for c.  When optimizing for speed it is better to permute each
+   of b, c and d into the order required by a, since those permutations
+   happen in parallel.  But when optimizing for size, it is better to:
+
+   - permute c into the same order as b
+   - do the arithmetic
+   - permute the result into the order required by a
+
+   This gives 2 permutations rather than 3.  */
+
+class vect_optimize_slp_pass
+{
+public:
+  vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
+  void run ();
+
+private:
+  /* Graph building.  */
+  struct loop *containing_loop (slp_tree);
+  bool is_cfg_latch_edge (graph_edge *);
+  void build_vertices (hash_set<slp_tree> &, slp_tree);
+  void build_vertices ();
+  void build_graph ();
+
+  /* Partitioning.  */
+  void create_partitions ();
+  template<typename T> void for_each_partition_edge (unsigned int, T);
+
+  /* Layout selection.  */
+  bool is_compatible_layout (slp_tree, unsigned int);
+  int change_layout_cost (slp_tree, unsigned int, unsigned int);
+  slpg_partition_layout_costs &partition_layout_costs (unsigned int,
+						       unsigned int);
+  void change_vec_perm_layout (slp_tree, lane_permutation_t &,
+			       int, unsigned int);
+  int internal_node_cost (slp_tree, int, unsigned int);
+  void start_choosing_layouts ();
+
+  /* Cost propagation.  */
+  slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
+				     unsigned int, unsigned int);
+  slpg_layout_cost total_in_cost (unsigned int);
+  slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
+  slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
+  void forward_pass ();
+  void backward_pass ();
+
+  /* Rematerialization.  */
+  slp_tree get_result_with_layout (slp_tree, unsigned int);
+  void materialize ();
+
+  /* Clean-up.  */
+  void remove_redundant_permutations ();
+
+  void dump ();
+
+  vec_info *m_vinfo;
+
+  /* True if we should optimize the graph for size, false if we should
+     optimize it for speed.  (It wouldn't be easy to make this decision
+     more locally.)  */
+  bool m_optimize_size;
+
+  /* A graph of all SLP nodes, with edges leading from uses to definitions.
+     In other words, a node's predecessors are its slp_tree parents and
+     a node's successors are its slp_tree children.  */
+  graph *m_slpg = nullptr;
+
+  /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
+  auto_vec<slpg_vertex> m_vertices;
+
+  /* The list of all leaves of M_SLPG. such as external definitions, constants,
+     and loads.  */
+  auto_vec<int> m_leafs;
+
+  /* This array has one entry for every vector layout that we're considering.
+     Element 0 is null and indicates "no change".  Other entries describe
+     permutations that are inherent in the current graph and that we would
+     like to reverse if possible.
+
+     For example, a permutation { 1, 2, 3, 0 } means that something has
+     effectively been permuted in that way, such as a load group
+     { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
+     We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
+     in order to put things "back" in order.  */
+  auto_vec<vec<unsigned> > m_perms;
+
+  /* A partitioning of the nodes for which a layout must be chosen.
+     Each partition represents an <SCC, cfg loop> pair; that is,
+     nodes in different SCCs belong to different partitions, and nodes
+     within an SCC can be further partitioned according to a containing
+     cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
+
+     - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
+       from leaves (such as loads) to roots (such as stores).
+
+     - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
+  auto_vec<slpg_partition_info> m_partitions;
+
+  /* The list of all nodes for which a layout must be chosen.  Nodes for
+     partition P come before the nodes for partition P+1.  Nodes within a
+     partition are in reverse postorder.  */
+  auto_vec<unsigned int> m_partitioned_nodes;
+
+  /* Index P * num-layouts + L contains the cost of using layout L
+     for partition P.  */
+  auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
+
+  /* Index N * num-layouts + L, if nonnull, is a node that provides the
+     original output of node N adjusted to have layout L.  */
+  auto_vec<slp_tree> m_node_layouts;
+};
+
+/* Fill the vertices and leafs vector with all nodes in the SLP graph.
+   Also record whether we should optimize anything for speed rather
+   than size.  */
+
+void
+vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
+					slp_tree node)
 {
   unsigned i;
   slp_tree child;
@@ -3561,8 +4021,15 @@ vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
   if (visited.add (node))
     return;
 
-  node->vertex = vertices.length ();
-  vertices.safe_push (slpg_vertex (node));
+  if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
+    {
+      basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
+      if (optimize_bb_for_speed_p (bb))
+	m_optimize_size = false;
+    }
+
+  node->vertex = m_vertices.length ();
+  m_vertices.safe_push (slpg_vertex (node));
 
   bool leaf = true;
   bool force_leaf = false;
@@ -3570,7 +4037,7 @@ vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
     if (child)
       {
 	leaf = false;
-	vect_slp_build_vertices (visited, child, vertices, leafs);
+	build_vertices (visited, child);
       }
     else
       force_leaf = true;
@@ -3580,21 +4047,19 @@ vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
      and inductions.  Force those SLP PHIs to act as leafs to make them
      backwards reachable.  */
   if (leaf || force_leaf)
-    leafs.safe_push (node->vertex);
+    m_leafs.safe_push (node->vertex);
 }
 
 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
 
-static void
-vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
-			 vec<int> &leafs)
+void
+vect_optimize_slp_pass::build_vertices ()
 {
   hash_set<slp_tree> visited;
   unsigned i;
   slp_instance instance;
-  FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
-    vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
-			     leafs);
+  FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
+    build_vertices (visited, SLP_INSTANCE_TREE (instance));
 }
 
 /* Apply (reverse) bijectite PERM to VEC.  */
@@ -3625,75 +4090,457 @@ vect_slp_permute (vec<unsigned> perm,
     }
 }
 
-/* Return whether permutations PERM_A and PERM_B as recorded in the
-   PERMS vector are equal.  */
+/* Return the cfg loop that contains NODE.  */
 
-static bool
-vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
-		   int perm_a, int perm_b)
+struct loop *
+vect_optimize_slp_pass::containing_loop (slp_tree node)
 {
-  return (perm_a == perm_b
-	  || (perm_a != -1 && perm_b != -1
-	      && perms[perm_a].length () == perms[perm_b].length ()
-	      && memcmp (&perms[perm_a][0], &perms[perm_b][0],
-			 sizeof (unsigned) * perms[perm_a].length ()) == 0));
+  stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
+  if (!rep)
+    return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
+  return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
 }
 
-/* Optimize the SLP graph of VINFO.  */
+/* Return true if UD (an edge from a use to a definition) is associated
+   with a loop latch edge in the cfg.  */
 
-void
-vect_optimize_slp (vec_info *vinfo)
+bool
+vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
 {
-  if (vinfo->slp_instances.is_empty ())
-    return;
+  slp_tree use = m_vertices[ud->src].node;
+  slp_tree def = m_vertices[ud->dest].node;
+  if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
+      || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
+    return false;
 
-  slp_tree node;
-  unsigned i;
-  auto_vec<slpg_vertex> vertices;
-  auto_vec<int> leafs;
-  vect_slp_build_vertices (vinfo, vertices, leafs);
+  stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
+  return (is_a<gphi *> (use_rep->stmt)
+	  && bb_loop_header_p (gimple_bb (use_rep->stmt))
+	  && containing_loop (def) == containing_loop (use));
+}
+
+/* Build the graph.  Mark edges that correspond to cfg loop latch edges with
+   a nonnull data field.  */
+
+void
+vect_optimize_slp_pass::build_graph ()
+{
+  m_optimize_size = true;
+  build_vertices ();
 
-  struct graph *slpg = new_graph (vertices.length ());
-  for (slpg_vertex &v : vertices)
+  m_slpg = new_graph (m_vertices.length ());
+  for (slpg_vertex &v : m_vertices)
     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
       if (child)
-	add_edge (slpg, v.node->vertex, child->vertex);
+	{
+	  graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
+	  if (is_cfg_latch_edge (ud))
+	    ud->data = this;
+	}
+}
+
+/* Return true if E corresponds to a loop latch edge in the cfg.  */
 
-  /* Compute (reverse) postorder on the inverted graph.  */
-  auto_vec<int> ipo;
-  graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
+static bool
+skip_cfg_latch_edges (graph_edge *e)
+{
+  return e->data;
+}
 
-  auto_vec<vec<unsigned> > perms;
-  perms.safe_push (vNULL); /* zero is no permute */
+/* Create the node partitions.  */
 
-  /* Produce initial permutations.  */
-  for (i = 0; i < leafs.length (); ++i)
+void
+vect_optimize_slp_pass::create_partitions ()
+{
+  /* Calculate a postorder of the graph, ignoring edges that correspond
+     to natural latch edges in the cfg.  Reading the vector from the end
+     to the beginning gives the reverse postorder.  */
+  auto_vec<int> initial_rpo;
+  graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
+	       false, NULL, skip_cfg_latch_edges);
+  gcc_assert (initial_rpo.length () == m_vertices.length ());
+
+  /* Calculate the strongly connected components of the graph.  */
+  auto_vec<int> scc_grouping;
+  unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
+
+  /* Create a new index order in which all nodes from the same SCC are
+     consecutive.  Use scc_pos to record the index of the first node in
+     each SCC.  */
+  auto_vec<unsigned int> scc_pos (num_sccs);
+  int last_component = -1;
+  unsigned int node_count = 0;
+  for (unsigned int node_i : scc_grouping)
+    {
+      if (last_component != m_slpg->vertices[node_i].component)
+	{
+	  last_component = m_slpg->vertices[node_i].component;
+	  gcc_assert (last_component == int (scc_pos.length ()));
+	  scc_pos.quick_push (node_count);
+	}
+      node_count += 1;
+    }
+  gcc_assert (node_count == initial_rpo.length ()
+	      && last_component + 1 == int (num_sccs));
+
+  /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
+     inside each SCC following the RPO we calculated above.  The fact that
+     we ignored natural latch edges when calculating the RPO should ensure
+     that, for natural loop nests:
+
+     - the first node that we encounter in a cfg loop is the loop header phi
+     - the loop header phis are in dominance order
+
+     Arranging for this is an optimization (see below) rather than a
+     correctness issue.  Unnatural loops with a tangled mess of backedges
+     will still work correctly, but might give poorer results.
+
+     Also update scc_pos so that it gives 1 + the index of the last node
+     in the SCC.  */
+  m_partitioned_nodes.safe_grow (node_count);
+  for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
+    {
+      unsigned int node_i = initial_rpo[old_i];
+      unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
+      m_partitioned_nodes[new_i] = node_i;
+    }
+
+  /* When optimizing for speed, partition each SCC based on the containing
+     cfg loop. The order we constructed above should ensure that, for natural
+     cfg loops, we'll create sub-SCC partitions for outer loops before
+     the corresponding sub-SCC partitions for inner loops.  Similarly,
+     when one sibling loop A dominates another sibling loop B, we should
+     create a sub-SCC partition for A before a sub-SCC partition for B.
+
+     As above, nothing depends for correctness on whether this achieves
+     a natural nesting, but we should get better results when it does.  */
+  m_partitions.reserve (m_vertices.length ());
+  unsigned int next_partition_i = 0;
+  hash_map<struct loop *, int> loop_partitions;
+  unsigned int rpo_begin = 0;
+  unsigned int num_partitioned_nodes = 0;
+  for (unsigned int rpo_end : scc_pos)
+    {
+      loop_partitions.empty ();
+      unsigned int partition_i = next_partition_i;
+      for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
+	{
+	  /* Handle externals and constants optimistically throughout.
+	     But treat existing vectors as fixed since we do not handle
+	     permuting them.  */
+	  unsigned int node_i = m_partitioned_nodes[rpo_i];
+	  auto &vertex = m_vertices[node_i];
+	  if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
+	       && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
+	      || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
+	    vertex.partition = -1;
+	  else
+	    {
+	      bool existed;
+	      if (m_optimize_size)
+		existed = next_partition_i > partition_i;
+	      else
+		{
+		  struct loop *loop = containing_loop (vertex.node);
+		  auto &entry = loop_partitions.get_or_insert (loop, &existed);
+		  if (!existed)
+		    entry = next_partition_i;
+		  partition_i = entry;
+		}
+	      if (!existed)
+		{
+		  m_partitions.quick_push (slpg_partition_info ());
+		  next_partition_i += 1;
+		}
+	      vertex.partition = partition_i;
+	      num_partitioned_nodes += 1;
+	      m_partitions[partition_i].node_end += 1;
+	    }
+	}
+      rpo_begin = rpo_end;
+    }
+
+  /* Assign ranges of consecutive node indices to each partition,
+     in partition order.  Start with node_end being the same as
+     node_begin so that the next loop can use it as a counter.  */
+  unsigned int node_begin = 0;
+  for (auto &partition : m_partitions)
     {
-      int idx = leafs[i];
-      slp_tree node = vertices[idx].node;
+      partition.node_begin = node_begin;
+      node_begin += partition.node_end;
+      partition.node_end = partition.node_begin;
+    }
+  gcc_assert (node_begin == num_partitioned_nodes);
 
-      /* Handle externals and constants optimistically throughout the
-	 iteration.  But treat existing vectors as fixed since we
-	 do not handle permuting them below.  */
-      if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
-	   && !SLP_TREE_VEC_DEFS (node).exists ())
-	  || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
-	continue;
+  /* Finally build the list of nodes in partition order.  */
+  m_partitioned_nodes.truncate (num_partitioned_nodes);
+  for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
+    {
+      int partition_i = m_vertices[node_i].partition;
+      if (partition_i >= 0)
+	{
+	  unsigned int order_i = m_partitions[partition_i].node_end++;
+	  m_partitioned_nodes[order_i] = node_i;
+	}
+    }
+}
+
+/* Look for edges from earlier partitions into node NODE_I and edges from
+   node NODE_I into later partitions.  Call:
+
+      FN (ud, other_node_i)
+
+   for each such use-to-def edge ud, where other_node_i is the node at the
+   other end of the edge.  */
+
+template<typename T>
+void
+vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
+{
+  int partition_i = m_vertices[node_i].partition;
+  for (graph_edge *pred = m_slpg->vertices[node_i].pred;
+       pred; pred = pred->pred_next)
+    {
+      int src_partition_i = m_vertices[pred->src].partition;
+      if (src_partition_i >= 0 && src_partition_i != partition_i)
+	fn (pred, pred->src);
+    }
+  for (graph_edge *succ = m_slpg->vertices[node_i].succ;
+       succ; succ = succ->succ_next)
+    {
+      int dest_partition_i = m_vertices[succ->dest].partition;
+      if (dest_partition_i >= 0 && dest_partition_i != partition_i)
+	fn (succ, succ->dest);
+    }
+}
+
+/* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
+   that NODE would operate on.  This test is independent of NODE's actual
+   operation.  */
+
+bool
+vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
+					      unsigned int layout_i)
+{
+  if (layout_i == 0)
+    return true;
+
+  /* SLP_TREE_LANES is zero for existing vectors, but those only support
+     layout 0 anyway.  */
+  if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
+    return false;
+
+  return true;
+}
+
+/* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
+   to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
+   layouts is incompatible with NODE or if the change is not possible for
+   some other reason.
+
+   The properties taken from NODE include the number of lanes and the
+   vector type.  The actual operation doesn't matter.  */
+
+int
+vect_optimize_slp_pass::change_layout_cost (slp_tree node,
+					    unsigned int from_layout_i,
+					    unsigned int to_layout_i)
+{
+  if (!is_compatible_layout (node, from_layout_i)
+      || !is_compatible_layout (node, to_layout_i))
+    return -1;
+
+  if (from_layout_i == to_layout_i)
+    return 0;
+
+  auto_vec<slp_tree, 1> children (1);
+  children.quick_push (node);
+  auto_lane_permutation_t perm (SLP_TREE_LANES (node));
+  if (from_layout_i > 0)
+    for (unsigned int i : m_perms[from_layout_i])
+      perm.quick_push ({ 0, i });
+  else
+    for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
+      perm.quick_push ({ 0, i });
+  if (to_layout_i > 0)
+    vect_slp_permute (m_perms[to_layout_i], perm, true);
+  auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
+					       children, false);
+  if (count >= 0)
+    return MAX (count, 1);
+
+  /* ??? In principle we could try changing via layout 0, giving two
+     layout changes rather than 1.  Doing that would require
+     corresponding support in get_result_with_layout.  */
+  return -1;
+}
+
+/* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
+
+inline slpg_partition_layout_costs &
+vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
+						unsigned int layout_i)
+{
+  return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
+}
+
+/* Change PERM in one of two ways:
+
+   - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
+     chosen for child I of NODE.
+
+   - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
+
+   In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
+
+void
+vect_optimize_slp_pass::
+change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
+			int in_layout_i, unsigned int out_layout_i)
+{
+  for (auto &entry : perm)
+    {
+      int this_in_layout_i = in_layout_i;
+      if (this_in_layout_i < 0)
+	{
+	  slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
+	  unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
+	  this_in_layout_i = m_partitions[in_partition_i].layout;
+	}
+      if (this_in_layout_i > 0)
+	entry.second = m_perms[this_in_layout_i][entry.second];
+    }
+  if (out_layout_i > 0)
+    vect_slp_permute (m_perms[out_layout_i], perm, true);
+}
 
-      /* Leafs do not change across iterations.  Note leafs also double
-	 as entries to the reverse graph.  */
-      if (!slpg->vertices[idx].succ)
+/* Check whether the target allows NODE to be rearranged so that the node's
+   output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
+   in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
+
+   If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
+   NODE can adapt to the layout changes that have (perhaps provisionally)
+   been chosen for NODE's children, so that no extra permutations are
+   needed on either the input or the output of NODE.
+
+   If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
+   that all inputs will be forced into layout IN_LAYOUT_I beforehand.
+
+   IN_LAYOUT_I has no meaning for other types of node.
+
+   Keeping the node as-is is always valid.  If the target doesn't appear to
+   support the node as-is then layout 0 has a high and arbitrary cost instead
+   of being invalid.  On the one hand, this ensures that every node has at
+   least one valid layout, avoiding what would otherwise be an awkward
+   special case.  On the other, it still encourages the pass to change
+   an invalid pre-existing layout choice into a valid one.  */
+
+int
+vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
+					    unsigned int out_layout_i)
+{
+  const int fallback_cost = 100;
+
+  if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+    {
+      auto_lane_permutation_t tmp_perm;
+      tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
+
+      /* Check that the child nodes support the chosen layout.  Checking
+	 the first child is enough, since any second child would have the
+	 same shape.  */
+      if (in_layout_i > 0
+	  && !is_compatible_layout (SLP_TREE_CHILDREN (node)[0], in_layout_i))
+	return -1;
+
+      change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
+      int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
+						  node, tmp_perm,
+						  SLP_TREE_CHILDREN (node),
+						  false);
+      if (count < 0)
 	{
-	  vertices[idx].perm_in = 0;
-	  vertices[idx].perm_out = 0;
+	  if (in_layout_i == 0 && out_layout_i == 0)
+	    return fallback_cost;
+	  return -1;
 	}
 
+      /* We currently have no way of telling whether the new layout is cheaper
+	 or more expensive than the old one.  But at least in principle,
+	 it should be worth making zero permutations (whole-vector shuffles)
+	 cheaper than real permutations, in case the pass is able to remove
+	 the latter.  */
+      return count == 0 ? 0 : 1;
+    }
+
+  stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
+  if (rep
+      && STMT_VINFO_DATA_REF (rep)
+      && DR_IS_READ (STMT_VINFO_DATA_REF (rep)))
+    {
+      auto_load_permutation_t tmp_perm;
+      tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
+      if (out_layout_i > 0)
+	vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
+
+      poly_uint64 vf = 1;
+      if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
+	vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      unsigned int n_perms;
+      if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
+					   nullptr, vf, true, false, &n_perms))
+	{
+	  if (out_layout_i == 0)
+	    return fallback_cost;
+	  return -1;
+	}
+
+      /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
+      return n_perms == 0 ? 0 : 1;
+    }
+
+  return 0;
+}
+
+/* Decide which element layouts we should consider using.  Calculate the
+   weights associated with inserting layout changes on partition edges.
+   Also mark partitions that cannot change layout, by setting their
+   layout to zero.  */
+
+void
+vect_optimize_slp_pass::start_choosing_layouts ()
+{
+  /* Used to assign unique permutation indices.  */
+  using perm_hash = unbounded_hashmap_traits<
+    vec_free_hash_base<int_hash_base<unsigned>>,
+    int_hash<int, -1, -2>
+  >;
+  hash_map<vec<unsigned>, int, perm_hash> layout_ids;
+
+  /* Layout 0 is "no change".  */
+  m_perms.safe_push (vNULL);
+
+  /* Create layouts from existing permutations.  */
+  for (unsigned int node_i : m_leafs)
+    {
+      auto &vertex = m_vertices[node_i];
+      if (vertex.partition < 0)
+	continue;
+
+      /* Leafs also double as entries to the reverse graph.  Allow the
+	 layout of those to be changed.  */
+      auto &partition = m_partitions[vertex.partition];
+      if (!m_slpg->vertices[node_i].succ)
+	partition.layout = 0;
+
       /* Loads are the only thing generating permutes.  */
+      slp_tree node = vertex.node;
       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
 	continue;
 
-      /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
-	 node unpermuted, record this permute.  */
+      /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
+	 unpermuted, record a layout that reverses this permutation.  */
+      gcc_assert (partition.layout == 0);
       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
       if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
 	continue;
@@ -3708,13 +4555,18 @@ vect_optimize_slp (vec_info *vinfo)
 	  if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
 	    any_permute = true;
 	}
-      /* If there's no permute no need to split one out.  */
-      if (!any_permute)
-	continue;
       /* If the span doesn't match we'd disrupt VF computation, avoid
 	 that for now.  */
       if (imax - imin + 1 != SLP_TREE_LANES (node))
 	continue;
+      /* If there's no permute no need to split one out.  In this case
+	 we can consider turning the load into a permuted load, if that
+	 turns out to be cheaper than alternatives.  */
+      if (!any_permute)
+	{
+	  partition.layout = -1;
+	  continue;
+	}
 
       /* For now only handle true permutes, like
 	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
@@ -3735,407 +4587,742 @@ vect_optimize_slp (vec_info *vinfo)
       perm.safe_grow (SLP_TREE_LANES (node), true);
       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
 	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
-      perms.safe_push (perm);
-      vertices[idx].perm_in = perms.length () - 1;
-      vertices[idx].perm_out = perms.length () - 1;
+
+      if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
+	{
+	  /* Continue to use existing layouts, but don't add any more.  */
+	  int *entry = layout_ids.get (perm);
+	  partition.layout = entry ? *entry : 0;
+	  perm.release ();
+	}
+      else
+	{
+	  bool existed;
+	  int &layout_i = layout_ids.get_or_insert (perm, &existed);
+	  if (existed)
+	    perm.release ();
+	  else
+	    {
+	      layout_i = m_perms.length ();
+	      m_perms.safe_push (perm);
+	    }
+	  partition.layout = layout_i;
+	}
     }
 
-  /* In addition to the above we have to mark outgoing permutes facing
-     non-reduction graph entries that are not represented as to be
-     materialized.  */
-  for (slp_instance instance : vinfo->slp_instances)
+  /* Initially assume that every layout is possible and has zero cost
+     in every partition.  */
+  m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
+					      * m_perms.length ());
+
+  /* We have to mark outgoing permutations facing non-reduction graph
+     entries that are not represented as to be materialized.  */
+  for (slp_instance instance : m_vinfo->slp_instances)
     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
       {
-	/* Just setting perm_out isn't enough for the propagation to
-	   pick this up.  */
-	vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
-	vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
+	unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
+	m_partitions[m_vertices[node_i].partition].layout = 0;
       }
 
-  /* Propagate permutes along the graph and compute materialization points.  */
-  bool changed;
-  bool do_materialization = false;
-  unsigned iteration = 0;
-  do
+  /* Check which layouts each node and partition can handle.  Calculate the
+     weights associated with inserting layout changes on edges.  */
+  for (unsigned int node_i : m_partitioned_nodes)
     {
-      changed = false;
-      ++iteration;
+      auto &vertex = m_vertices[node_i];
+      auto &partition = m_partitions[vertex.partition];
+      slp_tree node = vertex.node;
 
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "SLP optimize iteration %d\n", iteration);
+      if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
+	{
+	  vertex.weight = vect_slp_node_weight (node);
+
+	  /* We do not handle stores with a permutation, so all
+	     incoming permutations must have been materialized.  */
+	  if (STMT_VINFO_DATA_REF (rep)
+	      && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
+	    /* ???  We're forcing materialization in place
+	       of the child here, we'd need special handling
+	       in materialization to leave layout -1 here.  */
+	    partition.layout = 0;
+
+	  /* We cannot change the layout of an operation that is
+	     not independent on lanes.  Note this is an explicit
+	     negative list since that's much shorter than the respective
+	     positive one but it's critical to keep maintaining it.  */
+	  if (is_gimple_call (STMT_VINFO_STMT (rep)))
+	    switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
+	      {
+	      case CFN_COMPLEX_ADD_ROT90:
+	      case CFN_COMPLEX_ADD_ROT270:
+	      case CFN_COMPLEX_MUL:
+	      case CFN_COMPLEX_MUL_CONJ:
+	      case CFN_VEC_ADDSUB:
+	      case CFN_VEC_FMADDSUB:
+	      case CFN_VEC_FMSUBADD:
+		partition.layout = 0;
+	      default:;
+	      }
+	}
 
-      for (i = vertices.length (); i > 0 ; --i)
+      auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
 	{
-	  int idx = ipo[i-1];
-	  slp_tree node = vertices[idx].node;
+	  auto &other_vertex = m_vertices[other_node_i];
 
-	  /* Handle externals and constants optimistically throughout the
-	     iteration.  */
-	  if (SLP_TREE_DEF_TYPE (node) == vect_external_def
-	      || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
+	  /* Count the number of edges from earlier partitions and the number
+	     of edges to later partitions.  */
+	  if (other_vertex.partition < vertex.partition)
+	    partition.in_degree += 1;
+	  else
+	    partition.out_degree += 1;
+
+	  /* If the current node uses the result of OTHER_NODE_I, accumulate
+	     the effects of that.  */
+	  if (ud->src == int (node_i))
+	    {
+	      other_vertex.out_weight += vertex.weight;
+	      other_vertex.out_degree += 1;
+	    }
+	};
+      for_each_partition_edge (node_i, process_edge);
+    }
+}
+
+/* Return the incoming costs for node NODE_I, assuming that each input keeps
+   its current (provisional) choice of layout.  The inputs do not necessarily
+   have the same layout as each other.  */
+
+slpg_layout_cost
+vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
+{
+  auto &vertex = m_vertices[node_i];
+  slpg_layout_cost cost;
+  auto add_cost = [&](graph_edge *, unsigned int other_node_i)
+    {
+      auto &other_vertex = m_vertices[other_node_i];
+      if (other_vertex.partition < vertex.partition)
+	{
+	  auto &other_partition = m_partitions[other_vertex.partition];
+	  auto &other_costs = partition_layout_costs (other_vertex.partition,
+						      other_partition.layout);
+	  slpg_layout_cost this_cost = other_costs.in_cost;
+	  this_cost.add_serial_cost (other_costs.internal_cost);
+	  this_cost.split (other_partition.out_degree);
+	  cost.add_parallel_cost (this_cost);
+	}
+    };
+  for_each_partition_edge (node_i, add_cost);
+  return cost;
+}
+
+/* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
+   and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
+   slpg_layout_cost::impossible () if the change isn't possible.  */
+
+slpg_layout_cost
+vect_optimize_slp_pass::
+edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
+		  unsigned int layout2_i)
+{
+  auto &def_vertex = m_vertices[ud->dest];
+  auto &use_vertex = m_vertices[ud->src];
+  auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
+  auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
+  auto factor = change_layout_cost (def_vertex.node, def_layout_i,
+				    use_layout_i);
+  if (factor < 0)
+    return slpg_layout_cost::impossible ();
+
+  /* We have a choice of putting the layout change at the site of the
+     definition or at the site of the use.  Prefer the former when
+     optimizing for size or when the execution frequency of the
+     definition is no greater than the combined execution frequencies of
+     the uses.  When putting the layout change at the site of the definition,
+     divvy up the cost among all consumers.  */
+  if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
+    {
+      slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
+      cost.split (def_vertex.out_degree);
+      return cost;
+    }
+  return { use_vertex.weight * factor, m_optimize_size };
+}
+
+/* UD represents a use-def link between FROM_NODE_I and a node in a later
+   partition; FROM_NODE_I could be the definition node or the use node.
+   The node at the other end of the link wants to use layout TO_LAYOUT_I.
+   Return the cost of any necessary fix-ups on edge UD, or return
+   slpg_layout_cost::impossible () if the change isn't possible.
+
+   At this point, FROM_NODE_I's partition has chosen the cheapest
+   layout based on the information available so far, but this choice
+   is only provisional.  */
+
+slpg_layout_cost
+vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
+				      unsigned int to_layout_i)
+{
+  auto &from_vertex = m_vertices[from_node_i];
+  unsigned int from_partition_i = from_vertex.partition;
+  slpg_partition_info &from_partition = m_partitions[from_partition_i];
+  gcc_assert (from_partition.layout >= 0);
+
+  /* First calculate the cost on the assumption that FROM_PARTITION sticks
+     with its current layout preference.  */
+  slpg_layout_cost cost = slpg_layout_cost::impossible ();
+  auto edge_cost = edge_layout_cost (ud, from_node_i,
+				     from_partition.layout, to_layout_i);
+  if (edge_cost.is_possible ())
+    {
+      auto &from_costs = partition_layout_costs (from_partition_i,
+						 from_partition.layout);
+      cost = from_costs.in_cost;
+      cost.add_serial_cost (from_costs.internal_cost);
+      cost.split (from_partition.out_degree);
+      cost.add_serial_cost (edge_cost);
+    }
+
+  /* Take the minimum of that cost and the cost that applies if
+     FROM_PARTITION instead switches to TO_LAYOUT_I.  */
+  auto &direct_layout_costs = partition_layout_costs (from_partition_i,
+						      to_layout_i);
+  if (direct_layout_costs.is_possible ())
+    {
+      slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
+      direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
+      direct_cost.split (from_partition.out_degree);
+      if (!cost.is_possible ()
+	  || direct_cost.is_better_than (cost, m_optimize_size))
+	cost = direct_cost;
+    }
+
+  return cost;
+}
+
+/* UD represents a use-def link between TO_NODE_I and a node in an earlier
+   partition; TO_NODE_I could be the definition node or the use node.
+   The node at the other end of the link wants to use layout FROM_LAYOUT_I;
+   return the cost of any necessary fix-ups on edge UD, or
+   slpg_layout_cost::impossible () if the choice cannot be made.
+
+   At this point, TO_NODE_I's partition has a fixed choice of layout.  */
+
+slpg_layout_cost
+vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
+				       unsigned int from_layout_i)
+{
+  auto &to_vertex = m_vertices[to_node_i];
+  unsigned int to_partition_i = to_vertex.partition;
+  slpg_partition_info &to_partition = m_partitions[to_partition_i];
+  gcc_assert (to_partition.layout >= 0);
+
+  /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
+     adjusted for this input having layout FROM_LAYOUT_I.  Assume that
+     any other inputs keep their current choice of layout.  */
+  auto &to_costs = partition_layout_costs (to_partition_i,
+					   to_partition.layout);
+  if (ud->src == int (to_node_i)
+      && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
+    {
+      auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
+      auto old_layout = from_partition.layout;
+      from_partition.layout = from_layout_i;
+      int factor = internal_node_cost (to_vertex.node, -1,
+				       to_partition.layout);
+      from_partition.layout = old_layout;
+      if (factor >= 0)
+	{
+	  slpg_layout_cost cost = to_costs.out_cost;
+	  cost.add_serial_cost ({ to_vertex.weight * factor,
+				  m_optimize_size });
+	  cost.split (to_partition.in_degree);
+	  return cost;
+	}
+    }
+
+  /* Compute the cost if we insert any necessary layout change on edge UD.  */
+  auto edge_cost = edge_layout_cost (ud, to_node_i,
+				     to_partition.layout, from_layout_i);
+  if (edge_cost.is_possible ())
+    {
+      slpg_layout_cost cost = to_costs.out_cost;
+      cost.add_serial_cost (to_costs.internal_cost);
+      cost.split (to_partition.in_degree);
+      cost.add_serial_cost (edge_cost);
+      return cost;
+    }
+
+  return slpg_layout_cost::impossible ();
+}
+
+/* Make a forward pass through the partitions, accumulating input costs.
+   Make a tentative (provisional) choice of layout for each partition,
+   ensuring that this choice still allows later partitions to keep
+   their original layout.  */
+
+void
+vect_optimize_slp_pass::forward_pass ()
+{
+  for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
+       ++partition_i)
+    {
+      auto &partition = m_partitions[partition_i];
+
+      /* If the partition consists of a single VEC_PERM_EXPR, precompute
+	 the incoming cost that would apply if every predecessor partition
+	 keeps its current layout.  This is used within the loop below.  */
+      slpg_layout_cost in_cost;
+      slp_tree single_node = nullptr;
+      if (partition.node_end == partition.node_begin + 1)
+	{
+	  unsigned int node_i = m_partitioned_nodes[partition.node_begin];
+	  single_node = m_vertices[node_i].node;
+	  if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
+	    in_cost = total_in_cost (node_i);
+	}
+
+      /* Go through the possible layouts.  Decide which ones are valid
+	 for this partition and record which of the valid layouts has
+	 the lowest cost.  */
+      unsigned int min_layout_i = 0;
+      slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
+      for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
+	{
+	  auto &layout_costs = partition_layout_costs (partition_i, layout_i);
+	  if (!layout_costs.is_possible ())
 	    continue;
 
-	  /* We still eventually have failed backedge SLP nodes in the
-	     graph, those are only cancelled when analyzing operations.
-	     Simply treat them as transparent ops, propagating permutes
-	     through them.  */
-	  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
+	  /* If the recorded layout is already 0 then the layout cannot
+	     change.  */
+	  if (partition.layout == 0 && layout_i != 0)
 	    {
-	      /* We do not handle stores with a permutation, so all
-		 incoming permutes must have been materialized.  */
-	      stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
-	      if (STMT_VINFO_DATA_REF (rep)
-		  && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
-		{
-		  /* ???  We're forcing materialization in place
-		     of the child here, we'd need special handling
-		     in materialization to leave perm_in -1 here.  */
-		  vertices[idx].perm_in = 0;
-		  vertices[idx].perm_out = 0;
-		}
-	      /* We cannot move a permute across an operation that is
-		 not independent on lanes.  Note this is an explicit
-		 negative list since that's much shorter than the respective
-		 positive one but it's critical to keep maintaining it.  */
-	      if (is_gimple_call (STMT_VINFO_STMT (rep)))
-		switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
-		  {
-		  case CFN_COMPLEX_ADD_ROT90:
-		  case CFN_COMPLEX_ADD_ROT270:
-		  case CFN_COMPLEX_MUL:
-		  case CFN_COMPLEX_MUL_CONJ:
-		  case CFN_VEC_ADDSUB:
-		  case CFN_VEC_FMADDSUB:
-		  case CFN_VEC_FMSUBADD:
-		    vertices[idx].perm_in = 0;
-		    vertices[idx].perm_out = 0;
-		  default:;
-		  }
+	      layout_costs.mark_impossible ();
+	      continue;
 	    }
 
-	  if (!slpg->vertices[idx].succ)
-	    /* Pick up pre-computed leaf values.  */
-	    ;
-	  else
+	  bool is_possible = true;
+	  for (unsigned int order_i = partition.node_begin;
+	       order_i < partition.node_end; ++order_i)
 	    {
-	      bool any_succ_perm_out_m1 = false;
-	      int perm_in = vertices[idx].perm_in;
-	      for (graph_edge *succ = slpg->vertices[idx].succ;
-		   succ; succ = succ->succ_next)
+	      unsigned int node_i = m_partitioned_nodes[order_i];
+	      auto &vertex = m_vertices[node_i];
+
+	      /* Reject the layout if it is individually incompatible
+		 with any node in the partition.  */
+	      if (!is_compatible_layout (vertex.node, layout_i))
 		{
-		  int succ_idx = succ->dest;
-		  int succ_perm = vertices[succ_idx].perm_out;
-		  /* Handle unvisited (and constant) nodes optimistically.  */
-		  /* ???  But for constants once we want to handle
-		     non-bijective permutes we have to verify the permute,
-		     when unifying lanes, will not unify different constants.
-		     For example see gcc.dg/vect/bb-slp-14.c for a case
-		     that would break.  */
-		  if (succ_perm == -1)
-		    {
-		      /* When we handled a non-leaf optimistically, note
-			 that so we can adjust its outgoing permute below.  */
-		      slp_tree succ_node = vertices[succ_idx].node;
-		      if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
-			  && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
-			any_succ_perm_out_m1 = true;
-		      continue;
-		    }
-		  if (perm_in == -1)
-		    perm_in = succ_perm;
-		  else if (succ_perm == 0
-			   || !vect_slp_perms_eq (perms, perm_in, succ_perm))
-		    {
-		      perm_in = 0;
-		      break;
-		    }
+		  is_possible = false;
+		  break;
 		}
 
-	      /* Adjust any incoming permutes we treated optimistically.  */
-	      if (perm_in != -1 && any_succ_perm_out_m1)
+	      auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
 		{
-		  for (graph_edge *succ = slpg->vertices[idx].succ;
-		       succ; succ = succ->succ_next)
+		  auto &other_vertex = m_vertices[other_node_i];
+		  if (other_vertex.partition < vertex.partition)
 		    {
-		      slp_tree succ_node = vertices[succ->dest].node;
-		      if (vertices[succ->dest].perm_out == -1
-			  && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
-			  && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
-			{
-			  vertices[succ->dest].perm_out = perm_in;
-			  /* And ensure this propagates.  */
-			  if (vertices[succ->dest].perm_in == -1)
-			    vertices[succ->dest].perm_in = perm_in;
-			}
+		      /* Accumulate the incoming costs from earlier
+			 partitions, plus the cost of any layout changes
+			 on UD itself.  */
+		      auto cost = forward_cost (ud, other_node_i, layout_i);
+		      if (!cost.is_possible ())
+			is_possible = false;
+		      else
+			layout_costs.in_cost.add_parallel_cost (cost);
 		    }
-		  changed = true;
+		  else
+		    /* Reject the layout if it would make layout 0 impossible
+		       for later partitions.  This amounts to testing that the
+		       target supports reversing the layout change on edges
+		       to later partitions.
+
+		       In principle, it might be possible to push a layout
+		       change all the way down a graph, so that it never
+		       needs to be reversed and so that the target doesn't
+		       need to support the reverse operation.  But it would
+		       be awkward to bail out if we hit a partition that
+		       does not support the new layout, especially since
+		       we are not dealing with a lattice.  */
+		    is_possible &= edge_layout_cost (ud, other_node_i, 0,
+						     layout_i).is_possible ();
+		};
+	      for_each_partition_edge (node_i, add_cost);
+
+	      /* Accumulate the cost of using LAYOUT_I within NODE,
+		 both for the inputs and the outputs.  */
+	      int factor = internal_node_cost (vertex.node, layout_i,
+					       layout_i);
+	      if (factor < 0)
+		{
+		  is_possible = false;
+		  break;
 		}
+	      else if (factor)
+		layout_costs.internal_cost.add_serial_cost
+		  ({ vertex.weight * factor, m_optimize_size });
+	    }
+	  if (!is_possible)
+	    {
+	      layout_costs.mark_impossible ();
+	      continue;
+	    }
 
-	      if (!vect_slp_perms_eq (perms, perm_in,
-				      vertices[idx].perm_in))
+	  /* Combine the incoming and partition-internal costs.  */
+	  slpg_layout_cost combined_cost = layout_costs.in_cost;
+	  combined_cost.add_serial_cost (layout_costs.internal_cost);
+
+	  /* If this partition consists of a single VEC_PERM_EXPR, see
+	     if the VEC_PERM_EXPR can be changed to support output layout
+	     LAYOUT_I while keeping all the provisional choices of input
+	     layout.  */
+	  if (single_node
+	      && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
+	    {
+	      int factor = internal_node_cost (single_node, -1, layout_i);
+	      if (factor >= 0)
 		{
-		  /* Make sure we eventually converge.  */
-		  gcc_checking_assert (vertices[idx].perm_in == -1
-				       || perm_in == 0);
-		  vertices[idx].perm_in = perm_in;
-
-		  /* While we can handle VEC_PERM nodes as transparent
-		     pass-through they can be a cheap materialization
-		     point as well.  In addition they can act as source
-		     of a random permutation as well.
-		     The following ensures that former materialization
-		     points that now have zero incoming permutes no
-		     longer appear as such and that former "any" permutes
-		     get pass-through.  We keep VEC_PERM nodes optimistic
-		     as "any" outgoing permute though.  */
-		  if (vertices[idx].perm_out != 0
-		      && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
-		    vertices[idx].perm_out = perm_in;
-		  changed = true;
+		  auto weight = m_vertices[single_node->vertex].weight;
+		  slpg_layout_cost internal_cost
+		    = { weight * factor, m_optimize_size };
+
+		  slpg_layout_cost alt_cost = in_cost;
+		  alt_cost.add_serial_cost (internal_cost);
+		  if (alt_cost.is_better_than (combined_cost, m_optimize_size))
+		    {
+		      combined_cost = alt_cost;
+		      layout_costs.in_cost = in_cost;
+		      layout_costs.internal_cost = internal_cost;
+		    }
 		}
 	    }
 
-	  /* Elide pruning at materialization points in the first
-	     iteration phase.  */
-	  if (!do_materialization)
-	    continue;
+	  /* Record the layout with the lowest cost.  Prefer layout 0 in
+	     the event of a tie between it and another layout.  */
+	  if (!min_layout_cost.is_possible ()
+	      || combined_cost.is_better_than (min_layout_cost,
+					       m_optimize_size))
+	    {
+	      min_layout_i = layout_i;
+	      min_layout_cost = combined_cost;
+	    }
+	}
+
+      /* This loop's handling of earlier partitions should ensure that
+	 choosing the original layout for the current partition is no
+	 less valid than it was in the original graph, even with the
+	 provisional layout choices for those earlier partitions.  */
+      gcc_assert (min_layout_cost.is_possible ());
+      partition.layout = min_layout_i;
+    }
+}
+
+/* Make a backward pass through the partitions, accumulating output costs.
+   Make a final choice of layout for each partition.  */
+
+void
+vect_optimize_slp_pass::backward_pass ()
+{
+  for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
+    {
+      auto &partition = m_partitions[partition_i];
 
-	  int perm = vertices[idx].perm_out;
-	  if (perm == 0 || perm == -1)
+      unsigned int min_layout_i = 0;
+      slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
+      for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
+	{
+	  auto &layout_costs = partition_layout_costs (partition_i, layout_i);
+	  if (!layout_costs.is_possible ())
 	    continue;
 
-	  /* Decide on permute materialization.  Look whether there's
-	     a use (pred) edge that is permuted differently than us.
-	     In that case mark ourselves so the permutation is applied.  */
-	  bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
-	  if (all_preds_permuted)
-	    for (graph_edge *pred = slpg->vertices[idx].pred;
-		 pred; pred = pred->pred_next)
-	      {
-		int pred_perm = vertices[pred->src].perm_in;
-		gcc_checking_assert (pred_perm != -1);
-		if (!vect_slp_perms_eq (perms, perm, pred_perm))
-		  {
-		    all_preds_permuted = false;
-		    break;
-		  }
-	      }
-	  if (!all_preds_permuted)
+	  /* Accumulate the costs from successor partitions.  */
+	  bool is_possible = true;
+	  for (unsigned int order_i = partition.node_begin;
+	       order_i < partition.node_end; ++order_i)
 	    {
-	      vertices[idx].perm_out = 0;
-	      changed = true;
+	      unsigned int node_i = m_partitioned_nodes[order_i];
+	      auto &vertex = m_vertices[node_i];
+	      auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
+		{
+		  auto &other_vertex = m_vertices[other_node_i];
+		  auto &other_partition = m_partitions[other_vertex.partition];
+		  if (other_vertex.partition > vertex.partition)
+		    {
+		      /* Accumulate the incoming costs from later
+			 partitions, plus the cost of any layout changes
+			 on UD itself.  */
+		      auto cost = backward_cost (ud, other_node_i, layout_i);
+		      if (!cost.is_possible ())
+			is_possible = false;
+		      else
+			layout_costs.out_cost.add_parallel_cost (cost);
+		    }
+		  else
+		    /* Make sure that earlier partitions can (if necessary
+		       or beneficial) keep the layout that they chose in
+		       the forward pass.  This ensures that there is at
+		       least one valid choice of layout.  */
+		    is_possible &= edge_layout_cost (ud, other_node_i,
+						     other_partition.layout,
+						     layout_i).is_possible ();
+		};
+	      for_each_partition_edge (node_i, add_cost);
+	    }
+	  if (!is_possible)
+	    {
+	      layout_costs.mark_impossible ();
+	      continue;
+	    }
+
+	  /* Locally combine the costs from the forward and backward passes.
+	     (This combined cost is not passed on, since that would lead
+	     to double counting.)  */
+	  slpg_layout_cost combined_cost = layout_costs.in_cost;
+	  combined_cost.add_serial_cost (layout_costs.internal_cost);
+	  combined_cost.add_serial_cost (layout_costs.out_cost);
+
+	  /* Record the layout with the lowest cost.  Prefer layout 0 in
+	     the event of a tie between it and another layout.  */
+	  if (!min_layout_cost.is_possible ()
+	      || combined_cost.is_better_than (min_layout_cost,
+					       m_optimize_size))
+	    {
+	      min_layout_i = layout_i;
+	      min_layout_cost = combined_cost;
 	    }
 	}
 
-      /* If the initial propagation converged, switch on materialization
-	 and re-propagate.  */
-      if (!changed && !do_materialization)
+      gcc_assert (min_layout_cost.is_possible ());
+      partition.layout = min_layout_i;
+    }
+}
+
+/* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
+   NODE already has the layout that was selected for its partition.  */
+
+slp_tree
+vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
+						unsigned int to_layout_i)
+{
+  unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
+  slp_tree result = m_node_layouts[result_i];
+  if (result)
+    return result;
+
+  if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
+      || SLP_TREE_DEF_TYPE (node) == vect_external_def)
+    {
+      /* If the vector is uniform or unchanged, there's nothing to do.  */
+      if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
+	result = node;
+      else
 	{
-	  do_materialization = true;
-	  changed = true;
+	  auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
+	  result = vect_create_new_slp_node (scalar_ops);
+	  vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
 	}
     }
-  while (changed);
-  statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
-
-  /* Materialize.  */
-  for (i = 0; i < vertices.length (); ++i)
+  else
     {
-      int perm_in = vertices[i].perm_in;
-      slp_tree node = vertices[i].node;
+      unsigned int partition_i = m_vertices[node->vertex].partition;
+      unsigned int from_layout_i = m_partitions[partition_i].layout;
+      if (from_layout_i == to_layout_i)
+	return node;
 
-      /* First permute invariant/external original successors, we handle
-	 those optimistically during propagation and duplicate them if
-	 they are used with different permutations.  */
-      unsigned j;
-      slp_tree child;
-      if (perm_in > 0)
-	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-	  {
-	    if (!child
-		|| (SLP_TREE_DEF_TYPE (child) != vect_constant_def
-		    && SLP_TREE_DEF_TYPE (child) != vect_external_def))
-	      continue;
+      /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
+	 permutation instead of a serial one.  Leave the new permutation
+	 in TMP_PERM on success.  */
+      auto_lane_permutation_t tmp_perm;
+      unsigned int num_inputs = 1;
+      if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+	{
+	  tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
+	  if (from_layout_i != 0)
+	    vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
+	  if (to_layout_i != 0)
+	    vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
+	  if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
+					      tmp_perm,
+					      SLP_TREE_CHILDREN (node),
+					      false) >= 0)
+	    num_inputs = SLP_TREE_CHILDREN (node).length ();
+	  else
+	    tmp_perm.truncate (0);
+	}
 
-	    /* If the vector is uniform there's nothing to do.  */
-	    if (vect_slp_tree_uniform_p (child))
-	      continue;
+      if (dump_enabled_p ())
+	{
+	  if (tmp_perm.length () > 0)
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "duplicating permutation node %p with"
+			     " layout %d\n",
+			     node, to_layout_i);
+	  else
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "inserting permutation node in place of %p\n",
+			     node);
+	}
 
-	    /* We can end up sharing some externals via two_operator
-	       handling.  Be prepared to unshare those.  */
-	    if (child->refcnt != 1)
-	      {
-		gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
-		SLP_TREE_CHILDREN (node)[j] = child
-		  = vect_create_new_slp_node
-		      (SLP_TREE_SCALAR_OPS (child).copy ());
-	      }
-	    vect_slp_permute (perms[perm_in],
-			      SLP_TREE_SCALAR_OPS (child), true);
-	  }
+      unsigned int num_lanes = SLP_TREE_LANES (node);
+      result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
+      if (SLP_TREE_SCALAR_STMTS (node).length ())
+	{
+	  auto &stmts = SLP_TREE_SCALAR_STMTS (result);
+	  stmts.safe_splice (SLP_TREE_SCALAR_STMTS (result));
+	  if (from_layout_i != 0)
+	    vect_slp_permute (m_perms[from_layout_i], stmts, false);
+	  if (to_layout_i != 0)
+	    vect_slp_permute (m_perms[to_layout_i], stmts, true);
+	}
+      SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
+      SLP_TREE_LANES (result) = num_lanes;
+      SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
+      result->vertex = -1;
+
+      auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
+      if (tmp_perm.length ())
+	{
+	  lane_perm.safe_splice (tmp_perm);
+	  SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
+	}
+      else
+	{
+	  lane_perm.create (num_lanes);
+	  for (unsigned j = 0; j < num_lanes; ++j)
+	    lane_perm.quick_push ({ 0, j });
+	  if (from_layout_i != 0)
+	    vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
+	  if (to_layout_i != 0)
+	    vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
+	  SLP_TREE_CHILDREN (result).safe_push (node);
+	}
+      for (slp_tree child : SLP_TREE_CHILDREN (result))
+	child->refcnt++;
+    }
+  m_node_layouts[result_i] = result;
+  return result;
+}
+
+/* Apply the chosen vector layouts to the SLP graph.  */
 
+void
+vect_optimize_slp_pass::materialize ()
+{
+  /* We no longer need the costs, so avoid having two O(N * P) arrays
+     live at the same time.  */
+  m_partition_layout_costs.release ();
+  m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
+
+  auto_sbitmap fully_folded (m_vertices.length ());
+  bitmap_clear (fully_folded);
+  for (unsigned int node_i : m_partitioned_nodes)
+    {
+      auto &vertex = m_vertices[node_i];
+      slp_tree node = vertex.node;
+      int layout_i = m_partitions[vertex.partition].layout;
+      gcc_assert (layout_i >= 0);
+
+      /* Rearrange the scalar statements to match the chosen layout.  */
+      if (layout_i > 0)
+	vect_slp_permute (m_perms[layout_i],
+			  SLP_TREE_SCALAR_STMTS (node), true);
+
+      /* Update load and lane permutations.  */
       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
 	{
-	  /* Apply the common permutes to the input vectors.  */
-	  if (perm_in > 0)
+	  /* First try to absorb the input vector layouts.  If that fails,
+	     force the inputs to have layout LAYOUT_I too.  We checked that
+	     that was possible before deciding to use nonzero output layouts.
+	     (Note that at this stage we don't really have any guarantee that
+	     the target supports the original VEC_PERM_EXPR.)  */
+	  auto &perm = SLP_TREE_LANE_PERMUTATION (node);
+	  auto_lane_permutation_t tmp_perm;
+	  tmp_perm.safe_splice (perm);
+	  change_vec_perm_layout (node, tmp_perm, -1, layout_i);
+	  if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
+					      tmp_perm,
+					      SLP_TREE_CHILDREN (node),
+					      false) >= 0)
 	    {
-	      /* If the node is already a permute node we can apply
-		 the permutation to the lane selection, effectively
-		 materializing it on the incoming vectors.  */
-	      if (dump_enabled_p ())
+	      if (dump_enabled_p ()
+		  && !std::equal (tmp_perm.begin (), tmp_perm.end (),
+				  perm.begin ()))
 		dump_printf_loc (MSG_NOTE, vect_location,
-				 "simplifying permute node %p\n",
-				 node);
-	      for (unsigned k = 0;
-		   k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
-		SLP_TREE_LANE_PERMUTATION (node)[k].second
-		  = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
-	    }
-	  /* Apply the anticipated output permute to the permute and
-	     stmt vectors.  */
-	  int perm_out = vertices[i].perm_out;
-	  if (perm_out > 0)
-	    {
-	      vect_slp_permute (perms[perm_out],
-				SLP_TREE_SCALAR_STMTS (node), true);
-	      vect_slp_permute (perms[perm_out],
-				SLP_TREE_LANE_PERMUTATION (node), true);
+				 "absorbing input layouts into %p\n", node);
+	      std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
+	      bitmap_set_bit (fully_folded, node_i);
 	    }
-	}
-      else if (vertices[i].get_perm_materialized () != 0)
-	{
-	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
-	    /* For loads simply drop the permutation, the load permutation
-	       already performs the desired permutation.  */
-	    ;
-	  else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
-	    gcc_unreachable ();
 	  else
 	    {
+	      /* Not MSG_MISSED because it would make no sense to users.  */
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_NOTE, vect_location,
-				 "inserting permute node in place of %p\n",
+				 "failed to absorb input layouts into %p\n",
 				 node);
-
-	      /* Make a copy of NODE and in-place change it to a
-		 VEC_PERM node to permute the lanes of the copy.  */
-	      slp_tree copy = new _slp_tree;
-	      SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
-	      SLP_TREE_CHILDREN (node) = vNULL;
-	      SLP_TREE_SCALAR_STMTS (copy)
-		= SLP_TREE_SCALAR_STMTS (node).copy ();
-	      vect_slp_permute (perms[perm_in],
-				SLP_TREE_SCALAR_STMTS (copy), true);
-	      gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
-	      SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
-	      gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
-	      SLP_TREE_LANE_PERMUTATION (copy)
-		= SLP_TREE_LANE_PERMUTATION (node);
-	      SLP_TREE_LANE_PERMUTATION (node) = vNULL;
-	      SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
-	      copy->refcnt = 1;
-	      copy->max_nunits = node->max_nunits;
-	      SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
-	      SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
-	      SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
-
-	      /* Now turn NODE into a VEC_PERM.  */
-	      SLP_TREE_CHILDREN (node).safe_push (copy);
-	      SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
-	      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
-		SLP_TREE_LANE_PERMUTATION (node)
-		  .quick_push (std::make_pair (0, perms[perm_in][j]));
-	      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+	      change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
 	    }
 	}
-      else if (perm_in > 0) /* perm_in == perm_out */
+      else
 	{
-	  /* Apply the reverse permutation to our stmts.  */
-	  vect_slp_permute (perms[perm_in],
-			    SLP_TREE_SCALAR_STMTS (node), true);
-	  /* And to the lane/load permutation, which we can simply
-	     make regular by design.  */
-	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
-	    {
-	      gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
-	      /* ???  When we handle non-bijective permutes the idea
-		 is that we can force the load-permutation to be
-		 { min, min + 1, min + 2, ... max }.  But then the
-		 scalar defs might no longer match the lane content
-		 which means wrong-code with live lane vectorization.
-		 So we possibly have to have NULL entries for those.  */
-	      vect_slp_permute (perms[perm_in],
-				SLP_TREE_LOAD_PERMUTATION (node), true);
-	    }
-	  else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
-	    gcc_unreachable ();
+	  gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
+	  auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
+	  if (layout_i > 0)
+	    /* ???  When we handle non-bijective permutes the idea
+	       is that we can force the load-permutation to be
+	       { min, min + 1, min + 2, ... max }.  But then the
+	       scalar defs might no longer match the lane content
+	       which means wrong-code with live lane vectorization.
+	       So we possibly have to have NULL entries for those.  */
+	    vect_slp_permute (m_perms[layout_i], load_perm, true);
 	}
     }
 
-  /* Elide any permutations at BB reduction roots.  */
-  if (is_a <bb_vec_info> (vinfo))
+  /* Do this before any nodes disappear, since it involves a walk
+     over the leaves.  */
+  remove_redundant_permutations ();
+
+  /* Replace each child with a correctly laid-out version.  */
+  for (unsigned int node_i : m_partitioned_nodes)
     {
-      for (slp_instance instance : vinfo->slp_instances)
+      /* Skip nodes that have already been handled above.  */
+      if (bitmap_bit_p (fully_folded, node_i))
+	continue;
+
+      auto &vertex = m_vertices[node_i];
+      int in_layout_i = m_partitions[vertex.partition].layout;
+      gcc_assert (in_layout_i >= 0);
+
+      unsigned j;
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
 	{
-	  if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
+	  if (!child)
 	    continue;
-	  slp_tree old = SLP_INSTANCE_TREE (instance);
-	  if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
-	      && SLP_TREE_CHILDREN (old).length () == 1)
-	    {
-	      slp_tree child = SLP_TREE_CHILDREN (old)[0];
-	      if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
-		{
-		  /* Preserve the special VEC_PERM we use to shield existing
-		     vector defs from the rest.  But make it a no-op.  */
-		  unsigned i = 0;
-		  for (std::pair<unsigned, unsigned> &p
-		       : SLP_TREE_LANE_PERMUTATION (old))
-		    p.second = i++;
-		}
-	      else
-		{
-		  SLP_INSTANCE_TREE (instance) = child;
-		  SLP_TREE_REF_COUNT (child)++;
-		  vect_free_slp_tree (old);
-		}
-	    }
-	  else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
-		   && SLP_TREE_REF_COUNT (old) == 1
-		   && vertices[old->vertex].get_perm_materialized () != 0)
+
+	  slp_tree new_child = get_result_with_layout (child, in_layout_i);
+	  if (new_child != child)
 	    {
-	      /* ???  For loads the situation is more complex since
-		 we can't modify the permute in place in case the
-		 node is used multiple times.  In fact for loads this
-		 should be somehow handled in the propagation engine.  */
-	      /* Apply the reverse permutation to our stmts.  */
-	      int perm = vertices[old->vertex].get_perm_materialized ();
-	      vect_slp_permute (perms[perm],
-				SLP_TREE_SCALAR_STMTS (old), true);
-	      vect_slp_permute (perms[perm],
-				SLP_TREE_LOAD_PERMUTATION (old), true);
+	      vect_free_slp_tree (child);
+	      SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
+	      new_child->refcnt += 1;
 	    }
 	}
     }
+}
 
-  /* Free the perms vector used for propagation.  */
-  while (!perms.is_empty ())
-    perms.pop ().release ();
-  free_graph (slpg);
-
+/* Elide load permutations that are not necessary.  Such permutations might
+   be pre-existing, rather than created by the layout optimizations.  */
 
-  /* Now elide load permutations that are not necessary.  */
-  for (i = 0; i < leafs.length (); ++i)
+void
+vect_optimize_slp_pass::remove_redundant_permutations ()
+{
+  for (unsigned int node_i : m_leafs)
     {
-      node = vertices[leafs[i]].node;
+      slp_tree node = m_vertices[node_i].node;
       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
 	continue;
 
       /* In basic block vectorization we allow any subchain of an interleaving
 	 chain.
 	 FORNOW: not in loop SLP because of realignment complications.  */
-      if (is_a <bb_vec_info> (vinfo))
+      if (is_a <bb_vec_info> (m_vinfo))
 	{
 	  bool subchain_p = true;
 	  stmt_vec_info next_load_info = NULL;
@@ -4160,6 +5347,7 @@ vect_optimize_slp (vec_info *vinfo)
 	}
       else
 	{
+	  loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
 	  stmt_vec_info load_info;
 	  bool this_load_permuted = false;
 	  unsigned j;
@@ -4175,8 +5363,7 @@ vect_optimize_slp (vec_info *vinfo)
 	      /* The load requires permutation when unrolling exposes
 		 a gap either because the group is larger than the SLP
 		 group-size or because there is a gap between the groups.  */
-	      && (known_eq (LOOP_VINFO_VECT_FACTOR
-			      (as_a <loop_vec_info> (vinfo)), 1U)
+	      && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
 		  || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
 		      && DR_GROUP_GAP (first_stmt_info) == 0)))
 	    {
@@ -4187,6 +5374,150 @@ vect_optimize_slp (vec_info *vinfo)
     }
 }
 
+/* Print the partition graph and layout information to the dump file.  */
+
+void
+vect_optimize_slp_pass::dump ()
+{
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "SLP optimize permutations:\n");
+  for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
+    {
+      dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
+      const char *sep = "";
+      for (unsigned int idx : m_perms[layout_i])
+	{
+	  dump_printf (MSG_NOTE, "%s%d", sep, idx);
+	  sep = ", ";
+	}
+      dump_printf (MSG_NOTE, " }\n");
+    }
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "SLP optimize partitions:\n");
+  for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
+       ++partition_i)
+    {
+      auto &partition = m_partitions[partition_i];
+      dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "  partition %d (layout %d):\n",
+		       partition_i, partition.layout);
+      dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
+      for (unsigned int order_i = partition.node_begin;
+	   order_i < partition.node_end; ++order_i)
+	{
+	  auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
+	  dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
+			   (void *) vertex.node);
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "          weight: %f\n",
+			   vertex.weight.to_double ());
+	  if (vertex.out_degree)
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "          out weight: %f (degree %d)\n",
+			     vertex.out_weight.to_double (),
+			     vertex.out_degree);
+	  if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "          op: VEC_PERM_EXPR\n");
+	  else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "          op template: %G", rep->stmt);
+	}
+      dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
+      for (unsigned int order_i = partition.node_begin;
+	   order_i < partition.node_end; ++order_i)
+	{
+	  unsigned int node_i = m_partitioned_nodes[order_i];
+	  auto &vertex = m_vertices[node_i];
+	  auto print_edge = [&](graph_edge *, unsigned int other_node_i)
+	    {
+	      auto &other_vertex = m_vertices[other_node_i];
+	      if (other_vertex.partition < vertex.partition)
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "      - %p [%d] --> %p\n",
+				 other_vertex.node, other_vertex.partition,
+				 vertex.node);
+	      else
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "      - %p --> [%d] %p\n",
+				 vertex.node, other_vertex.partition,
+				 other_vertex.node);
+	    };
+	  for_each_partition_edge (node_i, print_edge);
+	}
+
+      for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
+	{
+	  auto &layout_costs = partition_layout_costs (partition_i, layout_i);
+	  if (layout_costs.is_possible ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "    layout %d:%s\n", layout_i,
+			       partition.layout == int (layout_i)
+			       ? " (*)" : "");
+	      slpg_layout_cost combined_cost = layout_costs.in_cost;
+	      combined_cost.add_serial_cost (layout_costs.internal_cost);
+	      combined_cost.add_serial_cost (layout_costs.out_cost);
+#define TEMPLATE "{depth: %f, total: %f}"
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "        " TEMPLATE "\n", layout_i,
+			       layout_costs.in_cost.depth.to_double (),
+			       layout_costs.in_cost.total.to_double ());
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "      + " TEMPLATE "\n",
+			       layout_costs.internal_cost.depth.to_double (),
+			       layout_costs.internal_cost.total.to_double ());
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "      + " TEMPLATE "\n",
+			       layout_costs.out_cost.depth.to_double (),
+			       layout_costs.out_cost.total.to_double ());
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "      = " TEMPLATE "\n",
+			       combined_cost.depth.to_double (),
+			       combined_cost.total.to_double ());
+#undef TEMPLATE
+	    }
+	  else
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "    layout %d: rejected\n", layout_i);
+	}
+    }
+}
+
+/* Main entry point for the SLP graph optimization pass.  */
+
+void
+vect_optimize_slp_pass::run ()
+{
+  build_graph ();
+  create_partitions ();
+  start_choosing_layouts ();
+  if (m_perms.length () > 1)
+    {
+      forward_pass ();
+      backward_pass ();
+      if (dump_enabled_p ())
+	dump ();
+      materialize ();
+      while (!m_perms.is_empty ())
+	m_perms.pop ().release ();
+    }
+  else
+    remove_redundant_permutations ();
+  free_graph (m_slpg);
+}
+
+/* Optimize the SLP graph of VINFO.  */
+
+void
+vect_optimize_slp (vec_info *vinfo)
+{
+  if (vinfo->slp_instances.is_empty ())
+    return;
+  vect_optimize_slp_pass (vinfo).run ();
+}
+
 /* Gather loads reachable from the individual SLP graph entries.  */
 
 void
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e5fdc9e..a2b0afb 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -154,7 +154,9 @@ struct vect_scalar_ops_slice_hash : typed_noop_remove<vect_scalar_ops_slice>
   SLP
  ************************************************************************/
 typedef vec<std::pair<unsigned, unsigned> > lane_permutation_t;
+typedef auto_vec<std::pair<unsigned, unsigned>, 16> auto_lane_permutation_t;
 typedef vec<unsigned> load_permutation_t;
+typedef auto_vec<unsigned, 16> auto_load_permutation_t;
 
 /* A computation tree of an SLP instance.  Each node corresponds to a group of
    stmts to be packed in a SIMD stmt.  */