diff options
Diffstat (limited to 'gcc')
24 files changed, 1031 insertions, 235 deletions
diff --git a/gcc/gimple.h b/gcc/gimple.h index 3c9b996..87c90be 100644 --- a/gcc/gimple.h +++ b/gcc/gimple.h @@ -6598,6 +6598,8 @@ gimple_expr_type (const gimple *stmt) } else if (code == GIMPLE_COND) return boolean_type_node; + else if (code == GIMPLE_PHI) + return TREE_TYPE (gimple_phi_result (stmt)); else return void_type_node; } diff --git a/gcc/testsuite/g++.dg/vect/simd-11.cc b/gcc/testsuite/g++.dg/vect/simd-11.cc new file mode 100644 index 0000000..912d184 --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/simd-11.cc @@ -0,0 +1,61 @@ +// { dg-do compile } +// { dg-require-effective-target c++11 } +// { dg-additional-options "-Ofast" } + +template <typename> class h { +public: + ~h(); +}; +template <typename> struct l; +template <typename c> struct l<h<c> > { + using d = c; + template <typename e> using f = h<e>; +}; +template <typename g> struct o { + typedef l<g> j; + typedef typename j::d k; + template <typename c> struct p { typedef typename j::f<c> m; }; +}; +template <typename c, typename g> struct F { + typedef typename o<g>::p<c>::m q; + struct : q { + } r; +}; +template <typename c, typename g = h<c> > class s : F<c, g> { +public: + s(long); + typename o<typename F<c, g>::q>::k operator[](long); +}; +template <int> class t { +public: + int dimension; + t(const t &); + void operator+=(t); + double values[]; +}; +template <int dim> t<dim>::t(const t &p1) { + for (int i = 0; i < dim; ++i) + values[i] = p1.values[i]; +} +template <int dim> class u : public t<dim> { +public: + double m_fn1(const u &) const; +}; +template <int dim> double u<dim>::m_fn1(const u &) const { + double a; + for (int i = 0; i < dim; ++i) { + double b = this->values[i]; + a += b; + } + return a; +} +int m_fn2(const u<2> &p1, const u<2> &w) { + int n; + s<u<2> > c(n); + s<double> d(n); + double e = p1.m_fn1(w); + for (;;) { + c[0] += p1; + d = e; + } +} diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-54.c b/gcc/testsuite/gcc.dg/vect/bb-slp-54.c new file mode 100644 index 0000000..d05ce33 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-54.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ + +double a[2], b[2], c[2]; + +void foo(int flag) +{ + double tem1, tem2; + if (flag) + { + tem1 = a[0]; + tem2 = a[1]; + } + else + { + tem1 = b[0]; + tem2 = b[1]; + } + c[0] = tem1; + c[1] = tem2; +} + +/* { dg-final { scan-tree-dump-times "transform load" 2 "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-55.c b/gcc/testsuite/gcc.dg/vect/bb-slp-55.c new file mode 100644 index 0000000..57a042b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-55.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ + +typedef struct { + int a; + int b; + int c; + int d; +} e; +e *f; +int g; +void h() { + e *i; + if (g) { + i->c = f[g].b; + i->d = f[g].a; + } else + i->c = i->d = 0; +} diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-56.c b/gcc/testsuite/gcc.dg/vect/bb-slp-56.c new file mode 100644 index 0000000..90d1751 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-56.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ + +typedef struct { + double a, b; +} c; +int d, e; +int i(void); +void h(c, c); +void f() { + c a, g; + do { + a.a = e ?: g.a; + a.b = g.b + d; + h(g, a); + g = a; + } while (i()); +} diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-57.c b/gcc/testsuite/gcc.dg/vect/bb-slp-57.c new file mode 100644 index 0000000..6f13507 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-57.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-ffast-math" } */ +/* { dg-require-effective-target vect_float } */ + +float *a; +typedef struct { + int c; + float bbmax[3]; +} d; +d e; +int f[3]; +int g, h, i, j; +float k, k; +void l() +{ + for (unsigned z = 0; z < 2048; ++z) { + { + j = e.bbmax[1] > k ? e.bbmax[1] : k; + } + e.bbmax[1] = j; + { i = e.bbmax[2] > k ? e.bbmax[2] : k; } + e.bbmax[2] = i; + f[2] = a[2]; + { + float b; + h = e.bbmax[1] > b ? e.bbmax[1] : b; + } + e.bbmax[1] = h; + { + float b; + g = e.bbmax[2] > b ? e.bbmax[2] : b; + } + e.bbmax[2] = g; + } +} + +/* { dg-final { scan-tree-dump-times "transform load" 1 "slp1" { target { { x86_64-*-* i?86-*-* } && lp64 } } } } */ +/* { dg-final { scan-tree-dump "optimized: basic block" "slp1" { target { { x86_64-*-* i?86-*-* } && lp64 } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-58.c b/gcc/testsuite/gcc.dg/vect/bb-slp-58.c new file mode 100644 index 0000000..11bf5c3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-58.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ + +double x[1024]; +void bar (void); + +void foo (void) +{ + double tem1 = x[0]; + double tem2 = x[1]; + for (int i = 0; i < 511; ++i) + { + x[2*i] = tem1; + x[2*i+1] = tem2; + bar (); + tem1 = x[2*(i+1)]; + tem2 = x[2*(i+1)+1]; + } +} + +/* We should be able to vectorize the cycle in one SLP attempt including + both load groups. */ +/* { dg-final { scan-tree-dump-times "transform load" 2 "slp1" } } */ +/* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-59.c b/gcc/testsuite/gcc.dg/vect/bb-slp-59.c new file mode 100644 index 0000000..2e35725 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-59.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-fdump-tree-loopdone" } */ + +double x[1024]; +void bar (void); + +void foo (void) +{ + double tem1 = x[0]; + double tem2 = x[1]; + for (int i = 0; i < 511; ++i) + { + x[2*i] = tem2; + x[2*i+1] = tem1; + bar (); + tem1 = x[2*(i+1)]; + tem2 = x[2*(i+1)+1]; + } +} + +/* We should be able to vectorize the cycle in one SLP attempt including + both load groups and do only one permutation. */ +/* { dg-final { scan-tree-dump-times "transform load" 2 "slp1" } } */ +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 1 "loopdone" } } */ +/* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-60.c b/gcc/testsuite/gcc.dg/vect/bb-slp-60.c new file mode 100644 index 0000000..52643bf --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-60.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ + +enum { a = 1, b }; +float *c, *e; +float d, h; +int f, g; +void i() +{ + float j = h; + for (; g;) + for (; f; f++) + { + c[a] = j * d; + c[b] = h * d; + j = 0; + h = e[2]; + } +} diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-61.c b/gcc/testsuite/gcc.dg/vect/bb-slp-61.c new file mode 100644 index 0000000..3323a2b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-61.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ + +struct a { + enum { b, c } d; + unsigned e; + unsigned f; +}; +void j(struct a *a, int i, int h) +{ + unsigned f = a->f; + switch (a->d) + while (1) + { + if (i) + { + case b: + if (h) + goto k; + } + else + f = 0; + case c:; + } +k: + a->e = a->f = f; +} diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-62.c b/gcc/testsuite/gcc.dg/vect/bb-slp-62.c new file mode 100644 index 0000000..84ee04c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-62.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ + +typedef struct { + char a; + int b[]; +} c; +int d, f; +c e; +void g() { + int h, i, j; + for (; i;) + switch (i) + case 4: { + h = (__UINTPTR_TYPE__)g >= 3; + for (; h; h -= 1) + if (d) + j = f; + } + for (; i < 3; i++) + e.b[i] = j; +} diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-63.c b/gcc/testsuite/gcc.dg/vect/bb-slp-63.c new file mode 100644 index 0000000..6519c97 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-63.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ + +struct { + unsigned a; + unsigned c; +} d; +int e, g; +void h(unsigned b) { + unsigned a, c; + while (e) { + if (b) { + ++e; + continue; + } + c = g; + if (g) + a |= 10; + } + d.a = a; + d.c = c; +} diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-64.c b/gcc/testsuite/gcc.dg/vect/bb-slp-64.c new file mode 100644 index 0000000..dcb6a14 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-64.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ + +enum { a, b }; +double *c, *e; +int d, f; +void g() { + for (;;) { + c[a] = c[b] = d * e[b]; + f = d -= f; + } +} diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-65.c b/gcc/testsuite/gcc.dg/vect/bb-slp-65.c new file mode 100644 index 0000000..ec1707b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-65.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3" } */ +/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */ + +int *a; +int b, c, d, e; +void f() { + int g; + for (;;) + for (; b;) + if (d) + for (; c;) + if (g) + e += a[1] = a[2] = e; +} diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-66.c b/gcc/testsuite/gcc.dg/vect/bb-slp-66.c new file mode 100644 index 0000000..b59a2cc --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-66.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3" } */ + +typedef struct { + double a, b; +} c; +typedef struct { + c d; + long coordinates; +} e; +int f; +c g; +e h; +void k(int); +int n(); +void j() { int i; k(i); } +void k(int l) { + double a; + int b; + c m[4]; + long i; + for (; l;) + do { + g.a = b ?: a; + m[3] = g; + if (f) + m[0] = m[1] = m[3]; + i = 0; + for (; i < 4; i++) + (&h + i)->d = m[i]; + } while (n()); +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-outer-slp-1.c new file mode 100644 index 0000000..62b18bd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-slp-1.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ + +int a[1024]; + +void foo (void) +{ + for (int i = 0; i < 1020; i += 4) + { + int suma = a[i]; + int sumb = a[i+1]; + int sumc = a[i+2]; + int sumd = a[i+3]; + for (unsigned j = 0; j < 77; ++j) + { + suma = (suma ^ i) + 1; + sumb = (sumb ^ i) + 2; + sumc = (sumc ^ i) + 3; + sumd = (sumd ^ i) + 4; + } + a[i] = suma; + a[i+1] = sumb; + a[i+2] = sumc; + a[i+3] = sumd; + } +} + +/* We should vectorize this outer loop with SLP. */ +/* { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-not "VEC_PERM_EXPR" "vect" } } */ diff --git a/gcc/testsuite/gfortran.dg/vect/O3-bb-slp-1.f b/gcc/testsuite/gfortran.dg/vect/O3-bb-slp-1.f new file mode 100644 index 0000000..74b3b17 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/vect/O3-bb-slp-1.f @@ -0,0 +1,28 @@ +! { dg-do compile } + subroutine tranx3 (jbeg,jend,kbeg,kend,dlo,den,mflx,zro) + parameter(in = 128+5 + & , jn = 128+5 + & , kn = 128+5) + parameter(ijkn = 128+5) + real*8 zro, dqm, dqp, dx3bi (kn) + real*8 mflux (ijkn,4), dtwid (ijkn,4), dd (ijkn,4) + real*8 mflx (in,jn,kn) + real*8 dlo (in,jn,kn), den (in,jn,kn) + do 2100 j=jbeg-1,jend + dtwid (k,1) = ( 0.5 + q1 ) * ( dlo(i ,j,k-1) + 3 - ( dx3a(k ) + xi ) * dd (k ,1) ) + mflux (k,1) = dtwid (k,1) * ( v3(i ,j,k) - vg3(k) ) * dt + if (j.ge.jbeg) then + den(i ,j,k) = ( dlo(i ,j,k) * dvl3a(k) + 1 - etwid (k+1,1) + etwid (k,1) ) * dvl3a i(k) + if (kend .eq. ke) mflx(i ,j,ke+1) = mflux (ke+1,1) + endif + do 2030 k=max(kbeg-2,ks-1),kend+1 + dqm = (dlo(i ,j,k ) - dlo(i ,j,k-1)) * dx3bi(k ) + dqp = (dlo(i ,j,k+1) - dlo(i ,j,k )) * dx3bi(k+1) + dd(k,1) = max ( dqm * dqp, zro ) +2030 continue + dtwid (k,3) = ( 0.5 + q1 ) * ( dlo(i+2,j,k-1) + 3 - ( dx3a(k ) + xi ) * deod (k ,3) ) +2100 continue + end diff --git a/gcc/testsuite/gfortran.dg/vect/O3-bb-slp-2.f b/gcc/testsuite/gfortran.dg/vect/O3-bb-slp-2.f new file mode 100644 index 0000000..34c44de --- /dev/null +++ b/gcc/testsuite/gfortran.dg/vect/O3-bb-slp-2.f @@ -0,0 +1,40 @@ +! { dg-do compile } +! { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } + subroutine tranx3 (ibeg,jbeg,jend,kbeg,kend + & ,dlo,den + & ,edn) + parameter(in = 128+5 + & , jn = 128+5 + & , kn = 128+5) + parameter(ijkn = 128+5) + real*8 e (in,jn,kn), dqm, dvl3a (kn), dvl3ai (kn) + & , dtwid (ijkn,4), dd (ijkn,4) + & , etwid (ijkn,4), deod (ijkn,4) + real*8 dlo (in,jn,kn), den (in,jn,kn) + & , edn (in,jn,kn) + do 2100 j=jbeg-1,jend + i = ibeg - 1 + do 1080 k=kbeg,kend + den(i ,j,k) = ( dlo(i ,j,k) * dvl3a(k) + 1 - etwid (k+1,1) + etwid (k,1) ) * dvl3a i(k) +1080 continue + do 2030 k=max(kbeg-2,ks-1),kend+1 + dqm = (dlo(i+2,j,k ) - dlo(i+2,j,k-1)) * dx3bi(k ) + dd(k,4) = max ( dqm * dqp, zro ) +2030 continue + dtwid (k,3) = ( 0.5 + q1 ) * ( dlo(i+2,j,k-1) + 1 + ( dx3a(k-1) - xi ) * dd (k-1,3) ) + 2 + ( 0.5 - q1 ) * ( dlo(i+2,j,k ) + 3 - ( dx3a(k ) + xi ) * deod (k ,3) ) + do 2080 k=kbeg,kend + den(i ,j,k) = ( dlo(i ,j,k) * dvl3a(k) + 1 - dtwid (k+1,3) + dtwid (k,3) ) * dvl3a i(k) + e (i+2,j,k) = ( e (i+2,j,k) * dvl3a(k) + 1 - etwid (k+1,3) + etwid (k,3) ) * dvl3a i(k) + edn(i+2,j,k) = e(i+2,j,k) / den(i+2,j,k) + e (i+3,j,k) = ( e (i+3,j,k) * dvl3a(k) + 1 - etwid (k+1,4) + etwid (k,4) ) * dvl3a i(k) + edn(i+3,j,k) = e(i+3,j,k) / den(i+3,j,k) +2080 continue +2100 continue + end diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 5d00b6f..3617918 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -1084,6 +1084,33 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, exit = single_exit (loop); basic_block new_preheader = new_bbs[0]; + /* Before installing PHI arguments make sure that the edges + into them match that of the scalar loop we analyzed. This + makes sure the SLP tree matches up between the main vectorized + loop and the epilogue vectorized copies. */ + if (single_succ_edge (preheader)->dest_idx + != single_succ_edge (new_bbs[0])->dest_idx) + { + basic_block swap_bb = new_bbs[1]; + gcc_assert (EDGE_COUNT (swap_bb->preds) == 2); + std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1)); + EDGE_PRED (swap_bb, 0)->dest_idx = 0; + EDGE_PRED (swap_bb, 1)->dest_idx = 1; + } + if (duplicate_outer_loop) + { + class loop *new_inner_loop = get_loop_copy (scalar_loop->inner); + if (loop_preheader_edge (scalar_loop)->dest_idx + != loop_preheader_edge (new_inner_loop)->dest_idx) + { + basic_block swap_bb = new_inner_loop->header; + gcc_assert (EDGE_COUNT (swap_bb->preds) == 2); + std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1)); + EDGE_PRED (swap_bb, 0)->dest_idx = 0; + EDGE_PRED (swap_bb, 1)->dest_idx = 1; + } + } + add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); /* Skip new preheader since it's deleted if copy loop is added at entry. */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index e42f327..75b7314 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -7377,15 +7377,24 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, if (slp_node) { vec_initial_defs.reserve (vec_num); - gcc_assert (slp_node == slp_node_instance->reduc_phis); - stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info); - tree neutral_op - = neutral_op_for_slp_reduction (slp_node, vectype_out, - STMT_VINFO_REDUC_CODE (reduc_info), - first != NULL); - get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis, - &vec_initial_defs, vec_num, - first != NULL, neutral_op); + if (nested_cycle) + { + unsigned phi_idx = loop_preheader_edge (loop)->dest_idx; + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx], + &vec_initial_defs); + } + else + { + gcc_assert (slp_node == slp_node_instance->reduc_phis); + stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info); + tree neutral_op + = neutral_op_for_slp_reduction (slp_node, vectype_out, + STMT_VINFO_REDUC_CODE (reduc_info), + first != NULL); + get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis, + &vec_initial_defs, vec_num, + first != NULL, neutral_op); + } } else { @@ -7520,6 +7529,79 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo, return true; } +/* Vectorizes PHIs. */ + +bool +vectorizable_phi (vec_info *, + stmt_vec_info stmt_info, gimple **vec_stmt, + slp_tree slp_node) +{ + if (!is_a <gphi *> (stmt_info->stmt) || !slp_node) + return false; + + if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def) + return false; + + tree vectype = SLP_TREE_VECTYPE (slp_node); + + if (!vec_stmt) /* transformation not required. */ + { + slp_tree child; + unsigned i; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child) + if (!child) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "PHI node with unvectorized backedge def\n"); + return false; + } + else if (!vect_maybe_update_slp_op_vectype (child, vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for invariants\n"); + return false; + } + STMT_VINFO_TYPE (stmt_info) = phi_info_type; + return true; + } + + tree scalar_dest = gimple_phi_result (stmt_info->stmt); + basic_block bb = gimple_bb (stmt_info->stmt); + tree vec_dest = vect_create_destination_var (scalar_dest, vectype); + auto_vec<tree> vec_oprnds; + auto_vec<gphi *> new_phis; + for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i) + { + slp_tree child = SLP_TREE_CHILDREN (slp_node)[i]; + + /* Skip not yet vectorized defs. */ + if (SLP_TREE_DEF_TYPE (child) == vect_internal_def + && SLP_TREE_VEC_STMTS (child).is_empty ()) + continue; + + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds); + if (!new_phis.exists ()) + { + new_phis.create (vec_oprnds.length ()); + for (unsigned j = 0; j < vec_oprnds.length (); j++) + { + /* Create the vectorized LC PHI node. */ + new_phis.quick_push (create_phi_node (vec_dest, bb)); + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]); + } + } + edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i); + for (unsigned j = 0; j < vec_oprnds.length (); j++) + add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION); + } + /* We should have at least one already vectorized child. */ + gcc_assert (new_phis.exists ()); + + return true; +} + /* Function vect_min_worthwhile_factor. @@ -8376,8 +8458,16 @@ vectorizable_live_operation (vec_info *vinfo, gimple_seq stmts = NULL; new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts, true, NULL_TREE); - - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + if (is_a <gphi *> (vec_stmt)) + { + gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt)); + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); + } + else + { + gimple_stmt_iterator si = gsi_for_stmt (vec_stmt); + gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT); + } /* Replace use of lhs with newly computed result. If the use stmt is a single arg PHI, just replace all uses of PHI result. It's necessary diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 85865da..f544b55 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -115,7 +115,8 @@ vect_free_slp_tree (slp_tree node) return; FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_free_slp_tree (child); + if (child) + vect_free_slp_tree (child); delete node; } @@ -148,9 +149,9 @@ vect_free_slp_instance (slp_instance instance) /* Create an SLP node for SCALAR_STMTS. */ static slp_tree -vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops) +vect_create_new_slp_node (slp_tree node, + vec<stmt_vec_info> scalar_stmts, unsigned nops) { - slp_tree node = new _slp_tree; SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; SLP_TREE_CHILDREN (node).create (nops); SLP_TREE_DEF_TYPE (node) = vect_internal_def; @@ -159,18 +160,33 @@ vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops) return node; } +/* Create an SLP node for SCALAR_STMTS. */ + +static slp_tree +vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops) +{ + return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops); +} + /* Create an SLP node for OPS. */ static slp_tree -vect_create_new_slp_node (vec<tree> ops) +vect_create_new_slp_node (slp_tree node, vec<tree> ops) { - slp_tree node = new _slp_tree; SLP_TREE_SCALAR_OPS (node) = ops; SLP_TREE_DEF_TYPE (node) = vect_external_def; SLP_TREE_LANES (node) = ops.length (); return node; } +/* Create an SLP node for OPS. */ + +static slp_tree +vect_create_new_slp_node (vec<tree> ops) +{ + return vect_create_new_slp_node (new _slp_tree, ops); +} + /* This structure is used in creation of an SLP tree. Each instance corresponds to the same operand in a group of scalar stmts in an SLP @@ -443,10 +459,13 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap, else commutative_op = commutative_tree_code (code) ? 0U : -1U; } + else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt)) + number_of_oprnds = gimple_phi_num_args (stmt); else return -1; bool swapped = (swap != 0); + bool backedge = false; gcc_assert (!swapped || first_op_cond); enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds); for (i = 0; i < number_of_oprnds; i++) @@ -463,6 +482,13 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap, else oprnd = gimple_op (stmt_info->stmt, map[i]); } + else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt)) + { + oprnd = gimple_phi_arg_def (stmt, i); + backedge = dominated_by_p (CDI_DOMINATORS, + gimple_phi_arg_edge (stmt, i)->src, + gimple_bb (stmt_info->stmt)); + } else oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i)); if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR) @@ -487,6 +513,26 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap, oprnd_info->def_stmts.quick_push (def_stmt_info); oprnd_info->ops.quick_push (oprnd); + /* If there's a extern def on a backedge make sure we can + code-generate at the region start. + ??? This is another case that could be fixed by adjusting + how we split the function but at the moment we'd have conflicting + goals there. */ + if (backedge + && dts[i] == vect_external_def + && is_a <bb_vec_info> (vinfo) + && !SSA_NAME_IS_DEFAULT_DEF (oprnd) + && !dominated_by_p (CDI_DOMINATORS, + as_a <bb_vec_info> (vinfo)->bbs[0], + gimple_bb (SSA_NAME_DEF_STMT (oprnd)))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: extern def %T only defined " + "on backedge\n", oprnd); + return -1; + } + if (first) { tree type = TREE_TYPE (oprnd); @@ -521,6 +567,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap, case vect_internal_def: case vect_reduction_def: case vect_induction_def: + case vect_nested_cycle: break; default: @@ -815,6 +862,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, machine_mode vec_mode; stmt_vec_info first_load = NULL, prev_first_load = NULL; bool first_stmt_load_p = false, load_p = false; + bool first_stmt_phi_p = false, phi_p = false; /* For every stmt in NODE find its def stmt/s. */ stmt_vec_info stmt_info; @@ -904,6 +952,11 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, return false; } } + else if (gimple_code (stmt) == GIMPLE_PHI) + { + rhs_code = ERROR_MARK; + phi_p = true; + } else { rhs_code = gimple_assign_rhs_code (stmt); @@ -916,6 +969,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, *node_vectype = vectype; first_stmt_code = rhs_code; first_stmt_load_p = load_p; + first_stmt_phi_p = phi_p; /* Shift arguments should be equal in all the packed stmts for a vector shift with scalar shift operand. */ @@ -1021,7 +1075,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, || first_stmt_code == INDIRECT_REF || first_stmt_code == COMPONENT_REF || first_stmt_code == MEM_REF))) - || first_stmt_load_p != load_p) + || first_stmt_load_p != load_p + || first_stmt_phi_p != phi_p) { if (dump_enabled_p ()) { @@ -1077,6 +1132,18 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, } } + if (phi_p + && (gimple_bb (first_stmt_info->stmt) + != gimple_bb (stmt_info->stmt))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: different BB for PHI " + "in %G", stmt); + /* Mismatch. */ + continue; + } + if (!types_compatible_p (vectype, *node_vectype)) { if (dump_enabled_p ()) @@ -1138,7 +1205,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, } /* Not memory operation. */ - if (TREE_CODE_CLASS (rhs_code) != tcc_binary + if (!phi_p + && TREE_CODE_CLASS (rhs_code) != tcc_binary && TREE_CODE_CLASS (rhs_code) != tcc_unary && TREE_CODE_CLASS (rhs_code) != tcc_expression && TREE_CODE_CLASS (rhs_code) != tcc_comparison @@ -1251,7 +1319,7 @@ typedef hash_map <vec <gimple *>, slp_tree, scalar_stmts_to_slp_tree_map_t; static slp_tree -vect_build_slp_tree_2 (vec_info *vinfo, +vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, vec<stmt_vec_info> stmts, unsigned int group_size, poly_uint64 *max_nunits, bool *matches, unsigned *npermutes, unsigned *tree_size, @@ -1276,19 +1344,37 @@ vect_build_slp_tree (vec_info *vinfo, } return *leader; } + + /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2 + so we can pick up backedge destinations during discovery. */ + slp_tree res = new _slp_tree; + SLP_TREE_DEF_TYPE (res) = vect_internal_def; + SLP_TREE_SCALAR_STMTS (res) = stmts; + bst_map->put (stmts.copy (), res); + poly_uint64 this_max_nunits = 1; - slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, + slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size, &this_max_nunits, matches, npermutes, tree_size, bst_map); - if (res) + if (!res_) + { + bool existed_p = bst_map->put (stmts, NULL); + gcc_assert (existed_p); + /* Mark the node invalid so we can detect those when still in use + as backedge destinations. */ + SLP_TREE_SCALAR_STMTS (res) = vNULL; + SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def; + vect_free_slp_tree (res); + } + else { + gcc_assert (res_ == res); res->max_nunits = this_max_nunits; vect_update_max_nunits (max_nunits, this_max_nunits); /* Keep a reference for the bst_map use. */ SLP_TREE_REF_COUNT (res)++; } - bst_map->put (stmts.copy (), res); - return res; + return res_; } /* Recursively build an SLP tree starting from NODE. @@ -1299,7 +1385,7 @@ vect_build_slp_tree (vec_info *vinfo, was found. */ static slp_tree -vect_build_slp_tree_2 (vec_info *vinfo, +vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, vec<stmt_vec_info> stmts, unsigned int group_size, poly_uint64 *max_nunits, bool *matches, unsigned *npermutes, unsigned *tree_size, @@ -1307,7 +1393,6 @@ vect_build_slp_tree_2 (vec_info *vinfo, { unsigned nops, i, this_tree_size = 0; poly_uint64 this_max_nunits = *max_nunits; - slp_tree node; matches[0] = false; @@ -1327,41 +1412,60 @@ vect_build_slp_tree_2 (vec_info *vinfo, /* If the SLP node is a PHI (induction or reduction), terminate the recursion. */ - if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt)) - { - tree scalar_type = TREE_TYPE (PHI_RESULT (stmt)); - tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, - group_size); - if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype, - max_nunits)) - return NULL; + bool skip_args[2] = { false, false }; + if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) + if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt)) + { + tree scalar_type = TREE_TYPE (PHI_RESULT (stmt)); + tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, + group_size); + if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype, + max_nunits)) + return NULL; - vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info); - /* Induction from different IVs is not supported. */ - if (def_type == vect_induction_def) - { - stmt_vec_info other_info; - FOR_EACH_VEC_ELT (stmts, i, other_info) - if (stmt_info != other_info) - return NULL; - } - else if (def_type == vect_reduction_def - || def_type == vect_double_reduction_def - || def_type == vect_nested_cycle) - { - /* Else def types have to match. */ - stmt_vec_info other_info; - FOR_EACH_VEC_ELT (stmts, i, other_info) - if (STMT_VINFO_DEF_TYPE (other_info) != def_type) - return NULL; - } - else - return NULL; - (*tree_size)++; - node = vect_create_new_slp_node (stmts, nops); - SLP_TREE_VECTYPE (node) = vectype; - return node; - } + vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info); + /* Induction from different IVs is not supported. */ + if (def_type == vect_induction_def) + { + stmt_vec_info other_info; + FOR_EACH_VEC_ELT (stmts, i, other_info) + if (stmt_info != other_info) + return NULL; + + /* Induction PHIs are leafs. */ + (*tree_size)++; + node = vect_create_new_slp_node (node, stmts, nops); + SLP_TREE_VECTYPE (node) = vectype; + SLP_TREE_CHILDREN (node).quick_grow_cleared (nops); + return node; + } + else if (def_type == vect_reduction_def + || def_type == vect_double_reduction_def + || def_type == vect_nested_cycle) + { + /* Else def types have to match. */ + stmt_vec_info other_info; + bool all_same = true; + FOR_EACH_VEC_ELT (stmts, i, other_info) + { + if (STMT_VINFO_DEF_TYPE (other_info) != def_type) + return NULL; + if (other_info != stmt_info) + all_same = false; + } + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + /* Reduction initial values are not explicitely represented. */ + if (!nested_in_vect_loop_p (loop, stmt_info)) + skip_args[loop_preheader_edge (loop)->dest_idx] = true; + /* Reduction chain backedge defs are filled manually. + ??? Need a better way to identify a SLP reduction chain PHI. + Or a better overall way to SLP match those. */ + if (all_same && def_type == vect_reduction_def) + skip_args[loop_latch_edge (loop)->dest_idx] = true; + } + else if (def_type != vect_internal_def) + return NULL; + } bool two_operators = false; @@ -1386,7 +1490,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, { *max_nunits = this_max_nunits; (*tree_size)++; - node = vect_create_new_slp_node (stmts, 0); + node = vect_create_new_slp_node (node, stmts, 0); SLP_TREE_VECTYPE (node) = vectype; /* And compute the load permutation. Whether it is actually a permutation depends on the unrolling factor which is @@ -1440,7 +1544,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, representing an actual vector without any scalar ops. ??? We could hide it completely with making the permute node external? */ - node = vect_create_new_slp_node (stmts, 1); + node = vect_create_new_slp_node (node, stmts, 1); SLP_TREE_CODE (node) = VEC_PERM_EXPR; SLP_TREE_LANE_PERMUTATION (node) = lperm; SLP_TREE_VECTYPE (node) = vectype; @@ -1486,6 +1590,14 @@ vect_build_slp_tree_2 (vec_info *vinfo, continue; } + /* We're skipping certain operands from processing, for example + outer loop reduction initial defs. */ + if (i <= 1 && skip_args[i]) + { + children.safe_push (NULL); + continue; + } + if (is_a <bb_vec_info> (vinfo) && oprnd_info->first_dt == vect_internal_def) { @@ -1508,9 +1620,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, } } - if (oprnd_info->first_dt != vect_internal_def - && oprnd_info->first_dt != vect_reduction_def - && oprnd_info->first_dt != vect_induction_def) + if (oprnd_info->first_dt == vect_external_def + || oprnd_info->first_dt == vect_constant_def) { slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; @@ -1639,14 +1750,14 @@ fail: child = vect_create_new_slp_node (oprnd_info->ops); children.safe_push (child); oprnd_info->ops = vNULL; - oprnd_info->def_stmts = vNULL; continue; } } gcc_assert (child == NULL); FOR_EACH_VEC_ELT (children, j, child) - vect_free_slp_tree (child); + if (child) + vect_free_slp_tree (child); vect_free_oprnd_info (oprnds_info); return NULL; } @@ -1670,7 +1781,9 @@ fail: unsigned n_vector_builds = 0; FOR_EACH_VEC_ELT (children, j, child) { - if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) + if (!child) + ; + else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) all_uniform_p = false; else if (!vect_slp_tree_uniform_p (child)) { @@ -1684,7 +1797,8 @@ fail: /* Roll back. */ matches[0] = false; FOR_EACH_VEC_ELT (children, j, child) - vect_free_slp_tree (child); + if (child) + vect_free_slp_tree (child); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -1718,7 +1832,7 @@ fail: /* Here we record the original defs since this node represents the final lane configuration. */ - node = vect_create_new_slp_node (stmts, 2); + node = vect_create_new_slp_node (node, stmts, 2); SLP_TREE_VECTYPE (node) = vectype; SLP_TREE_CODE (node) = VEC_PERM_EXPR; SLP_TREE_CHILDREN (node).quick_push (one); @@ -1749,7 +1863,7 @@ fail: return node; } - node = vect_create_new_slp_node (stmts, nops); + node = vect_create_new_slp_node (node, stmts, nops); SLP_TREE_VECTYPE (node) = vectype; SLP_TREE_CHILDREN (node).splice (children); return node; @@ -1835,7 +1949,8 @@ vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc, vect_print_slp_tree (dump_kind, loc, node); FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_print_slp_graph (dump_kind, loc, child, visited); + if (child) + vect_print_slp_graph (dump_kind, loc, child, visited); } static void @@ -1865,7 +1980,8 @@ vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited) STMT_SLP_TYPE (stmt_info) = pure_slp; FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_mark_slp_stmts (child, visited); + if (child) + vect_mark_slp_stmts (child, visited); } static void @@ -1898,7 +2014,8 @@ vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited) } FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_mark_slp_stmts_relevant (child, visited); + if (child) + vect_mark_slp_stmts_relevant (child, visited); } static void @@ -1915,7 +2032,7 @@ static void vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node, hash_set<slp_tree> &visited) { - if (visited.add (node)) + if (!node || visited.add (node)) return; if (SLP_TREE_CHILDREN (node).length () == 0) @@ -2176,34 +2293,62 @@ vect_build_slp_instance (vec_info *vinfo, } } - /* If this is a reduction chain with a conversion in front - amend the SLP tree with a node for that. */ - if (kind == slp_inst_kind_reduc_chain - && STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def) + /* Fixup SLP reduction chains. */ + if (kind == slp_inst_kind_reduc_chain) { - /* Get at the conversion stmt - we know it's the single use - of the last stmt of the reduction chain. */ - gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt; + /* If this is a reduction chain with a conversion in front + amend the SLP tree with a node for that. */ + gimple *scalar_def + = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt; + if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def) + { + /* Get at the conversion stmt - we know it's the single use + of the last stmt of the reduction chain. */ + use_operand_p use_p; + bool r = single_imm_use (gimple_assign_lhs (scalar_def), + &use_p, &scalar_def); + gcc_assert (r); + stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def); + next_info = vect_stmt_to_vectorize (next_info); + scalar_stmts = vNULL; + scalar_stmts.create (group_size); + for (unsigned i = 0; i < group_size; ++i) + scalar_stmts.quick_push (next_info); + slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1); + SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info); + SLP_TREE_CHILDREN (conv).quick_push (node); + SLP_INSTANCE_TREE (new_instance) = conv; + /* We also have to fake this conversion stmt as SLP reduction + group so we don't have to mess with too much code + elsewhere. */ + REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info; + REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL; + } + /* Fill the backedge child of the PHI SLP node. The + general matching code cannot find it because the + scalar code does not reflect how we vectorize the + reduction. */ use_operand_p use_p; - gimple *use_stmt; - bool r = single_imm_use (gimple_assign_lhs (tem), - &use_p, &use_stmt); - gcc_assert (r); - stmt_vec_info next_info = vinfo->lookup_stmt (use_stmt); - next_info = vect_stmt_to_vectorize (next_info); - scalar_stmts = vNULL; - scalar_stmts.create (group_size); - for (unsigned i = 0; i < group_size; ++i) - scalar_stmts.quick_push (next_info); - slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1); - SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info); - SLP_TREE_CHILDREN (conv).quick_push (node); - SLP_INSTANCE_TREE (new_instance) = conv; - /* We also have to fake this conversion stmt as SLP reduction - group so we don't have to mess with too much code - elsewhere. */ - REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info; - REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL; + imm_use_iterator imm_iter; + class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo)); + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, + gimple_get_lhs (scalar_def)) + /* There are exactly two non-debug uses, the reduction + PHI and the loop-closed PHI node. */ + if (!is_gimple_debug (USE_STMT (use_p)) + && gimple_bb (USE_STMT (use_p)) == loop->header) + { + auto_vec<stmt_vec_info, 64> phis (group_size); + stmt_vec_info phi_info + = vinfo->lookup_stmt (USE_STMT (use_p)); + for (unsigned i = 0; i < group_size; ++i) + phis.quick_push (phi_info); + slp_tree *phi_node = bst_map->get (phis); + unsigned dest_idx = loop_latch_edge (loop)->dest_idx; + SLP_TREE_CHILDREN (*phi_node)[dest_idx] + = SLP_INSTANCE_TREE (new_instance); + SLP_INSTANCE_TREE (new_instance)->refcnt++; + } } vinfo->slp_instances.safe_push (new_instance); @@ -2437,60 +2582,6 @@ vect_analyze_slp_instance (vec_info *vinfo, return res; } -/* Fill in backedge SLP children in the SLP graph. */ - -static void -vect_analyze_slp_backedges (vec_info *vinfo, slp_tree node, - scalar_stmts_to_slp_tree_map_t *bst_map, - hash_set<slp_tree> &visited) -{ - if (SLP_TREE_DEF_TYPE (node) != vect_internal_def - || visited.add (node)) - return; - - slp_tree child; - unsigned i; - FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - if (child) - vect_analyze_slp_backedges (vinfo, child, bst_map, visited); - - /* Inductions are not vectorized by vectorizing their defining cycle - but by materializing the values from SCEV data. */ - if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (node)) - == vect_induction_def) - return; - - if (gphi *phi = dyn_cast <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt)) - for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i) - { - auto_vec<stmt_vec_info, 64> stmts; - unsigned j; - stmt_vec_info phi_info; - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, phi_info) - { - tree def = gimple_phi_arg_def (as_a <gphi *>(phi_info->stmt), i); - stmt_vec_info def_info = vinfo->lookup_def (def); - if (!def_info) - break; - stmts.safe_push (vect_stmt_to_vectorize (def_info)); - } - if (j != SLP_TREE_LANES (node)) - continue; - slp_tree *edge_def = bst_map->get (stmts); - if (edge_def) - { - /* ??? We're currently not recording non-backedge children - of PHIs like external reduction initial values. Avoid - NULL entries in SLP_TREE_CHILDREN for those and thus - for now simply only record backedge defs at a - SLP_TREE_CHILDREN index possibly not matching that of - the corresponding PHI argument index. */ - SLP_TREE_CHILDREN (node).quick_push (*edge_def); - (*edge_def)->refcnt++; - } - } -} - /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP trees of packed scalar stmts if SLP is possible. */ @@ -2541,13 +2632,6 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) max_tree_size); } - /* Fill in backedges. */ - slp_instance instance; - hash_set<slp_tree> visited; - FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) - vect_analyze_slp_backedges (vinfo, SLP_INSTANCE_TREE (instance), - bst_map, visited); - /* The map keeps a reference on SLP nodes built, release that. */ for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); it != bst_map->end (); ++it) @@ -2572,11 +2656,16 @@ vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node, node->vertex = vertices.length (); vertices.safe_push (node); - if (SLP_TREE_CHILDREN (node).is_empty ()) + + bool leaf = true; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) + if (child) + { + leaf = false; + vect_slp_build_vertices (visited, child, vertices, leafs); + } + if (leaf) leafs.safe_push (node->vertex); - else - FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_slp_build_vertices (visited, child, vertices, leafs); } /* Fill the vertices and leafs vector with all nodes in the SLP graph. */ @@ -2654,7 +2743,8 @@ vect_optimize_slp (vec_info *vinfo) unsigned j; slp_tree child; FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) - add_edge (slpg, i, child->vertex); + if (child) + add_edge (slpg, i, child->vertex); } /* Compute (reverse) postorder on the inverted graph. */ @@ -2853,7 +2943,7 @@ vect_optimize_slp (vec_info *vinfo) slp_tree child; FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) { - if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) + if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def) continue; /* If the vector is uniform there's nothing to do. */ @@ -3291,6 +3381,7 @@ vect_slp_convert_to_external (vec_info *vinfo, slp_tree node, unsigned int group_size = SLP_TREE_LANES (node); SLP_TREE_DEF_TYPE (node) = vect_external_def; SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true); + SLP_TREE_LOAD_PERMUTATION (node).release (); FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) { tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt); @@ -3373,9 +3464,20 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, slp_tree child; /* Assume we can code-generate all invariants. */ - if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) + if (!node + || SLP_TREE_DEF_TYPE (node) == vect_constant_def + || SLP_TREE_DEF_TYPE (node) == vect_external_def) return true; + if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Failed cyclic SLP reference in %p", node); + return false; + } + gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def); + /* If we already analyzed the exact same set of scalar stmts we're done. We share the generated vector stmts for those. */ if (visited.contains (node) @@ -3404,8 +3506,9 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, other referrers. */ if (res) FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) - if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def - || SLP_TREE_DEF_TYPE (child) == vect_external_def) + if (child + && (SLP_TREE_DEF_TYPE (child) == vect_constant_def + || SLP_TREE_DEF_TYPE (child) == vect_external_def) /* Perform usual caching, note code-generation still code-gens these nodes multiple times but we expect to CSE them later. */ @@ -3459,8 +3562,12 @@ static void vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node, slp_instance instance, stmt_vector_for_cost *cost_vec, - hash_set<stmt_vec_info> &svisited) + hash_set<stmt_vec_info> &svisited, + hash_set<slp_tree> &visited) { + if (visited.add (node)) + return; + unsigned i; stmt_vec_info stmt_info; stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node); @@ -3479,7 +3586,7 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node, gimple *orig_stmt = orig_stmt_info->stmt; ssa_op_iter op_iter; def_operand_p def_p; - FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF) + FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF) { imm_use_iterator use_iter; gimple *use_stmt; @@ -3548,9 +3655,9 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node, slp_tree child; FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) + if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def) vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, - cost_vec, svisited); + cost_vec, svisited, visited); } /* Analyze statements in SLP instances of VINFO. Return true if the @@ -3615,11 +3722,13 @@ vect_slp_analyze_operations (vec_info *vinfo) if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo)) { hash_set<stmt_vec_info> svisited; + hash_set<slp_tree> visited; for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i) { vect_location = instance->location (); vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance), - instance, &instance->cost_vec, svisited); + instance, &instance->cost_vec, svisited, + visited); } } @@ -3683,7 +3792,7 @@ vect_bb_partition_graph_r (bb_vec_info bb_vinfo, slp_tree child; FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) + if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def) vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance, instance_leader, visited); } @@ -3759,7 +3868,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, the scalar cost. */ if (!STMT_VINFO_LIVE_P (stmt_info)) { - FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF) + FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF) { imm_use_iterator use_iter; gimple *use_stmt; @@ -3803,7 +3912,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, auto_vec<bool, 20> subtree_life; FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) { - if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) + if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def) { /* Do not directly pass LIFE to the recursive call, copy it to confine changes in the callee to the current child/subtree. */ @@ -4272,19 +4381,18 @@ vect_slp_function (function *fun) for (unsigned i = 0; i < n; i++) { basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]); - - /* Split when a basic block has multiple predecessors or when the - edge into it exits a loop (because of implementation issues with - respect to placement of CTORs for externals). */ bool split = false; - edge e; - if (!single_pred_p (bb) - || ((e = single_pred_edge (bb)), - loop_exit_edge_p (e->src->loop_father, e))) - split = true; + /* Split when a BB is not dominated by the first block. */ + if (!bbs.is_empty () + && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0])) + split = true; + /* Split when the loop determined by the first block + is exited. This is because we eventually insert + invariants at region begin. */ else if (!bbs.is_empty () - && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0])) + && bbs[0]->loop_father != bb->loop_father + && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father)) split = true; if (split && !bbs.is_empty ()) @@ -5087,24 +5195,22 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi, return true; } -/* Vectorize SLP instance tree in postorder. */ +/* Vectorize SLP NODE. */ static void -vect_schedule_slp_instance (vec_info *vinfo, - slp_tree node, slp_instance instance, - hash_set<slp_tree> &visited) +vect_schedule_slp_node (vec_info *vinfo, + slp_tree node, slp_instance instance) { gimple_stmt_iterator si; int i; slp_tree child; - /* See if we have already vectorized the node in the graph of the - SLP instance. */ - if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def - && SLP_TREE_VEC_STMTS (node).exists ()) - || SLP_TREE_VEC_DEFS (node).exists ()) + /* For existing vectors there's nothing to do. */ + if (SLP_TREE_VEC_DEFS (node).exists ()) return; + gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ()); + /* Vectorize externals and constants. */ if (SLP_TREE_DEF_TYPE (node) == vect_constant_def || SLP_TREE_DEF_TYPE (node) == vect_external_def) @@ -5119,17 +5225,11 @@ vect_schedule_slp_instance (vec_info *vinfo, return; } - /* ??? If we'd have a way to mark backedges that would be cheaper. */ - if (visited.add (node)) - return; - - FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_schedule_slp_instance (vinfo, child, instance, visited); + stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0); SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); - stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "------>vectorizing SLP node starting from: %G", @@ -5148,13 +5248,12 @@ vect_schedule_slp_instance (vec_info *vinfo, last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); si = gsi_for_stmt (last_stmt_info->stmt); } - else if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) - == cycle_phi_info_type) - || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) - == induc_vec_info_type)) + else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type + || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type + || STMT_VINFO_TYPE (stmt_info) == phi_info_type) + && SLP_TREE_CODE (node) != VEC_PERM_EXPR) { - /* For reduction and induction PHIs we do not use the - insertion iterator. */ + /* For PHI node vectorization we do not use the insertion iterator. */ si = gsi_none (); } else @@ -5277,7 +5376,7 @@ vect_remove_slp_scalar_calls (vec_info *vinfo, tree lhs; stmt_vec_info stmt_info; - if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) + if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def) return; if (visited.add (node)) @@ -5359,6 +5458,142 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance) gsi_replace (&rgsi, rstmt, true); } +struct slp_scc_info +{ + bool on_stack; + int dfs; + int lowlink; +}; + +/* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */ + +static void +vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance, + hash_map<slp_tree, slp_scc_info> &scc_info, + int &maxdfs, vec<slp_tree> &stack) +{ + bool existed_p; + slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p); + gcc_assert (!existed_p); + info->dfs = maxdfs; + info->lowlink = maxdfs; + info->on_stack = true; + maxdfs++; + stack.safe_push (node); + unsigned i; + slp_tree child; + + /* ??? We're keeping SLP_TREE_CHILDREN of externalized nodes. */ + if (SLP_TREE_DEF_TYPE (node) == vect_internal_def) + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) + { + if (!child) + continue; + slp_scc_info *child_info = scc_info.get (child); + if (!child_info) + { + vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack); + /* Recursion might have re-allocated the node. */ + info = scc_info.get (node); + child_info = scc_info.get (child); + info->lowlink = MIN (info->lowlink, child_info->lowlink); + } + else if (child_info->on_stack) + info->lowlink = MIN (info->lowlink, child_info->dfs); + } + if (info->lowlink != info->dfs) + return; + + /* Singleton. */ + if (stack.last () == node) + { + stack.pop (); + info->on_stack = false; + vect_schedule_slp_node (vinfo, node, instance); + return; + } + /* SCC. */ + int last_idx = stack.length () - 1; + while (stack[last_idx] != node) + last_idx--; + /* We can break the cycle at PHIs who have at least one child + code generated. Then we could re-start the DFS walk until + all nodes in the SCC are covered (we might have new entries + for only back-reachable nodes). But it's simpler to just + iterate and schedule those that are ready. */ + auto_vec<slp_tree, 4> phis_to_fixup; + unsigned todo = stack.length () - last_idx; + do + { + for (int idx = stack.length () - 1; idx >= last_idx; --idx) + { + slp_tree entry = stack[idx]; + if (!entry) + continue; + bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR + && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt)); + bool ready = !phi; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child) + if (!child) + { + gcc_assert (phi); + ready = true; + break; + } + else if (scc_info.get (child)->on_stack) + { + if (!phi) + { + ready = false; + break; + } + } + else + { + if (phi) + { + ready = true; + break; + } + } + if (ready) + { + vect_schedule_slp_node (vinfo, entry, instance); + scc_info.get (entry)->on_stack = false; + stack[idx] = NULL; + todo--; + if (phi) + phis_to_fixup.safe_push (entry); + } + } + } + while (todo != 0); + + /* Now fixup the backedge def of the vectorized PHIs in this SCC. */ + slp_tree phi_node; + FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node) + { + gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt); + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds) + { + unsigned dest_idx = e->dest_idx; + child = SLP_TREE_CHILDREN (phi_node)[dest_idx]; + if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def) + continue; + /* Simply fill all args. */ + for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i) + add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]), + vect_get_slp_vect_def (child, i), + e, gimple_phi_arg_location (phi, dest_idx)); + } + } + + /* Pop the SCC. */ + stack.truncate (last_idx); +} + /* Generate vector code for SLP_INSTANCES in the loop/basic block. */ void @@ -5367,7 +5602,8 @@ vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances) slp_instance instance; unsigned int i; - hash_set<slp_tree> visited; + hash_map<slp_tree, slp_scc_info> scc_info; + int maxdfs = 0; FOR_EACH_VEC_ELT (slp_instances, i, instance) { slp_tree node = SLP_INSTANCE_TREE (instance); @@ -5381,8 +5617,11 @@ vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances) vect_print_slp_graph (MSG_NOTE, vect_location, SLP_INSTANCE_TREE (instance)); } - /* Schedule the tree of INSTANCE. */ - vect_schedule_slp_instance (vinfo, node, instance, visited); + /* Schedule the tree of INSTANCE, scheduling SCCs in a way to + have a PHI be the node breaking the cycle. */ + auto_vec<slp_tree> stack; + if (!scc_info.get (node)) + vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack); if (SLP_INSTANCE_ROOT_STMT (instance)) vectorize_slp_instance_root_stmt (node, instance); @@ -5398,25 +5637,6 @@ vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances) stmt_vec_info store_info; unsigned int j; - /* For reductions set the latch values of the vectorized PHIs. */ - if (instance->reduc_phis - && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE - (instance->reduc_phis)) != FOLD_LEFT_REDUCTION - && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE - (instance->reduc_phis)) != EXTRACT_LAST_REDUCTION) - { - slp_tree slp_node = root; - slp_tree phi_node = instance->reduc_phis; - gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt); - edge e = loop_latch_edge (gimple_bb (phi)->loop_father); - gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length () - == SLP_TREE_VEC_STMTS (slp_node).length ()); - for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j) - add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]), - vect_get_slp_vect_def (slp_node, j), - e, gimple_phi_arg_location (phi, e->dest_idx)); - } - /* Remove scalar call stmts. Do not do this for basic-block vectorization as not all uses may be vectorized. ??? Why should this be necessary? DCE should be able to diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 3575f25..7f0763f 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -10745,7 +10745,8 @@ vect_analyze_stmt (vec_info *vinfo, || vectorizable_condition (vinfo, stmt_info, NULL, NULL, node, cost_vec) || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node, - cost_vec)); + cost_vec) + || vectorizable_phi (vinfo, stmt_info, NULL, node)); } if (!ok) @@ -10885,6 +10886,11 @@ vect_transform_stmt (vec_info *vinfo, gcc_assert (done); break; + case phi_info_type: + done = vectorizable_phi (vinfo, stmt_info, &vec_stmt, slp_node); + gcc_assert (done); + break; + default: if (!STMT_VINFO_LIVE_P (stmt_info)) { diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 0e08652..b63dda3 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -684,7 +684,8 @@ vec_info::new_stmt_vec_info (gimple *stmt) STMT_VINFO_SLP_VECT_ONLY (res) = false; STMT_VINFO_VEC_STMTS (res) = vNULL; - if (gimple_code (stmt) == GIMPLE_PHI + if (is_a <loop_vec_info> (this) + && gimple_code (stmt) == GIMPLE_PHI && is_loop_header_bb_p (gimple_bb (stmt))) STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type; else diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 9c55383..13a02cd 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -881,6 +881,7 @@ enum stmt_vec_info_type { type_conversion_vec_info_type, cycle_phi_info_type, lc_phi_info_type, + phi_info_type, loop_exit_ctrl_vec_info_type }; @@ -1939,6 +1940,7 @@ extern bool vect_transform_cycle_phi (loop_vec_info, stmt_vec_info, slp_tree, slp_instance); extern bool vectorizable_lc_phi (loop_vec_info, stmt_vec_info, gimple **, slp_tree); +extern bool vectorizable_phi (vec_info *, stmt_vec_info, gimple **, slp_tree); extern bool vect_worthwhile_without_simd_p (vec_info *, tree_code); extern int vect_get_known_peeling_cost (loop_vec_info, int, int *, stmt_vector_for_cost *, |