7 files changed, 196 insertions, 16 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 07e091b..3b15e2e 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,16 @@
+2015-07-02  Alan Lawrence  <alan.lawrence@arm.com>
+
+	* tree-pass.h (make_pass_ch_vect): New.
+	* passes.def: Add pass_ch_vect just before pass_if_conversion.
+
+	* tree-ssa-loop-ch.c (ch_base, pass_ch_vect, pass_data_ch_vect,
+	pass_ch::process_loop_p, pass_ch_vect::process_loop_p,
+	make_pass_ch_vect): New.
+	(pass_ch): Extend ch_base.
+
+	(pass_ch::execute): Move all but loop_optimizer_init/finalize to...
+	(ch_base::copy_headers): ...here.
+
 2015-07-02  Richard Biener  <rguenther@suse.de>
 
 	* builtins.c (get_pointer_alignment_1): Handle POINTER_PLUS_EXPR.
diff --git a/gcc/passes.def b/gcc/passes.def
index 9ced655..0d8356b 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -247,6 +247,7 @@ along with GCC; see the file COPYING3.  If not see
 	  PUSH_INSERT_PASSES_WITHIN (pass_parallelize_loops)
 	      NEXT_PASS (pass_expand_omp_ssa);
 	  POP_INSERT_PASSES ()
+	  NEXT_PASS (pass_ch_vect);
 	  NEXT_PASS (pass_if_conversion);
 	  /* pass_vectorize must immediately follow pass_if_conversion.
 	     Please do not add any other passes in between.  */
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index de7c440..e7b749b 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2015-07-02  Alan Lawrence  <alan.lawrence@arm.com>
+
+	* gcc.dg/vect/vect-strided-a-u16-i4.c (main1): Narrow scope of x,y,z,w.
+	* gcc.dg/vect/vect-ifcvt-11.c: New testcase.
+
 2015-07-02  Richard Biener  <rguenther@suse.de>
 
 	PR testsuite/66719
diff --git a/gcc/testsuite/gcc.dg/vect/vect-ifcvt-11.c b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-11.c
new file mode 100644
index 0000000..7e32369
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-11.c
@@ -0,0 +1,36 @@
+/* { dg-require-effective-target vect_condition } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 16
+
+extern void abort (void);
+
+int A[N] = {36, 39, 42, 45, 43, 32, 21, 12, 23, 34, 45, 56, 67, 78, 81, 11};
+int B[N] = {144,195,210,225,172,128,105,60, 92, 136,225,280,268,390,324,55};
+
+__attribute__((noinline))
+void foo ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      int m = (A[i] & i) ? 5 : 4;
+      A[i] = A[i] * m;
+    }
+}
+
+int main ()
+{
+
+  check_vect ();
+  foo ();
+  /* check results:  */
+  for (int i = 0; i < N; i++)
+    if (A[i] != B[i])
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-strided-a-u16-i4.c b/gcc/testsuite/gcc.dg/vect/vect-strided-a-u16-i4.c
index af33ed4..0be68b3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-strided-a-u16-i4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-strided-a-u16-i4.c
@@ -21,7 +21,6 @@ main1 ()
   s *ptr = arr;
   s res[N];
   int i;
-  unsigned short x, y, z, w;
 
   for (i = 0; i < N; i++)
     {
@@ -35,6 +34,7 @@ main1 ()
 
   for (i = 0; i < N; i++)
     {
+      unsigned short x, y, z, w;
       x = ptr->b - ptr->a;
       y = ptr->d - ptr->c;
       res[i].c = x + y;
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 4545556..2808dad 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -381,6 +381,7 @@ extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ccp (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_phi_only_cprop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_build_ssa (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-loop-ch.c b/gcc/tree-ssa-loop-ch.c
index 98534ba..121e3d8 100644
--- a/gcc/tree-ssa-loop-ch.c
+++ b/gcc/tree-ssa-loop-ch.c
@@ -135,12 +135,23 @@ do_while_loop_p (struct loop *loop)
   return true;
 }
 
-/* For all loops, copy the condition at the end of the loop body in front
-   of the loop.  This is beneficial since it increases efficiency of
-   code motion optimizations.  It also saves one jump on entry to the loop.  */
-
 namespace {
 
+/* Common superclass for both header-copying phases.  */
+class ch_base : public gimple_opt_pass
+{
+  protected:
+    ch_base (pass_data data, gcc::context *ctxt)
+      : gimple_opt_pass (data, ctxt)
+    {}
+
+  /* Copies headers of all loops in FUN for which process_loop_p is true.  */
+  unsigned int copy_headers (function *fun);
+
+  /* Return true to copy headers of LOOP or false to skip.  */
+  virtual bool process_loop_p (struct loop *loop) = 0;
+};
+
 const pass_data pass_data_ch =
 {
   GIMPLE_PASS, /* type */
@@ -154,21 +165,68 @@ const pass_data pass_data_ch =
   0, /* todo_flags_finish */
 };
 
-class pass_ch : public gimple_opt_pass
+class pass_ch : public ch_base
 {
 public:
   pass_ch (gcc::context *ctxt)
-    : gimple_opt_pass (pass_data_ch, ctxt)
+    : ch_base (pass_data_ch, ctxt)
   {}
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return flag_tree_ch != 0; }
+  
+  /* Initialize and finalize loop structures, copying headers inbetween.  */
   virtual unsigned int execute (function *);
 
+protected:
+  /* ch_base method: */
+  virtual bool process_loop_p (struct loop *loop);
 }; // class pass_ch
 
+const pass_data pass_data_ch_vect =
+{
+  GIMPLE_PASS, /* type */
+  "ch_vect", /* name */
+  OPTGROUP_LOOP, /* optinfo_flags */
+  TV_TREE_CH, /* tv_id */
+  ( PROP_cfg | PROP_ssa ), /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+/* This is a more aggressive version of the same pass, designed to run just
+   before if-conversion and vectorization, to put more loops into the form
+   required for those phases.  */
+class pass_ch_vect : public ch_base
+{
+public:
+  pass_ch_vect (gcc::context *ctxt)
+    : ch_base (pass_data_ch_vect, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *fun)
+  {
+    return flag_tree_ch != 0
+	   && (flag_tree_loop_vectorize != 0 || fun->has_force_vectorize_loops);
+  }
+  
+  /* Just copy headers, no initialization/finalization of loop structures.  */
+  virtual unsigned int execute (function *);
+
+protected:
+  /* ch_base method: */
+  virtual bool process_loop_p (struct loop *loop);
+}; // class pass_ch_vect
+
+/* For all loops, copy the condition at the end of the loop body in front
+   of the loop.  This is beneficial since it increases efficiency of
+   code motion optimizations.  It also saves one jump on entry to the loop.  */
+
 unsigned int
-pass_ch::execute (function *fun)
+ch_base::copy_headers (function *fun)
 {
   struct loop *loop;
   basic_block header;
@@ -178,13 +236,8 @@ pass_ch::execute (function *fun)
   unsigned bbs_size;
   bool changed = false;
 
-  loop_optimizer_init (LOOPS_HAVE_PREHEADERS
-		       | LOOPS_HAVE_SIMPLE_LATCHES);
   if (number_of_loops (fun) <= 1)
-    {
-      loop_optimizer_finalize ();
       return 0;
-    }
 
   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (fun));
   copied_bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (fun));
@@ -201,7 +254,7 @@ pass_ch::execute (function *fun)
 	 written as such, or because jump threading transformed it into one),
 	 we might be in fact peeling the first iteration of the loop.  This
 	 in general is not a good idea.  */
-      if (do_while_loop_p (loop))
+      if (!process_loop_p (loop))
 	continue;
 
       /* Iterate the header copying up to limit; this takes care of the cases
@@ -288,17 +341,88 @@ pass_ch::execute (function *fun)
       changed = true;
     }
 
-  update_ssa (TODO_update_ssa);
+  if (changed)
+    update_ssa (TODO_update_ssa);
   free (bbs);
   free (copied_bbs);
 
-  loop_optimizer_finalize ();
   return changed ? TODO_cleanup_cfg : 0;
 }
 
+/* Initialize the loop structures we need, and finalize after.  */
+
+unsigned int
+pass_ch::execute (function *fun)
+{
+  loop_optimizer_init (LOOPS_HAVE_PREHEADERS
+		       | LOOPS_HAVE_SIMPLE_LATCHES);
+
+  unsigned int res = copy_headers (fun);
+
+  loop_optimizer_finalize ();
+  return res;
+}
+
+/* Assume an earlier phase has already initialized all the loop structures that
+   we need here (and perhaps others too), and that these will be finalized by
+   a later phase.  */
+   
+unsigned int
+pass_ch_vect::execute (function *fun)
+{
+  return copy_headers (fun);
+}
+
+/* Apply header copying according to a very simple test of do-while shape.  */
+
+bool
+pass_ch::process_loop_p (struct loop *loop)
+{
+  return !do_while_loop_p (loop);
+}
+
+/* Apply header-copying to loops where we might enable vectorization.  */
+
+bool
+pass_ch_vect::process_loop_p (struct loop *loop)
+{
+  if (!flag_tree_vectorize && !loop->force_vectorize)
+    return false;
+
+  if (loop->dont_vectorize)
+    return false;
+
+  if (!do_while_loop_p (loop))
+    return true;
+
+ /* The vectorizer won't handle anything with multiple exits, so skip.  */
+  edge exit = single_exit (loop);
+  if (!exit)
+    return false;
+
+  /* Copy headers iff there looks to be code in the loop after the exit block,
+     i.e. the exit block has an edge to another block (besides the latch,
+     which should be empty).  */
+  edge_iterator ei;
+  edge e;
+  FOR_EACH_EDGE (e, ei, exit->src->succs)
+    if (!loop_exit_edge_p (loop, e)
+	&& e->dest != loop->header
+	&& e->dest != loop->latch)
+      return true;
+
+  return false;
+}
+
 } // anon namespace
 
 gimple_opt_pass *
+make_pass_ch_vect (gcc::context *ctxt)
+{
+  return new pass_ch_vect (ctxt);
+}
+
+gimple_opt_pass *
 make_pass_ch (gcc::context *ctxt)
 {
   return new pass_ch (ctxt);