29 files changed, 1176 insertions, 196 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index a5e5c16..c69bc6c 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,74 @@
+2010-07-04  Ira Rosen  <irar@il.ibm.com>
+	    Revital Eres  <eres@il.ibm.com>
+
+	* doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): 
+	Document new arguments.
+	* doc/tm.texi: Regenerate.
+	* targhooks.c (default_builtin_vectorization_cost): Add new arguments.
+	Handle unaligned store.
+	* targhooks.h (default_builtin_vectorization_cost): Add new arguments.
+	* target.def (builtin_vectorization_cost): Add new arguments.
+	* target.h (enum vect_cost_for_stmt): Add unaligned_store.
+	* tree-vect-loop-manip.c (vect_gen_niters_for_prolog_loop): Take number
+	of iterations of prolog loop directly from LOOP_PEELING_FOR_ALIGNMENT.
+	(vect_vfa_segment_size): Fix indentation.
+	* tree-vectorizer.h (struct _vect_peel_info): New.
+	(struct _vect_peel_extended_info): New.
+	(struct _loop_vec_info): Add new field for peeling hash table and a
+	macro for its access.
+	(VECT_MAX_COST): Define.
+	(vect_get_load_cost): Declare.
+	(vect_get_store_cost, vect_get_known_peeling_cost, 
+	vect_get_single_scalar_iteraion_cost): Likewise.
+	(vect_supportable_dr_alignment): Add new argument.
+	* tree-vect-loop.c (new_loop_vec_info): Initialize peeling hash table
+	field.
+	(destroy_loop_vec_info): Free peeling hash table.
+	(vect_analyze_loop_form): Update call to builtin_vectorization_cost.
+	(vect_analyze_loop): Move vect_enhance_data_refs_alignment before
+	vect_analyze_slp. Fix indentation.
+	(vect_get_single_scalar_iteraion_cost): New function.
+	(vect_get_known_peeling_cost): Likewise.
+	(vect_estimate_min_profitable_iters): Rename byte_misalign to npeel. 
+	Call vect_get_single_scalar_iteraion_cost instead of cost_for_stmt per 
+	statement. Move outside cost calculation inside unknown peeling case.
+	Call vect_get_known_peeling_cost for known amount of peeling.
+	* tree-vect-data-refs.c (vect_compute_data_ref_alignment): Add data
+	reference to the print message of forced alignment.
+	(vect_verify_datarefs_alignment): Update call to 
+	vect_supportable_dr_alignment.
+	(vect_get_data_access_cost): New function.
+	(vect_peeling_hash, vect_peeling_hash_eq, vect_peeling_hash_insert,
+	vect_peeling_hash_get_most_frequent, vect_peeling_hash_get_lowest_cost,
+	vect_peeling_hash_choose_best_peeling): Likewise.
+	(vect_enhance_data_refs_alignment): Fix documentation. Use hash table
+	to store all the accesses in the loop and find best possible access to
+	align using peeling for known alignment case. For unknown alignment
+	check if stores are preferred or if peeling is worthy.
+	(vect_find_same_alignment_drs): Analyze pairs of loads too.
+	(vect_supportable_dr_alignment): Add new argument and check aligned
+	accesses according to it. 
+	* tree-vect-stmts.c (vect_get_stmt_cost): New function.
+	(cost_for_stmt): Call vect_get_stmt_cost. 
+	(vect_model_simple_cost): Likewise.
+	(vect_model_store_cost): Call vect_get_stmt_cost. Call 
+	vect_get_store_cost to calculate the cost of the statement.
+	(vect_get_store_cost): New function.
+	(vect_model_load_cost): Call vect_get_stmt_cost. Call 
+	vect_get_load_cost to calculate the cost of the statement.
+	(vect_get_load_cost): New function.
+	(vectorizable_store): Update call to vect_supportable_dr_alignment.
+	(vectorizable_load): Likewise.
+	* config/spu/spu.c (spu_builtin_vectorization_cost): Add new
+	arguments. 
+	* config/i386/i386.c (ix86_builtin_vectorization_cost): Add new
+	arguments. Handle unaligned store.
+	* config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): New.
+	(rs6000_builtin_support_vector_misalignment): Return true for word and
+	double word alignments for VSX.
+	* tree-vect-slp.c (vect_build_slp_tree): Update calls to 
+	vect_supportable_dr_alignment and builtin_vectorization_cost.
+
 2010-07-03  John David Anglin  <dave.anglin@nrc-cnrc.gc.ca>
 
 	PR target/44597
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ec2cdd3..711fc16 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -29376,7 +29376,9 @@ static const struct attribute_spec ix86_attribute_table[] =
 
 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
 static int
-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost)
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+                                 tree vectype ATTRIBUTE_UNUSED,
+                                 int misalign ATTRIBUTE_UNUSED)
 {
   switch (type_of_cost)
     {
@@ -29405,6 +29407,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost)
         return ix86_cost->scalar_to_vec_cost;
 
       case unaligned_load:
+      case unaligned_store:
         return ix86_cost->vec_unalign_load_cost;
 
       case cond_branch_taken:
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index a98b4dd..45bc230 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1075,6 +1075,8 @@ static bool rs6000_builtin_support_vector_misalignment (enum
 							machine_mode,
 							const_tree,
 							int, bool);
+static int rs6000_builtin_vectorization_cost (enum vect_cost_for_stmt,
+                                              tree, int);
 
 static void def_builtin (int, const char *, tree, int);
 static bool rs6000_vector_alignment_reachable (const_tree, bool);
@@ -1467,6 +1469,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
   rs6000_builtin_support_vector_misalignment
 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE rs6000_vector_alignment_reachable
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
+  rs6000_builtin_vectorization_cost
 
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS rs6000_init_builtins
@@ -3333,12 +3338,19 @@ rs6000_builtin_support_vector_misalignment (enum machine_mode mode,
 
       if (misalignment == -1)
 	{
-	  /* misalignment factor is unknown at compile time but we know
+	  /* Misalignment factor is unknown at compile time but we know
 	     it's word aligned.  */
 	  if (rs6000_vector_alignment_reachable (type, is_packed))
-	    return true;
+            {
+              int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
+
+              if (element_size == 64 || element_size == 32)
+               return true;
+            }
+
 	  return false;
 	}
+
       /* VSX supports word-aligned vector.  */
       if (misalignment % 4 == 0)
 	return true;
@@ -3404,6 +3416,106 @@ rs6000_builtin_vec_perm (tree type, tree *mask_element_type)
   return d;
 }
 
+
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+static int
+rs6000_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+                                   tree vectype, int misalign)
+{
+  unsigned elements;
+
+  switch (type_of_cost)
+    {
+      case scalar_stmt:
+      case scalar_load:
+      case scalar_store:
+      case vector_stmt:
+      case vector_load:
+      case vector_store:
+      case vec_to_scalar:
+      case scalar_to_vec:
+      case cond_branch_not_taken:
+      case vec_perm:
+        return 1;
+
+      case cond_branch_taken:
+        return 3;
+
+      case unaligned_load:
+        if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN)
+          {
+            elements = TYPE_VECTOR_SUBPARTS (vectype);
+            if (elements == 2)
+              /* Double word aligned.  */
+              return 2;
+
+            if (elements == 4)
+              {
+                switch (misalign)
+                  {
+                    case 8:
+                      /* Double word aligned.  */
+                      return 2;
+
+                    case -1:
+                      /* Unknown misalignment.  */
+                    case 4:
+                    case 12:
+                      /* Word aligned.  */
+                      return 22;
+
+                    default:
+                      gcc_unreachable ();
+                  }
+              }
+          }
+
+        if (TARGET_ALTIVEC)
+          /* Misaligned loads are not supported.  */
+          gcc_unreachable ();
+
+        return 2;
+
+      case unaligned_store:
+        if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN)
+          {
+            elements = TYPE_VECTOR_SUBPARTS (vectype);
+            if (elements == 2)
+              /* Double word aligned.  */
+              return 2;
+
+            if (elements == 4)
+              {
+                switch (misalign)
+                  {
+                    case 8:
+                      /* Double word aligned.  */
+                      return 2;
+
+                    case -1:
+                      /* Unknown misalignment.  */
+                    case 4:
+                    case 12:
+                      /* Word aligned.  */
+                      return 23;
+
+                    default:
+                      gcc_unreachable ();
+                  }
+              }
+          }
+
+        if (TARGET_ALTIVEC)
+          /* Misaligned stores are not supported.  */
+          gcc_unreachable ();
+
+        return 2;
+
+      default:
+        gcc_unreachable ();
+    }
+}
+
 /* Handle generic options of the form -mfoo=yes/no.
    NAME is the option name.
    VALUE is the option value.
diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c
index 4b7f916..3d4f587 100644
--- a/gcc/config/spu/spu.c
+++ b/gcc/config/spu/spu.c
@@ -209,7 +209,7 @@ static rtx spu_addr_space_legitimize_address (rtx, rtx, enum machine_mode,
 static tree spu_builtin_mul_widen_even (tree);
 static tree spu_builtin_mul_widen_odd (tree);
 static tree spu_builtin_mask_for_load (void);
-static int spu_builtin_vectorization_cost (enum vect_cost_for_stmt);
+static int spu_builtin_vectorization_cost (enum vect_cost_for_stmt, tree, int);
 static bool spu_vector_alignment_reachable (const_tree, bool);
 static tree spu_builtin_vec_perm (tree, tree *);
 static enum machine_mode spu_addr_space_pointer_mode (addr_space_t);
@@ -6694,7 +6694,9 @@ spu_builtin_mask_for_load (void)
 
 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
 static int 
-spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost)
+spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+                                tree vectype ATTRIBUTE_UNUSED,
+                                int misalign ATTRIBUTE_UNUSED)
 {
   switch (type_of_cost)
     {
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index d6bc604..17b582f 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5706,8 +5706,10 @@ preserved (e.g.@: used only by a reduction computation). Otherwise, the
 @code{widen_mult_hi/lo} idioms will be used.
 @end deftypefn
 
-@deftypefn {Target Hook} int TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST (enum vect_cost_for_stmt @var{type_of_cost})
+@deftypefn {Target Hook} int TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST (enum vect_cost_for_stmt @var{type_of_cost}, tree @var{vectype}, int @var{misalign})
 Returns cost of different scalar or vector statements for vectorization cost model.
+For vector memory operations the cost may depend on type (@var{vectype}) and 
+misalignment value (@var{misalign}).
 @end deftypefn
 
 @deftypefn {Target Hook} bool TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE (const_tree @var{type}, bool @var{is_packed})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 083d56f..e79341d 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -5708,6 +5708,8 @@ preserved (e.g.@: used only by a reduction computation). Otherwise, the
 
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
 Returns cost of different scalar or vector statements for vectorization cost model.
+For vector memory operations the cost may depend on type (@var{vectype}) and 
+misalignment value (@var{misalign}).
 @end deftypefn
 
 @hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
diff --git a/gcc/target.def b/gcc/target.def
index 8bcf877..6270925 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -789,11 +789,13 @@ DEFHOOK
  "",
  tree, (tree x), NULL)
 
-/* Cost of different vector/scalar statements in vectorization cost model.  */
+/* Cost of different vector/scalar statements in vectorization cost
+   model. In case of misaligned vector loads and stores the cost depends
+   on the data type and misalignment value.  */
 DEFHOOK
 (builtin_vectorization_cost,
  "",
- int, (enum vect_cost_for_stmt type_of_cost),
+ int, (enum vect_cost_for_stmt type_of_cost, tree vectype, int misalign),
  default_builtin_vectorization_cost)
 
 /* Return true if vector alignment is reachable (by peeling N
diff --git a/gcc/target.h b/gcc/target.h
index 18d160d..99dd1ee 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -119,6 +119,7 @@ enum vect_cost_for_stmt
   vector_stmt,
   vector_load,
   unaligned_load,
+  unaligned_store,
   vector_store,
   vec_to_scalar,
   scalar_to_vec,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 9271db8..1a49f0c 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -479,7 +479,9 @@ default_builtin_vectorized_conversion (unsigned int code ATTRIBUTE_UNUSED,
 /* Default vectorizer cost model values.  */
 
 int
-default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost)
+default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+                                    tree vectype ATTRIBUTE_UNUSED,
+                                    int misalign ATTRIBUTE_UNUSED)
 {
   switch (type_of_cost)
     {
@@ -496,6 +498,7 @@ default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost)
         return 1;
 
       case unaligned_load:
+      case unaligned_store:
         return 2;
 
       case cond_branch_taken:
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index f491dbd..eb4b547 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -77,7 +77,7 @@ extern tree default_builtin_vectorized_function (tree, tree, tree);
 
 extern tree default_builtin_vectorized_conversion (unsigned int, tree, tree);
 
-extern int default_builtin_vectorization_cost (enum vect_cost_for_stmt);
+extern int default_builtin_vectorization_cost (enum vect_cost_for_stmt, tree, int);
 
 extern tree default_builtin_reciprocal (unsigned int, bool, bool);
 
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index a815dd1..5ae6c6e 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,20 @@
+2010-07-04  Ira Rosen  <irar@il.ibm.com>
+	    Revital Eres  <eres@il.ibm.com>
+
+	* gcc.dg/vect/vect-42.c: Don't expect peeling on targets that support
+	misaligned stores.
+	* gcc.dg/vect/vect-60.c, gcc.dg/vect/vect-56.c, gcc.dg/vect/vect-93.c,
+	gcc.dg/vect/vect-96.c: Likewise.
+	* gcc.dg/vect/vect-109.c: Expect vectorization only on targets that	
+	that support misaligned stores. Change the number of expected 
+	misaligned accesses.
+	* gcc.dg/vect/vect-peel-1.c: New test.
+	* gcc.dg/vect/vect-peel-2.c, gcc.dg/vect/vect-peel-3.c, 
+	gcc.dg/vect/vect-peel-4.c: Likewise.
+	* gcc.dg/vect/vect-multitypes-1.c: Change the test to make it 
+	vectorizable on all targets that support realignment.
+	* gcc.dg/vect/vect-multitypes-4.c: Likewise. 	
+
 2010-07-03  H.J. Lu  <hongjiu.lu@intel.com>
 
 	PR c/44806
diff --git a/gcc/testsuite/gcc.dg/vect/vect-109.c b/gcc/testsuite/gcc.dg/vect/vect-109.c
index 3939093..ddba263 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-109.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-109.c
@@ -72,8 +72,8 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_hw_misalign } } } */
 /* { dg-final { scan-tree-dump-times "not vectorized: unsupported unaligned store" 2 "vect" { xfail vect_hw_misalign } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 10 "vect" { target vect_hw_misalign } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 3 "vect" { target vect_hw_misalign } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-42.c b/gcc/testsuite/gcc.dg/vect/vect-42.c
index 3ba1c6f..fa83200 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-42.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-42.c
@@ -65,6 +65,7 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 3 "vect" { target vect_no_align } } } */
 /* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 1 "vect" { target { { ! vector_alignment_reachable } && { ! vect_hw_misalign } } } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail { vect_no_align || { ! vector_alignment_reachable } } } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { vect_no_align || { ! vector_alignment_reachable } } } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail { vect_no_align || { { !  vector_alignment_reachable } || vect_hw_misalign  } } } } }  */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 3 "vect" { target vect_hw_misalign } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { vect_no_align || { { ! vector_alignment_reachable } || vect_hw_misalign } } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-56.c b/gcc/testsuite/gcc.dg/vect/vect-56.c
index 7b7da12..1555d41 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-56.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-56.c
@@ -68,6 +68,8 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align } } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target { vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { xfail { vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { target { vect_hw_misalign } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-60.c b/gcc/testsuite/gcc.dg/vect/vect-60.c
index cbdf63d..ba8ffe6 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-60.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-60.c
@@ -69,6 +69,8 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align } } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target { vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { xfail { vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { target { vect_hw_misalign } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-93.c b/gcc/testsuite/gcc.dg/vect/vect-93.c
index 85666d9..dfb98cf 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-93.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-93.c
@@ -72,7 +72,7 @@ int main (void)
 /* main && main1 together: */
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 2 "vect" { target powerpc*-*-* i?86-*-* x86_64-*-* } } } */
 /* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { target { vect_no_align && {! vector_alignment_reachable} } } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 3 "vect" { xfail { { vect_no_align } || {! vector_alignment_reachable} } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 3 "vect" { xfail { { vect_no_align } || { { ! vector_alignment_reachable} || vect_hw_misalign } } } } } */
 
 /* in main1: */
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target !powerpc*-*-* !i?86-*-* !x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-96.c b/gcc/testsuite/gcc.dg/vect/vect-96.c
index f392169..c7dea61 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-96.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-96.c
@@ -44,6 +44,6 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target { {! vect_no_align} && vector_alignment_reachable } } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { { vect_no_align } || {! vector_alignment_reachable} } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { { vect_no_align } || { { ! vector_alignment_reachable} || vect_hw_misalign } } } } } */
 /* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning." 1 "vect" { target { vect_no_align || { {! vector_alignment_reachable} && {! vect_hw_misalign} } } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
index e8fe027..7981c4a 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
@@ -27,13 +27,13 @@ __attribute__ ((noinline)) int main1 (int n)
   for (i = 0; i < n; i++)
     {
       sa[i+7] = sb[i];
-      ia[i+3] = ib[i];
+      ia[i+3] = ib[i+1];
     }
 
   /* check results:  */
   for (i = 0; i < n; i++)
     {
-      if (sa[i+7] != sb[i] || ia[i+3] != ib[i])
+      if (sa[i+7] != sb[i] || ia[i+3] != ib[i+1])
 	abort ();
     }
 
@@ -44,7 +44,9 @@ __attribute__ ((noinline)) int main1 (int n)
    access for peeling, and therefore will examine the option of
    using a peeling factor = (V-3)%V = 1 for V=2,4. 
    This will not align the access 'sa[i+3]' (for which we need to
-   peel 5 iterations), so the loop can not be vectorized.  */
+   peel 5 iterations). However, 'ia[i+3]' also gets aligned if we peel 5
+   iterations, so the loop is vectorizable on all targets that support
+   unaligned loads.  */
 
 __attribute__ ((noinline)) int main2 (int n)
 {
@@ -55,13 +57,13 @@ __attribute__ ((noinline)) int main2 (int n)
   for (i = 0; i < n; i++)
     {
       ia[i+3] = ib[i];
-      sa[i+3] = sb[i];
+      sa[i+3] = sb[i+1];
     }
 
   /* check results:  */
   for (i = 0; i < n; i++)
     {
-      if (sa[i+3] != sb[i] || ia[i+3] != ib[i])
+      if (sa[i+3] != sb[i+1] || ia[i+3] != ib[i])
         abort ();
     }
 
@@ -78,11 +80,8 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail {! vect_hw_misalign} } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail {! vect_hw_misalign} } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail { vect_no_align } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail { vect_no_align } } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail { vect_no_align } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
index 274fb02..3a83491 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
@@ -20,7 +20,9 @@ unsigned int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,
    access for peeling, and therefore will examine the option of
    using a peeling factor = VF-7%VF. This will result in a peeling factor 1,
    which will also align the access to 'ia[i+3]', and the loop could be
-   vectorized on all targets that support unaligned loads.  */
+   vectorized on all targets that support unaligned loads.
+   Without cost model on targets that support misaligned stores, no peeling
+   will be applied since we want to keep the four loads aligned.  */
 
 __attribute__ ((noinline))
 int main1 (int n)
@@ -50,7 +52,11 @@ int main1 (int n)
    using a peeling factor = VF-3%VF. This will result in a peeling factor
    1 if VF=4,2. This will not align the access to 'sa[i+3]', for which we 
    need to peel 5,1 iterations for VF=4,2 respectively, so the loop can not 
-   be vectorized.  */
+   be vectorized.  However, 'ia[i+3]' also gets aligned if we peel 5
+   iterations, so the loop is vectorizable on all targets that support
+   unaligned loads.
+   Without cost model on targets that support misaligned stores, no peeling
+   will be applied since we want to keep the four loads aligned.  */
 
 __attribute__ ((noinline))
 int main2 (int n)
@@ -85,11 +91,10 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail {! vect_hw_misalign} } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail {! vect_hw_misalign}  } } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 8 "vect" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail { vect_no_align } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { target { vect_hw_misalign}  } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 8 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { target { vect_hw_misalign  } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-1.c b/gcc/testsuite/gcc.dg/vect/vect-peel-1.c
new file mode 100644
index 0000000..ae77463
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-1.c
@@ -0,0 +1,51 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int ib[N+5];
+
+__attribute__ ((noinline))
+int main1 ()
+{
+  int i;
+  int ia[N+1];
+
+  /* All the accesses are misaligned. With cost model disabled, we count the 
+     the number of aligned accesses for each peeling option, and in this case
+     we align the two loads if possible (i.e., if misaligned stores are 
+     supported).  */
+  for (i = 1; i <= N; i++)
+    {
+      ia[i] = ib[i+2] + ib[i+6];
+    }
+
+  /* check results:  */
+  for (i = 1; i <= N; i++)
+    {
+      if (ia[i] != ib[i+2] + ib[i+6])
+        abort ();
+    }
+
+  return 0;
+}
+
+int main (void)
+{ 
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < N+5; i++)
+    ib[i] = i;
+
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail  vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target vect_hw_misalign  } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail  vect_no_align } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-2.c b/gcc/testsuite/gcc.dg/vect/vect-peel-2.c
new file mode 100644
index 0000000..ee7b8db
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-2.c
@@ -0,0 +1,52 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 128
+
+/* unaligned store.  */
+
+int ib[N+5];
+
+__attribute__ ((noinline))
+int main1 ()
+{
+  int i;
+  int ia[N+1];
+
+  /* The store is aligned and the loads are misaligned with the same 
+     misalignment. Cost model is disabled. If misaligned stores are supported,
+     we peel according to the loads to align them.  */
+  for (i = 0; i <= N; i++)
+    {
+      ia[i] = ib[i+2] + ib[i+6];
+    }
+
+  /* check results:  */
+  for (i = 1; i <= N; i++)
+    {
+      if (ia[i] != ib[i+2] + ib[i+6])
+        abort ();
+    }
+
+  return 0;
+}
+
+int main (void)
+{ 
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < N+5; i++)
+    ib[i] = i;
+
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail  vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target vect_hw_misalign  } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { target vect_hw_misalign } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-3.c b/gcc/testsuite/gcc.dg/vect/vect-peel-3.c
new file mode 100644
index 0000000..80f03c8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-3.c
@@ -0,0 +1,55 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+#define RES 21888 
+
+/* unaligned store.  */
+
+int ib[N+10];
+int ia[N+10];
+int ic[N+10];
+
+__attribute__ ((noinline))
+int main1 ()
+{
+  int i, suma = 0, sumb = 0, sumc = 0;
+
+  /* ib and ic have same misalignment, we peel to align them.  */
+  for (i = 1; i <= N; i++)
+    {
+      suma += ia[i];
+      sumb += ib[i+6];
+      sumc += ic[i+2];
+    }
+
+  /* check results:  */
+  if (suma + sumb + sumc != RES)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < N+10; i++)
+    {
+      ib[i] = i;
+      ic[i] = i+2;
+      ia[i] = i/2;
+    } 
+
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-4.c b/gcc/testsuite/gcc.dg/vect/vect-peel-4.c
new file mode 100644
index 0000000..971d023
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-4.c
@@ -0,0 +1,47 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int ib[N+5];
+
+__attribute__ ((noinline))
+int main1 ()
+{
+  int i;
+  int ia[N+1];
+
+  /* Don't peel keeping one load and the store aligned.  */
+  for (i = 0; i <= N; i++)
+    {
+      ia[i] = ib[i] + ib[i+6];
+    }
+
+  /* check results:  */
+  for (i = 1; i <= N; i++)
+    {
+      if (ia[i] != ib[i] + ib[i+6])
+        abort ();
+    }
+
+  return 0;
+}
+
+int main (void)
+{ 
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < N+5; i++)
+    ib[i] = i;
+
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index cbefc1f..cf9fab2 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -810,7 +810,11 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
 	 NOTE: This is the only change to the code we make during
 	 the analysis phase, before deciding to vectorize the loop.  */
       if (vect_print_dump_info (REPORT_DETAILS))
-	fprintf (vect_dump, "force alignment");
+        {
+          fprintf (vect_dump, "force alignment of ");
+          print_generic_expr (vect_dump, ref, TDF_SLIM);
+        }
+
       DECL_ALIGN (base) = TYPE_ALIGN (vectype);
       DECL_USER_ALIGN (base) = 1;
     }
@@ -967,7 +971,7 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
           || !STMT_VINFO_VECTORIZABLE (stmt_info))
         continue;
 
-      supportable_dr_alignment = vect_supportable_dr_alignment (dr);
+      supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
       if (!supportable_dr_alignment)
         {
           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
@@ -1061,6 +1065,189 @@ vector_alignment_reachable_p (struct data_reference *dr)
   return true;
 }
 
+
+/* Calculate the cost of the memory access represented by DR.  */
+
+static void
+vect_get_data_access_cost (struct data_reference *dr,
+                           unsigned int *inside_cost,
+                           unsigned int *outside_cost)
+{
+  gimple stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  int ncopies = vf / nunits;
+  bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
+
+  if (!supportable_dr_alignment)
+    *inside_cost = VECT_MAX_COST;
+  else
+    {
+      if (DR_IS_READ (dr))
+        vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost);
+      else
+        vect_get_store_cost (dr, ncopies, inside_cost);
+    }
+
+  if (vect_print_dump_info (REPORT_COST))
+    fprintf (vect_dump, "vect_get_data_access_cost: inside_cost = %d, "
+             "outside_cost = %d.", *inside_cost, *outside_cost);
+}
+
+
+static hashval_t
+vect_peeling_hash (const void *elem)
+{
+  const struct _vect_peel_info *peel_info;
+
+  peel_info = (const struct _vect_peel_info *) elem;
+  return (hashval_t) peel_info->npeel;
+}
+
+
+static int
+vect_peeling_hash_eq (const void *elem1, const void *elem2)
+{
+  const struct _vect_peel_info *a, *b;
+
+  a = (const struct _vect_peel_info *) elem1;
+  b = (const struct _vect_peel_info *) elem2;
+  return (a->npeel == b->npeel);
+}
+
+
+/* Insert DR into peeling hash table with NPEEL as key.  */
+
+static void
+vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
+                          int npeel)
+{
+  struct _vect_peel_info elem, *slot;
+  void **new_slot;
+  bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
+
+  elem.npeel = npeel;
+  slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
+                                     &elem);
+  if (slot)
+    slot->count++;
+  else
+    {
+      slot = XNEW (struct _vect_peel_info);
+      slot->npeel = npeel;
+      slot->dr = dr;
+      slot->count = 1;
+      new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot,
+                                 INSERT);
+      *new_slot = slot;
+    }
+
+  if (!supportable_dr_alignment && !flag_vect_cost_model)
+    slot->count += VECT_MAX_COST;
+}
+
+
+/* Traverse peeling hash table to find peeling option that aligns maximum
+   number of data accesses.  */
+
+static int
+vect_peeling_hash_get_most_frequent (void **slot, void *data)
+{
+  vect_peel_info elem = (vect_peel_info) *slot;
+  vect_peel_extended_info max = (vect_peel_extended_info) data;
+
+  if (elem->count > max->peel_info.count)
+    {
+      max->peel_info.npeel = elem->npeel;
+      max->peel_info.count = elem->count;
+      max->peel_info.dr = elem->dr;
+    }
+
+  return 1;
+}
+
+
+/* Traverse peeling hash table and calculate cost for each peeling option. Find
+   one with the lowest cost.  */
+
+static int
+vect_peeling_hash_get_lowest_cost (void **slot, void *data)
+{
+  vect_peel_info elem = (vect_peel_info) *slot;
+  vect_peel_extended_info min = (vect_peel_extended_info) data;
+  int save_misalignment, dummy;
+  unsigned int inside_cost = 0, outside_cost = 0, i;
+  gimple stmt = DR_STMT (elem->dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
+  struct data_reference *dr;
+
+  for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
+    {
+      stmt = DR_STMT (dr);
+      stmt_info = vinfo_for_stmt (stmt);
+      /* For interleaving, only the alignment of the first access
+         matters.  */
+      if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
+          && DR_GROUP_FIRST_DR (stmt_info) != stmt)
+        continue;
+
+      save_misalignment = DR_MISALIGNMENT (dr);
+      vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
+      vect_get_data_access_cost (dr, &inside_cost, &outside_cost);
+      SET_DR_MISALIGNMENT (dr, save_misalignment);
+    }
+
+  outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel, &dummy,
+                         vect_get_single_scalar_iteraion_cost (loop_vinfo));
+
+  if (inside_cost < min->inside_cost
+      || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
+    {
+      min->inside_cost = inside_cost;
+      min->outside_cost = outside_cost;
+      min->peel_info.dr = elem->dr;
+      min->peel_info.npeel = elem->npeel;
+    }
+
+  return 1;
+}
+
+
+/* Choose best peeling option by traversing peeling hash table and either
+   choosing an option with the lowest cost (if cost model is enabled) or the
+   option that aligns as many accesses as possible.  */
+
+static struct data_reference *
+vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
+                                       unsigned int *npeel)
+{
+   struct _vect_peel_extended_info res;
+
+   res.peel_info.dr = NULL;
+
+   if (flag_vect_cost_model)
+     {
+       res.inside_cost = INT_MAX;
+       res.outside_cost = INT_MAX;
+       htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
+                      vect_peeling_hash_get_lowest_cost, &res);
+     }
+   else
+     {
+       res.peel_info.count = 0;
+       htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
+                      vect_peeling_hash_get_most_frequent, &res);
+     }
+
+   *npeel = res.peel_info.npeel;
+   return res.peel_info.dr;
+}
+
+
 /* Function vect_enhance_data_refs_alignment
 
    This pass will use loop versioning and loop peeling in order to enhance
@@ -1158,15 +1345,21 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
   VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   enum dr_alignment_support supportable_dr_alignment;
-  struct data_reference *dr0 = NULL;
+  struct data_reference *dr0 = NULL, *first_store = NULL;
   struct data_reference *dr;
-  unsigned int i;
+  unsigned int i, j;
   bool do_peeling = false;
   bool do_versioning = false;
   bool stat;
   gimple stmt;
   stmt_vec_info stmt_info;
   int vect_versioning_for_alias_required;
+  unsigned int npeel = 0;
+  bool all_misalignments_unknown = true;
+  unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  unsigned possible_npeel_number = 1;
+  tree vectype;
+  unsigned int nelements, mis, same_align_drs_max = 0;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_enhance_data_refs_alignment ===");
@@ -1201,12 +1394,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
      - How many accesses will become unaligned due to the peeling,
        and the cost of misaligned accesses.
      - The cost of peeling (the extra runtime checks, the increase
-       in code size).
-
-     The scheme we use FORNOW: peel to force the alignment of the first
-     unsupported misaligned access in the loop.
-
-     TODO: Use a cost model.  */
+       in code size).  */
 
   for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
     {
@@ -1219,15 +1407,108 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
           && DR_GROUP_FIRST_DR (stmt_info) != stmt)
         continue;
 
-      if (!DR_IS_READ (dr) && !aligned_access_p (dr))
+      supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
+      do_peeling = vector_alignment_reachable_p (dr);
+      if (do_peeling)
         {
-	  do_peeling = vector_alignment_reachable_p (dr);
-	  if (do_peeling)
-	    dr0 = dr;
-	  if (!do_peeling && vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "vector alignment may not be reachable");
-	  break;
-	}
+          if (known_alignment_for_access_p (dr))
+            {
+              unsigned int npeel_tmp;
+
+              /* Save info about DR in the hash table.  */
+              if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
+                LOOP_VINFO_PEELING_HTAB (loop_vinfo) =
+                           htab_create (1, vect_peeling_hash,
+                                        vect_peeling_hash_eq, free);
+
+              vectype = STMT_VINFO_VECTYPE (stmt_info);
+              nelements = TYPE_VECTOR_SUBPARTS (vectype);
+              mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
+                                                TREE_TYPE (DR_REF (dr))));
+              npeel_tmp = (nelements - mis) % vf;
+
+              /* For multiple types, it is possible that the bigger type access
+                 will have more than one peeling option. E.g., a loop with two
+                 types: one of size (vector size / 4), and the other one of
+                 size (vector size / 8). Vectorization factor will 8. If both
+                 access are misaligned by 3, the first one needs one scalar
+                 iteration to be aligned, and the second one needs 5. But the
+                 the first one will be aligned also by peeling 5 scalar
+                 iterations, and in that case both accesses will be aligned.
+                 Hence, except for the immediate peeling amount, we also want
+                 to try to add full vector size, while we don't exceed
+                 vectorization factor.
+                 We do this automtically for cost model, since we calculate cost
+                 for every peeling option.  */
+              if (!flag_vect_cost_model)
+                possible_npeel_number = vf /nelements;
+
+              /* Handle the aligned case. We may decide to align some other
+                 access, making DR unaligned.  */
+              if (DR_MISALIGNMENT (dr) == 0)
+                {
+                  npeel_tmp = 0;
+                  if (!flag_vect_cost_model)
+                    possible_npeel_number++;
+                }
+
+              for (j = 0; j < possible_npeel_number; j++)
+                {
+                  gcc_assert (npeel_tmp <= vf);
+                  vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
+                  npeel_tmp += nelements;
+                }
+
+              all_misalignments_unknown = false;
+              /* Data-ref that was chosen for the case that all the
+                 misalignments are unknown is not relevant anymore, since we
+                 have a data-ref with known alignment.  */
+              dr0 = NULL;
+            }
+          else
+            {
+              /* If we don't know all the misalignment values, we prefer
+                 peeling for data-ref that has maximum number of data-refs
+                 with the same alignment, unless the target prefers to align
+                 stores over load.  */
+              if (all_misalignments_unknown)
+                {
+                  if (same_align_drs_max  < VEC_length (dr_p,
+                                       STMT_VINFO_SAME_ALIGN_REFS (stmt_info))
+                      || !dr0)
+                    {
+                      same_align_drs_max = VEC_length (dr_p,
+                                       STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
+                      dr0 = dr;
+                    }
+
+                  if (!first_store && !DR_IS_READ (dr))
+                    first_store = dr;
+                }
+
+              /* If there are both known and unknown misaligned accesses in the
+                 loop, we choose peeling amount according to the known
+                 accesses.  */
+
+
+              if (!supportable_dr_alignment)
+                {
+                  dr0 = dr;
+                  if (!first_store && !DR_IS_READ (dr))
+                    first_store = dr;
+                }
+            }
+        }
+      else
+        {
+          if (!aligned_access_p (dr))
+            {
+              if (vect_print_dump_info (REPORT_DETAILS))
+                fprintf (vect_dump, "vector alignment may not be reachable");
+
+              break;
+            }
+        }
     }
 
   vect_versioning_for_alias_required
@@ -1242,24 +1523,112 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
     do_peeling = false;
 
+  if (do_peeling && all_misalignments_unknown
+      && vect_supportable_dr_alignment (dr0, false))
+    {
+
+      /* Check if the target requires to prefer stores over loads, i.e., if
+         misaligned stores are more expensive than misaligned loads (taking
+         drs with same alignment into account).  */
+      if (first_store && DR_IS_READ (dr0))
+        {
+          unsigned int load_inside_cost = 0, load_outside_cost = 0;
+          unsigned int store_inside_cost = 0, store_outside_cost = 0;
+          unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
+          unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
+
+          vect_get_data_access_cost (dr0, &load_inside_cost,
+                                     &load_outside_cost);
+          vect_get_data_access_cost (first_store, &store_inside_cost,
+                                     &store_outside_cost);
+
+          /* Calculate the penalty for leaving FIRST_STORE unaligned (by
+             aligning the load DR0).  */
+          load_inside_penalty = store_inside_cost;
+          load_outside_penalty = store_outside_cost;
+          for (i = 0; VEC_iterate (dr_p, STMT_VINFO_SAME_ALIGN_REFS
+                                   (vinfo_for_stmt (DR_STMT (first_store))),
+                                   i, dr);
+               i++)
+            if (DR_IS_READ (dr))
+              {
+                load_inside_penalty += load_inside_cost;
+                load_outside_penalty += load_outside_cost;
+              }
+            else
+              {
+                load_inside_penalty += store_inside_cost;
+                load_outside_penalty += store_outside_cost;
+              }
+
+          /* Calculate the penalty for leaving DR0 unaligned (by
+             aligning the FIRST_STORE).  */
+          store_inside_penalty = load_inside_cost;
+          store_outside_penalty = load_outside_cost;
+          for (i = 0; VEC_iterate (dr_p, STMT_VINFO_SAME_ALIGN_REFS
+                                   (vinfo_for_stmt (DR_STMT (dr0))),
+                                   i, dr);
+               i++)
+            if (DR_IS_READ (dr))
+              {
+                store_inside_penalty += load_inside_cost;
+                store_outside_penalty += load_outside_cost;
+              }
+            else
+              {
+                store_inside_penalty += store_inside_cost;
+                store_outside_penalty += store_outside_cost;
+              }
+
+          if (load_inside_penalty > store_inside_penalty
+              || (load_inside_penalty == store_inside_penalty
+                  && load_outside_penalty > store_outside_penalty))
+            dr0 = first_store;
+        }
+
+      /* In case there are only loads with different unknown misalignments, use
+         peeling only if it may help to align other accesses in the loop.  */
+      if (!first_store && !VEC_length (dr_p, STMT_VINFO_SAME_ALIGN_REFS
+                                            (vinfo_for_stmt (DR_STMT (dr0))))
+          && vect_supportable_dr_alignment (dr0, false)
+              != dr_unaligned_supported)
+        do_peeling = false;
+    }
+
+  if (do_peeling && !dr0)
+    {
+      /* Peeling is possible, but there is no data access that is not supported
+         unless aligned. So we try to choose the best possible peeling.  */
+
+      /* We should get here only if there are drs with known misalignment.  */
+      gcc_assert (!all_misalignments_unknown);
+
+      /* Choose the best peeling from the hash table.  */
+      dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel);
+      if (!dr0 || !npeel)
+        do_peeling = false;
+    }
+
   if (do_peeling)
     {
-      int mis;
-      int npeel = 0;
-      gimple stmt = DR_STMT (dr0);
-      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-      int nelements = TYPE_VECTOR_SUBPARTS (vectype);
+      stmt = DR_STMT (dr0);
+      stmt_info = vinfo_for_stmt (stmt);
+      vectype = STMT_VINFO_VECTYPE (stmt_info);
+      nelements = TYPE_VECTOR_SUBPARTS (vectype);
 
       if (known_alignment_for_access_p (dr0))
         {
-          /* Since it's known at compile time, compute the number of iterations
-             in the peeled loop (the peeling factor) for use in updating
-             DR_MISALIGNMENT values.  The peeling factor is the vectorization
-             factor minus the misalignment as an element count.  */
-          mis = DR_MISALIGNMENT (dr0);
-          mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
-          npeel = nelements - mis;
+          if (!npeel)
+            {
+              /* Since it's known at compile time, compute the number of
+                 iterations in the peeled loop (the peeling factor) for use in
+                 updating DR_MISALIGNMENT values.  The peeling factor is the
+                 vectorization factor minus the misalignment as an element
+                 count.  */
+              mis = DR_MISALIGNMENT (dr0);
+              mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
+              npeel = nelements - mis;
+            }
 
 	  /* For interleaved data access every iteration accesses all the
 	     members of the group, therefore we divide the number of iterations
@@ -1290,7 +1659,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
 	  save_misalignment = DR_MISALIGNMENT (dr);
 	  vect_update_misalignment_for_peel (dr, dr0, npeel);
-	  supportable_dr_alignment = vect_supportable_dr_alignment (dr);
+	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 	  SET_DR_MISALIGNMENT (dr, save_misalignment);
 
 	  if (!supportable_dr_alignment)
@@ -1300,6 +1669,15 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 	    }
 	}
 
+      if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
+        {
+          stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
+          if (!stat)
+            do_peeling = false;
+          else
+            return stat;
+        }
+
       if (do_peeling)
         {
           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
@@ -1314,7 +1692,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 	      vect_update_misalignment_for_peel (dr, dr0, npeel);
 
           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
-          LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
+          if (npeel)
+            LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
+          else
+            LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
 	  SET_DR_MISALIGNMENT (dr0, 0);
 	  if (vect_print_dump_info (REPORT_ALIGNMENT))
             fprintf (vect_dump, "Alignment of access forced using peeling.");
@@ -1358,7 +1739,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 		  && DR_GROUP_FIRST_DR (stmt_info) != stmt))
 	    continue;
 
-	  supportable_dr_alignment = vect_supportable_dr_alignment (dr);
+	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 
           if (!supportable_dr_alignment)
             {
@@ -1467,7 +1848,7 @@ vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
     return;
 
-  if ((DR_IS_READ (dra) && DR_IS_READ (drb)) || dra == drb)
+  if (dra == drb)
     return;
 
   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
@@ -3558,13 +3939,16 @@ vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
     return (alignment <= MAX_STACK_ALIGNMENT);
 }
 
-/* Function vect_supportable_dr_alignment
 
-   Return whether the data reference DR is supported with respect to its
+/* Return whether the data reference DR is supported with respect to its
+   alignment.
+   If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
+   it is aligned, i.e., check if it is possible to vectorize it with different
    alignment.  */
 
 enum dr_alignment_support
-vect_supportable_dr_alignment (struct data_reference *dr)
+vect_supportable_dr_alignment (struct data_reference *dr,
+                               bool check_aligned_accesses)
 {
   gimple stmt = DR_STMT (dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
@@ -3574,7 +3958,7 @@ vect_supportable_dr_alignment (struct data_reference *dr)
   struct loop *vect_loop = NULL;
   bool nested_in_vect_loop = false;
 
-  if (aligned_access_p (dr))
+  if (aligned_access_p (dr) && !check_aligned_accesses)
     return dr_aligned;
 
   if (!loop_vinfo)
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index f8922a2..38546cf 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -1976,25 +1976,18 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters,
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
   tree niters_type = TREE_TYPE (loop_niters);
-  int step = 1;
-  int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
   int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 
-  if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
-    step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
-
   pe = loop_preheader_edge (loop);
 
   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
     {
-      int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
-      int elem_misalign = byte_misalign / element_size;
+      int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
 
       if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "known alignment = %d.", byte_misalign);
+        fprintf (vect_dump, "known peeling = %d.", npeel);
 
-      iters = build_int_cst (niters_type,
-                     (((nelements - elem_misalign) & (nelements - 1)) / step));
+      iters = build_int_cst (niters_type, npeel);
     }
   else
     {
@@ -2017,7 +2010,8 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters,
 
       /* Create:  byte_misalign = addr & (vectype_size - 1)  */
       byte_misalign =
-        fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
+        fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), 
+                     vectype_size_minus_1);
 
       /* Create:  elem_misalign = byte_misalign / element_size  */
       elem_misalign =
@@ -2323,7 +2317,8 @@ vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
   tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
 			             DR_STEP (dr), vect_factor);
 
-  if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
+  if (vect_supportable_dr_alignment (dr, false)
+        == dr_explicit_realign_optimized)
     {
       tree vector_size = TYPE_SIZE_UNIT
 			  (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 3b38716..ef48173 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -755,6 +755,7 @@ new_loop_vec_info (struct loop *loop)
   LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
+  LOOP_VINFO_PEELING_HTAB (res) = NULL;
 
   return res;
 }
@@ -845,6 +846,9 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
   VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
 
+  if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
+    htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
+
   free (loop_vinfo);
   loop->aux = NULL;
 }
@@ -1122,7 +1126,11 @@ vect_analyze_loop_form (struct loop *loop)
 static inline 
 int vect_get_cost (enum vect_cost_for_stmt type_of_cost)
 {
-  return targetm.vectorize.builtin_vectorization_cost (type_of_cost); 
+  tree dummy_type = NULL;
+  int dummy = 0;
+
+  return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
+                                                       dummy_type, dummy);
 }
 
  
@@ -1498,17 +1506,6 @@ vect_analyze_loop (struct loop *loop)
       return NULL;
     }
 
-  /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
-  ok = vect_analyze_slp (loop_vinfo, NULL);
-  if (ok)
-    {
-      /* Decide which possible SLP instances to SLP.  */
-      vect_make_slp_decision (loop_vinfo);
-
-      /* Find stmts that need to be both vectorized and SLPed.  */
-      vect_detect_hybrid_slp (loop_vinfo);
-    }
-
   /* This pass will decide on using loop versioning and/or loop peeling in
      order to enhance the alignment of data references in the loop.  */
 
@@ -1516,11 +1513,22 @@ vect_analyze_loop (struct loop *loop)
   if (!ok)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
-	fprintf (vect_dump, "bad data alignment.");
+        fprintf (vect_dump, "bad data alignment.");
       destroy_loop_vec_info (loop_vinfo, true);
       return NULL;
     }
 
+  /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
+  ok = vect_analyze_slp (loop_vinfo, NULL);
+  if (ok)
+    {
+      /* Decide which possible SLP instances to SLP.  */
+      vect_make_slp_decision (loop_vinfo);
+
+      /* Find stmts that need to be both vectorized and SLPed.  */
+      vect_detect_hybrid_slp (loop_vinfo);
+    }
+
   /* Scan all the operations in the loop and make sure they are
      vectorizable.  */
 
@@ -2004,6 +2012,94 @@ vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
 				     double_reduc, true);
 }
 
+/* Calculate the cost of one scalar iteration of the loop.  */
+int
+vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
+  int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
+  int innerloop_iters, i, stmt_cost;
+
+  /* Count statements in scalar loop. Using this as scalar cost for a single
+     iteration for now.
+
+     TODO: Add outer loop support.
+
+     TODO: Consider assigning different costs to different scalar
+     statements.  */
+
+  /* FORNOW.  */
+  if (loop->inner)
+    innerloop_iters = 50; /* FIXME */
+
+  for (i = 0; i < nbbs; i++)
+    {
+      gimple_stmt_iterator si;
+      basic_block bb = bbs[i];
+
+      if (bb->loop_father == loop->inner)
+        factor = innerloop_iters;
+      else
+        factor = 1;
+
+      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+        {
+          gimple stmt = gsi_stmt (si);
+
+          if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
+            continue;
+
+          if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
+            {
+              if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
+               stmt_cost = vect_get_cost (scalar_load);
+             else
+               stmt_cost = vect_get_cost (scalar_store);
+            }
+          else
+            stmt_cost = vect_get_cost (scalar_stmt);
+
+          scalar_single_iter_cost += stmt_cost * factor;
+        }
+    }
+  return scalar_single_iter_cost;
+}
+
+/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
+int
+vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
+                             int *peel_iters_epilogue,
+                             int scalar_single_iter_cost)
+{
+  int peel_guard_costs = 0;
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      *peel_iters_epilogue = vf/2;
+      if (vect_print_dump_info (REPORT_COST))
+        fprintf (vect_dump, "cost model: "
+                            "epilogue peel iters set to vf/2 because "
+                            "loop iterations are unknown .");
+
+      /* If peeled iterations are known but number of scalar loop
+         iterations are unknown, count a taken branch per peeled loop.  */
+      peel_guard_costs =  2 * vect_get_cost (cond_branch_taken);
+    }
+  else
+    {
+      int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      peel_iters_prologue = niters < peel_iters_prologue ?
+                            niters : peel_iters_prologue;
+      *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
+    }
+
+   return (peel_iters_prologue * scalar_single_iter_cost)
+            + (*peel_iters_epilogue * scalar_single_iter_cost)
+           + peel_guard_costs;
+}
+
 /* Function vect_estimate_min_profitable_iters
 
    Return the number of iterations required for the vector version of the
@@ -2028,7 +2124,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
-  int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
+  int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
   int peel_guard_costs = 0;
   int innerloop_iters = 0, factor;
   VEC (slp_instance, heap) *slp_instances;
@@ -2099,7 +2195,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 	      && (!STMT_VINFO_LIVE_P (stmt_info)
 		  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
 	    continue;
-	  scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
 	  vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
 	  /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
 	     some of the "outside" costs are generated inside the outer-loop.  */
@@ -2107,6 +2202,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 	}
     }
 
+  scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
+
   /* Add additional cost for the peeled instructions in prologue and epilogue
      loop.
 
@@ -2116,7 +2213,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
      TODO: Build an expression that represents peel_iters for prologue and
      epilogue to be used in a run-time test.  */
 
-  if (byte_misalign < 0)
+  if (npeel  < 0)
     {
       peel_iters_prologue = vf/2;
       if (vect_print_dump_info (REPORT_COST))
@@ -2137,46 +2234,18 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
          not known. Hence guards remain the same.  */
       peel_guard_costs +=  2 * (vect_get_cost (cond_branch_taken)
                                 + vect_get_cost (cond_branch_not_taken));
+      vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
+                           + (peel_iters_epilogue * scalar_single_iter_cost)
+                           + peel_guard_costs;
     }
   else
     {
-      if (byte_misalign)
-	{
-	  struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
-	  int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
-	  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
-	  int nelements = TYPE_VECTOR_SUBPARTS (vectype);
-
-	  peel_iters_prologue = nelements - (byte_misalign / element_size);
-	}
-      else
-	peel_iters_prologue = 0;
-
-      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
-        {
-          peel_iters_epilogue = vf/2;
-          if (vect_print_dump_info (REPORT_COST))
-            fprintf (vect_dump, "cost model: "
-                     "epilogue peel iters set to vf/2 because "
-                     "loop iterations are unknown .");
-
-	  /* If peeled iterations are known but number of scalar loop
-	     iterations are unknown, count a taken branch per peeled loop.  */
-	  peel_guard_costs +=  2 * vect_get_cost (cond_branch_taken); 
-        }
-      else
-	{
-	  int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
-	  peel_iters_prologue = niters < peel_iters_prologue ?
-					niters : peel_iters_prologue;
-	  peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
-	}
+      peel_iters_prologue = npeel;
+      vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
+                                    peel_iters_prologue, &peel_iters_epilogue,
+                                    scalar_single_iter_cost);
     }
 
-  vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
-                      + (peel_iters_epilogue * scalar_single_iter_cost)
-                      + peel_guard_costs;
-
   /* FORNOW: The scalar outside cost is incremented in one of the
      following ways:
 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 5f753a2..1ae3a65 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -560,7 +560,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
               if (first_load == stmt)
                 {
                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
-                  if (vect_supportable_dr_alignment (first_dr)
+                  if (vect_supportable_dr_alignment (first_dr, false)
                       == dr_unaligned_unsupported)
                     {
                       if (vect_print_dump_info (REPORT_SLP))
@@ -646,7 +646,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
         {
           VEC_safe_push (slp_tree, heap, *loads, *node);
           *inside_cost 
-            += targetm.vectorize.builtin_vectorization_cost (vec_perm) 
+            += targetm.vectorize.builtin_vectorization_cost (vec_perm, NULL, 0) 
                * group_size;
         }
 
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index c95fe7d..89e7c4b 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -545,6 +545,18 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
 }
 
 
+/* Get cost by calling cost target builtin.  */
+
+static inline
+int vect_get_stmt_cost (enum vect_cost_for_stmt type_of_cost)
+{
+  tree dummy_type = NULL;
+  int dummy = 0;
+
+  return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
+                                                       dummy_type, dummy);
+}
+
 int
 cost_for_stmt (gimple stmt)
 {
@@ -553,9 +565,9 @@ cost_for_stmt (gimple stmt)
   switch (STMT_VINFO_TYPE (stmt_info))
   {
   case load_vec_info_type:
-    return targetm.vectorize.builtin_vectorization_cost (scalar_load);
+    return vect_get_stmt_cost (scalar_load);
   case store_vec_info_type:
-    return targetm.vectorize.builtin_vectorization_cost (scalar_store);
+    return vect_get_stmt_cost (scalar_store);
   case op_vec_info_type:
   case condition_vec_info_type:
   case assignment_vec_info_type:
@@ -565,7 +577,7 @@ cost_for_stmt (gimple stmt)
   case type_demotion_vec_info_type:
   case type_conversion_vec_info_type:
   case call_vec_info_type:
-    return targetm.vectorize.builtin_vectorization_cost (scalar_stmt);
+    return vect_get_stmt_cost (scalar_stmt);
   case undef_vec_info_type:
   default:
     gcc_unreachable ();
@@ -589,15 +601,13 @@ vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
   if (PURE_SLP_STMT (stmt_info))
     return;
 
-  inside_cost = ncopies 
-    * targetm.vectorize.builtin_vectorization_cost (vector_stmt);
+  inside_cost = ncopies * vect_get_stmt_cost (vector_stmt); 
 
   /* FORNOW: Assuming maximum 2 args per stmts.  */
   for (i = 0; i < 2; i++)
     {
       if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
-	outside_cost 
-          += targetm.vectorize.builtin_vectorization_cost (vector_stmt);
+	outside_cost += vect_get_stmt_cost (vector_stmt); 
     }
 
   if (vect_print_dump_info (REPORT_COST))
@@ -638,22 +648,39 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
 		       enum vect_def_type dt, slp_tree slp_node)
 {
   int group_size;
-  int inside_cost = 0, outside_cost = 0;
+  unsigned int inside_cost = 0, outside_cost = 0;
+  struct data_reference *first_dr;
+  gimple first_stmt;
 
   /* The SLP costs were already calculated during SLP tree build.  */
   if (PURE_SLP_STMT (stmt_info))
     return;
 
   if (dt == vect_constant_def || dt == vect_external_def)
-    outside_cost 
-      = targetm.vectorize.builtin_vectorization_cost (scalar_to_vec);
+    outside_cost = vect_get_stmt_cost (scalar_to_vec); 
 
   /* Strided access?  */
-  if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
-    group_size = vect_cost_strided_group_size (stmt_info);
+  if (DR_GROUP_FIRST_DR (stmt_info))
+    {
+      if (slp_node)
+        {
+          first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
+          group_size = 1;
+        }
+      else
+        {
+          first_stmt = DR_GROUP_FIRST_DR (stmt_info);
+          group_size = vect_cost_strided_group_size (stmt_info);
+        }
+
+      first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
+    }
   /* Not a strided access.  */
   else
-    group_size = 1;
+    {
+      group_size = 1;
+      first_dr = STMT_VINFO_DATA_REF (stmt_info);
+    }
 
   /* Is this an access in a group of stores, which provide strided access?
      If so, add in the cost of the permutes.  */
@@ -661,7 +688,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
     {
       /* Uses a high and low interleave operation for each needed permute.  */
       inside_cost = ncopies * exact_log2(group_size) * group_size
-             * targetm.vectorize.builtin_vectorization_cost (vector_stmt);
+        * vect_get_stmt_cost (vector_stmt);
 
       if (vect_print_dump_info (REPORT_COST))
         fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
@@ -670,8 +697,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
     }
 
   /* Costs of the stores.  */
-  inside_cost += ncopies 
-    * targetm.vectorize.builtin_vectorization_cost (vector_store);
+  vect_get_store_cost (first_dr, ncopies, &inside_cost);
 
   if (vect_print_dump_info (REPORT_COST))
     fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
@@ -683,6 +709,49 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
 }
 
 
+/* Calculate cost of DR's memory access.  */
+void
+vect_get_store_cost (struct data_reference *dr, int ncopies,
+                     unsigned int *inside_cost)
+{
+  int alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
+
+  switch (alignment_support_scheme)
+    {
+    case dr_aligned:
+      {
+        *inside_cost += ncopies * vect_get_stmt_cost (vector_store);
+
+        if (vect_print_dump_info (REPORT_COST))
+          fprintf (vect_dump, "vect_model_store_cost: aligned.");
+
+        break;
+      }
+
+    case dr_unaligned_supported:
+      {
+        gimple stmt = DR_STMT (dr);
+        stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+        tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+
+        /* Here, we assign an additional cost for the unaligned store.  */
+        *inside_cost += ncopies
+          * targetm.vectorize.builtin_vectorization_cost (unaligned_store,
+                                 vectype, DR_MISALIGNMENT (dr));
+
+        if (vect_print_dump_info (REPORT_COST))
+          fprintf (vect_dump, "vect_model_store_cost: unaligned supported by "
+                   "hardware.");
+
+        break;
+      }
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+
 /* Function vect_model_load_cost
 
    Models cost for loads.  In the case of strided accesses, the last access
@@ -695,10 +764,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
 
 {
   int group_size;
-  int alignment_support_cheme;
   gimple first_stmt;
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
-  int inside_cost = 0, outside_cost = 0;
+  unsigned int inside_cost = 0, outside_cost = 0;
 
   /* The SLP costs were already calculated during SLP tree build.  */
   if (PURE_SLP_STMT (stmt_info))
@@ -718,29 +786,47 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
       first_dr = dr;
     }
 
-  alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
-
   /* Is this an access in a group of loads providing strided access?
      If so, add in the cost of the permutes.  */
   if (group_size > 1)
     {
       /* Uses an even and odd extract operations for each needed permute.  */
       inside_cost = ncopies * exact_log2(group_size) * group_size
-	* targetm.vectorize.builtin_vectorization_cost (vector_stmt);
+	* vect_get_stmt_cost (vector_stmt);
 
       if (vect_print_dump_info (REPORT_COST))
         fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
                  group_size);
-
     }
 
   /* The loads themselves.  */
-  switch (alignment_support_cheme)
+  vect_get_load_cost (first_dr, ncopies,
+         ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node),
+         &inside_cost, &outside_cost);
+
+  if (vect_print_dump_info (REPORT_COST))
+    fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
+             "outside_cost = %d .", inside_cost, outside_cost);
+
+  /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
+  stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
+  stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
+}
+
+
+/* Calculate cost of DR's memory access.  */
+void
+vect_get_load_cost (struct data_reference *dr, int ncopies,
+                    bool add_realign_cost, unsigned int *inside_cost,
+                    unsigned int *outside_cost)
+{
+  int alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
+
+  switch (alignment_support_scheme)
     {
     case dr_aligned:
       {
-        inside_cost += ncopies 
-          * targetm.vectorize.builtin_vectorization_cost (vector_load);
+        inside_cost += ncopies * vect_get_stmt_cost (vector_load); 
 
         if (vect_print_dump_info (REPORT_COST))
           fprintf (vect_dump, "vect_model_load_cost: aligned.");
@@ -749,10 +835,14 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
       }
     case dr_unaligned_supported:
       {
-        /* Here, we assign an additional cost for the unaligned load.  */
-        inside_cost += ncopies 
-        * targetm.vectorize.builtin_vectorization_cost (unaligned_load);
+        gimple stmt = DR_STMT (dr);
+        stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+        tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 
+        /* Here, we assign an additional cost for the unaligned load.  */
+        *inside_cost += ncopies
+          * targetm.vectorize.builtin_vectorization_cost (unaligned_load,
+                                           vectype, DR_MISALIGNMENT (dr));
         if (vect_print_dump_info (REPORT_COST))
           fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
                    "hardware.");
@@ -761,16 +851,14 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
       }
     case dr_explicit_realign:
       {
-        inside_cost += ncopies * (2 
-         * targetm.vectorize.builtin_vectorization_cost (vector_load) 
-           + targetm.vectorize.builtin_vectorization_cost (vector_stmt));
+        *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load)
+           + vect_get_stmt_cost (vector_stmt));
 
         /* FIXME: If the misalignment remains fixed across the iterations of
            the containing loop, the following cost should be added to the
            outside costs.  */
         if (targetm.vectorize.builtin_mask_for_load)
-          inside_cost 
-            += targetm.vectorize.builtin_vectorization_cost (vector_stmt);
+          *inside_cost += vect_get_stmt_cost (vector_stmt);
 
         break;
       }
@@ -787,32 +875,21 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
            access in the group. Inside the loop, there is a load op
            and a realignment op.  */
 
-        if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
+        if (add_realign_cost)
           {
-            outside_cost = 2 
-              * targetm.vectorize.builtin_vectorization_cost (vector_stmt);
+            *outside_cost = 2 * vect_get_stmt_cost (vector_stmt);
             if (targetm.vectorize.builtin_mask_for_load)
-              outside_cost 
-                += targetm.vectorize.builtin_vectorization_cost (vector_stmt);
+              *outside_cost += vect_get_stmt_cost (vector_stmt);
           }
 
-        inside_cost += ncopies 
-          * (targetm.vectorize.builtin_vectorization_cost (vector_load)
-             + targetm.vectorize.builtin_vectorization_cost (vector_stmt));
+        *inside_cost += ncopies * (vect_get_stmt_cost (vector_load)
+          + vect_get_stmt_cost (vector_stmt));
         break;
       }
 
     default:
       gcc_unreachable ();
     }
-
-  if (vect_print_dump_info (REPORT_COST))
-    fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
-             "outside_cost = %d .", inside_cost, outside_cost);
-
-  /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
-  stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
-  stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
 }
 
 
@@ -3142,7 +3219,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   dr_chain = VEC_alloc (tree, heap, group_size);
   oprnds = VEC_alloc (tree, heap, group_size);
 
-  alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
+  alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false);
   gcc_assert (alignment_support_scheme);
 
   /* In case the vectorization factor (VF) is bigger than the number
@@ -3507,7 +3584,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
       group_size = vec_num = 1;
     }
 
-  alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
+  alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false);
   gcc_assert (alignment_support_scheme);
 
   /* In case the vectorization factor (VF) is bigger than the number
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index bf6769c..ed8ff58 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -170,6 +170,21 @@ DEF_VEC_ALLOC_P(slp_instance, heap);
 #define SLP_TREE_OUTSIDE_OF_LOOP_COST(S)         (S)->cost.outside_of_loop
 #define SLP_TREE_INSIDE_OF_LOOP_COST(S)          (S)->cost.inside_of_loop
 
+
+typedef struct _vect_peel_info
+{
+  int npeel;
+  struct data_reference *dr;
+  unsigned int count;
+} *vect_peel_info;
+
+typedef struct _vect_peel_extended_info
+{
+  struct _vect_peel_info peel_info;
+  unsigned int inside_cost;
+  unsigned int outside_cost;
+} *vect_peel_extended_info;
+
 /*-----------------------------------------------------------------*/
 /* Info on vectorized loops.                                       */
 /*-----------------------------------------------------------------*/
@@ -245,6 +260,10 @@ typedef struct _loop_vec_info {
 
   /* Reduction cycles detected in the loop. Used in loop-aware SLP.  */
   VEC (gimple, heap) *reductions;
+
+  /* Hash table used to choose the best peeling option.  */
+  htab_t peeling_htab;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -270,6 +289,7 @@ typedef struct _loop_vec_info {
 #define LOOP_VINFO_SLP_INSTANCES(L)        (L)->slp_instances
 #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
 #define LOOP_VINFO_REDUCTIONS(L)           (L)->reductions
+#define LOOP_VINFO_PEELING_HTAB(L)         (L)->peeling_htab
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
 VEC_length (gimple, (L)->may_misalign_stmts) > 0
@@ -543,6 +563,8 @@ typedef struct _stmt_vec_info {
 #define PURE_SLP_STMT(S)                  ((S)->slp_type == pure_slp)
 #define STMT_SLP_TYPE(S)                   (S)->slp_type
 
+#define VECT_MAX_COST 1000
+
 /* The maximum number of intermediate steps required in multi-step type
    conversion.  */
 #define MAX_INTERM_CVT_STEPS         3
@@ -743,11 +765,14 @@ extern void vect_remove_stores (gimple);
 extern bool vect_analyze_stmt (gimple, bool *, slp_tree);
 extern bool vectorizable_condition (gimple, gimple_stmt_iterator *, gimple *,
                                     tree, int);
+extern void vect_get_load_cost (struct data_reference *, int, bool,
+                                unsigned int *, unsigned int *);
+extern void vect_get_store_cost (struct data_reference *, int, unsigned int *);
 
 /* In tree-vect-data-refs.c.  */
 extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);
 extern enum dr_alignment_support vect_supportable_dr_alignment
-                                           (struct data_reference *);
+                                           (struct data_reference *, bool);
 extern tree vect_get_smallest_scalar_type (gimple, HOST_WIDE_INT *,
                                            HOST_WIDE_INT *);
 extern bool vect_analyze_data_ref_dependences (loop_vec_info, bb_vec_info,
@@ -795,7 +820,8 @@ extern bool vectorizable_induction (gimple, gimple_stmt_iterator *, gimple *);
 extern int vect_estimate_min_profitable_iters (loop_vec_info);
 extern tree get_initial_def_for_reduction (gimple, tree, tree *);
 extern int vect_min_worthwhile_factor (enum tree_code);
-
+extern int vect_get_known_peeling_cost (loop_vec_info, int, int *, int);
+extern int vect_get_single_scalar_iteraion_cost (loop_vec_info);
 
 /* In tree-vect-slp.c.  */
 extern void vect_free_slp_instance (slp_instance);