re PR tree-optimization/84037 (Speed regression of polyhedron benchmark since r256644)

2018-02-16 Richard Biener <rguenther@suse.de> PR tree-optimization/84037 PR tree-optimization/84016 PR target/82862 * config/i386/i386.c (ix86_builtin_vectorization_cost): Adjust vec_construct for the fact we need additional higher latency 128bit inserts for AVX256 and AVX512 vector builds. (ix86_add_stmt_cost): Scale vector construction cost for elementwise loads. From-SVN: r257734
author: Richard Biener <rguenther@suse.de> 2018-02-16 13:47:25 +0000
committer: Richard Biener <rguenth@gcc.gnu.org> 2018-02-16 13:47:25 +0000
commit: be77ba2a461eefdf4a2676b19025f36ec092c598 (patch)
tree: 7a02106de91bc43d8e50c5adb8b842c3ba9b3fc1
parent: fe74f9b4ed572a3bb2f8a110ea178578df8e0563 (diff)
download: gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.zip
gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.tar.gz
gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.tar.bz2
2 files changed, 35 insertions, 1 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c43637a..01ca398 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,16 @@
 2018-02-16  Richard Biener  <rguenther@suse.de>
 
+	PR tree-optimization/84037
+	PR tree-optimization/84016
+	PR target/82862
+	* config/i386/i386.c (ix86_builtin_vectorization_cost):
+	Adjust vec_construct for the fact we need additional higher latency
+	128bit inserts for AVX256 and AVX512 vector builds.
+	(ix86_add_stmt_cost): Scale vector construction cost for
+	elementwise loads.
+
+2018-02-16  Richard Biener  <rguenther@suse.de>
+
 	PR tree-optimization/84417
 	* tree-ssa.c (non_rewritable_mem_ref_base): Properly constrain
 	the MEM_REF offset when conversion to BIT_FIELD_REF is desired.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 2e82842..4a968a7 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -45906,7 +45906,18 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 			      ix86_cost->sse_op, true);
 
       case vec_construct:
-	return ix86_vec_cost (mode, ix86_cost->sse_op, false);
+	{
+	  /* N element inserts.  */
+	  int cost = ix86_vec_cost (mode, ix86_cost->sse_op, false);
+	  /* One vinserti128 for combining two SSE vectors for AVX256.  */
+	  if (GET_MODE_BITSIZE (mode) == 256)
+	    cost += ix86_vec_cost (mode, ix86_cost->addss, true);
+	  /* One vinserti64x4 and two vinserti128 for combining SSE
+	     and AVX256 vectors to AVX512.  */
+	  else if (GET_MODE_BITSIZE (mode) == 512)
+	    cost += 3 * ix86_vec_cost (mode, ix86_cost->addss, true);
+	  return cost;
+	}
 
       default:
         gcc_unreachable ();
@@ -50245,6 +50256,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
 	  break;
 	}
     }
+  /* If we do elementwise loads into a vector then we are bound by
+     latency and execution resources for the many scalar loads
+     (AGU and load ports).  Try to account for this by scaling the
+     construction cost by the number of elements involved.  */
+  if (kind == vec_construct
+      && stmt_info
+      && stmt_info->type == load_vec_info_type
+      && stmt_info->memory_access_type == VMAT_ELEMENTWISE)
+    {
+      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+      stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype);
+    }
   if (stmt_cost == -1)
     stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
author	Richard Biener <rguenther@suse.de>	2018-02-16 13:47:25 +0000
committer	Richard Biener <rguenth@gcc.gnu.org>	2018-02-16 13:47:25 +0000
commit	be77ba2a461eefdf4a2676b19025f36ec092c598 (patch)
tree	7a02106de91bc43d8e50c5adb8b842c3ba9b3fc1
parent	fe74f9b4ed572a3bb2f8a110ea178578df8e0563 (diff)
download	gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.zip gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.tar.gz gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.tar.bz2