aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2018-02-16 13:47:25 +0000
committerRichard Biener <rguenth@gcc.gnu.org>2018-02-16 13:47:25 +0000
commitbe77ba2a461eefdf4a2676b19025f36ec092c598 (patch)
tree7a02106de91bc43d8e50c5adb8b842c3ba9b3fc1
parentfe74f9b4ed572a3bb2f8a110ea178578df8e0563 (diff)
downloadgcc-be77ba2a461eefdf4a2676b19025f36ec092c598.zip
gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.tar.gz
gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.tar.bz2
re PR tree-optimization/84037 (Speed regression of polyhedron benchmark since r256644)
2018-02-16 Richard Biener <rguenther@suse.de> PR tree-optimization/84037 PR tree-optimization/84016 PR target/82862 * config/i386/i386.c (ix86_builtin_vectorization_cost): Adjust vec_construct for the fact we need additional higher latency 128bit inserts for AVX256 and AVX512 vector builds. (ix86_add_stmt_cost): Scale vector construction cost for elementwise loads. From-SVN: r257734
-rw-r--r--gcc/ChangeLog11
-rw-r--r--gcc/config/i386/i386.c25
2 files changed, 35 insertions, 1 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c43637a..01ca398 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,16 @@
2018-02-16 Richard Biener <rguenther@suse.de>
+ PR tree-optimization/84037
+ PR tree-optimization/84016
+ PR target/82862
+ * config/i386/i386.c (ix86_builtin_vectorization_cost):
+ Adjust vec_construct for the fact we need additional higher latency
+ 128bit inserts for AVX256 and AVX512 vector builds.
+ (ix86_add_stmt_cost): Scale vector construction cost for
+ elementwise loads.
+
+2018-02-16 Richard Biener <rguenther@suse.de>
+
PR tree-optimization/84417
* tree-ssa.c (non_rewritable_mem_ref_base): Properly constrain
the MEM_REF offset when conversion to BIT_FIELD_REF is desired.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 2e82842..4a968a7 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -45906,7 +45906,18 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
ix86_cost->sse_op, true);
case vec_construct:
- return ix86_vec_cost (mode, ix86_cost->sse_op, false);
+ {
+ /* N element inserts. */
+ int cost = ix86_vec_cost (mode, ix86_cost->sse_op, false);
+ /* One vinserti128 for combining two SSE vectors for AVX256. */
+ if (GET_MODE_BITSIZE (mode) == 256)
+ cost += ix86_vec_cost (mode, ix86_cost->addss, true);
+ /* One vinserti64x4 and two vinserti128 for combining SSE
+ and AVX256 vectors to AVX512. */
+ else if (GET_MODE_BITSIZE (mode) == 512)
+ cost += 3 * ix86_vec_cost (mode, ix86_cost->addss, true);
+ return cost;
+ }
default:
gcc_unreachable ();
@@ -50245,6 +50256,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
break;
}
}
+ /* If we do elementwise loads into a vector then we are bound by
+ latency and execution resources for the many scalar loads
+ (AGU and load ports). Try to account for this by scaling the
+ construction cost by the number of elements involved. */
+ if (kind == vec_construct
+ && stmt_info
+ && stmt_info->type == load_vec_info_type
+ && stmt_info->memory_access_type == VMAT_ELEMENTWISE)
+ {
+ stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype);
+ }
if (stmt_cost == -1)
stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);