diff options
author | Richard Biener <rguenther@suse.de> | 2018-02-16 13:47:25 +0000 |
---|---|---|
committer | Richard Biener <rguenth@gcc.gnu.org> | 2018-02-16 13:47:25 +0000 |
commit | be77ba2a461eefdf4a2676b19025f36ec092c598 (patch) | |
tree | 7a02106de91bc43d8e50c5adb8b842c3ba9b3fc1 | |
parent | fe74f9b4ed572a3bb2f8a110ea178578df8e0563 (diff) | |
download | gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.zip gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.tar.gz gcc-be77ba2a461eefdf4a2676b19025f36ec092c598.tar.bz2 |
re PR tree-optimization/84037 (Speed regression of polyhedron benchmark since r256644)
2018-02-16 Richard Biener <rguenther@suse.de>
PR tree-optimization/84037
PR tree-optimization/84016
PR target/82862
* config/i386/i386.c (ix86_builtin_vectorization_cost):
Adjust vec_construct for the fact we need additional higher latency
128bit inserts for AVX256 and AVX512 vector builds.
(ix86_add_stmt_cost): Scale vector construction cost for
elementwise loads.
From-SVN: r257734
-rw-r--r-- | gcc/ChangeLog | 11 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 25 |
2 files changed, 35 insertions, 1 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c43637a..01ca398 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,16 @@ 2018-02-16 Richard Biener <rguenther@suse.de> + PR tree-optimization/84037 + PR tree-optimization/84016 + PR target/82862 + * config/i386/i386.c (ix86_builtin_vectorization_cost): + Adjust vec_construct for the fact we need additional higher latency + 128bit inserts for AVX256 and AVX512 vector builds. + (ix86_add_stmt_cost): Scale vector construction cost for + elementwise loads. + +2018-02-16 Richard Biener <rguenther@suse.de> + PR tree-optimization/84417 * tree-ssa.c (non_rewritable_mem_ref_base): Properly constrain the MEM_REF offset when conversion to BIT_FIELD_REF is desired. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 2e82842..4a968a7 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -45906,7 +45906,18 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, ix86_cost->sse_op, true); case vec_construct: - return ix86_vec_cost (mode, ix86_cost->sse_op, false); + { + /* N element inserts. */ + int cost = ix86_vec_cost (mode, ix86_cost->sse_op, false); + /* One vinserti128 for combining two SSE vectors for AVX256. */ + if (GET_MODE_BITSIZE (mode) == 256) + cost += ix86_vec_cost (mode, ix86_cost->addss, true); + /* One vinserti64x4 and two vinserti128 for combining SSE + and AVX256 vectors to AVX512. */ + else if (GET_MODE_BITSIZE (mode) == 512) + cost += 3 * ix86_vec_cost (mode, ix86_cost->addss, true); + return cost; + } default: gcc_unreachable (); @@ -50245,6 +50256,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, break; } } + /* If we do elementwise loads into a vector then we are bound by + latency and execution resources for the many scalar loads + (AGU and load ports). Try to account for this by scaling the + construction cost by the number of elements involved. */ + if (kind == vec_construct + && stmt_info + && stmt_info->type == load_vec_info_type + && stmt_info->memory_access_type == VMAT_ELEMENTWISE) + { + stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype); + } if (stmt_cost == -1) stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); |