aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJan Hubicka <hubicka@ucw.cz>2017-10-25 21:11:41 +0200
committerJan Hubicka <hubicka@gcc.gnu.org>2017-10-25 19:11:41 +0000
commita4fe6139ab2e39d8b264befaf38f748e5c88d76a (patch)
tree1fdc34bd4b0df06c9b3eb2b74007a1d33a27f5b2 /gcc
parentd185db140ad68f80d134f2f7f254e4ace7f89f24 (diff)
downloadgcc-a4fe6139ab2e39d8b264befaf38f748e5c88d76a.zip
gcc-a4fe6139ab2e39d8b264befaf38f748e5c88d76a.tar.gz
gcc-a4fe6139ab2e39d8b264befaf38f748e5c88d76a.tar.bz2
i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather cost correctly.
* i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather cost correctly. * i386.h (processor_costs): Add gather_static, gather_per_elt, scatter_static, scatter_per_elt. * x86-tune-costs.h: Add new cost entries. From-SVN: r254083
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog8
-rw-r--r--gcc/config/i386/i386.c18
-rw-r--r--gcc/config/i386/i386.h4
-rw-r--r--gcc/config/i386/x86-tune-costs.h56
4 files changed, 84 insertions, 2 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c5dfcb7..5985d9e 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2017-10-23 Jan Hubicka <hubicka@ucw.cz>
+
+ * i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
+ cost correctly.
+ * i386.h (processor_costs): Add gather_static, gather_per_elt,
+ scatter_static, scatter_per_elt.
+ * x86-tune-costs.h: Add new cost entries.
+
2017-10-25 Richard Biener <rguenther@suse.de>
* tree-ssa-sccvn.h (vn_eliminate): Declare.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 367cade..56486e0 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
/* We should have separate costs for unaligned loads and gather/scatter.
Do that incrementally. */
case unaligned_load:
- case vector_gather_load:
index = sse_store_index (mode);
return ix86_vec_cost (mode,
COSTS_N_INSNS
@@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
true);
case unaligned_store:
- case vector_scatter_store:
index = sse_store_index (mode);
return ix86_vec_cost (mode,
COSTS_N_INSNS
(ix86_cost->sse_unaligned_store[index]) / 2,
true);
+ case vector_gather_load:
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->gather_static
+ + ix86_cost->gather_per_elt
+ * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+ true);
+
+ case vector_scatter_store:
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->scatter_static
+ + ix86_cost->scatter_per_elt
+ * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+ true);
+
case cond_branch_taken:
return ix86_cost->cond_taken_branch_cost;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 27fc9f0..837906b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -253,6 +253,10 @@ struct processor_costs {
const int mmxsse_to_integer; /* cost of moving mmxsse register to
integer. */
const int ssemmx_to_integer; /* cost of moving integer to mmxsse register. */
+ const int gather_static, gather_per_elt; /* Cost of gather load is computed
+ as static + per_item * nelts. */
+ const int scatter_static, scatter_per_elt; /* Cost of gather store is
+ computed as static + per_item * nelts. */
const int l1_cache_size; /* size of l1 cache, in kilobytes. */
const int l2_cache_size; /* size of l2 cache, in kilobytes. */
const int prefetch_block; /* bytes moved to cache for prefetch. */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index e31d7ce..c7ac70e 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
{3, 3, 3, 3, 3}, /* cost of unaligned SSE store
in 128bit, 256bit and 512bit */
3, 3, /* SSE->integer and integer->SSE moves */
+ 5, 0, /* Gather load static, per_elt. */
+ 5, 0, /* Gather store static, per_elt. */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
@@ -166,6 +168,8 @@ struct processor_costs i386_cost = { /* 386 specific costs */
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
@@ -249,6 +253,8 @@ struct processor_costs i486_cost = { /* 486 specific costs */
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
4, /* size of l1 cache. 486 has 8kB cache
shared for code and data, so 4kB is
not really precise. */
@@ -334,6 +340,8 @@ struct processor_costs pentium_cost = {
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
@@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = {
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
@@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost = {
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
3, 3, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
256, /* size of l2 cache */
32, /* size of prefetch block */
@@ -584,6 +596,8 @@ struct processor_costs geode_cost = {
in 32,64,128,256 and 512-bit */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE moves */
+ 2, 2, /* Gather load static, per_elt. */
+ 2, 2, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
128, /* size of l2 cache. */
32, /* size of prefetch block */
@@ -666,6 +680,8 @@ struct processor_costs k6_cost = {
in 32,64,128,256 and 512-bit */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE moves */
+ 2, 2, /* Gather load static, per_elt. */
+ 2, 2, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
32, /* size of l2 cache. Some models
have integrated l2 cache, but
@@ -754,6 +770,8 @@ struct processor_costs athlon_cost = {
in 32,64,128,256 and 512-bit */
{4, 4, 5, 10, 20}, /* cost of unaligned stores. */
5, 5, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -844,6 +862,8 @@ struct processor_costs k8_cost = {
in 32,64,128,256 and 512-bit */
{4, 4, 5, 10, 20}, /* cost of unaligned stores. */
5, 5, /* SSE->integer and integer->SSE moves */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = {
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
+ 4, 4, /* Gather load static, per_elt. */
+ 4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
16, 20, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = {
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE moves. */
+ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
+ throughput 12. Approx 9 uops do not depend on vector size and every load
+ is 7 uops. */
+ 18, 8, /* Gather load static, per_elt. */
+ 18, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
@@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 12, 24, 48}, /* cost of unaligned stores. */
14, 14, /* SSE->integer and integer->SSE moves */
+ 10, 10, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 12, 24, 48}, /* cost of unaligned stores. */
14, 14, /* SSE->integer and integer->SSE moves */
+ 10, 10, /* Gather load static, per_elt. */
+ 10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
in 32,64,128,256 and 512-bit */
{32, 32, 32, 64, 128}, /* cost of unaligned stores. */
20, 12, /* SSE->integer and integer->SSE moves */
+ 16, 16, /* Gather load static, per_elt. */
+ 16, 16, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
in 32,64,128,256 and 512-bit */
{24, 24, 24, 48, 96}, /* cost of unaligned stores. */
20, 12, /* SSE->integer and integer->SSE moves */
+ 12, 12, /* Gather load static, per_elt. */
+ 12, 12, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
1024, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
in 32,64,128,256 and 512-bit */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
8, 6, /* SSE->integer and integer->SSE moves */
+ 8, 8, /* Gather load static, per_elt. */
+ 8, 8, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
in 32,64,128,256 and 512-bit */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
8, 6, /* SSE->integer and integer->SSE moves */
+ 8, 8, /* Gather load static, per_elt. */
+ 8, 8, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 10, 10}, /* cost of unaligned loads. */
4, 4, /* SSE->integer and integer->SSE moves */
+ 6, 6, /* Gather load static, per_elt. */
+ 6, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
in 32,64,128,256 and 512-bit */
{10, 10, 10, 15, 20}, /* cost of unaligned storess. */
20, 20, /* SSE->integer and integer->SSE moves */
+ 6, 6, /* Gather load static, per_elt. */
+ 6, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
@@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
in 32,64,128,256 and 512-bit */
{6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2, 2, /* SSE->integer and integer->SSE moves */
+ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
+ rec. throughput 6.
+ So 5 uops statically and one uops per load. */
+ 10, 6, /* Gather load static, per_elt. */
+ 10, 6, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */