diff options
author | liuhongt <hongtao.liu@intel.com> | 2021-03-26 10:56:47 +0800 |
---|---|---|
committer | liuhongt <hongtao.liu@intel.com> | 2021-07-28 10:48:39 +0800 |
commit | 872da9a6f664a06d73c987aa0cb2e5b830158a10 (patch) | |
tree | 05c56597a356b312b6f5e5396d96916db40ba97a /gcc | |
parent | af3f12e6e869adcbb1cec09cedba627d4bbf69a4 (diff) | |
download | gcc-872da9a6f664a06d73c987aa0cb2e5b830158a10.zip gcc-872da9a6f664a06d73c987aa0cb2e5b830158a10.tar.gz gcc-872da9a6f664a06d73c987aa0cb2e5b830158a10.tar.bz2 |
Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.
gcc/ChangeLog:
PR target/99881
* config/i386/i386.h (processor_costs): Add new member
integer_to_sse.
* config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
bdver_cost, znver1_cost, znver2_cost, znver3_cost,
btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
generic_cost, core_cost): Initialize integer_to_sse same value
as sse_op.
(skylake_cost): Initialize integer_to_sse twice as much as sse_op.
* config/i386/i386.c (ix86_builtin_vectorization_cost):
Use integer_to_sse instead of sse_op to calculate the cost of
vec_construct.
gcc/testsuite/ChangeLog:
PR target/99881
* gcc.target/i386/pr99881.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/i386.c | 6 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 1 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-costs.h | 26 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr99881.c | 49 |
4 files changed, 81 insertions, 1 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 876a19f..ac59ebf 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_construct: { /* N element inserts into SSE vectors. */ - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; + int cost + = TYPE_VECTOR_SUBPARTS (vectype) * (fp ? + ix86_cost->sse_op + : ix86_cost->integer_to_sse); + /* One vinserti128 for combining two SSE vectors for AVX256. */ if (GET_MODE_BITSIZE (mode) == 256) cost += ix86_vec_cost (mode, ix86_cost->addss); diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 0c2c93d..d1e1c22 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -165,6 +165,7 @@ struct processor_costs { const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ zmm_move; const int sse_to_integer; /* cost of moving SSE register to integer. */ + const int integer_to_sse; /* cost of moving integer to SSE register. */ const int gather_static, gather_per_elt; /* Cost of gather load is computed as static + per_item * nelts. */ const int scatter_static, scatter_per_elt; /* Cost of gather store is diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index ffe810f..67cfa00 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ in 128bit, 256bit and 512bit */ 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_BYTES (2), /* cost of moving integer to sse register. */ 5, 0, /* Gather load static, per_elt. */ 5, 0, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -211,6 +212,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -319,6 +321,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 4, /* size of l1 cache. 486 has 8kB cache @@ -429,6 +432,7 @@ struct processor_costs pentium_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -753,6 +759,7 @@ struct processor_costs geode_cost = { {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -860,6 +867,7 @@ struct processor_costs k6_cost = { {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -973,6 +981,7 @@ struct processor_costs athlon_cost = { {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 5, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = { {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 5, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = { {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = { {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 16, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 12, 12, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 16, /* size of l1 cache. */ @@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = { {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, throughput 9. Approx 7 uops do not depend on vector size and every load is 4 uops. */ @@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = { {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2)+1, /* cost of moving integer to sse register. */ 20, 8, /* Gather load static, per_elt. */ 22, 10, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = { {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 20, 8, /* Gather load static, per_elt. */ 22, 10, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = { {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 14, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = { {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 14, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = { {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 20, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 16, 16, /* Gather load static, per_elt. */ 16, 16, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = { {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 20, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (2), /* cost of moving integer to sse register. */ 12, 12, /* Gather load static, per_elt. */ 12, 12, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = { {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 8, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = { {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 8, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = { {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 4, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 6, 6, /* Gather load static, per_elt. */ 6, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = { {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3049,6 +3074,7 @@ struct processor_costs core_cost = { {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2, /* cost of moving SSE register to integer. */ + COSTS_N_INSNS (1), /* cost of moving integer to sse register. */ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, rec. throughput 6. So 5 uops statically and one uops per load. */ diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c new file mode 100644 index 0000000..7ae51c8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr99881.c @@ -0,0 +1,49 @@ +/* PR target/99881. */ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=skylake" } */ +/* { dg-final { scan-assembler-not "xmm[0-9]" } } */ + +void +foo (int* __restrict a, int n, int c) +{ + a[0] = n; + a[1] = c; +} + +void +foo1 (int* __restrict a, int n, int b, int c, int d) +{ + a[0] = n; + a[1] = b; + a[2] = c; + a[3] = d; +} + +void +foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h) +{ + a[0] = n; + a[1] = b; + a[2] = c; + a[3] = d; + a[4] = e; + a[5] = f; + a[6] = g; + a[7] = h; +} + +void +foo3 (long long* __restrict a, long long n, long long c) +{ + a[0] = n; + a[1] = c; +} + +void +foo4 (long long* __restrict a, long long n, long long b, long long c, long long d) +{ + a[0] = n; + a[1] = b; + a[2] = c; + a[3] = d; +} |