Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.

gcc/ChangeLog: PR target/99881 * config/i386/i386.h (processor_costs): Add new member integer_to_sse. * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost, i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, bdver_cost, znver1_cost, znver2_cost, znver3_cost, btver1_cost, btver2_cost, btver3_cost, pentium4_cost, nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost, generic_cost, core_cost): Initialize integer_to_sse same value as sse_op. (skylake_cost): Initialize integer_to_sse twice as much as sse_op. * config/i386/i386.c (ix86_builtin_vectorization_cost): Use integer_to_sse instead of sse_op to calculate the cost of vec_construct. gcc/testsuite/ChangeLog: PR target/99881 * gcc.target/i386/pr99881.c: New test.
author: liuhongt <hongtao.liu@intel.com> 2021-03-26 10:56:47 +0800
committer: liuhongt <hongtao.liu@intel.com> 2021-07-28 10:48:39 +0800
commit: 872da9a6f664a06d73c987aa0cb2e5b830158a10 (patch)
tree: 05c56597a356b312b6f5e5396d96916db40ba97a /gcc
parent: af3f12e6e869adcbb1cec09cedba627d4bbf69a4 (diff)
download: gcc-872da9a6f664a06d73c987aa0cb2e5b830158a10.zip
gcc-872da9a6f664a06d73c987aa0cb2e5b830158a10.tar.gz
gcc-872da9a6f664a06d73c987aa0cb2e5b830158a10.tar.bz2
4 files changed, 81 insertions, 1 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 876a19f..ac59ebf 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
       case vec_construct:
 	{
 	  /* N element inserts into SSE vectors.  */
-	  int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
+	  int cost
+	    = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
+						ix86_cost->sse_op
+						: ix86_cost->integer_to_sse);
+
 	  /* One vinserti128 for combining two SSE vectors for AVX256.  */
 	  if (GET_MODE_BITSIZE (mode) == 256)
 	    cost += ix86_vec_cost (mode, ix86_cost->addss);
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 0c2c93d..d1e1c22 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -165,6 +165,7 @@ struct processor_costs {
   const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
 	    zmm_move;
   const int sse_to_integer;	/* cost of moving SSE register to integer.  */
+  const int integer_to_sse;	/* cost of moving integer to SSE register.  */
   const int gather_static, gather_per_elt; /* Cost of gather load is computed
 				   as static + per_item * nelts. */
   const int scatter_static, scatter_per_elt; /* Cost of gather store is
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f..67cfa00 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
 					   in 128bit, 256bit and 512bit */
   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
+  COSTS_N_BYTES (2),			/* cost of moving integer to sse register.  */
   5, 0,					/* Gather load static, per_elt.  */
   5, 0,					/* Gather store static, per_elt.  */
   0,					/* size of l1 cache  */
@@ -211,6 +212,7 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   0,					/* size of l1 cache  */
@@ -319,6 +321,7 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   4,					/* size of l1 cache.  486 has 8kB cache
@@ -429,6 +432,7 @@ struct processor_costs pentium_cost = {
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -530,6 +534,7 @@ struct processor_costs lakemont_cost = {
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -646,6 +651,7 @@ struct processor_costs pentiumpro_cost = {
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -753,6 +759,7 @@ struct processor_costs geode_cost = {
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   2, 2,					/* Gather load static, per_elt.  */
   2, 2,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -860,6 +867,7 @@ struct processor_costs k6_cost = {
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   2, 2,					/* Gather load static, per_elt.  */
   2, 2,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -973,6 +981,7 @@ struct processor_costs athlon_cost = {
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   5,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (2),			/* cost of moving integer to sse register.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -1088,6 +1097,7 @@ struct processor_costs k8_cost = {
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   5,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (2),			/* cost of moving integer to sse register.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -1216,6 +1226,7 @@ struct processor_costs amdfam10_cost = {
   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   3,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (2),			/* cost of moving integer to sse register.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -1336,6 +1347,7 @@ const struct processor_costs bdver_cost = {
   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   16,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (2),			/* cost of moving integer to sse register.  */
   12, 12,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
   16,					/* size of l1 cache.  */
@@ -1477,6 +1489,7 @@ struct processor_costs znver1_cost = {
   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
   6,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
      throughput 12.  Approx 9 uops do not depend on vector size and every load
      is 7 uops.  */
@@ -1633,6 +1646,7 @@ struct processor_costs znver2_cost = {
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
      throughput 12.  Approx 9 uops do not depend on vector size and every load
      is 7 uops.  */
@@ -1765,6 +1779,7 @@ struct processor_costs znver3_cost = {
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
      throughput 9.  Approx 7 uops do not depend on vector size and every load
      is 4 uops.  */
@@ -1909,6 +1924,7 @@ struct processor_costs skylake_cost = {
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (2)+1,			/* cost of moving integer to sse register.  */
   20, 8,				/* Gather load static, per_elt.  */
   22, 10,				/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -2035,6 +2051,7 @@ struct processor_costs icelake_cost = {
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   20, 8,				/* Gather load static, per_elt.  */
   22, 10,				/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -2148,6 +2165,7 @@ const struct processor_costs btver1_cost = {
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   14,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2258,6 +2276,7 @@ const struct processor_costs btver2_cost = {
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   14,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2367,6 +2386,7 @@ struct processor_costs pentium4_cost = {
   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
   20,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (2),			/* cost of moving integer to sse register.  */
   16, 16,				/* Gather load static, per_elt.  */
   16, 16,				/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -2479,6 +2499,7 @@ struct processor_costs nocona_cost = {
   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
   20,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (2),			/* cost of moving integer to sse register.  */
   12, 12,				/* Gather load static, per_elt.  */
   12, 12,				/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -2589,6 +2610,7 @@ struct processor_costs atom_cost = {
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   8,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   8, 8,					/* Gather load static, per_elt.  */
   8, 8,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2699,6 +2721,7 @@ struct processor_costs slm_cost = {
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   8,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   8, 8,					/* Gather load static, per_elt.  */
   8, 8,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2809,6 +2832,7 @@ struct processor_costs intel_cost = {
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
   4,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   6, 6,					/* Gather load static, per_elt.  */
   6, 6,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2926,6 +2950,7 @@ struct processor_costs generic_cost = {
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   18, 6,				/* Gather load static, per_elt.  */
   18, 6,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -3049,6 +3074,7 @@ struct processor_costs core_cost = {
   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   2,					/* cost of moving SSE register to integer.  */
+  COSTS_N_INSNS (1),			/* cost of moving integer to sse register.  */
   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
      rec. throughput 6.
      So 5 uops statically and one uops per load.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c
new file mode 100644
index 0000000..7ae51c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99881.c
@@ -0,0 +1,49 @@
+/* PR target/99881.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=skylake" } */
+/* { dg-final { scan-assembler-not "xmm[0-9]" } } */
+
+void
+foo (int* __restrict a, int n, int c)
+{
+    a[0] = n;
+    a[1] = c;
+}
+
+void
+foo1 (int* __restrict a, int n, int b, int c, int d)
+{
+    a[0] = n;
+    a[1] = b;
+    a[2] = c;
+    a[3] = d;
+}
+
+void
+foo2 (int* __restrict a, int n, int b, int c, int d, int e, int f, int g, int h)
+{
+    a[0] = n;
+    a[1] = b;
+    a[2] = c;
+    a[3] = d;
+    a[4] = e;
+    a[5] = f;
+    a[6] = g;
+    a[7] = h;
+}
+
+void
+foo3 (long long* __restrict a, long long n, long long c)
+{
+    a[0] = n;
+    a[1] = c;
+}
+
+void
+foo4 (long long* __restrict a, long long n, long long b, long long c, long long d)
+{
+    a[0] = n;
+    a[1] = b;
+    a[2] = c;
+    a[3] = d;
+}
author	liuhongt <hongtao.liu@intel.com>	2021-03-26 10:56:47 +0800
committer	liuhongt <hongtao.liu@intel.com>	2021-07-28 10:48:39 +0800
commit	872da9a6f664a06d73c987aa0cb2e5b830158a10 (patch)
tree	05c56597a356b312b6f5e5396d96916db40ba97a /gcc
parent	af3f12e6e869adcbb1cec09cedba627d4bbf69a4 (diff)
download	gcc-872da9a6f664a06d73c987aa0cb2e5b830158a10.zip gcc-872da9a6f664a06d73c987aa0cb2e5b830158a10.tar.gz gcc-872da9a6f664a06d73c987aa0cb2e5b830158a10.tar.bz2