1 files changed, 31 insertions, 30 deletions
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index f01b8ee..18ad3cc 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1867,9 +1867,9 @@ struct processor_costs znver4_cost = {
   {8, 8, 8},				/* cost of storing integer
 					   registers.  */
   2,					/* cost of reg,reg fld/fst.  */
-  {6, 6, 16},				/* cost of loading fp registers
+  {14, 14, 17},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode.  */
-  {8, 8, 16},				/* cost of storing fp registers
+  {12, 12, 16},				/* cost of storing fp registers
 					   in SFmode, DFmode and XFmode.  */
   2,					/* cost of moving MMX register.  */
   {6, 6},				/* cost of loading MMX registers
@@ -1878,13 +1878,13 @@ struct processor_costs znver4_cost = {
 					   in SImode and DImode.  */
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
-  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
+  {6, 6, 10, 10, 12},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {8, 8, 8, 8, 16},			/* cost of storing SSE registers
+  {8, 8, 8, 12, 12},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  6, 6,					/* SSE->integer and integer->SSE
+  6, 8,					/* SSE->integer and integer->SSE
 					   moves.  */
-  8, 8,				/* mask->integer and integer->mask moves */
+  8, 8,					/* mask->integer and integer->mask moves */
   {6, 6, 6},				/* cost of loading mask register
 					   in QImode, HImode, SImode.  */
   {8, 8, 8},				/* cost if storing mask register
@@ -1894,6 +1894,7 @@ struct processor_costs znver4_cost = {
   },
 
   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
+  /* TODO: Lea with 3 components has cost 2.  */
   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
   COSTS_N_INSNS (1),			/* variable shift costs.  */
   COSTS_N_INSNS (1),			/* constant shift costs.  */
@@ -1904,11 +1905,11 @@ struct processor_costs znver4_cost = {
    COSTS_N_INSNS (3)},			/*			other.  */
   0,					/* cost of multiply per each bit
 					   set.  */
-  {COSTS_N_INSNS (9),			/* cost of a divide/mod for QI.  */
-   COSTS_N_INSNS (10),			/* 			    HI.  */
-   COSTS_N_INSNS (12),			/*			    SI.  */
-   COSTS_N_INSNS (17),			/*			    DI.  */
-   COSTS_N_INSNS (17)},			/*			    other.  */
+  {COSTS_N_INSNS (12),			/* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (13),			/* 			    HI.  */
+   COSTS_N_INSNS (13),			/*			    SI.  */
+   COSTS_N_INSNS (18),			/*			    DI.  */
+   COSTS_N_INSNS (18)},			/*			    other.  */
   COSTS_N_INSNS (1),			/* cost of movsx.  */
   COSTS_N_INSNS (1),			/* cost of movzx.  */
   8,					/* "large" insn.  */
@@ -1919,22 +1920,22 @@ struct processor_costs znver4_cost = {
 					   Relative to reg-reg move (2).  */
   {8, 8, 8},				/* cost of storing integer
 					   registers.  */
-  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
+  {6, 6, 10, 10, 12},			/* cost of loading SSE registers
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {8, 8, 8, 8, 16},			/* cost of storing SSE register
+  {8, 8, 8, 12, 12},			/* cost of storing SSE register
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
-  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
-  2, 2, 3,				/* cost of moving XMM,YMM,ZMM
+  {6, 6, 6, 6, 6},			/* cost of unaligned loads.  */
+  {8, 8, 8, 8, 8},			/* cost of unaligned stores.  */
+  2, 2, 2,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
-  /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
-     throughput 9.  Approx 7 uops do not depend on vector size and every load
-     is 4 uops.  */
-  14, 8,				/* Gather load static, per_elt.  */
-  14, 10,				/* Gather store static, per_elt.  */
+  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
+     throughput 5.  Approx 7 uops do not depend on vector size and every load
+     is 5 uops.  */
+  14, 10,				/* Gather load static, per_elt.  */
+  14, 20,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
-  512,					/* size of l2 cache.  */
+  1024,					/* size of l2 cache.  */
   64,					/* size of prefetch block.  */
   /* New AMD processors never drop prefetches; if they cannot be performed
      immediately, they are queued.  We set number of simultaneous prefetches
@@ -1943,26 +1944,26 @@ struct processor_costs znver4_cost = {
      time).  */
   100,					/* number of parallel prefetches.  */
   3,					/* Branch cost.  */
-  COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (7),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
   /* Latency of fdiv is 8-15.  */
   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
   /* Latency of fsqrt is 4-10.  */
-  COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
+  COSTS_N_INSNS (25),			/* cost of FSQRT instruction.  */
 
   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
-  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
-  COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
   /* 9-13.  */
   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.