1 files changed, 29 insertions, 133 deletions
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index e509129..c8603b9 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2252,7 +2252,7 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
   /* ADDSS has throughput 2 and latency 2
      (in some cases when source is another addition).  */
-  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
   /* MULSS has throughput 2 and latency 3.  */
   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
@@ -3568,127 +3568,6 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (2),			/* Branch mispredict scale.  */
 };
 
-static stringop_algs intel_memcpy[2] = {
-  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static stringop_algs intel_memset[2] = {
-  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs intel_cost = {
-  {
-  /* Start of register allocator costs.  integer->integer move cost is 2. */
-  6,				     /* cost for loading QImode using movzbl */
-  {4, 4, 4},				/* cost of loading integer registers
-					   in QImode, HImode and SImode.
-					   Relative to reg-reg move (2).  */
-  {6, 6, 6},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {6, 6, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 10},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 6},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {6, 6, 6, 6, 6},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-  4, 4,				/* SSE->integer and integer->SSE moves */
-  4, 4,				/* mask->integer and integer->mask moves */
-  {4, 4, 4},				/* cost of loading mask register
-					   in QImode, HImode, SImode.  */
-  {6, 6, 6},				/* cost if storing mask register
-					   in QImode, HImode, SImode.  */
-  2,					/* cost of moving mask register.  */
-  /* End of register allocator costs.  */
-  },
-
-  COSTS_N_INSNS (1),			/* cost of an add instruction */
-  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
-  COSTS_N_INSNS (1),			/* variable shift costs */
-  COSTS_N_INSNS (1),			/* constant shift costs */
-  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
-   COSTS_N_INSNS (3),			/*				 HI */
-   COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (2)},			/*			      other */
-  0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),			/*			    HI */
-   COSTS_N_INSNS (42),			/*			    SI */
-   COSTS_N_INSNS (74),			/*			    DI */
-   COSTS_N_INSNS (74)},			/*			    other */
-  COSTS_N_INSNS (1),			/* cost of movsx */
-  COSTS_N_INSNS (1),			/* cost of movzx */
-  8,					/* "large" insn */
-  17,					/* MOVE_RATIO */
-  6,					/* CLEAR_RATIO */
-  {4, 4, 4},				/* cost of loading integer registers
-					   in QImode, HImode and SImode.
-					   Relative to reg-reg move (2).  */
-  {6, 6, 6},				/* cost of storing integer registers */
-  {6, 6, 6, 6, 6},			/* cost of loading SSE register
-					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {6, 6, 6, 6, 6},			/* cost of storing SSE register
-					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-  {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
-  4,					/* cost of moving SSE register to integer.  */
-  4,					/* cost of moving integer register to SSE.  */
-  6, 6,					/* Gather load static, per_elt.  */
-  6, 6,					/* Gather store static, per_elt.  */
-  32,					/* size of l1 cache.  */
-  256,					/* size of l2 cache.  */
-  64,					/* size of prefetch block */
-  6,					/* number of parallel prefetches */
-  3,					/* Branch cost */
-  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
-  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
-  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
-  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
-  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
-
-  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
-  COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
-  COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
-  COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
-  COSTS_N_INSNS (8),			/* cost of CVTSS2SD etc.  */
-  COSTS_N_INSNS (16),			/* cost of 256bit VCVTPS2PD etc.  */
-  COSTS_N_INSNS (32),			/* cost of 512bit VCVTPS2PD etc.  */
-  COSTS_N_INSNS (8),			/* cost of CVTSI2SS instruction.  */
-  COSTS_N_INSNS (8),			/* cost of CVT(T)SS2SI instruction.  */
-  COSTS_N_INSNS (8),			/* cost of CVTPI2PS instruction.  */
-  COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
-  1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
-  intel_memcpy,
-  intel_memset,
-  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
-  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
-  "16",					/* Loop alignment.  */
-  "16:8:8",				/* Jump alignment.  */
-  "0:0:8",				/* Label alignment.  */
-  "16",					/* Func alignment.  */
-  4,					/* Small unroll limit.  */
-  2,					/* Small unroll factor.  */
-  COSTS_N_INSNS (2),			/* Branch mispredict scale.  */
-};
-
 /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU.  */
 static stringop_algs lujiazui_memcpy[2] = {
   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
@@ -4065,19 +3944,36 @@ struct processor_costs shijidadao_cost = {
 
 
 
-/* Generic should produce code tuned for Core-i7 (and newer chips)
-   and btver1 (and newer chips).  */
+/* Generic should produce code tuned for Haswell (and newer chips)
+   and znver1 (and newer chips):
+   1. Don't align memory.
+   2. For known sizes, prefer vector loop, unroll loop with 4 moves or
+      stores per iteration without aligning the loop, up to 256 bytes.
+   3. For unknown sizes, use memcpy/memset.
+   4. Since each loop iteration has 4 stores and 8 stores for zeroing
+      with unroll loop may be needed, change CLEAR_RATIO to 10 so that
+      zeroing up to 72 bytes are fully unrolled with 9 stores without
+      SSE.
+ */
 
 static stringop_algs generic_memcpy[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, vector_loop, true},
+    {256, unrolled_loop, true},
+    {-1, libcall, true}}},
+  {libcall,
+   {{256, vector_loop, true},
+    {256, unrolled_loop, true},
+    {-1, libcall, true}}}};
 static stringop_algs generic_memset[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, vector_loop, true},
+    {256, unrolled_loop, true},
+    {-1, libcall, true}}},
+  {libcall,
+   {{256, vector_loop, true},
+    {256, unrolled_loop, true},
+    {-1, libcall, true}}}};
 static const
 struct processor_costs generic_cost = {
   {
@@ -4134,7 +4030,7 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-  6,					/* CLEAR_RATIO */
+  10,					/* CLEAR_RATIO */
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */