gcc/config/i386/x86-tune.def


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230

/* Definitions of x86 tunable features.
   Copyright (C) 2013 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.

GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
   negatively, so enabling for Generic64 seems like good code size
   tradeoff.  We can't enable it for 32bit generic because it does not
   work well with PPro base chips.  */
DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", 
	  m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", 
          m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE 
          | m_GENERIC)
DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",  m_486 | m_PENT)
DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen",
	 m_486 | m_PENT | m_PPRO | m_ATOM | m_SLM | m_CORE_ALL | m_K6
	 | m_AMD_MULTIPLE | m_GENERIC)
/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
   on simulation result. But after P4 was made, no performance benefit
   was observed with branch hints.  It also increases the code size.
   As a result, icc never generates branch hints.  */
DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0)
DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
          | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
   partial dependencies.  */
DEF_TUNE (X86_TUNE_MOVX, "movx",
          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE 
          | m_AMD_MULTIPLE  | m_GENERIC)
/* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
   register stalls on Generic32 compilation setting as well.  However
   in current implementation the partial register stalls are not eliminated
   very well - they can be introduced via subregs synthesized by combine
   and can happen in caller/callee saving sequences.  */
DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
          m_CORE_ALL | m_GENERIC)
/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
 * on 16-bit immediate moves into memory on Core2 and Corei7.  */
DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", 
          m_386 | m_486 | m_K6_GEODE)
DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
          ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM 
            | m_SLM | m_AMD_MULTIPLE | m_GENERIC))
DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", ~(m_PENT | m_ATOM | m_SLM | m_K6))
/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT)
DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO))
DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
          m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_SLM 
          | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT))
DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0)
/* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
   register stalls.  Just like X86_TUNE_PARTIAL_REG_STALL this option
   might be considered for Generic32 if our scheme for avoiding partial
   stalls was more effective.  */
DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0)
DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
/* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
   over esp addition.  */
DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO)
/* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
   over esp addition.  */
DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT)
/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
   over esp subtraction.  */
DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT 
          | m_K6_GEODE)
/* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
   over esp subtraction.  */
DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE)
/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
   for DFmode copies */
DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
          ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM 
          | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
          m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE 
          | m_GENERIC)
/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
   conflict here in between PPro/Pentium4 based chips that thread 128bit
   SSE registers as single units versus K8 based chips that divide SSE
   registers to two 64bit halves.  This knob promotes all store destinations
   to be 128bit to allow register renaming on 128bit SSE units, but usually
   results in one extra microop on 64bit SSE units.  Experimental results
   shows that disabling this option on P4 brings over 20% SPECfp regression,
   while enabling it on K8 brings roughly 2.4% regression that can be partly
    masked by careful scheduling of moves.  */
DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10 
          | m_BDVER | m_GENERIC)
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
          m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM)
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
          m_COREI7 | m_BDVER | m_SLM)
DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
          m_BDVER)
/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
   are resolved on SSE register parts instead of whole registers, so we may
   maintain just lower part of scalar values in proper format leaving the
   upper part undefined.  */
DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", m_AMD_MULTIPLE)
DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | m_P4_NOCONA)
DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
          m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", 
          m_PPRO | m_ATHLON_K8)
DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
          m_PPRO | m_ATHLON_K8)	
DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
          ~(m_AMD_MULTIPLE | m_GENERIC))
DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
          ~m_ATHLON_K8)
DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
          ~(m_AMDFAM10 | m_BDVER ))
/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
   than 4 branch instructions in the 16 byte window.  */
DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM| m_AMD_MULTIPLE 
          | m_GENERIC)
DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
          m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE 
          | m_AMD_MULTIPLE | m_GENERIC)
DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
          m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
          ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC))
DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
          m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM)
DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
          | m_ATHLON_K8 | m_GENERIC)
DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
          m_CORE_ALL | m_K8 | m_GENERIC)
/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
   and SImode multiply, but 386 and 486 do HImode multiply faster.  */
DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
          ~(m_386 | m_486))
/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
   vector path on AMD machines.  */
DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
          m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
   machines.  */
DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
          m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
   than a MOV.  */
DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT)
/* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
   but one byte longer.  */
DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT)
/* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
   operand that cannot be represented using a modRM byte.  The XOR
   replacement is long decoded, so this split helps here as well.  */
DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
   from FP to FP. */
DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
          m_CORE_ALL | m_AMDFAM10 | m_GENERIC)
/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
   from integer to FP. */
DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
/* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
   with a subsequent conditional jump instruction into a single
   compare-and-branch uop.  */
DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
   will impact LEA instruction selection. */
DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
/* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
   instructions.  */
DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_ATOM)
/* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
   at -O3.  For the moment, the prefetching seems badly tuned for Intel
   chips.  */
DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
          m_K6_GEODE | m_AMD_MULTIPLE)
/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
   the auto-vectorizer.  */
DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2)
/* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
   during reassociation of integer computation.  */
DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel",
          m_ATOM)
/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
   during reassociation of fp computation.  */
DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
          m_ATOM | m_SLM | m_HASWELL | m_BDVER1 | m_BDVER2)
/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
   regs instead of memory.  */
DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
          m_CORE_ALL)
/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
   a conditional move.  */
DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
	  m_ATOM | m_SLM)
/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
   fp converts to destination register.  */
DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
          m_SLM)