aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorRichard Henderson <rth@redhat.com>2007-03-06 07:59:38 -0800
committerRichard Henderson <rth@gcc.gnu.org>2007-03-06 07:59:38 -0800
commit80fd744fdae18292b5cb67cebceea8b750656c40 (patch)
tree8dcd9af23e8d7fadc4bbdb37a6caff7a249aa7b0 /gcc
parent14da607343d7637050c36ee3d338156dcc431354 (diff)
downloadgcc-80fd744fdae18292b5cb67cebceea8b750656c40.zip
gcc-80fd744fdae18292b5cb67cebceea8b750656c40.tar.gz
gcc-80fd744fdae18292b5cb67cebceea8b750656c40.tar.bz2
i386.c (x86_use_leave, [...]): Merge into ...
* config/i386/i386.c (x86_use_leave, x86_push_memory, x86_zero_extend_with_and, x86_movx, x86_double_with_add, x86_use_bit_test, x86_unroll_strlen, x86_deep_branch, x86_branch_hints, x86_use_sahf, x86_partial_reg_stall, x86_partial_flag_reg_stall, x86_use_himode_fiop, x86_use_simode_fiop, x86_use_mov0, x86_use_cltd, x86_read_modify_write, x86_read_modify, x86_split_long_moves, x86_promote_QImode, x86_fast_prefix, x86_single_stringop, x86_qimode_math, x86_promote_qi_regs, x86_himode_math, x86_promote_hi_regs, x86_sub_esp_4, x86_sub_esp_8, x86_add_esp_4, x86_add_esp_8, x86_integer_DFmode_moves, x86_partial_reg_dependency, x86_memory_mismatch_stall, x86_prologue_using_move, x86_epilogue_using_move, x86_shift1, x86_sse_partial_reg_dependency, x86_sse_split_regs, x86_sse_unaligned_move_optimal, x86_sse_typeless_stores, x86_sse_load0_by_pxor, x86_use_ffreep, x86_use_incdec, x86_inter_unit_moves, x86_ext_80387_constants, x86_four_jump_limit, x86_schedule, x86_use_bt, x86_pad_returns): Merge into ... (ix86_tune_features): ... here. New array. (x86_cmove, x86_use_xchgb, x86_cmpxchg, x86_cmpxchg8b, x86_xadd, x86_bswap): Merge into ... (ix86_arch_features): ... here. New array. (x86_3dnow_a): Remove. (x86_accumulate_outgoing_args): Make static. (x86_arch_always_fancy_math_387): Make static. (ix86_tune_mask, ix86_arch_mask): Move ... (override_options): ... to local variables here. Apply the appropriate mask to each element of ix86_arch_features and ix86_tune_features. Adjust TARGET_CMOVE and TARGET_USE_SAHF as were done in the old macros. (standard_80387_constant_p): Use TARGET_EXT_80387_CONSTANTS. * config/i386/i386.h (x86_use_leave, x86_push_memory, x86_zero_extend_with_and, x86_use_bit_test, x86_cmove, x86_deep_branch, x86_branch_hints, x86_unroll_strlen, x86_double_with_add, x86_partial_reg_stall, x86_movx, x86_use_himode_fiop, x86_use_simode_fiop, x86_use_mov0, x86_use_cltd, x86_use_xchgb, x86_read_modify_write, x86_read_modify, x86_split_long_moves, x86_promote_QImode, x86_single_stringop, x86_fast_prefix, x86_himode_math, x86_qimode_math, x86_promote_qi_regs, x86_promote_hi_regs, x86_integer_DFmode_moves, x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8, x86_partial_reg_dependency, x86_memory_mismatch_stall, x86_accumulate_outgoing_args, x86_prologue_using_move, x86_epilogue_using_move, x86_decompose_lea, x86_arch_always_fancy_math_387, x86_shift1, x86_sse_partial_reg_dependency, x86_sse_split_regs, x86_sse_unaligned_move_optimal, x86_sse_typeless_stores, x86_sse_load0_by_pxor, x86_use_ffreep, x86_inter_unit_moves, x86_schedule, x86_use_bt, x86_cmpxchg, x86_cmpxchg8b, x86_xadd, x86_use_incdec, x86_pad_returns, x86_bswap, x86_partial_flag_reg_stall): Remove. (enum ix86_tune_indices): New. (ix86_tune_features): New. (TARGET_USE_LEAVE, TARGET_PUSH_MEMORY, TARGET_ZERO_EXTEND_WITH_AND, TARGET_USE_BIT_TEST, TARGET_UNROLL_STRLEN, TARGET_DEEP_BRANCH_PREDICTION, TARGET_BRANCH_PREDICTION_HINTS, TARGET_DOUBLE_WITH_ADD, TARGET_USE_SAHF, TARGET_MOVX, TARGET_PARTIAL_REG_STALL, TARGET_PARTIAL_FLAG_REG_STALL, TARGET_USE_HIMODE_FIOP, TARGET_USE_SIMODE_FIOP, TARGET_USE_MOV0, TARGET_USE_CLTD, TARGET_USE_XCHGB, TARGET_SPLIT_LONG_MOVES, TARGET_READ_MODIFY_WRITE, TARGET_READ_MODIFY, TARGET_PROMOTE_QImode, TARGET_FAST_PREFIX, TARGET_SINGLE_STRINGOP, TARGET_QIMODE_MATH, TARGET_HIMODE_MATH, TARGET_PROMOTE_QI_REGS, TARGET_PROMOTE_HI_REGS, TARGET_ADD_ESP_4, TARGET_ADD_ESP_8, TARGET_SUB_ESP_4, TARGET_SUB_ESP_8, TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY, TARGET_SSE_PARTIAL_REG_DEPENDENCY, TARGET_SSE_UNALIGNED_MOVE_OPTIMAL, TARGET_SSE_SPLIT_REGS, TARGET_SSE_TYPELESS_STORES, TARGET_SSE_LOAD0_BY_PXOR, TARGET_MEMORY_MISMATCH_STALL, TARGET_PROLOGUE_USING_MOVE, TARGET_EPILOGUE_USING_MOVE, TARGET_SHIFT1, TARGET_USE_FFREEP, TARGET_INTER_UNIT_MOVES, TARGET_FOUR_JUMP_LIMIT, TARGET_SCHEDULE, TARGET_USE_BT, TARGET_USE_INCDEC, TARGET_PAD_RETURNS, TARGET_EXT_80387_CONSTANTS): Use it. (enum ix86_arch_indices): New. (ix86_arch_features): New. (TARGET_CMOVE, TARGET_CMPXCHG, TARGET_CMPXCHG8B, TARGET_XADD, TARGET_BSWAP): Use it. (ix86_tune_mask, ix86_arch_mask): Remove. From-SVN: r122621
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog80
-rw-r--r--gcc/config/i386/i386.c461
-rw-r--r--gcc/config/i386/i386.h266
3 files changed, 517 insertions, 290 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 4430d04..bc27cf5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,83 @@
+2007-03-06 Richard Henderson <rth@redhat.com>
+
+ * config/i386/i386.c (x86_use_leave, x86_push_memory,
+ x86_zero_extend_with_and, x86_movx, x86_double_with_add,
+ x86_use_bit_test, x86_unroll_strlen, x86_deep_branch,
+ x86_branch_hints, x86_use_sahf, x86_partial_reg_stall,
+ x86_partial_flag_reg_stall, x86_use_himode_fiop, x86_use_simode_fiop,
+ x86_use_mov0, x86_use_cltd, x86_read_modify_write, x86_read_modify,
+ x86_split_long_moves, x86_promote_QImode, x86_fast_prefix,
+ x86_single_stringop, x86_qimode_math, x86_promote_qi_regs,
+ x86_himode_math, x86_promote_hi_regs, x86_sub_esp_4, x86_sub_esp_8,
+ x86_add_esp_4, x86_add_esp_8, x86_integer_DFmode_moves,
+ x86_partial_reg_dependency, x86_memory_mismatch_stall,
+ x86_prologue_using_move, x86_epilogue_using_move, x86_shift1,
+ x86_sse_partial_reg_dependency, x86_sse_split_regs,
+ x86_sse_unaligned_move_optimal, x86_sse_typeless_stores,
+ x86_sse_load0_by_pxor, x86_use_ffreep, x86_use_incdec,
+ x86_inter_unit_moves, x86_ext_80387_constants, x86_four_jump_limit,
+ x86_schedule, x86_use_bt, x86_pad_returns): Merge into ...
+ (ix86_tune_features): ... here. New array.
+ (x86_cmove, x86_use_xchgb, x86_cmpxchg, x86_cmpxchg8b,
+ x86_xadd, x86_bswap): Merge into ...
+ (ix86_arch_features): ... here. New array.
+ (x86_3dnow_a): Remove.
+ (x86_accumulate_outgoing_args): Make static.
+ (x86_arch_always_fancy_math_387): Make static.
+ (ix86_tune_mask, ix86_arch_mask): Move ...
+ (override_options): ... to local variables here. Apply the
+ appropriate mask to each element of ix86_arch_features and
+ ix86_tune_features. Adjust TARGET_CMOVE and TARGET_USE_SAHF
+ as were done in the old macros.
+ (standard_80387_constant_p): Use TARGET_EXT_80387_CONSTANTS.
+ * config/i386/i386.h (x86_use_leave, x86_push_memory,
+ x86_zero_extend_with_and, x86_use_bit_test, x86_cmove, x86_deep_branch,
+ x86_branch_hints, x86_unroll_strlen, x86_double_with_add,
+ x86_partial_reg_stall, x86_movx, x86_use_himode_fiop,
+ x86_use_simode_fiop, x86_use_mov0, x86_use_cltd, x86_use_xchgb,
+ x86_read_modify_write, x86_read_modify, x86_split_long_moves,
+ x86_promote_QImode, x86_single_stringop, x86_fast_prefix,
+ x86_himode_math, x86_qimode_math, x86_promote_qi_regs,
+ x86_promote_hi_regs, x86_integer_DFmode_moves, x86_add_esp_4,
+ x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8,
+ x86_partial_reg_dependency, x86_memory_mismatch_stall,
+ x86_accumulate_outgoing_args, x86_prologue_using_move,
+ x86_epilogue_using_move, x86_decompose_lea,
+ x86_arch_always_fancy_math_387, x86_shift1,
+ x86_sse_partial_reg_dependency, x86_sse_split_regs,
+ x86_sse_unaligned_move_optimal, x86_sse_typeless_stores,
+ x86_sse_load0_by_pxor, x86_use_ffreep, x86_inter_unit_moves,
+ x86_schedule, x86_use_bt, x86_cmpxchg, x86_cmpxchg8b, x86_xadd,
+ x86_use_incdec, x86_pad_returns, x86_bswap,
+ x86_partial_flag_reg_stall): Remove.
+ (enum ix86_tune_indices): New.
+ (ix86_tune_features): New.
+ (TARGET_USE_LEAVE, TARGET_PUSH_MEMORY, TARGET_ZERO_EXTEND_WITH_AND,
+ TARGET_USE_BIT_TEST, TARGET_UNROLL_STRLEN,
+ TARGET_DEEP_BRANCH_PREDICTION, TARGET_BRANCH_PREDICTION_HINTS,
+ TARGET_DOUBLE_WITH_ADD, TARGET_USE_SAHF, TARGET_MOVX,
+ TARGET_PARTIAL_REG_STALL, TARGET_PARTIAL_FLAG_REG_STALL,
+ TARGET_USE_HIMODE_FIOP, TARGET_USE_SIMODE_FIOP, TARGET_USE_MOV0,
+ TARGET_USE_CLTD, TARGET_USE_XCHGB, TARGET_SPLIT_LONG_MOVES,
+ TARGET_READ_MODIFY_WRITE, TARGET_READ_MODIFY, TARGET_PROMOTE_QImode,
+ TARGET_FAST_PREFIX, TARGET_SINGLE_STRINGOP, TARGET_QIMODE_MATH,
+ TARGET_HIMODE_MATH, TARGET_PROMOTE_QI_REGS, TARGET_PROMOTE_HI_REGS,
+ TARGET_ADD_ESP_4, TARGET_ADD_ESP_8, TARGET_SUB_ESP_4,
+ TARGET_SUB_ESP_8, TARGET_INTEGER_DFMODE_MOVES,
+ TARGET_PARTIAL_REG_DEPENDENCY, TARGET_SSE_PARTIAL_REG_DEPENDENCY,
+ TARGET_SSE_UNALIGNED_MOVE_OPTIMAL, TARGET_SSE_SPLIT_REGS,
+ TARGET_SSE_TYPELESS_STORES, TARGET_SSE_LOAD0_BY_PXOR,
+ TARGET_MEMORY_MISMATCH_STALL, TARGET_PROLOGUE_USING_MOVE,
+ TARGET_EPILOGUE_USING_MOVE, TARGET_SHIFT1, TARGET_USE_FFREEP,
+ TARGET_INTER_UNIT_MOVES, TARGET_FOUR_JUMP_LIMIT, TARGET_SCHEDULE,
+ TARGET_USE_BT, TARGET_USE_INCDEC, TARGET_PAD_RETURNS,
+ TARGET_EXT_80387_CONSTANTS): Use it.
+ (enum ix86_arch_indices): New.
+ (ix86_arch_features): New.
+ (TARGET_CMOVE, TARGET_CMPXCHG, TARGET_CMPXCHG8B, TARGET_XADD,
+ TARGET_BSWAP): Use it.
+ (ix86_tune_mask, ix86_arch_mask): Remove.
+
2007-03-06 Joseph Myers <joseph@codesourcery.com>
PR bootstrap/31020
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ac36887..cf3b3ff 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1004,187 +1004,221 @@ const struct processor_costs *ix86_cost = &pentium_cost;
(PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
#define m_GENERIC (m_GENERIC32 | m_GENERIC64)
-/* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for
- Generic64 seems like good code size tradeoff. We can't enable it for 32bit
- generic because it is not working well with PPro base chips. */
-const int x86_use_leave = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2
- | m_GENERIC64;
-const int x86_push_memory = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
- | m_NOCONA | m_CORE2 | m_GENERIC;
-const int x86_zero_extend_with_and = m_486 | m_PENT;
-/* Enable to zero extend integer registers to avoid partial dependencies */
-const int x86_movx = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
- | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */;
-const int x86_double_with_add = ~m_386;
-const int x86_use_bit_test = m_386;
-const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10
- | m_K6 | m_CORE2 | m_GENERIC;
-const int x86_cmove = m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
- | m_NOCONA;
-const int x86_3dnow_a = m_ATHLON_K8_AMDFAM10;
-const int x86_deep_branch = m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10
- | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
-/* Branch hints were put in P4 based on simulation result. But
- after P4 was made, no performance benefit was observed with
- branch hints. It also increases the code size. As the result,
- icc never generates branch hints. */
-const int x86_branch_hints = 0;
-const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32;
- /*m_GENERIC | m_ATHLON_K8 ? */
-/* We probably ought to watch for partial register stalls on Generic32
- compilation setting as well. However in current implementation the
- partial register stalls are not eliminated very well - they can
- be introduced via subregs synthesized by combine and can happen
- in caller/callee saving sequences.
- Because this option pays back little on PPro based chips and is in conflict
- with partial reg. dependencies used by Athlon/P4 based chips, it is better
- to leave it off for generic32 for now. */
-const int x86_partial_reg_stall = m_PPRO;
-const int x86_partial_flag_reg_stall = m_CORE2 | m_GENERIC;
-const int x86_use_himode_fiop = m_386 | m_486 | m_K6_GEODE;
-const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT
- | m_CORE2 | m_GENERIC);
-const int x86_use_mov0 = m_K6;
-const int x86_use_cltd = ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC);
-/* Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
-const int x86_use_xchgb = m_PENT4;
-const int x86_read_modify_write = ~m_PENT;
-const int x86_read_modify = ~(m_PENT | m_PPRO);
-const int x86_split_long_moves = m_PPRO;
-const int x86_promote_QImode = m_K6_GEODE | m_PENT | m_386 | m_486
- | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
- /* m_PENT4 ? */
-const int x86_fast_prefix = ~(m_PENT | m_486 | m_386);
-const int x86_single_stringop = m_386 | m_PENT4 | m_NOCONA;
-const int x86_qimode_math = ~(0);
-const int x86_promote_qi_regs = 0;
-/* On PPro this flag is meant to avoid partial register stalls. Just like
- the x86_partial_reg_stall this option might be considered for Generic32
- if our scheme for avoiding partial stalls was more effective. */
-const int x86_himode_math = ~(m_PPRO);
-const int x86_promote_hi_regs = m_PPRO;
-/* Enable if add/sub rsp is preferred over 1 or 2 push/pop */
-const int x86_sub_esp_4 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
- | m_CORE2 | m_GENERIC;
-const int x86_sub_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
- | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
-const int x86_add_esp_4 = m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA
- | m_CORE2 | m_GENERIC;
-const int x86_add_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
- | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
-/* Enable if integer moves are preferred for DFmode copies */
-const int x86_integer_DFmode_moves = ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
- | m_PPRO | m_CORE2 | m_GENERIC | m_GEODE);
-const int x86_partial_reg_dependency = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
- | m_CORE2 | m_GENERIC;
-const int x86_memory_mismatch_stall = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
- | m_CORE2 | m_GENERIC;
-/* If ACCUMULATE_OUTGOING_ARGS is enabled, the maximum amount of space required
- for outgoing arguments will be computed and placed into the variable
- `current_function_outgoing_args_size'. No space will be pushed onto the stack
- for each call; instead, the function prologue should increase the stack frame
- size by this amount. Setting both PUSH_ARGS and ACCUMULATE_OUTGOING_ARGS is
- not proper. */
-const int x86_accumulate_outgoing_args = m_ATHLON_K8_AMDFAM10 | m_PENT4
- | m_NOCONA | m_PPRO | m_CORE2
- | m_GENERIC;
-const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
-const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
-const int x86_shift1 = ~m_486;
-const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO
- | m_ATHLON_K8_AMDFAM10 | m_PENT4
- | m_NOCONA | m_CORE2 | m_GENERIC;
-/* In Generic model we have an conflict here in between PPro/Pentium4 based chips
- that thread 128bit SSE registers as single units versus K8 based chips that
- divide SSE registers to two 64bit halves.
- x86_sse_partial_reg_dependency promote all store destinations to be 128bit
- to allow register renaming on 128bit SSE units, but usually results in one
- extra microop on 64bit SSE units. Experimental results shows that disabling
- this option on P4 brings over 20% SPECfp regression, while enabling it on
- K8 brings roughly 2.4% regression that can be partly masked by careful scheduling
- of moves. */
-const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
- | m_GENERIC | m_AMDFAM10;
-/* Set for machines where the type and dependencies are resolved on SSE
- register parts instead of whole registers, so we may maintain just
- lower part of scalar values in proper format leaving the upper part
- undefined. */
-const int x86_sse_split_regs = m_ATHLON_K8;
-/* Code generation for scalar reg-reg moves of single and double precision data:
- if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
- movaps reg, reg
- else
- movss reg, reg
- if (x86_sse_partial_reg_dependency == true)
- movapd reg, reg
- else
- movsd reg, reg
+/* Feature tests against the various tunings. */
+unsigned int ix86_tune_features[X86_TUNE_LAST] = {
+ /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
+ negatively, so enabling for Generic64 seems like good code size
+ tradeoff. We can't enable it for 32bit generic because it does not
+ work well with PPro base chips. */
+ m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC64,
+
+ /* X86_TUNE_PUSH_MEMORY */
+ m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
+ | m_NOCONA | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_ZERO_EXTEND_WITH_AND */
+ m_486 | m_PENT,
+
+ /* X86_TUNE_USE_BIT_TEST */
+ m_386,
+
+ /* X86_TUNE_UNROLL_STRLEN */
+ m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6 | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_DEEP_BRANCH_PREDICTION */
+ m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
+ | m_NOCONA | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
+ on simulation result. But after P4 was made, no performance benefit
+ was observed with branch hints. It also increases the code size.
+ As a result, icc never generates branch hints. */
+ 0,
+
+ /* X86_TUNE_DOUBLE_WITH_ADD */
+ ~m_386,
+
+ /* X86_TUNE_USE_SAHF */
+ m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32,
+ /* | m_GENERIC | m_ATHLON_K8 ? */
+
+ /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
+ partial dependencies */
+ m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
+ | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
+
+ /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
+ register stalls on Generic32 compilation setting as well. However
+ in current implementation the partial register stalls are not eliminated
+ very well - they can be introduced via subregs synthesized by combine
+ and can happen in caller/callee saving sequences. Because this option
+ pays back little on PPro based chips and is in conflict with partial reg
+ dependencies used by Athlon/P4 based chips, it is better to leave it off
+ for generic32 for now. */
+ m_PPRO,
+
+ /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
+ m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_USE_HIMODE_FIOP */
+ m_386 | m_486 | m_K6_GEODE,
- Code generation for scalar loads of double precision data:
- if (x86_sse_split_regs == true)
- movlpd mem, reg (gas syntax)
- else
- movsd mem, reg
-
- Code generation for unaligned packed loads of single precision data
- (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
- if (x86_sse_unaligned_move_optimal)
- movups mem, reg
+ /* X86_TUNE_USE_SIMODE_FIOP */
+ ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT | m_CORE2 | m_GENERIC),
- if (x86_sse_partial_reg_dependency == true)
- {
- xorps reg, reg
- movlps mem, reg
- movhps mem+8, reg
- }
- else
- {
- movlps mem, reg
- movhps mem+8, reg
- }
+ /* X86_TUNE_USE_MOV0 */
+ m_K6,
+
+ /* X86_TUNE_USE_CLTD */
+ ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC),
- Code generation for unaligned packed loads of double precision data
- (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
- if (x86_sse_unaligned_move_optimal)
- movupd mem, reg
+ /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
+ m_PENT4,
- if (x86_sse_split_regs == true)
- {
- movlpd mem, reg
- movhpd mem+8, reg
- }
- else
- {
- movsd mem, reg
- movhpd mem+8, reg
- }
- */
-const int x86_sse_unaligned_move_optimal = m_AMDFAM10;
-const int x86_sse_typeless_stores = m_ATHLON_K8_AMDFAM10;
-const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA;
-const int x86_use_ffreep = m_ATHLON_K8_AMDFAM10;
-const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC);
-
-const int x86_inter_unit_moves = ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC);
-
-const int x86_ext_80387_constants = m_K6_GEODE | m_ATHLON_K8 | m_PENT4
- | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
-/* Some CPU cores are not able to predict more than 4 branch instructions in
- the 16 byte window. */
-const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
- | m_NOCONA | m_CORE2 | m_GENERIC;
-const int x86_schedule = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT
- | m_CORE2 | m_GENERIC;
-const int x86_use_bt = m_ATHLON_K8_AMDFAM10;
-/* Compare and exchange was added for 80486. */
-const int x86_cmpxchg = ~m_386;
-/* Compare and exchange 8 bytes was added for pentium. */
-const int x86_cmpxchg8b = ~(m_386 | m_486);
-/* Exchange and add was added for 80486. */
-const int x86_xadd = ~m_386;
-/* Byteswap was added for 80486. */
-const int x86_bswap = ~m_386;
-const int x86_pad_returns = m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
+ /* X86_TUNE_SPLIT_LONG_MOVES */
+ m_PPRO,
+
+ /* X86_TUNE_READ_MODIFY_WRITE */
+ ~m_PENT,
+
+ /* X86_TUNE_READ_MODIFY */
+ ~(m_PENT | m_PPRO),
+
+ /* X86_TUNE_PROMOTE_QIMODE */
+ m_K6_GEODE | m_PENT | m_386 | m_486 | m_ATHLON_K8_AMDFAM10 | m_CORE2
+ | m_GENERIC /* | m_PENT4 ? */,
+
+ /* X86_TUNE_FAST_PREFIX */
+ ~(m_PENT | m_486 | m_386),
+
+ /* X86_TUNE_SINGLE_STRINGOP */
+ m_386 | m_PENT4 | m_NOCONA,
+
+ /* X86_TUNE_QIMODE_MATH */
+ ~0,
+
+ /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
+ register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
+ might be considered for Generic32 if our scheme for avoiding partial
+ stalls was more effective. */
+ ~m_PPRO,
+
+ /* X86_TUNE_PROMOTE_QI_REGS */
+ 0,
+
+ /* X86_TUNE_PROMOTE_HI_REGS */
+ m_PPRO,
+
+ /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop. */
+ m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_ADD_ESP_8 */
+ m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
+ | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_SUB_ESP_4 */
+ m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_SUB_ESP_8 */
+ m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
+ | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
+ for DFmode copies */
+ ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
+ | m_GENERIC | m_GEODE),
+
+ /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
+ m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
+ conflict here in between PPro/Pentium4 based chips that thread 128bit
+ SSE registers as single units versus K8 based chips that divide SSE
+ registers to two 64bit halves. This knob promotes all store destinations
+ to be 128bit to allow register renaming on 128bit SSE units, but usually
+ results in one extra microop on 64bit SSE units. Experimental results
+ shows that disabling this option on P4 brings over 20% SPECfp regression,
+ while enabling it on K8 brings roughly 2.4% regression that can be partly
+ masked by careful scheduling of moves. */
+ m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10,
+
+ /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
+ m_AMDFAM10,
+
+ /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
+ are resolved on SSE register parts instead of whole registers, so we may
+ maintain just lower part of scalar values in proper format leaving the
+ upper part undefined. */
+ m_ATHLON_K8,
+
+ /* X86_TUNE_SSE_TYPELESS_STORES */
+ m_ATHLON_K8_AMDFAM10,
+
+ /* X86_TUNE_SSE_LOAD0_BY_PXOR */
+ m_PPRO | m_PENT4 | m_NOCONA,
+
+ /* X86_TUNE_MEMORY_MISMATCH_STALL */
+ m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_PROLOGUE_USING_MOVE */
+ m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_EPILOGUE_USING_MOVE */
+ m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_SHIFT1 */
+ ~m_486,
+
+ /* X86_TUNE_USE_FFREEP */
+ m_ATHLON_K8_AMDFAM10,
+
+ /* X86_TUNE_INTER_UNIT_MOVES */
+ ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC),
+
+ /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
+ than 4 branch instructions in the 16 byte window. */
+ m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_SCHEDULE */
+ m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_USE_BT */
+ m_ATHLON_K8_AMDFAM10,
+
+ /* X86_TUNE_USE_INCDEC */
+ ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC),
+
+ /* X86_TUNE_PAD_RETURNS */
+ m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC,
+
+ /* X86_TUNE_EXT_80387_CONSTANTS */
+ m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC
+};
+
+/* Feature tests against the various architecture variations. */
+unsigned int ix86_arch_features[X86_ARCH_LAST] = {
+ /* X86_ARCH_CMOVE */
+ m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA,
+
+ /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
+ ~m_386,
+
+ /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
+ ~(m_386 | m_486),
+
+ /* X86_ARCH_XADD: Exchange and add was added for 80486. */
+ ~m_386,
+
+ /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
+ ~m_386,
+};
+
+static const unsigned int x86_accumulate_outgoing_args
+ = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
+
+static const unsigned int x86_arch_always_fancy_math_387
+ = m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
+ | m_NOCONA | m_CORE2 | m_GENERIC;
static enum stringop_alg stringop_alg = no_stringop;
@@ -1397,11 +1431,9 @@ enum fpmath_unit ix86_fpmath;
/* Which cpu are we scheduling for. */
enum processor_type ix86_tune;
-int ix86_tune_mask;
/* Which instruction set architecture to use. */
enum processor_type ix86_arch;
-int ix86_arch_mask;
/* true if sse prefetch instruction is not NOOP. */
int x86_prefetch_sse;
@@ -1811,6 +1843,7 @@ override_options (void)
{
int i;
int ix86_tune_defaulted = 0;
+ unsigned int ix86_arch_mask, ix86_tune_mask;
/* Comes from final.c -- no real reason to change it. */
#define MAX_CODE_ALIGN 16
@@ -2124,6 +2157,10 @@ override_options (void)
if (i == pta_size)
error ("bad value (%s) for -march= switch", ix86_arch_string);
+ ix86_arch_mask = 1u << ix86_arch;
+ for (i = 0; i < X86_ARCH_LAST; ++i)
+ ix86_arch_features[i] &= ix86_arch_mask;
+
for (i = 0; i < pta_size; i++)
if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
{
@@ -2155,8 +2192,9 @@ override_options (void)
if (i == pta_size)
error ("bad value (%s) for -mtune= switch", ix86_tune_string);
- ix86_arch_mask = 1 << ix86_arch;
- ix86_tune_mask = 1 << ix86_tune;
+ ix86_tune_mask = 1u << ix86_tune;
+ for (i = 0; i < X86_TUNE_LAST; ++i)
+ ix86_tune_features[i] &= ix86_tune_mask;
if (optimize_size)
ix86_cost = &size_cost;
@@ -2366,7 +2404,6 @@ override_options (void)
error ("-msseregparm used without SSE enabled");
ix86_fpmath = TARGET_FPMATH_DEFAULT;
-
if (ix86_fpmath_string != 0)
{
if (! strcmp (ix86_fpmath_string, "387"))
@@ -2425,6 +2462,15 @@ override_options (void)
target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
}
+ /* For sane SSE instruction set generation we need fcomi instruction.
+ It is safe to enable all CMOVE instructions. */
+ if (TARGET_SSE)
+ TARGET_CMOVE = 1;
+
+ /* ??? Any idea why this is unconditionally disabled for 64-bit? */
+ if (TARGET_64BIT)
+ TARGET_USE_SAHF = 0;
+
/* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
{
char *p;
@@ -4999,7 +5045,7 @@ standard_80387_constant_p (rtx x)
/* For XFmode constants, try to find a special 80387 instruction when
optimizing for size or on those CPUs that benefit from them. */
if (GET_MODE (x) == XFmode
- && (optimize_size || x86_ext_80387_constants & ix86_tune_mask))
+ && (optimize_size || TARGET_EXT_80387_CONSTANTS))
{
int i;
@@ -9499,6 +9545,55 @@ ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
/* Implement the movmisalign patterns for SSE. Non-SSE modes go
straight to ix86_expand_vector_move. */
+/* Code generation for scalar reg-reg moves of single and double precision data:
+ if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
+ movaps reg, reg
+ else
+ movss reg, reg
+ if (x86_sse_partial_reg_dependency == true)
+ movapd reg, reg
+ else
+ movsd reg, reg
+
+ Code generation for scalar loads of double precision data:
+ if (x86_sse_split_regs == true)
+ movlpd mem, reg (gas syntax)
+ else
+ movsd mem, reg
+
+ Code generation for unaligned packed loads of single precision data
+ (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
+ if (x86_sse_unaligned_move_optimal)
+ movups mem, reg
+
+ if (x86_sse_partial_reg_dependency == true)
+ {
+ xorps reg, reg
+ movlps mem, reg
+ movhps mem+8, reg
+ }
+ else
+ {
+ movlps mem, reg
+ movhps mem+8, reg
+ }
+
+ Code generation for unaligned packed loads of double precision data
+ (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
+ if (x86_sse_unaligned_move_optimal)
+ movupd mem, reg
+
+ if (x86_sse_split_regs == true)
+ {
+ movlpd mem, reg
+ movhpd mem+8, reg
+ }
+ else
+ {
+ movsd mem, reg
+ movhpd mem+8, reg
+ }
+ */
void
ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f77fc76..8e3032c 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -179,110 +179,165 @@ extern const struct processor_costs *ix86_cost;
#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
-extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
-extern const int x86_use_bit_test, x86_cmove, x86_deep_branch;
-extern const int x86_branch_hints, x86_unroll_strlen;
-extern const int x86_double_with_add, x86_partial_reg_stall, x86_movx;
-extern const int x86_use_himode_fiop, x86_use_simode_fiop;
-extern const int x86_use_mov0, x86_use_cltd, x86_use_xchgb;
-extern const int x86_read_modify_write, x86_read_modify, x86_split_long_moves;
-extern const int x86_promote_QImode, x86_single_stringop, x86_fast_prefix;
-extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
-extern const int x86_promote_hi_regs, x86_integer_DFmode_moves;
-extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
-extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
-extern const int x86_accumulate_outgoing_args, x86_prologue_using_move;
-extern const int x86_epilogue_using_move, x86_decompose_lea;
-extern const int x86_arch_always_fancy_math_387, x86_shift1;
-extern const int x86_sse_partial_reg_dependency, x86_sse_split_regs;
-extern const int x86_sse_unaligned_move_optimal;
-extern const int x86_sse_typeless_stores, x86_sse_load0_by_pxor;
-extern const int x86_use_ffreep;
-extern const int x86_inter_unit_moves, x86_schedule;
-extern const int x86_use_bt;
-extern const int x86_cmpxchg, x86_cmpxchg8b, x86_xadd;
-extern const int x86_use_incdec;
-extern const int x86_pad_returns;
-extern const int x86_bswap;
-extern const int x86_partial_flag_reg_stall;
-extern int x86_prefetch_sse, x86_cmpxchg16b;
-
-#define TARGET_USE_LEAVE (x86_use_leave & ix86_tune_mask)
-#define TARGET_PUSH_MEMORY (x86_push_memory & ix86_tune_mask)
-#define TARGET_ZERO_EXTEND_WITH_AND (x86_zero_extend_with_and & ix86_tune_mask)
-#define TARGET_USE_BIT_TEST (x86_use_bit_test & ix86_tune_mask)
-#define TARGET_UNROLL_STRLEN (x86_unroll_strlen & ix86_tune_mask)
-/* For sane SSE instruction set generation we need fcomi instruction. It is
- safe to enable all CMOVE instructions. */
-#define TARGET_CMOVE ((x86_cmove & ix86_arch_mask) || TARGET_SSE)
-#define TARGET_FISTTP (TARGET_SSE3 && TARGET_80387)
-#define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & ix86_tune_mask)
-#define TARGET_BRANCH_PREDICTION_HINTS (x86_branch_hints & ix86_tune_mask)
-#define TARGET_DOUBLE_WITH_ADD (x86_double_with_add & ix86_tune_mask)
-#define TARGET_USE_SAHF ((x86_use_sahf & ix86_tune_mask) && !TARGET_64BIT)
-#define TARGET_MOVX (x86_movx & ix86_tune_mask)
-#define TARGET_PARTIAL_REG_STALL (x86_partial_reg_stall & ix86_tune_mask)
-#define TARGET_PARTIAL_FLAG_REG_STALL \
- (x86_partial_flag_reg_stall & ix86_tune_mask)
-#define TARGET_USE_HIMODE_FIOP (x86_use_himode_fiop & ix86_tune_mask)
-#define TARGET_USE_SIMODE_FIOP (x86_use_simode_fiop & ix86_tune_mask)
-#define TARGET_USE_MOV0 (x86_use_mov0 & ix86_tune_mask)
-#define TARGET_USE_CLTD (x86_use_cltd & ix86_tune_mask)
-#define TARGET_USE_XCHGB (x86_use_xchgb & ix86_tune_mask)
-#define TARGET_SPLIT_LONG_MOVES (x86_split_long_moves & ix86_tune_mask)
-#define TARGET_READ_MODIFY_WRITE (x86_read_modify_write & ix86_tune_mask)
-#define TARGET_READ_MODIFY (x86_read_modify & ix86_tune_mask)
-#define TARGET_PROMOTE_QImode (x86_promote_QImode & ix86_tune_mask)
-#define TARGET_FAST_PREFIX (x86_fast_prefix & ix86_tune_mask)
-#define TARGET_SINGLE_STRINGOP (x86_single_stringop & ix86_tune_mask)
-#define TARGET_QIMODE_MATH (x86_qimode_math & ix86_tune_mask)
-#define TARGET_HIMODE_MATH (x86_himode_math & ix86_tune_mask)
-#define TARGET_PROMOTE_QI_REGS (x86_promote_qi_regs & ix86_tune_mask)
-#define TARGET_PROMOTE_HI_REGS (x86_promote_hi_regs & ix86_tune_mask)
-#define TARGET_ADD_ESP_4 (x86_add_esp_4 & ix86_tune_mask)
-#define TARGET_ADD_ESP_8 (x86_add_esp_8 & ix86_tune_mask)
-#define TARGET_SUB_ESP_4 (x86_sub_esp_4 & ix86_tune_mask)
-#define TARGET_SUB_ESP_8 (x86_sub_esp_8 & ix86_tune_mask)
-#define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & ix86_tune_mask)
-#define TARGET_PARTIAL_REG_DEPENDENCY \
- (x86_partial_reg_dependency & ix86_tune_mask)
-#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \
- (x86_sse_partial_reg_dependency & ix86_tune_mask)
-#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \
- (x86_sse_unaligned_move_optimal & ix86_tune_mask)
-#define TARGET_SSE_SPLIT_REGS (x86_sse_split_regs & ix86_tune_mask)
-#define TARGET_SSE_TYPELESS_STORES (x86_sse_typeless_stores & ix86_tune_mask)
-#define TARGET_SSE_LOAD0_BY_PXOR (x86_sse_load0_by_pxor & ix86_tune_mask)
-#define TARGET_MEMORY_MISMATCH_STALL \
- (x86_memory_mismatch_stall & ix86_tune_mask)
-#define TARGET_PROLOGUE_USING_MOVE (x86_prologue_using_move & ix86_tune_mask)
-#define TARGET_EPILOGUE_USING_MOVE (x86_epilogue_using_move & ix86_tune_mask)
-#define TARGET_PREFETCH_SSE (x86_prefetch_sse)
-#define TARGET_SHIFT1 (x86_shift1 & ix86_tune_mask)
-#define TARGET_USE_FFREEP (x86_use_ffreep & ix86_tune_mask)
-#define TARGET_INTER_UNIT_MOVES (x86_inter_unit_moves & ix86_tune_mask)
-#define TARGET_FOUR_JUMP_LIMIT (x86_four_jump_limit & ix86_tune_mask)
-#define TARGET_SCHEDULE (x86_schedule & ix86_tune_mask)
-#define TARGET_USE_BT (x86_use_bt & ix86_tune_mask)
-#define TARGET_USE_INCDEC (x86_use_incdec & ix86_tune_mask)
-#define TARGET_PAD_RETURNS (x86_pad_returns & ix86_tune_mask)
-
-#define ASSEMBLER_DIALECT (ix86_asm_dialect)
-
-#define TARGET_SSE_MATH ((ix86_fpmath & FPMATH_SSE) != 0)
-#define TARGET_MIX_SSE_I387 ((ix86_fpmath & FPMATH_SSE) \
- && (ix86_fpmath & FPMATH_387))
-
-#define TARGET_GNU_TLS (ix86_tls_dialect == TLS_DIALECT_GNU)
-#define TARGET_GNU2_TLS (ix86_tls_dialect == TLS_DIALECT_GNU2)
-#define TARGET_ANY_GNU_TLS (TARGET_GNU_TLS || TARGET_GNU2_TLS)
-#define TARGET_SUN_TLS (ix86_tls_dialect == TLS_DIALECT_SUN)
-
-#define TARGET_CMPXCHG (x86_cmpxchg & ix86_arch_mask)
-#define TARGET_CMPXCHG8B (x86_cmpxchg8b & ix86_arch_mask)
-#define TARGET_CMPXCHG16B (x86_cmpxchg16b)
-#define TARGET_XADD (x86_xadd & ix86_arch_mask)
-#define TARGET_BSWAP (x86_bswap & ix86_arch_mask)
+/* Feature tests against the various tunings. */
+enum ix86_tune_indices {
+ X86_TUNE_USE_LEAVE,
+ X86_TUNE_PUSH_MEMORY,
+ X86_TUNE_ZERO_EXTEND_WITH_AND,
+ X86_TUNE_USE_BIT_TEST,
+ X86_TUNE_UNROLL_STRLEN,
+ X86_TUNE_DEEP_BRANCH_PREDICTION,
+ X86_TUNE_BRANCH_PREDICTION_HINTS,
+ X86_TUNE_DOUBLE_WITH_ADD,
+ X86_TUNE_USE_SAHF, /* && !TARGET_64BIT */
+ X86_TUNE_MOVX,
+ X86_TUNE_PARTIAL_REG_STALL,
+ X86_TUNE_PARTIAL_FLAG_REG_STALL,
+ X86_TUNE_USE_HIMODE_FIOP,
+ X86_TUNE_USE_SIMODE_FIOP,
+ X86_TUNE_USE_MOV0,
+ X86_TUNE_USE_CLTD,
+ X86_TUNE_USE_XCHGB,
+ X86_TUNE_SPLIT_LONG_MOVES,
+ X86_TUNE_READ_MODIFY_WRITE,
+ X86_TUNE_READ_MODIFY,
+ X86_TUNE_PROMOTE_QIMODE,
+ X86_TUNE_FAST_PREFIX,
+ X86_TUNE_SINGLE_STRINGOP,
+ X86_TUNE_QIMODE_MATH,
+ X86_TUNE_HIMODE_MATH,
+ X86_TUNE_PROMOTE_QI_REGS,
+ X86_TUNE_PROMOTE_HI_REGS,
+ X86_TUNE_ADD_ESP_4,
+ X86_TUNE_ADD_ESP_8,
+ X86_TUNE_SUB_ESP_4,
+ X86_TUNE_SUB_ESP_8,
+ X86_TUNE_INTEGER_DFMODE_MOVES,
+ X86_TUNE_PARTIAL_REG_DEPENDENCY,
+ X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY,
+ X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL,
+ X86_TUNE_SSE_SPLIT_REGS,
+ X86_TUNE_SSE_TYPELESS_STORES,
+ X86_TUNE_SSE_LOAD0_BY_PXOR,
+ X86_TUNE_MEMORY_MISMATCH_STALL,
+ X86_TUNE_PROLOGUE_USING_MOVE,
+ X86_TUNE_EPILOGUE_USING_MOVE,
+ X86_TUNE_SHIFT1,
+ X86_TUNE_USE_FFREEP,
+ X86_TUNE_INTER_UNIT_MOVES,
+ X86_TUNE_FOUR_JUMP_LIMIT,
+ X86_TUNE_SCHEDULE,
+ X86_TUNE_USE_BT,
+ X86_TUNE_USE_INCDEC,
+ X86_TUNE_PAD_RETURNS,
+ X86_TUNE_EXT_80387_CONSTANTS,
+
+ X86_TUNE_LAST
+};
+
+extern unsigned int ix86_tune_features[X86_TUNE_LAST];
+
+#define TARGET_USE_LEAVE ix86_tune_features[X86_TUNE_USE_LEAVE]
+#define TARGET_PUSH_MEMORY ix86_tune_features[X86_TUNE_PUSH_MEMORY]
+#define TARGET_ZERO_EXTEND_WITH_AND \
+ ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
+#define TARGET_USE_BIT_TEST ix86_tune_features[X86_TUNE_USE_BIT_TEST]
+#define TARGET_UNROLL_STRLEN ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
+#define TARGET_DEEP_BRANCH_PREDICTION \
+ ix86_tune_features[X86_TUNE_DEEP_BRANCH_PREDICTION]
+#define TARGET_BRANCH_PREDICTION_HINTS \
+ ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
+#define TARGET_DOUBLE_WITH_ADD ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD]
+#define TARGET_USE_SAHF ix86_tune_features[X86_TUNE_USE_SAHF]
+#define TARGET_MOVX ix86_tune_features[X86_TUNE_MOVX]
+#define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL]
+#define TARGET_PARTIAL_FLAG_REG_STALL \
+ ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL]
+#define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP]
+#define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP]
+#define TARGET_USE_MOV0 ix86_tune_features[X86_TUNE_USE_MOV0]
+#define TARGET_USE_CLTD ix86_tune_features[X86_TUNE_USE_CLTD]
+#define TARGET_USE_XCHGB ix86_tune_features[X86_TUNE_USE_XCHGB]
+#define TARGET_SPLIT_LONG_MOVES ix86_tune_features[X86_TUNE_SPLIT_LONG_MOVES]
+#define TARGET_READ_MODIFY_WRITE ix86_tune_features[X86_TUNE_READ_MODIFY_WRITE]
+#define TARGET_READ_MODIFY ix86_tune_features[X86_TUNE_READ_MODIFY]
+#define TARGET_PROMOTE_QImode ix86_tune_features[X86_TUNE_PROMOTE_QIMODE]
+#define TARGET_FAST_PREFIX ix86_tune_features[X86_TUNE_FAST_PREFIX]
+#define TARGET_SINGLE_STRINGOP ix86_tune_features[X86_TUNE_SINGLE_STRINGOP]
+#define TARGET_QIMODE_MATH ix86_tune_features[X86_TUNE_QIMODE_MATH]
+#define TARGET_HIMODE_MATH ix86_tune_features[X86_TUNE_HIMODE_MATH]
+#define TARGET_PROMOTE_QI_REGS ix86_tune_features[X86_TUNE_PROMOTE_QI_REGS]
+#define TARGET_PROMOTE_HI_REGS ix86_tune_features[X86_TUNE_PROMOTE_HI_REGS]
+#define TARGET_ADD_ESP_4 ix86_tune_features[X86_TUNE_ADD_ESP_4]
+#define TARGET_ADD_ESP_8 ix86_tune_features[X86_TUNE_ADD_ESP_8]
+#define TARGET_SUB_ESP_4 ix86_tune_features[X86_TUNE_SUB_ESP_4]
+#define TARGET_SUB_ESP_8 ix86_tune_features[X86_TUNE_SUB_ESP_8]
+#define TARGET_INTEGER_DFMODE_MOVES \
+ ix86_tune_features[X86_TUNE_INTEGER_DFMODE_MOVES]
+#define TARGET_PARTIAL_REG_DEPENDENCY \
+ ix86_tune_features[X86_TUNE_PARTIAL_REG_DEPENDENCY]
+#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \
+ ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY]
+#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \
+ ix86_tune_features[X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL]
+#define TARGET_SSE_SPLIT_REGS ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
+#define TARGET_SSE_TYPELESS_STORES \
+ ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
+#define TARGET_SSE_LOAD0_BY_PXOR ix86_tune_features[X86_TUNE_SSE_LOAD0_BY_PXOR]
+#define TARGET_MEMORY_MISMATCH_STALL \
+ ix86_tune_features[X86_TUNE_MEMORY_MISMATCH_STALL]
+#define TARGET_PROLOGUE_USING_MOVE \
+ ix86_tune_features[X86_TUNE_PROLOGUE_USING_MOVE]
+#define TARGET_EPILOGUE_USING_MOVE \
+ ix86_tune_features[X86_TUNE_EPILOGUE_USING_MOVE]
+#define TARGET_SHIFT1 ix86_tune_features[X86_TUNE_SHIFT1]
+#define TARGET_USE_FFREEP ix86_tune_features[X86_TUNE_USE_FFREEP]
+#define TARGET_INTER_UNIT_MOVES ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES]
+#define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT]
+#define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE]
+#define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT]
+#define TARGET_USE_INCDEC ix86_tune_features[X86_TUNE_USE_INCDEC]
+#define TARGET_PAD_RETURNS ix86_tune_features[X86_TUNE_PAD_RETURNS]
+#define TARGET_EXT_80387_CONSTANTS \
+ ix86_tune_features[X86_TUNE_EXT_80387_CONSTANTS]
+
+/* Feature tests against the various architecture variations. */
+enum ix86_arch_indices {
+ X86_ARCH_CMOVE, /* || TARGET_SSE */
+ X86_ARCH_CMPXCHG,
+ X86_ARCH_CMPXCHG8B,
+ X86_ARCH_XADD,
+ X86_ARCH_BSWAP,
+
+ X86_ARCH_LAST
+};
+
+extern unsigned int ix86_arch_features[X86_ARCH_LAST];
+
+#define TARGET_CMOVE ix86_arch_features[X86_ARCH_CMOVE]
+#define TARGET_CMPXCHG ix86_arch_features[X86_ARCH_CMPXCHG]
+#define TARGET_CMPXCHG8B ix86_arch_features[X86_ARCH_CMPXCHG8B]
+#define TARGET_XADD ix86_arch_features[X86_ARCH_XADD]
+#define TARGET_BSWAP ix86_arch_features[X86_ARCH_BSWAP]
+
+#define TARGET_FISTTP (TARGET_SSE3 && TARGET_80387)
+
+extern int x86_prefetch_sse;
+#define TARGET_PREFETCH_SSE x86_prefetch_sse
+
+extern int x86_cmpxchg16b;
+#define TARGET_CMPXCHG16B x86_cmpxchg16b
+
+#define ASSEMBLER_DIALECT (ix86_asm_dialect)
+
+#define TARGET_SSE_MATH ((ix86_fpmath & FPMATH_SSE) != 0)
+#define TARGET_MIX_SSE_I387 \
+ ((ix86_fpmath & (FPMATH_SSE | FPMATH_387)) == (FPMATH_SSE | FPMATH_387))
+
+#define TARGET_GNU_TLS (ix86_tls_dialect == TLS_DIALECT_GNU)
+#define TARGET_GNU2_TLS (ix86_tls_dialect == TLS_DIALECT_GNU2)
+#define TARGET_ANY_GNU_TLS (TARGET_GNU_TLS || TARGET_GNU2_TLS)
+#define TARGET_SUN_TLS (ix86_tls_dialect == TLS_DIALECT_SUN)
#ifndef TARGET_64BIT_DEFAULT
#define TARGET_64BIT_DEFAULT 0
@@ -2132,10 +2187,7 @@ enum processor_type
};
extern enum processor_type ix86_tune;
-extern int ix86_tune_mask;
-
extern enum processor_type ix86_arch;
-extern int ix86_arch_mask;
enum fpmath_unit
{