aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJan Hubicka <jh@suse.cz>2013-10-19 14:11:14 +0200
committerJan Hubicka <hubicka@gcc.gnu.org>2013-10-19 12:11:14 +0000
commit41ee845b75a5025e4d376d8df8661e1340b59d0a (patch)
tree432272133578a944dd10eac6df2977c235104f3b /gcc
parent322cb62ac5c93e21a859cce27c0d8e8b1b6c1c01 (diff)
downloadgcc-41ee845b75a5025e4d376d8df8661e1340b59d0a.zip
gcc-41ee845b75a5025e4d376d8df8661e1340b59d0a.tar.gz
gcc-41ee845b75a5025e4d376d8df8661e1340b59d0a.tar.bz2
i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation for cold functions.
* config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation for cold functions. * x86-tune.def (X86_TUNE_USE_LEAVE): Update comment. (X86_TUNE_PUSH_MEMORY): Likewise. (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New. (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New. * i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store): Remove. (ix86_option_override_internal): Update to use tune features instead of variables. From-SVN: r203855
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog15
-rw-r--r--gcc/config/i386/i386.c23
-rw-r--r--gcc/config/i386/i386.h19
-rw-r--r--gcc/config/i386/x86-tune.def39
4 files changed, 70 insertions, 26 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index bdb9e01..086b5b2 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,18 @@
+2013-10-18 Jan Hubicka <jh@suse.cz>
+
+ * config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation
+ for cold functions.
+ * x86-tune.def (X86_TUNE_USE_LEAVE): Update comment.
+ (X86_TUNE_PUSH_MEMORY): Likewise.
+ (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL,
+ X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New.
+ (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New.
+ * i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387,
+ x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store):
+ Remove.
+ (ix86_option_override_internal): Update to use tune features instead
+ of variables.
+
2013-10-18 Cong Hou <congh@google.com>
PR tree-optimization/58508
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index b8c3c1d..91e6510 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1897,18 +1897,6 @@ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
~m_386,
};
-static const unsigned int x86_accumulate_outgoing_args
- = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_arch_always_fancy_math_387
- = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_load
- = m_COREI7 | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_store
- = m_COREI7 | m_BDVER | m_GENERIC;
-
/* In case the average insn count for single function invocation is
lower than this constant, emit fast (but longer) prologue and
epilogue code. */
@@ -2925,7 +2913,7 @@ ix86_option_override_internal (bool main_args_p,
struct gcc_options *opts_set)
{
int i;
- unsigned int ix86_arch_mask, ix86_tune_mask;
+ unsigned int ix86_arch_mask;
const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
const char *prefix;
const char *suffix;
@@ -3693,7 +3681,7 @@ ix86_option_override_internal (bool main_args_p,
/* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
since the insns won't need emulation. */
- if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
+ if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
/* Likewise, if the target doesn't have a 387, or we've specified
@@ -3835,8 +3823,7 @@ ix86_option_override_internal (bool main_args_p,
gcc_unreachable ();
}
- ix86_tune_mask = 1u << ix86_tune;
- if ((x86_accumulate_outgoing_args & ix86_tune_mask)
+ if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
&& !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
&& !opts->x_optimize_size)
opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
@@ -3976,10 +3963,10 @@ ix86_option_override_internal (bool main_args_p,
if (flag_expensive_optimizations
&& !(opts_set->x_target_flags & MASK_VZEROUPPER))
opts->x_target_flags |= MASK_VZEROUPPER;
- if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
+ if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
&& !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
- if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
+ if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
&& !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
/* Enable 128-bit AVX instruction generation
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 10f7ff0..63e4903 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1544,13 +1544,26 @@ enum reg_class
will be computed and placed into the variable `crtl->outgoing_args_size'.
No space will be pushed onto the stack for each call; instead, the
function prologue should increase the stack frame size by this amount.
+
+ In 32bit mode enabling argument accumulation results in about 5% code size
+ growth becuase move instructions are less compact than push. In 64bit
+ mode the difference is less drastic but visible.
+
+ FIXME: Unlike earlier implementations, the size of unwind info seems to
+ actually grouw with accumulation. Is that because accumulated args
+ unwind info became unnecesarily bloated?
64-bit MS ABI seem to require 16 byte alignment everywhere except for
- function prologue and apilogue. This is not possible without
- ACCUMULATE_OUTGOING_ARGS. */
+ function prologue and epilogue. This is not possible without
+ ACCUMULATE_OUTGOING_ARGS.
+
+ If stack probes are required, the space used for large function
+ arguments on the stack must also be probed, so enable
+ -maccumulate-outgoing-args so this happens in the prologue. */
#define ACCUMULATE_OUTGOING_ARGS \
- (TARGET_ACCUMULATE_OUTGOING_ARGS || TARGET_64BIT_MS_ABI)
+ ((TARGET_ACCUMULATE_OUTGOING_ARGS && optimize_function_for_speed_p (cfun)) \
+ || TARGET_STACK_PROBE || TARGET_64BIT_MS_ABI)
/* If defined, a C expression whose value is nonzero when we want to use PUSH
instructions to pass outgoing arguments. */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 34484a2..42eee33 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -18,15 +18,13 @@ a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
-/* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
- negatively, so enabling for Generic64 seems like good code size
- tradeoff. We can't enable it for 32bit generic because it does not
- work well with PPro base chips. */
+/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */
DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
- Some chips, like 486 and Pentium have problems with these sequences. */
+ Some chips, like 486 and Pentium works faster with separate load
+ and push instructions. */
DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
| m_GENERIC)
@@ -210,6 +208,16 @@ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
m_COREI7 | m_BDVER | m_SLM | m_GENERIC)
+/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are
+ split. */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
+ ~(m_COREI7 | m_GENERIC))
+
+/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are
+ split. */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal",
+ ~(m_COREI7 | m_BDVER | m_GENERIC))
+
/* Use packed single precision instructions where posisble. I.e. movups instead
of movupd. */
DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
@@ -398,3 +406,24 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
fp converts to destination register. */
DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
m_SLM)
+
+/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
+ arguments in prologue/epilogue instead of separately for each call
+ by push/pop instructions.
+ This increase code size by about 5% in 32bit mode, less so in 64bit mode
+ because parameters are passed in registers. It is considerable
+ win for targets without stack engine that prevents multple push operations
+ to happen in parallel.
+
+ FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer,
+ Bobcat and Generic. This is because disabling it causes large
+ regression on mgrid due to IRA limitation leading to unecessary
+ use of the frame pointer in 32bit mode. */
+DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
+ m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
+ such as fsqrt, fprem, fsin, fcos, fsincos etc.
+ Should be enabled for all targets that always has coprocesor. */
+DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
+ ~(m_386 | m_486))