diff options
author | Philipp Tomsich <philipp.tomsich@vrull.eu> | 2023-03-23 19:47:57 +0100 |
---|---|---|
committer | Philipp Tomsich <philipp.tomsich@vrull.eu> | 2023-04-17 12:15:04 +0200 |
commit | f200c56787f2c6f93ffb739d57d01a294ab72f68 (patch) | |
tree | f46e03114871217a86a231c7ee9e5d20d9e203e4 /gcc | |
parent | 7ec03c41de320fb747fa2a90f5d3b6db3aa4dde1 (diff) | |
download | gcc-f200c56787f2c6f93ffb739d57d01a294ab72f68.zip gcc-f200c56787f2c6f93ffb739d57d01a294ab72f68.tar.gz gcc-f200c56787f2c6f93ffb739d57d01a294ab72f68.tar.bz2 |
aarch64: disable LDP via tuning structure for -mcpu=ampere1
AmpereOne (-mcpu=ampere1) breaks LDP instructions into two uops.
Given the chance that this causes instructions to slip into the next
decoding cycle and the additional overheads when handling
cacheline-crossing LDP instructions, we disable the generation of LDP
isntructions through the tuning structure from instruction combining
(such as in peephole2).
Given the code-density benefits in builtins and prologue/epilogue
expansion, we allow LDPs there.
This commit:
* adds a new tuning option AARCH64_EXTRA_TUNE_NO_LDP_COMBINE
* allows -moverride=tune=... to override this
These changes are benchmark-driven, yielding the following changes
(with a net-overall improvement):
503.bwaves_r. -0.88%
507.cactuBSSN_r 0.35%
508.namd_r 3.09%
510.parest_r -2.99%
511.povray_r 5.54%
519.lbm_r 15.83%
521.wrf_r 0.56%
526.blender_r 2.47%
527.cam4_r 0.70%
538.imagick_r 0.00%
544.nab_r -0.33%
549.fotonik3d_r. -0.42%
554.roms_r 0.00%
-------------------------
= total 1.79%
Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
Co-Authored-By: Di Zhao <di.zhao@amperecomputing.com>
gcc/ChangeLog:
* config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION):
Add AARCH64_EXTRA_TUNE_NO_LDP_COMBINE.
* config/aarch64/aarch64.cc (aarch64_operands_ok_for_ldpstp):
Check for the above tuning option when processing loads.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ampere1-no_ldp_combine.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-tuning-flags.def | 3 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.cc | 18 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c | 11 |
3 files changed, 30 insertions, 2 deletions
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 712895a..52112ba 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -44,6 +44,9 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) /* Disallow load/store pair instructions on Q-registers. */ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS) +/* Disallow load-pair instructions to be formed in combine/peephole. */ +AARCH64_EXTRA_TUNING_OPTION ("no_ldp_combine", NO_LDP_COMBINE) + AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS) AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f4ef22c..0f04ab9 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -1933,7 +1933,7 @@ static const struct tune_params ampere1_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */ &ere1_prefetch_tune }; @@ -1971,7 +1971,7 @@ static const struct tune_params ampere1a_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */ &ere1_prefetch_tune }; @@ -26053,6 +26053,20 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load, enum reg_class rclass_1, rclass_2; rtx mem_1, mem_2, reg_1, reg_2; + /* Allow the tuning structure to disable LDP instruction formation + from combining instructions (e.g., in peephole2). + TODO: Implement fine-grained tuning control for LDP and STP: + 1. control policies for load and store separately; + 2. support the following policies: + - default (use what is in the tuning structure) + - always + - never + - aligned (only if the compiler can prove that the + load will be aligned to 2 * element_size) */ + if (load && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_COMBINE)) + return false; + if (load) { mem_1 = operands[1]; diff --git a/gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c b/gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c new file mode 100644 index 0000000..bc871f4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c @@ -0,0 +1,11 @@ +/* { dg-options "-O3 -mtune=ampere1" } */ + +long +foo (long a[]) +{ + return a[0] + a[1]; +} + +/* We should see two ldrs instead of one ldp. */ +/* { dg-final { scan-assembler {\tldr\t} } } */ +/* { dg-final { scan-assembler-not {\tldp\t} } } */ |