aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorPhilipp Tomsich <philipp.tomsich@vrull.eu>2023-03-23 19:47:57 +0100
committerPhilipp Tomsich <philipp.tomsich@vrull.eu>2023-04-17 12:15:04 +0200
commitf200c56787f2c6f93ffb739d57d01a294ab72f68 (patch)
treef46e03114871217a86a231c7ee9e5d20d9e203e4 /gcc
parent7ec03c41de320fb747fa2a90f5d3b6db3aa4dde1 (diff)
downloadgcc-f200c56787f2c6f93ffb739d57d01a294ab72f68.zip
gcc-f200c56787f2c6f93ffb739d57d01a294ab72f68.tar.gz
gcc-f200c56787f2c6f93ffb739d57d01a294ab72f68.tar.bz2
aarch64: disable LDP via tuning structure for -mcpu=ampere1
AmpereOne (-mcpu=ampere1) breaks LDP instructions into two uops. Given the chance that this causes instructions to slip into the next decoding cycle and the additional overheads when handling cacheline-crossing LDP instructions, we disable the generation of LDP isntructions through the tuning structure from instruction combining (such as in peephole2). Given the code-density benefits in builtins and prologue/epilogue expansion, we allow LDPs there. This commit: * adds a new tuning option AARCH64_EXTRA_TUNE_NO_LDP_COMBINE * allows -moverride=tune=... to override this These changes are benchmark-driven, yielding the following changes (with a net-overall improvement): 503.bwaves_r. -0.88% 507.cactuBSSN_r 0.35% 508.namd_r 3.09% 510.parest_r -2.99% 511.povray_r 5.54% 519.lbm_r 15.83% 521.wrf_r 0.56% 526.blender_r 2.47% 527.cam4_r 0.70% 538.imagick_r 0.00% 544.nab_r -0.33% 549.fotonik3d_r. -0.42% 554.roms_r 0.00% ------------------------- = total 1.79% Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu> Co-Authored-By: Di Zhao <di.zhao@amperecomputing.com> gcc/ChangeLog: * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION): Add AARCH64_EXTRA_TUNE_NO_LDP_COMBINE. * config/aarch64/aarch64.cc (aarch64_operands_ok_for_ldpstp): Check for the above tuning option when processing loads. gcc/testsuite/ChangeLog: * gcc.target/aarch64/ampere1-no_ldp_combine.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/aarch64/aarch64-tuning-flags.def3
-rw-r--r--gcc/config/aarch64/aarch64.cc18
-rw-r--r--gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c11
3 files changed, 30 insertions, 2 deletions
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 712895a..52112ba 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -44,6 +44,9 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
/* Disallow load/store pair instructions on Q-registers. */
AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
+/* Disallow load-pair instructions to be formed in combine/peephole. */
+AARCH64_EXTRA_TUNING_OPTION ("no_ldp_combine", NO_LDP_COMBINE)
+
AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f4ef22c..0f04ab9 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1933,7 +1933,7 @@ static const struct tune_params ampere1_tunings =
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */
&ampere1_prefetch_tune
};
@@ -1971,7 +1971,7 @@ static const struct tune_params ampere1a_tunings =
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */
&ampere1_prefetch_tune
};
@@ -26053,6 +26053,20 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
enum reg_class rclass_1, rclass_2;
rtx mem_1, mem_2, reg_1, reg_2;
+ /* Allow the tuning structure to disable LDP instruction formation
+ from combining instructions (e.g., in peephole2).
+ TODO: Implement fine-grained tuning control for LDP and STP:
+ 1. control policies for load and store separately;
+ 2. support the following policies:
+ - default (use what is in the tuning structure)
+ - always
+ - never
+ - aligned (only if the compiler can prove that the
+ load will be aligned to 2 * element_size) */
+ if (load && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_COMBINE))
+ return false;
+
if (load)
{
mem_1 = operands[1];
diff --git a/gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c b/gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c
new file mode 100644
index 0000000..bc871f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c
@@ -0,0 +1,11 @@
+/* { dg-options "-O3 -mtune=ampere1" } */
+
+long
+foo (long a[])
+{
+ return a[0] + a[1];
+}
+
+/* We should see two ldrs instead of one ldp. */
+/* { dg-final { scan-assembler {\tldr\t} } } */
+/* { dg-final { scan-assembler-not {\tldp\t} } } */