diff options
author | Wilco Dijkstra <wdijkstr@arm.com> | 2017-05-05 09:40:01 +0000 |
---|---|---|
committer | Wilco Dijkstra <wilco@gcc.gnu.org> | 2017-05-05 09:40:01 +0000 |
commit | 9d29ae83ed27a06cdd729cd7f0cceb12c28b91dc (patch) | |
tree | 8e1a45a11552c8773d1770494a16e028b354522b /gcc | |
parent | dfae9048a0ce06a8f240dd17c282cb1e1eaf2097 (diff) | |
download | gcc-9d29ae83ed27a06cdd729cd7f0cceb12c28b91dc.zip gcc-9d29ae83ed27a06cdd729cd7f0cceb12c28b91dc.tar.gz gcc-9d29ae83ed27a06cdd729cd7f0cceb12c28b91dc.tar.bz2 |
Code scheduling for Cortex-A53 isn't as good as it could be.
Code scheduling for Cortex-A53 isn't as good as it could be. It turns out
code runs faster overall if we place loads and stores with a dependency
closer together. To achieve this effect, this patch adds a bypass between
cortex_a53_load1 and cortex_a53_load*/cortex_a53_store* if the result of an
earlier load is used in an address calculation. This significantly improved
benchmark scores in a proprietary benchmark suite.
gcc/
* config/arm/aarch-common.c (arm_early_load_addr_dep_ptr):
New function.
(arm_early_store_addr_dep_ptr): Likewise.
* config/arm/aarch-common-protos.h
(arm_early_load_addr_dep_ptr): Add prototype.
(arm_early_store_addr_dep_ptr): Likewise.
* config/arm/cortex-a53.md: Add new bypasses.
From-SVN: r247631
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 10 | ||||
-rw-r--r-- | gcc/config/arm/aarch-common-protos.h | 2 | ||||
-rw-r--r-- | gcc/config/arm/aarch-common.c | 36 | ||||
-rw-r--r-- | gcc/config/arm/cortex-a53.md | 10 |
4 files changed, 58 insertions, 0 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 44df0d7..910658a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2017-05-05 Wilco Dijkstra <wdijkstr@arm.com> + + * config/arm/aarch-common.c (arm_early_load_addr_dep_ptr): + New function. + (arm_early_store_addr_dep_ptr): Likewise. + * config/arm/aarch-common-protos.h + (arm_early_load_addr_dep_ptr): Add prototype. + (arm_early_store_addr_dep_ptr): Likewise. + * config/arm/cortex-a53.md: Add new bypasses. + 2017-05-05 Jakub Jelinek <jakub@redhat.com> * tree.c (next_type_uid): Change type to unsigned. diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h index 7c2bb4c..35d2d96 100644 --- a/gcc/config/arm/aarch-common-protos.h +++ b/gcc/config/arm/aarch-common-protos.h @@ -30,7 +30,9 @@ extern bool aarch_rev16_p (rtx); extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode); extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode); extern int arm_early_load_addr_dep (rtx, rtx); +extern int arm_early_load_addr_dep_ptr (rtx, rtx); extern int arm_early_store_addr_dep (rtx, rtx); +extern int arm_early_store_addr_dep_ptr (rtx, rtx); extern int arm_mac_accumulator_is_mul_result (rtx, rtx); extern int arm_mac_accumulator_is_result (rtx, rtx); extern int arm_no_early_alu_shift_dep (rtx, rtx); diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c index 742d2ff..6a04711 100644 --- a/gcc/config/arm/aarch-common.c +++ b/gcc/config/arm/aarch-common.c @@ -241,6 +241,24 @@ arm_early_load_addr_dep (rtx producer, rtx consumer) return reg_overlap_mentioned_p (value, addr); } +/* Return nonzero if the CONSUMER instruction (a load) does need + a Pmode PRODUCER's value to calculate the address. */ + +int +arm_early_load_addr_dep_ptr (rtx producer, rtx consumer) +{ + rtx value = arm_find_sub_rtx_with_code (PATTERN (producer), SET, false); + rtx addr = arm_find_sub_rtx_with_code (PATTERN (consumer), SET, false); + + if (!value || !addr || !MEM_P (SET_SRC (value))) + return 0; + + value = SET_DEST (value); + addr = SET_SRC (addr); + + return GET_MODE (value) == Pmode && reg_overlap_mentioned_p (value, addr); +} + /* Return nonzero if the CONSUMER instruction (an ALU op) does not have an early register shift value or amount dependency on the result of PRODUCER. */ @@ -336,6 +354,24 @@ arm_early_store_addr_dep (rtx producer, rtx consumer) return !arm_no_early_store_addr_dep (producer, consumer); } +/* Return nonzero if the CONSUMER instruction (a store) does need + a Pmode PRODUCER's value to calculate the address. */ + +int +arm_early_store_addr_dep_ptr (rtx producer, rtx consumer) +{ + rtx value = arm_find_sub_rtx_with_code (PATTERN (producer), SET, false); + rtx addr = arm_find_sub_rtx_with_code (PATTERN (consumer), SET, false); + + if (!value || !addr || !MEM_P (SET_SRC (value))) + return 0; + + value = SET_DEST (value); + addr = SET_DEST (addr); + + return GET_MODE (value) == Pmode && reg_overlap_mentioned_p (value, addr); +} + /* Return non-zero iff the consumer (a multiply-accumulate or a multiple-subtract instruction) has an accumulator dependency on the result of the producer and no other dependency on that result. It diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md index 7cf5fc5..b7e0c92 100644 --- a/gcc/config/arm/cortex-a53.md +++ b/gcc/config/arm/cortex-a53.md @@ -254,6 +254,16 @@ "cortex_a53_store*" "arm_no_early_store_addr_dep") +;; Model a bypass for load to load/store address. + +(define_bypass 3 "cortex_a53_load1" + "cortex_a53_load*" + "arm_early_load_addr_dep_ptr") + +(define_bypass 3 "cortex_a53_load1" + "cortex_a53_store*" + "arm_early_store_addr_dep_ptr") + ;; Model a GP->FP register move as similar to stores. (define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*" |