aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wdijkstr@arm.com>2017-05-05 09:40:01 +0000
committerWilco Dijkstra <wilco@gcc.gnu.org>2017-05-05 09:40:01 +0000
commit9d29ae83ed27a06cdd729cd7f0cceb12c28b91dc (patch)
tree8e1a45a11552c8773d1770494a16e028b354522b
parentdfae9048a0ce06a8f240dd17c282cb1e1eaf2097 (diff)
downloadgcc-9d29ae83ed27a06cdd729cd7f0cceb12c28b91dc.zip
gcc-9d29ae83ed27a06cdd729cd7f0cceb12c28b91dc.tar.gz
gcc-9d29ae83ed27a06cdd729cd7f0cceb12c28b91dc.tar.bz2
Code scheduling for Cortex-A53 isn't as good as it could be.
Code scheduling for Cortex-A53 isn't as good as it could be. It turns out code runs faster overall if we place loads and stores with a dependency closer together. To achieve this effect, this patch adds a bypass between cortex_a53_load1 and cortex_a53_load*/cortex_a53_store* if the result of an earlier load is used in an address calculation. This significantly improved benchmark scores in a proprietary benchmark suite. gcc/ * config/arm/aarch-common.c (arm_early_load_addr_dep_ptr): New function. (arm_early_store_addr_dep_ptr): Likewise. * config/arm/aarch-common-protos.h (arm_early_load_addr_dep_ptr): Add prototype. (arm_early_store_addr_dep_ptr): Likewise. * config/arm/cortex-a53.md: Add new bypasses. From-SVN: r247631
-rw-r--r--gcc/ChangeLog10
-rw-r--r--gcc/config/arm/aarch-common-protos.h2
-rw-r--r--gcc/config/arm/aarch-common.c36
-rw-r--r--gcc/config/arm/cortex-a53.md10
4 files changed, 58 insertions, 0 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 44df0d7..910658a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,13 @@
+2017-05-05 Wilco Dijkstra <wdijkstr@arm.com>
+
+ * config/arm/aarch-common.c (arm_early_load_addr_dep_ptr):
+ New function.
+ (arm_early_store_addr_dep_ptr): Likewise.
+ * config/arm/aarch-common-protos.h
+ (arm_early_load_addr_dep_ptr): Add prototype.
+ (arm_early_store_addr_dep_ptr): Likewise.
+ * config/arm/cortex-a53.md: Add new bypasses.
+
2017-05-05 Jakub Jelinek <jakub@redhat.com>
* tree.c (next_type_uid): Change type to unsigned.
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 7c2bb4c..35d2d96 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -30,7 +30,9 @@ extern bool aarch_rev16_p (rtx);
extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode);
extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode);
extern int arm_early_load_addr_dep (rtx, rtx);
+extern int arm_early_load_addr_dep_ptr (rtx, rtx);
extern int arm_early_store_addr_dep (rtx, rtx);
+extern int arm_early_store_addr_dep_ptr (rtx, rtx);
extern int arm_mac_accumulator_is_mul_result (rtx, rtx);
extern int arm_mac_accumulator_is_result (rtx, rtx);
extern int arm_no_early_alu_shift_dep (rtx, rtx);
diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c
index 742d2ff..6a04711 100644
--- a/gcc/config/arm/aarch-common.c
+++ b/gcc/config/arm/aarch-common.c
@@ -241,6 +241,24 @@ arm_early_load_addr_dep (rtx producer, rtx consumer)
return reg_overlap_mentioned_p (value, addr);
}
+/* Return nonzero if the CONSUMER instruction (a load) does need
+ a Pmode PRODUCER's value to calculate the address. */
+
+int
+arm_early_load_addr_dep_ptr (rtx producer, rtx consumer)
+{
+ rtx value = arm_find_sub_rtx_with_code (PATTERN (producer), SET, false);
+ rtx addr = arm_find_sub_rtx_with_code (PATTERN (consumer), SET, false);
+
+ if (!value || !addr || !MEM_P (SET_SRC (value)))
+ return 0;
+
+ value = SET_DEST (value);
+ addr = SET_SRC (addr);
+
+ return GET_MODE (value) == Pmode && reg_overlap_mentioned_p (value, addr);
+}
+
/* Return nonzero if the CONSUMER instruction (an ALU op) does not
have an early register shift value or amount dependency on the
result of PRODUCER. */
@@ -336,6 +354,24 @@ arm_early_store_addr_dep (rtx producer, rtx consumer)
return !arm_no_early_store_addr_dep (producer, consumer);
}
+/* Return nonzero if the CONSUMER instruction (a store) does need
+ a Pmode PRODUCER's value to calculate the address. */
+
+int
+arm_early_store_addr_dep_ptr (rtx producer, rtx consumer)
+{
+ rtx value = arm_find_sub_rtx_with_code (PATTERN (producer), SET, false);
+ rtx addr = arm_find_sub_rtx_with_code (PATTERN (consumer), SET, false);
+
+ if (!value || !addr || !MEM_P (SET_SRC (value)))
+ return 0;
+
+ value = SET_DEST (value);
+ addr = SET_DEST (addr);
+
+ return GET_MODE (value) == Pmode && reg_overlap_mentioned_p (value, addr);
+}
+
/* Return non-zero iff the consumer (a multiply-accumulate or a
multiple-subtract instruction) has an accumulator dependency on the
result of the producer and no other dependency on that result. It
diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
index 7cf5fc5..b7e0c92 100644
--- a/gcc/config/arm/cortex-a53.md
+++ b/gcc/config/arm/cortex-a53.md
@@ -254,6 +254,16 @@
"cortex_a53_store*"
"arm_no_early_store_addr_dep")
+;; Model a bypass for load to load/store address.
+
+(define_bypass 3 "cortex_a53_load1"
+ "cortex_a53_load*"
+ "arm_early_load_addr_dep_ptr")
+
+(define_bypass 3 "cortex_a53_load1"
+ "cortex_a53_store*"
+ "arm_early_store_addr_dep_ptr")
+
;; Model a GP->FP register move as similar to stores.
(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"