Improve Cortex-A53 FP scheduler

The Cortex-A53 scheduler model of FMAC bypass is not quite right for FMAC to FMAC forwarding. Experiments also show the latencies of FP operations are too high as well. Rather than adding more bypasses, adjust the latencies of FP instructions to get a better schedule on average. As a result SPECFP2006 is 1.1% faster. gcc/ * config/arm/cortex-a53.md (cortex_a53_fpalu) Adjust latency. (cortex_a53_fconst): Likewise. (cortex_a53_fpmul): Likewise. (cortex_a53_f_load_64): Likewise. (cortex_a53_f_load_many): Likewise. (cortex_a53_advsimd_alu): Likewise. (cortex_a53_advsimd_alu_q): Likewise. (cortex_a53_advsimd_mul): Likewise. (cortex_a53_advsimd_mul_q): Likewise. (fpmac bypass): Add new bypass for fpmac-fpmac case. Add missing fmul, r2f_cvt and fconst cases. From-SVN: r249200
author: Wilco Dijkstra <wdijkstr@arm.com> 2017-06-14 14:51:46 +0000
committer: Wilco Dijkstra <wilco@gcc.gnu.org> 2017-06-14 14:51:46 +0000
commit: 4524009c09ac5148643c1226f6868ef38cc44e6c (patch)
tree: 536bd4a183e5012b4b57c4eaa923a6d29d94654b
parent: ea9a08f5dfbab28de2eabfeb684b42c0615cd99d (diff)
download: gcc-4524009c09ac5148643c1226f6868ef38cc44e6c.zip
gcc-4524009c09ac5148643c1226f6868ef38cc44e6c.tar.gz
gcc-4524009c09ac5148643c1226f6868ef38cc44e6c.tar.bz2
2 files changed, 30 insertions, 18 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 09a1b98..59ca506 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,17 @@
+2017-06-14  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* config/arm/cortex-a53.md (cortex_a53_fpalu) Adjust latency.
+	(cortex_a53_fconst): Likewise.
+	(cortex_a53_fpmul): Likewise.
+	(cortex_a53_f_load_64): Likewise.
+	(cortex_a53_f_load_many): Likewise.
+	(cortex_a53_advsimd_alu): Likewise.
+	(cortex_a53_advsimd_alu_q): Likewise.
+	(cortex_a53_advsimd_mul): Likewise.
+	(cortex_a53_advsimd_mul_q): Likewise.
+	(fpmac bypass): Add new bypass for fpmac-fpmac case.
+	Add missing fmul, r2f_cvt and fconst cases.
+
 2017-06-14  Richard Biener  <rguenther@suse.de>
 
 	PR middle-end/81088
diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
index b7e0c92..ff16e36 100644
--- a/gcc/config/arm/cortex-a53.md
+++ b/gcc/config/arm/cortex-a53.md
@@ -511,19 +511,19 @@
 ;; Floating-point arithmetic.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(define_insn_reservation "cortex_a53_fpalu" 5
+(define_insn_reservation "cortex_a53_fpalu" 4
   (and (eq_attr "tune" "cortexa53")
 	(eq_attr "type" "ffariths, fadds, ffarithd, faddd, fmov,
 			f_cvt, fcmps, fcmpd, fccmps, fccmpd, fcsel,
 			f_rints, f_rintd, f_minmaxs, f_minmaxd"))
   "cortex_a53_slot_any,cortex_a53_fp_alu")
 
-(define_insn_reservation "cortex_a53_fconst" 3
+(define_insn_reservation "cortex_a53_fconst" 2
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "type" "fconsts,fconstd"))
   "cortex_a53_slot_any,cortex_a53_fp_alu")
 
-(define_insn_reservation "cortex_a53_fpmul" 5
+(define_insn_reservation "cortex_a53_fpmul" 4
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "type" "fmuls,fmuld"))
   "cortex_a53_slot_any,cortex_a53_fp_mul")
@@ -574,7 +574,7 @@
 ;; Floating-point load/store.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(define_insn_reservation "cortex_a53_f_load_64" 4
+(define_insn_reservation "cortex_a53_f_load_64" 3
   (and (eq_attr "tune" "cortexa53")
        (ior (eq_attr "type" "f_loads,f_loadd")
 	    (eq_attr "cortex_a53_advsimd_type"
@@ -582,7 +582,7 @@
   "cortex_a53_slot_any+cortex_a53_ls_agen,
    cortex_a53_load")
 
-(define_insn_reservation "cortex_a53_f_load_many" 5
+(define_insn_reservation "cortex_a53_f_load_many" 4
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "cortex_a53_advsimd_type"
 		"advsimd_load_128,advsimd_load_lots"))
@@ -616,22 +616,22 @@
 ;; or a 128-bit operation in which case we require in our model that we
 ;; issue from slot 0.
 
-(define_insn_reservation "cortex_a53_advsimd_alu" 5
+(define_insn_reservation "cortex_a53_advsimd_alu" 4
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "cortex_a53_advsimd_type" "advsimd_alu"))
   "cortex_a53_slot_any,cortex_a53_fp_alu")
 
-(define_insn_reservation "cortex_a53_advsimd_alu_q" 5
+(define_insn_reservation "cortex_a53_advsimd_alu_q" 4
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "cortex_a53_advsimd_type" "advsimd_alu_q"))
   "cortex_a53_slot0,cortex_a53_fp_alu_q")
 
-(define_insn_reservation "cortex_a53_advsimd_mul" 5
+(define_insn_reservation "cortex_a53_advsimd_mul" 4
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "cortex_a53_advsimd_type" "advsimd_mul"))
   "cortex_a53_slot_any,cortex_a53_fp_mul")
 
-(define_insn_reservation "cortex_a53_advsimd_mul_q" 5
+(define_insn_reservation "cortex_a53_advsimd_mul_q" 4
   (and (eq_attr "tune" "cortexa53")
        (eq_attr "cortex_a53_advsimd_type" "advsimd_mul_q"))
   "cortex_a53_slot0,cortex_a53_fp_mul_q")
@@ -710,20 +710,18 @@
 ;; multiply-accumulate operations as a bypass reducing the latency
 ;; of producing instructions to near zero.
 
-(define_bypass 1 "cortex_a53_fp*,
+(define_bypass 1 "cortex_a53_fpalu,
+		  cortex_a53_fpmul,
 		  cortex_a53_r2f,
+		  cortex_a53_r2f_cvt,
+		  cortex_a53_fconst,
 		  cortex_a53_f_load*"
 		 "cortex_a53_fpmac"
 		 "aarch_accumulator_forwarding")
 
-;; Model a bypass from the result of an FP operation to a use.
-
-(define_bypass 4 "cortex_a53_fpalu,
-		  cortex_a53_fpmul"
-		 "cortex_a53_fpalu,
-		  cortex_a53_fpmul,
-		  cortex_a53_fpmac,
-		  cortex_a53_advsimd_div*")
+(define_bypass 4 "cortex_a53_fpmac"
+		 "cortex_a53_fpmac"
+		 "aarch_accumulator_forwarding")
 
 ;; We want AESE and AESMC to end up consecutive to one another.
author	Wilco Dijkstra <wdijkstr@arm.com>	2017-06-14 14:51:46 +0000
committer	Wilco Dijkstra <wilco@gcc.gnu.org>	2017-06-14 14:51:46 +0000
commit	4524009c09ac5148643c1226f6868ef38cc44e6c (patch)
tree	536bd4a183e5012b4b57c4eaa923a6d29d94654b
parent	ea9a08f5dfbab28de2eabfeb684b42c0615cd99d (diff)
download	gcc-4524009c09ac5148643c1226f6868ef38cc44e6c.zip gcc-4524009c09ac5148643c1226f6868ef38cc44e6c.tar.gz gcc-4524009c09ac5148643c1226f6868ef38cc44e6c.tar.bz2