diff options
author | Paul Brook <paul@codesourcery.com> | 2008-03-10 13:39:56 +0000 |
---|---|---|
committer | Paul Brook <pbrook@gcc.gnu.org> | 2008-03-10 13:39:56 +0000 |
commit | bd4dc3cd971c7953f458c392aa2fa0d468e3b263 (patch) | |
tree | 6a3081ec5b897b6ef344f1cf4f522e14b0ecce7e /gcc | |
parent | 3b509e336185d75d64ddba6e3866f1cfbc986111 (diff) | |
download | gcc-bd4dc3cd971c7953f458c392aa2fa0d468e3b263.zip gcc-bd4dc3cd971c7953f458c392aa2fa0d468e3b263.tar.gz gcc-bd4dc3cd971c7953f458c392aa2fa0d468e3b263.tar.bz2 |
cortex-r4.md: New.
2008-03-10 Paul Brook <paul@codesourcery.com>
Mark Shinwell <shinwell@codesourcery.com>
gcc/
* config/arm/cortex-r4.md: New.
* config/arm/thumb2.md (divsi3, udivsi3): Annotate with
insn attributes.
* config/arm/arm.md: Include cortex-r4.md.
(insn): Add smmls, sdiv and udiv values.
(generic_sched): Don't use generic scheduling for Cortex-R4.
(arm_issue_rate): New function.
(TARGET_SCHED_ISSUE_RATE): Define.
Co-Authored-By: Mark Shinwell <shinwell@codesourcery.com>
From-SVN: r133078
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 13 | ||||
-rw-r--r-- | gcc/config/arm/arm.c | 20 | ||||
-rw-r--r-- | gcc/config/arm/arm.md | 8 | ||||
-rw-r--r-- | gcc/config/arm/cortex-r4.md | 288 | ||||
-rw-r--r-- | gcc/config/arm/thumb2.md | 8 |
5 files changed, 331 insertions, 6 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 052d616..5eab262 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,16 @@ +2008-03-10 Paul Brook <paul@codesourcery.com> + Mark Shinwell <shinwell@codesourcery.com> + + gcc/ + * config/arm/cortex-r4.md: New. + * config/arm/thumb2.md (divsi3, udivsi3): Annotate with + insn attributes. + * config/arm/arm.md: Include cortex-r4.md. + (insn): Add smmls, sdiv and udiv values. + (generic_sched): Don't use generic scheduling for Cortex-R4. + (arm_issue_rate): New function. + (TARGET_SCHED_ISSUE_RATE): Define. + 2008-03-10 Sebastian Pop <sebastian.pop@amd.com> * doc/invoke.texi (-ftree-loop-distribution): Add an example. diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index fc11cb1..bddb0e2 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -188,6 +188,7 @@ static void arm_target_help (void); static unsigned HOST_WIDE_INT arm_shift_truncation_mask (enum machine_mode); static bool arm_cannot_copy_insn_p (rtx); static bool arm_tls_symbol_p (rtx x); +static int arm_issue_rate (void); static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED; @@ -358,6 +359,9 @@ static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED; #undef TARGET_CANNOT_FORCE_CONST_MEM #define TARGET_CANNOT_FORCE_CONST_MEM arm_cannot_force_const_mem +#undef TARGET_SCHED_ISSUE_RATE +#define TARGET_SCHED_ISSUE_RATE arm_issue_rate + #undef TARGET_MANGLE_TYPE #define TARGET_MANGLE_TYPE arm_mangle_type @@ -18710,6 +18714,22 @@ thumb2_output_casesi (rtx *operands) } } +/* Most ARM cores are single issue, but some newer ones can dual issue. + The scheduler descriptions rely on this being correct. */ +static int +arm_issue_rate (void) +{ + switch (arm_tune) + { + case cortexr4: + case cortexa8: + return 2; + + default: + return 1; + } +} + /* A table and a function to perform ARM-specific name mangling for NEON vector types in order to conform to the AAPCS (see "Procedure Call Standard for the ARM Architecture", Appendix A). To qualify diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index 765b89b..6eef6502 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -1,6 +1,7 @@ ;;- Machine description for ARM for GNU compiler ;; Copyright 1991, 1993, 1994, 1995, 1996, 1996, 1997, 1998, 1999, 2000, -;; 2001, 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. +;; 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 +;; Free Software Foundation, Inc. ;; Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl) ;; and Martin Simmons (@harleqn.co.uk). ;; More major hacks by Richard Earnshaw (rearnsha@arm.com). @@ -183,7 +184,7 @@ ;; scheduling information. (define_attr "insn" - "mov,mvn,smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx,smmul,smmulr,smmla,umaal,smlald,smlsld,clz,mrs,msr,xtab,other" + "mov,mvn,smulxy,smlaxy,smlalxy,smulwy,smlawx,mul,muls,mla,mlas,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx,smmul,smmulr,smmla,umaal,smlald,smlsld,clz,mrs,msr,xtab,sdiv,udiv,other" (const_string "other")) ; TYPE attribute is used to detect floating point instructions which, if @@ -332,7 +333,7 @@ (define_attr "generic_sched" "yes,no" (const (if_then_else - (eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa8") + (eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa8,cortexr4") (const_string "no") (const_string "yes")))) @@ -349,6 +350,7 @@ (include "arm1026ejs.md") (include "arm1136jfs.md") (include "cortex-a8.md") +(include "cortex-r4.md") ;;--------------------------------------------------------------------------- diff --git a/gcc/config/arm/cortex-r4.md b/gcc/config/arm/cortex-r4.md new file mode 100644 index 0000000..d912f20 --- /dev/null +++ b/gcc/config/arm/cortex-r4.md @@ -0,0 +1,288 @@ +;; ARM Cortex-R4 scheduling description. +;; Copyright (C) 2007, 2008 Free Software Foundation, Inc. +;; Contributed by CodeSourcery. + +;; This file is part of GCC. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING. If not, write to +;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, +;; Boston, MA 02110-1301, USA. + +(define_automaton "cortex_r4") + +;; We approximate the dual-issue constraints of this core using four +;; "issue units" and a reservation matrix as follows. The numbers indicate +;; the instruction groups' preferences in order. Multiple entries for +;; the same numbered preference indicate units that must be reserved +;; together. +;; +;; Issue unit: A B C ALU +;; +;; ALU w/o reg shift 1st 2nd 1st and 2nd +;; ALU w/ reg shift 1st 2nd 2nd 1st and 2nd +;; Moves 1st 2nd 2nd +;; Multiplication 1st 1st +;; Division 1st 1st +;; Load/store single 1st 1st +;; Other load/store 1st 1st +;; Branches 1st + +(define_cpu_unit "cortex_r4_issue_a" "cortex_r4") +(define_cpu_unit "cortex_r4_issue_b" "cortex_r4") +(define_cpu_unit "cortex_r4_issue_c" "cortex_r4") +(define_cpu_unit "cortex_r4_issue_alu" "cortex_r4") + +(define_reservation "cortex_r4_alu" + "(cortex_r4_issue_a+cortex_r4_issue_alu)|\ + (cortex_r4_issue_b+cortex_r4_issue_alu)") +(define_reservation "cortex_r4_alu_shift_reg" + "(cortex_r4_issue_a+cortex_r4_issue_alu)|\ + (cortex_r4_issue_b+cortex_r4_issue_c+\ + cortex_r4_issue_alu)") +(define_reservation "cortex_r4_mov" + "cortex_r4_issue_a|(cortex_r4_issue_b+\ + cortex_r4_issue_alu)") +(define_reservation "cortex_r4_mul" "cortex_r4_issue_a+cortex_r4_issue_alu") +(define_reservation "cortex_r4_mul_2" + "(cortex_r4_issue_a+cortex_r4_issue_alu)*2") +;; Division instructions execute out-of-order with respect to the +;; rest of the pipeline and only require reservations on their first and +;; final cycles. +(define_reservation "cortex_r4_div_9" + "cortex_r4_issue_a+cortex_r4_issue_alu,\ + nothing*7,\ + cortex_r4_issue_a+cortex_r4_issue_alu") +(define_reservation "cortex_r4_div_10" + "cortex_r4_issue_a+cortex_r4_issue_alu,\ + nothing*8,\ + cortex_r4_issue_a+cortex_r4_issue_alu") +(define_reservation "cortex_r4_load_store" + "cortex_r4_issue_a+cortex_r4_issue_c") +(define_reservation "cortex_r4_load_store_2" + "(cortex_r4_issue_a+cortex_r4_issue_b)*2") +(define_reservation "cortex_r4_branch" "cortex_r4_issue_b") + +;; We assume that all instructions are unconditional. + +;; Data processing instructions. Moves without shifts are kept separate +;; for the purposes of the dual-issue constraints above. +(define_insn_reservation "cortex_r4_alu" 2 + (and (eq_attr "tune" "cortexr4") + (and (eq_attr "type" "alu") + (not (eq_attr "insn" "mov")))) + "cortex_r4_alu") + +(define_insn_reservation "cortex_r4_mov" 2 + (and (eq_attr "tune" "cortexr4") + (and (eq_attr "type" "alu") + (eq_attr "insn" "mov"))) + "cortex_r4_mov") + +(define_insn_reservation "cortex_r4_alu_shift" 2 + (and (eq_attr "tune" "cortexr4") + (eq_attr "type" "alu_shift")) + "cortex_r4_alu") + +(define_insn_reservation "cortex_r4_alu_shift_reg" 2 + (and (eq_attr "tune" "cortexr4") + (eq_attr "type" "alu_shift_reg")) + "cortex_r4_alu_shift_reg") + +;; An ALU instruction followed by an ALU instruction with no early dep. +(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ + cortex_r4_mov" + "cortex_r4_alu") +(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ + cortex_r4_mov" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ + cortex_r4_mov" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; In terms of availabilities, a consumer mov could theoretically be +;; issued together with a producer ALU instruction, without stalls. +;; In practice this cannot happen because mov;add (in that order) is not +;; eligible for dual issue and furthermore dual issue is not permitted +;; when a dependency is involved. We therefore note it as latency one. +;; A mov followed by another of the same is also latency one. +(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ + cortex_r4_mov" + "cortex_r4_mov") + +;; qadd, qdadd, qsub and qdsub are not currently emitted, and neither are +;; media data processing instructions nor sad instructions. + +;; Multiplication instructions. + +(define_insn_reservation "cortex_r4_mul_4" 4 + (and (eq_attr "tune" "cortexr4") + (eq_attr "insn" "mul,smmul")) + "cortex_r4_mul_2") + +(define_insn_reservation "cortex_r4_mul_3" 3 + (and (eq_attr "tune" "cortexr4") + (eq_attr "insn" "smulxy,smulwy,smuad,smusd")) + "cortex_r4_mul") + +(define_insn_reservation "cortex_r4_mla_4" 4 + (and (eq_attr "tune" "cortexr4") + (eq_attr "insn" "mla,smmla")) + "cortex_r4_mul_2") + +(define_insn_reservation "cortex_r4_mla_3" 3 + (and (eq_attr "tune" "cortexr4") + (eq_attr "insn" "smlaxy,smlawy,smlad,smlsd")) + "cortex_r4_mul") + +(define_insn_reservation "cortex_r4_smlald" 3 + (and (eq_attr "tune" "cortexr4") + (eq_attr "insn" "smlald,smlsld")) + "cortex_r4_mul") + +(define_insn_reservation "cortex_r4_mull" 4 + (and (eq_attr "tune" "cortexr4") + (eq_attr "insn" "smull,umull,umlal,umaal")) + "cortex_r4_mul_2") + +;; A multiply or an MLA with a single-register result, followed by an +;; MLA with an accumulator dependency, has its result forwarded. +(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3" + "cortex_r4_mla_3,cortex_r4_mla_4" + "arm_mac_accumulator_is_mul_result") + +(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4" + "cortex_r4_mla_3,cortex_r4_mla_4" + "arm_mac_accumulator_is_mul_result") + +;; A multiply followed by an ALU instruction needing the multiply +;; result only at ALU has lower latency than one needing it at Shift. +(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_alu") +(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") +(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_alu") +(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; A multiply followed by a mov has one cycle lower latency again. +(define_bypass 1 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_mov") +(define_bypass 2 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_mov") + +;; We guess that division of A/B using sdiv or udiv, on average, +;; is performed with B having ten more leading zeros than A. +;; This gives a latency of nine for udiv and ten for sdiv. +(define_insn_reservation "cortex_r4_udiv" 9 + (and (eq_attr "tune" "cortexr4") + (eq_attr "insn" "udiv")) + "cortex_r4_div_9") + +(define_insn_reservation "cortex_r4_sdiv" 10 + (and (eq_attr "tune" "cortexr4") + (eq_attr "insn" "sdiv")) + "cortex_r4_div_10") + +;; Branches. We assume correct prediction. + +(define_insn_reservation "cortex_r4_branch" 0 + (and (eq_attr "tune" "cortexr4") + (eq_attr "type" "branch")) + "cortex_r4_branch") + +;; Call latencies are not predictable. A semi-arbitrary very large +;; number is used as "positive infinity" so that everything should be +;; finished by the time of return. +(define_insn_reservation "cortex_r4_call" 32 + (and (eq_attr "tune" "cortexr4") + (eq_attr "type" "call")) + "nothing") + +;; Status register access instructions are not currently emitted. + +;; Load instructions. +;; We do not model the "addr_md_3cycle" cases and assume that +;; accesses following are correctly aligned. + +(define_insn_reservation "cortex_r4_load_1_2" 3 + (and (eq_attr "tune" "cortexr4") + (eq_attr "type" "load1,load2")) + "cortex_r4_load_store") + +(define_insn_reservation "cortex_r4_load_3_4" 4 + (and (eq_attr "tune" "cortexr4") + (eq_attr "type" "load3,load4")) + "cortex_r4_load_store_2") + +;; If a producing load is followed by an instruction consuming only +;; as a Normal Reg, there is one fewer cycle of latency. + +(define_bypass 2 "cortex_r4_load_1_2" + "cortex_r4_alu") +(define_bypass 2 "cortex_r4_load_1_2" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "cortex_r4_load_1_2" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +(define_bypass 3 "cortex_r4_load_3_4" + "cortex_r4_alu") +(define_bypass 3 "cortex_r4_load_3_4" + "cortex_r4_alu_shift" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "cortex_r4_load_3_4" + "cortex_r4_alu_shift_reg" + "arm_no_early_alu_shift_value_dep") + +;; If a producing load is followed by an instruction consuming only +;; as a Late Reg, there are two fewer cycles of latency. Such consumer +;; instructions are moves and stores. + +(define_bypass 1 "cortex_r4_load_1_2" + "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4") +(define_bypass 2 "cortex_r4_load_3_4" + "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4") + +;; If a producer's result is required as the base or offset of a load, +;; there is an extra cycle latency. + +(define_bypass 3 "cortex_r4_alu,cortex_r4_mov,cortex_r4_alu_shift,\ + cortex_r4_alu_shift_reg" + "cortex_r4_load_1_2,cortex_r4_load_3_4") + +(define_bypass 4 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" + "cortex_r4_load_1_2,cortex_r4_load_3_4") + +(define_bypass 5 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" + "cortex_r4_load_1_2,cortex_r4_load_3_4") + +;; Store instructions. + +(define_insn_reservation "cortex_r4_store_1_2" 0 + (and (eq_attr "tune" "cortexr4") + (eq_attr "type" "store1,store2")) + "cortex_r4_load_store") + +(define_insn_reservation "cortex_r4_store_3_4" 0 + (and (eq_attr "tune" "cortexr4") + (eq_attr "type" "store3,store4")) + "cortex_r4_load_store_2") + diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index 0efe31f..ba45c88 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1,5 +1,5 @@ ;; ARM Thumb-2 Machine Description -;; Copyright (C) 2007 Free Software Foundation, Inc. +;; Copyright (C) 2007, 2008 Free Software Foundation, Inc. ;; Written by CodeSourcery, LLC. ;; ;; This file is part of GCC. @@ -1131,7 +1131,8 @@ (match_operand:SI 2 "s_register_operand" "r")))] "TARGET_THUMB2 && arm_arch_hwdiv" "sdiv%?\t%0, %1, %2" - [(set_attr "predicable" "yes")] + [(set_attr "predicable" "yes") + (set_attr "insn" "sdiv")] ) (define_insn "udivsi3" @@ -1140,7 +1141,8 @@ (match_operand:SI 2 "s_register_operand" "r")))] "TARGET_THUMB2 && arm_arch_hwdiv" "udiv%?\t%0, %1, %2" - [(set_attr "predicable" "yes")] + [(set_attr "predicable" "yes") + (set_attr "insn" "udiv")] ) (define_insn "*thumb2_cbz" |