aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <rsandifo@redhat.com>2004-05-06 15:27:19 +0000
committerRichard Sandiford <rsandifo@gcc.gnu.org>2004-05-06 15:27:19 +0000
commitdc884a86d3ffa4108b66a232d18eb3fa35000863 (patch)
treedc72d15b86329f3634e41b3ca914b9982b3a4a25
parente51f7aeb7ba0496ed2fb1a158255cbc41c5a2d08 (diff)
downloadgcc-dc884a86d3ffa4108b66a232d18eb3fa35000863.zip
gcc-dc884a86d3ffa4108b66a232d18eb3fa35000863.tar.gz
gcc-dc884a86d3ffa4108b66a232d18eb3fa35000863.tar.bz2
invoke.texi: Document -mvr4130-align.
* doc/invoke.texi: Document -mvr4130-align. * config/mips/mips.h (MASK_VR4130_ALIGN, TARGET_VR4130_ALIGN) (TUNE_MIPS4120, TUNE_MIPS4130): New macros. (TUNE_MACC_CHAINS): Include TUNE_MIPS4120 and TUNE_MIPS4130. (TARGET_SWITCHES): Add -mvr4130-align and -mno-vr4130-align. * config/mips/mips.md: Include sched-int.h. (USEFUL_INSN_P, SEQ_BEGIN, SEQ_END, FOR_EACH_SUBINSN): New macros. (mips_rtx_costs): Set integer multiplication costs for TUNE_MIPS4130. (override_options): Enable -mvr4130-align at -O3 and above. (mips_sim_insn): New variable. (mips_sim): New structure. (mips_sim_reset, mips_sim_init, mips_sim_next_cycle, mips_sim_wait_reg) (mips_sim_wait_regs_2, mips_sim_wait_regs_1, mips_sim_wait_regs) (mips_sim_wait_units, mips_sim_wait_insn, mips_sim_record_set) (mips_sim_issue_insn, mips_sim_issue_nop, mips_sim_finish_insn) (vr4130_avoid_branch_rt_conflict, vr4130_align_insns): New functions. (mips_reorg): Call vr4130_align_insns. (vr4130_last_insn): New variable. (vr4130_true_reg_dependence_p_1, vr4130_true_reg_dependence_p) (vr4130_swap_insns_p, vr4130_reorder): New functions. (mips_sched_reorder, mips_variable_issue): Hook in vr4130 code. (mips_issue_rate): Return 2 for PROCESSOR_R4130. (mips_use_dfa_pipeline_interface): Return true for the same. * config/mips/4130.md: New file. * config/mips/mips.md: Include it. Add a peephole2 to convert "mult;mflo" into "mtlo;macc". (*macc, *umul_acc_di, *smul_acc_di): Use $1 rather than $0 as the target of maccs. (*msac_using_macc): New pattern. From-SVN: r81567
-rw-r--r--gcc/ChangeLog32
-rw-r--r--gcc/config/mips/4130.md136
-rw-r--r--gcc/config/mips/mips.c531
-rw-r--r--gcc/config/mips/mips.h13
-rw-r--r--gcc/config/mips/mips.md68
-rw-r--r--gcc/doc/invoke.texi15
6 files changed, 789 insertions, 6 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index ebddaf5..3ea323e 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,37 @@
2004-05-06 Richard Sandiford <rsandifo@redhat.com>
+ * doc/invoke.texi: Document -mvr4130-align.
+ * config/mips/mips.h (MASK_VR4130_ALIGN, TARGET_VR4130_ALIGN)
+ (TUNE_MIPS4120, TUNE_MIPS4130): New macros.
+ (TUNE_MACC_CHAINS): Include TUNE_MIPS4120 and TUNE_MIPS4130.
+ (TARGET_SWITCHES): Add -mvr4130-align and -mno-vr4130-align.
+ * config/mips/mips.md: Include sched-int.h.
+ (USEFUL_INSN_P, SEQ_BEGIN, SEQ_END, FOR_EACH_SUBINSN): New macros.
+ (mips_rtx_costs): Set integer multiplication costs for TUNE_MIPS4130.
+ (override_options): Enable -mvr4130-align at -O3 and above.
+ (mips_sim_insn): New variable.
+ (mips_sim): New structure.
+ (mips_sim_reset, mips_sim_init, mips_sim_next_cycle, mips_sim_wait_reg)
+ (mips_sim_wait_regs_2, mips_sim_wait_regs_1, mips_sim_wait_regs)
+ (mips_sim_wait_units, mips_sim_wait_insn, mips_sim_record_set)
+ (mips_sim_issue_insn, mips_sim_issue_nop, mips_sim_finish_insn)
+ (vr4130_avoid_branch_rt_conflict, vr4130_align_insns): New functions.
+ (mips_reorg): Call vr4130_align_insns.
+ (vr4130_last_insn): New variable.
+ (vr4130_true_reg_dependence_p_1, vr4130_true_reg_dependence_p)
+ (vr4130_swap_insns_p, vr4130_reorder): New functions.
+ (mips_sched_reorder, mips_variable_issue): Hook in vr4130 code.
+ (mips_issue_rate): Return 2 for PROCESSOR_R4130.
+ (mips_use_dfa_pipeline_interface): Return true for the same.
+ * config/mips/4130.md: New file.
+ * config/mips/mips.md: Include it. Add a peephole2 to convert
+ "mult;mflo" into "mtlo;macc".
+ (*macc, *umul_acc_di, *smul_acc_di): Use $1 rather than $0 as the
+ target of maccs.
+ (*msac_using_macc): New pattern.
+
+2004-05-06 Richard Sandiford <rsandifo@redhat.com>
+
* config/mips/5500.md (ir_vr55_store): Set latency to 0.
(ir_vr55_hilo): Split into...
(ir_vr55_mfhilo, ir_vr55_mthilo): ...these new reservations.
diff --git a/gcc/config/mips/4130.md b/gcc/config/mips/4130.md
new file mode 100644
index 0000000..eddc405
--- /dev/null
+++ b/gcc/config/mips/4130.md
@@ -0,0 +1,136 @@
+;;
+;; Pipeline description for the VR4130 family.
+;;
+;; The processor issues each 8-byte aligned pair of instructions together,
+;; stalling the second instruction if it depends on the first. Thus, if we
+;; want two instructions to issue in parallel, we need to make sure that the
+;; first one is 8-byte aligned.
+;;
+;; For the purposes of this pipeline description, we treat the processor
+;; like a standard two-way superscalar architecture. If scheduling were
+;; the last pass to run, we could use the scheduler hooks to vary the
+;; issue rate depending on whether an instruction is at an aligned or
+;; unaligned address. Unfortunately, delayed branch scheduling and
+;; hazard avoidance are done after the final scheduling pass, and they
+;; can change the addresses of many instructions.
+;;
+;; We get around this in two ways:
+;;
+;; (1) By running an extra pass at the end of compilation. This pass goes
+;; through the function looking for pairs of instructions that could
+;; execute in parallel. It makes sure that the first instruction in
+;; each pair is suitably aligned, inserting nops if necessary. Doing
+;; this gives the same kind of pipeline behavior we would see on a
+;; normal superscalar target.
+;;
+;; This pass is generally a speed improvement, but the extra nops will
+;; obviously make the program bigger. It is therefore unsuitable for
+;; -Os (at the very least).
+;;
+;; (2) By modifying the scheduler hooks so that, where possible:
+;;
+;; (a) dependent instructions are separated by a non-dependent
+;; instruction;
+;;
+;; (b) instructions that use the multiplication unit are separated
+;; by non-multiplication instructions; and
+;;
+;; (c) memory access instructions are separated by non-memory
+;; instructions.
+;;
+;; The idea is to keep conflicting instructions apart wherever possible
+;; and thus make the schedule less dependent on alignment.
+
+(define_automaton "vr4130_main, vr4130_muldiv, vr4130_mulpre")
+
+(define_cpu_unit "vr4130_alu1, vr4130_alu2, vr4130_dcache" "vr4130_main")
+(define_cpu_unit "vr4130_muldiv" "vr4130_muldiv")
+
+;; This is a fake unit for pre-reload scheduling of multiplications.
+;; It enforces the true post-reload repeat rate.
+(define_cpu_unit "vr4130_mulpre" "vr4130_mulpre")
+
+;; The scheduling hooks use this attribute for (b) above.
+(define_attr "vr4130_class" "mul,mem,alu"
+ (cond [(eq_attr "type" "load,store")
+ (const_string "mem")
+
+ (eq_attr "type" "mfhilo,mthilo,imul,imadd,idiv")
+ (const_string "mul")]
+ (const_string "alu")))
+
+(define_insn_reservation "vr4130_multi" 1
+ (and (eq_attr "cpu" "r4130")
+ (eq_attr "type" "multi,unknown"))
+ "vr4130_alu1 + vr4130_alu2 + vr4130_dcache + vr4130_muldiv")
+
+(define_insn_reservation "vr4130_int" 1
+ (and (eq_attr "cpu" "r4130")
+ (eq_attr "type" "const,arith,shift,slt,nop"))
+ "vr4130_alu1 | vr4130_alu2")
+
+(define_insn_reservation "vr4130_load" 3
+ (and (eq_attr "cpu" "r4130")
+ (eq_attr "type" "load"))
+ "vr4130_dcache")
+
+(define_insn_reservation "vr4130_store" 1
+ (and (eq_attr "cpu" "r4130")
+ (eq_attr "type" "store"))
+ "vr4130_dcache")
+
+(define_insn_reservation "vr4130_mfhilo" 3
+ (and (eq_attr "cpu" "r4130")
+ (eq_attr "type" "mfhilo"))
+ "vr4130_muldiv")
+
+(define_insn_reservation "vr4130_mthilo" 1
+ (and (eq_attr "cpu" "r4130")
+ (eq_attr "type" "mthilo"))
+ "vr4130_muldiv")
+
+;; The product is available in LO & HI after one cycle. Moving the result
+;; into an integer register will take an additional three cycles, see mflo
+;; & mfhi above. Note that the same latencies and repeat rates apply if we
+;; use "mtlo; macc" instead of "mult; mflo".
+(define_insn_reservation "vr4130_mulsi" 4
+ (and (eq_attr "cpu" "r4130")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mode" "SI")))
+ "vr4130_muldiv + (vr4130_mulpre * 2)")
+
+;; As for vr4130_mulsi, but the product is available in LO and HI
+;; after 3 cycles.
+(define_insn_reservation "vr4130_muldi" 6
+ (and (eq_attr "cpu" "r4130")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mode" "DI")))
+ "(vr4130_muldiv * 3) + (vr4130_mulpre * 4)")
+
+;; maccs can execute in consecutive cycles without stalling, but it
+;; is 3 cycles before the integer destination can be read.
+(define_insn_reservation "vr4130_macc" 3
+ (and (eq_attr "cpu" "r4130")
+ (eq_attr "type" "imadd"))
+ "vr4130_muldiv")
+
+(define_bypass 1 "vr4130_mulsi,vr4130_macc" "vr4130_macc" "mips_linked_madd_p")
+(define_bypass 1 "vr4130_mulsi,vr4130_macc" "vr4130_mfhilo")
+(define_bypass 3 "vr4130_muldi" "vr4130_mfhilo")
+
+(define_insn_reservation "vr4130_divsi" 36
+ (and (eq_attr "cpu" "r4130")
+ (and (eq_attr "type" "idiv")
+ (eq_attr "mode" "SI")))
+ "vr4130_muldiv * 36")
+
+(define_insn_reservation "vr4130_divdi" 72
+ (and (eq_attr "cpu" "r4130")
+ (and (eq_attr "type" "idiv")
+ (eq_attr "mode" "DI")))
+ "vr4130_muldiv * 72")
+
+(define_insn_reservation "vr4130_branch" 0
+ (and (eq_attr "cpu" "r4130")
+ (eq_attr "type" "branch,jump,call"))
+ "vr4130_alu1 | vr4130_alu2")
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index ad5a48e..598120f 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -54,6 +54,7 @@ Boston, MA 02111-1307, USA. */
#include "integrate.h"
#include "langhooks.h"
#include "cfglayout.h"
+#include "sched-int.h"
/* Enumeration for all of the relational tests, so that we can build
arrays indexed by the test type, and not worry about the order
@@ -107,6 +108,33 @@ enum internal_test {
multi-instruction addu sequence. Use 0x7fe0 to work around this. */
#define MIPS_MAX_FIRST_STACK_STEP (TARGET_MIPS16 ? 0x100 : 0x7fe0)
+/* True if INSN is a mips.md pattern or asm statement. */
+#define USEFUL_INSN_P(INSN) \
+ (INSN_P (INSN) \
+ && GET_CODE (PATTERN (INSN)) != USE \
+ && GET_CODE (PATTERN (INSN)) != CLOBBER \
+ && GET_CODE (PATTERN (INSN)) != ADDR_VEC \
+ && GET_CODE (PATTERN (INSN)) != ADDR_DIFF_VEC)
+
+/* If INSN is a delayed branch sequence, return the first instruction
+ in the sequence, otherwise return INSN itself. */
+#define SEQ_BEGIN(INSN) \
+ (INSN_P (INSN) && GET_CODE (PATTERN (INSN)) == SEQUENCE \
+ ? XVECEXP (PATTERN (INSN), 0, 0) \
+ : (INSN))
+
+/* Likewise for the last instruction in a delayed branch sequence. */
+#define SEQ_END(INSN) \
+ (INSN_P (INSN) && GET_CODE (PATTERN (INSN)) == SEQUENCE \
+ ? XVECEXP (PATTERN (INSN), 0, XVECLEN (PATTERN (INSN), 0) - 1) \
+ : (INSN))
+
+/* Execute the following loop body with SUBINSN set to each instruction
+ between SEQ_BEGIN (INSN) and SEQ_END (INSN) inclusive. */
+#define FOR_EACH_SUBINSN(SUBINSN, INSN) \
+ for ((SUBINSN) = SEQ_BEGIN (INSN); \
+ (SUBINSN) != NEXT_INSN (SEQ_END (INSN)); \
+ (SUBINSN) = NEXT_INSN (SUBINSN))
/* Classifies an address.
@@ -138,6 +166,7 @@ struct mips16_constant;
struct mips_arg_info;
struct mips_address_info;
struct mips_integer_op;
+struct mips_sim;
static enum mips_symbol_type mips_classify_symbol (rtx);
static void mips_split_const (rtx, rtx *, HOST_WIDE_INT *);
@@ -219,6 +248,21 @@ static void dump_constants (struct mips16_constant *, rtx);
static int mips16_insn_length (rtx);
static int mips16_rewrite_pool_refs (rtx *, void *);
static void mips16_lay_out_constants (void);
+static void mips_sim_reset (struct mips_sim *);
+static void mips_sim_init (struct mips_sim *, state_t);
+static void mips_sim_next_cycle (struct mips_sim *);
+static void mips_sim_wait_reg (struct mips_sim *, rtx, rtx);
+static int mips_sim_wait_regs_2 (rtx *, void *);
+static void mips_sim_wait_regs_1 (rtx *, void *);
+static void mips_sim_wait_regs (struct mips_sim *, rtx);
+static void mips_sim_wait_units (struct mips_sim *, rtx);
+static void mips_sim_wait_insn (struct mips_sim *, rtx);
+static void mips_sim_record_set (rtx, rtx, void *);
+static void mips_sim_issue_insn (struct mips_sim *, rtx);
+static void mips_sim_issue_nop (struct mips_sim *);
+static void mips_sim_finish_insn (struct mips_sim *, rtx);
+static void vr4130_avoid_branch_rt_conflict (rtx);
+static void vr4130_align_insns (void);
static void mips_avoid_hazard (rtx, rtx, int *, rtx *, rtx);
static void mips_avoid_hazards (void);
static void mips_reorg (void);
@@ -230,6 +274,10 @@ static bool mips_return_in_memory (tree, tree);
static bool mips_strict_argument_naming (CUMULATIVE_ARGS *);
static void mips_macc_chains_record (rtx);
static void mips_macc_chains_reorder (rtx *, int);
+static void vr4130_true_reg_dependence_p_1 (rtx, rtx, void *);
+static bool vr4130_true_reg_dependence_p (rtx);
+static bool vr4130_swap_insns_p (rtx, rtx);
+static void vr4130_reorder (rtx *, int);
static void mips_promote_ready (rtx *, int, int);
static int mips_sched_reorder (FILE *, int, rtx *, int *, int);
static int mips_variable_issue (FILE *, int, rtx, int);
@@ -2347,6 +2395,8 @@ mips_rtx_costs (rtx x, int code, int outer_code, int *total)
*total = COSTS_N_INSNS (12);
else if (TUNE_MIPS3900)
*total = COSTS_N_INSNS (2);
+ else if (TUNE_MIPS4130)
+ *total = COSTS_N_INSNS (mode == DImode ? 6 : 4);
else if (TUNE_MIPS5400 || TUNE_SB1)
*total = COSTS_N_INSNS (mode == DImode ? 4 : 3);
else if (TUNE_MIPS5500 || TUNE_MIPS7000)
@@ -4788,6 +4838,12 @@ override_options (void)
if (TARGET_NAME_REGS)
memcpy (mips_reg_names, mips_sw_reg_names, sizeof (mips_reg_names));
+ /* -mvr4130-align is a "speed over size" optimization: it usually produces
+ faster code, but at the expense of more nops. Enable it at -O3 and
+ above. */
+ if (optimize > 2 && (target_flags_explicit & MASK_VR4130_ALIGN) == 0)
+ target_flags |= MASK_VR4130_ALIGN;
+
/* When compiling for the mips16, we can not use floating point. We
record the original hard float value in mips16_hard_float. */
if (TARGET_MIPS16)
@@ -8367,8 +8423,373 @@ mips16_lay_out_constants (void)
}
dump_constants (pool.first, get_last_insn ());
}
+
+/* A temporary variable used by for_each_rtx callbacks, etc. */
+static rtx mips_sim_insn;
+
+/* A structure representing the state of the processor pipeline.
+ Used by the mips_sim_* family of functions. */
+struct mips_sim {
+ /* The maximum number of instructions that can be issued in a cycle.
+ (Caches mips_issue_rate.) */
+ unsigned int issue_rate;
+
+ /* The current simulation time. */
+ unsigned int time;
+
+ /* How many more instructions can be issued in the current cycle. */
+ unsigned int insns_left;
+
+ /* LAST_SET[X].INSN is the last instruction to set register X.
+ LAST_SET[X].TIME is the time at which that instruction was issued.
+ INSN is null if no instruction has yet set register X. */
+ struct {
+ rtx insn;
+ unsigned int time;
+ } last_set[FIRST_PSEUDO_REGISTER];
+
+ /* The pipeline's current DFA state. */
+ state_t dfa_state;
+};
+
+/* Reset STATE to the initial simulation state. */
+
+static void
+mips_sim_reset (struct mips_sim *state)
+{
+ state->time = 0;
+ state->insns_left = state->issue_rate;
+ memset (&state->last_set, 0, sizeof (state->last_set));
+ state_reset (state->dfa_state);
+}
+
+/* Initialize STATE before its first use. DFA_STATE points to an
+ allocated but uninitialized DFA state. */
+
+static void
+mips_sim_init (struct mips_sim *state, state_t dfa_state)
+{
+ state->issue_rate = mips_issue_rate ();
+ state->dfa_state = dfa_state;
+ mips_sim_reset (state);
+}
+
+/* Advance STATE by one clock cycle. */
+
+static void
+mips_sim_next_cycle (struct mips_sim *state)
+{
+ state->time++;
+ state->insns_left = state->issue_rate;
+ state_transition (state->dfa_state, 0);
+}
+
+/* Advance simulation state STATE until instruction INSN can read
+ register REG. */
+
+static void
+mips_sim_wait_reg (struct mips_sim *state, rtx insn, rtx reg)
+{
+ unsigned int i;
+
+ for (i = 0; i < HARD_REGNO_NREGS (REGNO (reg), GET_MODE (reg)); i++)
+ if (state->last_set[REGNO (reg) + i].insn != 0)
+ {
+ unsigned int t;
+
+ t = state->last_set[REGNO (reg) + i].time;
+ t += insn_latency (state->last_set[REGNO (reg) + i].insn, insn);
+ while (state->time < t)
+ mips_sim_next_cycle (state);
+ }
+}
+
+/* A for_each_rtx callback. If *X is a register, advance simulation state
+ DATA until mips_sim_insn can read the register's value. */
+
+static int
+mips_sim_wait_regs_2 (rtx *x, void *data)
+{
+ if (REG_P (*x))
+ mips_sim_wait_reg (data, mips_sim_insn, *x);
+ return 0;
+}
+
+/* Call mips_sim_wait_regs_2 (R, DATA) for each register R mentioned in *X. */
+
+static void
+mips_sim_wait_regs_1 (rtx *x, void *data)
+{
+ for_each_rtx (x, mips_sim_wait_regs_2, data);
+}
+
+/* Advance simulation state STATE until all of INSN's register
+ dependencies are satisfied. */
+
+static void
+mips_sim_wait_regs (struct mips_sim *state, rtx insn)
+{
+ mips_sim_insn = insn;
+ note_uses (&PATTERN (insn), mips_sim_wait_regs_1, state);
+}
+
+/* Advance simulation state STATE until the units required by
+ instruction INSN are available. */
+
+static void
+mips_sim_wait_units (struct mips_sim *state, rtx insn)
+{
+ state_t tmp_state;
+
+ tmp_state = alloca (state_size ());
+ while (state->insns_left == 0
+ || (memcpy (tmp_state, state->dfa_state, state_size ()),
+ state_transition (tmp_state, insn) >= 0))
+ mips_sim_next_cycle (state);
+}
+
+/* Advance simulation state STATE until INSN is ready to issue. */
+
+static void
+mips_sim_wait_insn (struct mips_sim *state, rtx insn)
+{
+ mips_sim_wait_regs (state, insn);
+ mips_sim_wait_units (state, insn);
+}
+
+/* mips_sim_insn has just set X. Update the LAST_SET array
+ in simulation state DATA. */
+
+static void
+mips_sim_record_set (rtx x, rtx pat ATTRIBUTE_UNUSED, void *data)
+{
+ struct mips_sim *state;
+ unsigned int i;
+
+ state = data;
+ if (REG_P (x))
+ for (i = 0; i < HARD_REGNO_NREGS (REGNO (x), GET_MODE (x)); i++)
+ {
+ state->last_set[REGNO (x) + i].insn = mips_sim_insn;
+ state->last_set[REGNO (x) + i].time = state->time;
+ }
+}
+
+/* Issue instruction INSN in scheduler state STATE. Assume that INSN
+ can issue immediately (i.e., that mips_sim_wait_insn has already
+ been called). */
+
+static void
+mips_sim_issue_insn (struct mips_sim *state, rtx insn)
+{
+ state_transition (state->dfa_state, insn);
+ state->insns_left--;
+
+ mips_sim_insn = insn;
+ note_stores (PATTERN (insn), mips_sim_record_set, state);
+}
+
+/* Simulate issuing a NOP in state STATE. */
+
+static void
+mips_sim_issue_nop (struct mips_sim *state)
+{
+ if (state->insns_left == 0)
+ mips_sim_next_cycle (state);
+ state->insns_left--;
+}
+
+/* Update simulation state STATE so that it's ready to accept the instruction
+ after INSN. INSN should be part of the main rtl chain, not a member of a
+ SEQUENCE. */
+
+static void
+mips_sim_finish_insn (struct mips_sim *state, rtx insn)
+{
+ /* If INSN is a jump with an implicit delay slot, simulate a nop. */
+ if (JUMP_P (insn))
+ mips_sim_issue_nop (state);
+
+ switch (GET_CODE (SEQ_BEGIN (insn)))
+ {
+ case CODE_LABEL:
+ case CALL_INSN:
+ /* We can't predict the processor state after a call or label. */
+ mips_sim_reset (state);
+ break;
+
+ case JUMP_INSN:
+ /* The delay slots of branch likely instructions are only executed
+ when the branch is taken. Therefore, if the caller has simulated
+ the delay slot instruction, STATE does not really reflect the state
+ of the pipeline for the instruction after the delay slot. Also,
+ branch likely instructions tend to incur a penalty when not taken,
+ so there will probably be an extra delay between the branch and
+ the instruction after the delay slot. */
+ if (INSN_ANNULLED_BRANCH_P (SEQ_BEGIN (insn)))
+ mips_sim_reset (state);
+ break;
+
+ default:
+ break;
+ }
+}
+
+/* The VR4130 pipeline issues aligned pairs of instructions together,
+ but it stalls the second instruction if it depends on the first.
+ In order to cut down the amount of logic required, this dependence
+ check is not based on a full instruction decode. Instead, any non-SPECIAL
+ instruction is assumed to modify the register specified by bits 20-16
+ (which is usually the "rt" field).
+
+ In beq, beql, bne and bnel instructions, the rt field is actually an
+ input, so we can end up with a false dependence between the branch
+ and its delay slot. If this situation occurs in instruction INSN,
+ try to avoid it by swapping rs and rt. */
+
+static void
+vr4130_avoid_branch_rt_conflict (rtx insn)
+{
+ rtx first, second;
+
+ first = SEQ_BEGIN (insn);
+ second = SEQ_END (insn);
+ if (GET_CODE (first) == JUMP_INSN
+ && GET_CODE (second) == INSN
+ && GET_CODE (PATTERN (first)) == SET
+ && GET_CODE (SET_DEST (PATTERN (first))) == PC
+ && GET_CODE (SET_SRC (PATTERN (first))) == IF_THEN_ELSE)
+ {
+ /* Check for the right kind of condition. */
+ rtx cond = XEXP (SET_SRC (PATTERN (first)), 0);
+ if ((GET_CODE (cond) == EQ || GET_CODE (cond) == NE)
+ && REG_P (XEXP (cond, 0))
+ && REG_P (XEXP (cond, 1))
+ && reg_referenced_p (XEXP (cond, 1), PATTERN (second))
+ && !reg_referenced_p (XEXP (cond, 0), PATTERN (second)))
+ {
+ /* SECOND mentions the rt register but not the rs register. */
+ rtx tmp = XEXP (cond, 0);
+ XEXP (cond, 0) = XEXP (cond, 1);
+ XEXP (cond, 1) = tmp;
+ }
+ }
+}
+
+/* Implement -mvr4130-align. Go through each basic block and simulate the
+ processor pipeline. If we find that a pair of instructions could execute
+ in parallel, and the first of those instruction is not 8-byte aligned,
+ insert a nop to make it aligned. */
+static void
+vr4130_align_insns (void)
+{
+ struct mips_sim state;
+ rtx insn, subinsn, last, last2, next;
+ bool aligned_p;
+
+ dfa_start ();
+
+ /* LAST is the last instruction before INSN to have a nonzero length.
+ LAST2 is the last such instruction before LAST. */
+ last = 0;
+ last2 = 0;
+
+ /* ALIGNED_P is true if INSN is known to be at an aligned address. */
+ aligned_p = true;
+
+ mips_sim_init (&state, alloca (state_size ()));
+ for (insn = get_insns (); insn != 0; insn = next)
+ {
+ unsigned int length;
+
+ next = NEXT_INSN (insn);
+
+ /* See the comment above vr4130_avoid_branch_rt_conflict for details.
+ This isn't really related to the alignment pass, but we do it on
+ the fly to avoid a separate instruction walk. */
+ vr4130_avoid_branch_rt_conflict (insn);
+
+ if (USEFUL_INSN_P (insn))
+ FOR_EACH_SUBINSN (subinsn, insn)
+ {
+ mips_sim_wait_insn (&state, subinsn);
+
+ /* If we want this instruction to issue in parallel with the
+ previous one, make sure that the previous instruction is
+ aligned. There are several reasons why this isn't worthwhile
+ when the second instruction is a call:
+
+ - Calls are less likely to be performance critical,
+ - There's a good chance that the delay slot can execute
+ in parallel with the call.
+ - The return address would then be unaligned.
+
+ In general, if we're going to insert a nop between instructions
+ X and Y, it's better to insert it immediately after X. That
+ way, if the nop makes Y aligned, it will also align any labels
+ between X and Y. */
+ if (state.insns_left != state.issue_rate
+ && GET_CODE (subinsn) != CALL_INSN)
+ {
+ if (subinsn == SEQ_BEGIN (insn) && aligned_p)
+ {
+ /* SUBINSN is the first instruction in INSN and INSN is
+ aligned. We want to align the previous instruction
+ instead, so insert a nop between LAST2 and LAST.
+
+ Note that LAST could be either a single instruction
+ or a branch with a delay slot. In the latter case,
+ LAST, like INSN, is already aligned, but the delay
+ slot must have some extra delay that stops it from
+ issuing at the same time as the branch. We therefore
+ insert a nop before the branch in order to align its
+ delay slot. */
+ emit_insn_after (gen_nop (), last2);
+ aligned_p = false;
+ }
+ else if (subinsn != SEQ_BEGIN (insn) && !aligned_p)
+ {
+ /* SUBINSN is the delay slot of INSN, but INSN is
+ currently unaligned. Insert a nop between
+ LAST and INSN to align it. */
+ emit_insn_after (gen_nop (), last);
+ aligned_p = true;
+ }
+ }
+ mips_sim_issue_insn (&state, subinsn);
+ }
+ mips_sim_finish_insn (&state, insn);
+
+ /* Update LAST, LAST2 and ALIGNED_P for the next instruction. */
+ length = get_attr_length (insn);
+ if (length > 0)
+ {
+ /* If the instruction is an asm statement or multi-instruction
+ mips.md patern, the length is only an estimate. Insert an
+ 8 byte alignment after it so that the following instructions
+ can be handled correctly. */
+ if (GET_CODE (SEQ_BEGIN (insn)) == INSN
+ && (recog_memoized (insn) < 0 || length >= 8))
+ {
+ next = emit_insn_after (gen_align (GEN_INT (3)), insn);
+ next = NEXT_INSN (next);
+ mips_sim_next_cycle (&state);
+ aligned_p = true;
+ }
+ else if (length & 4)
+ aligned_p = !aligned_p;
+ last2 = last;
+ last = insn;
+ }
+ /* See whether INSN is an aligned label. */
+ if (LABEL_P (insn) && label_to_alignment (insn) >= 3)
+ aligned_p = true;
+ }
+ dfa_finish ();
+}
+
/* Subroutine of mips_reorg. If there is a hazard between INSN
and a previous instruction, avoid it by inserting nops after
instruction AFTER.
@@ -8499,6 +8920,8 @@ mips_reorg (void)
if (mips_flag_delayed_branch)
dbr_schedule (get_insns (), dump_file);
mips_avoid_hazards ();
+ if (TUNE_MIPS4130 && TARGET_VR4130_ALIGN)
+ vr4130_align_insns ();
}
}
@@ -9266,6 +9689,104 @@ mips_macc_chains_reorder (rtx *ready, int nready)
}
}
+/* The last instruction to be scheduled. */
+
+static rtx vr4130_last_insn;
+
+/* A note_stores callback used by vr4130_true_reg_dependence_p. DATA
+ points to an rtx that is initially an instruction. Nullify the rtx
+ if the instruction uses the value of register X. */
+
+static void
+vr4130_true_reg_dependence_p_1 (rtx x, rtx pat ATTRIBUTE_UNUSED, void *data)
+{
+ rtx *insn_ptr = data;
+ if (REG_P (x)
+ && *insn_ptr != 0
+ && reg_referenced_p (x, PATTERN (*insn_ptr)))
+ *insn_ptr = 0;
+}
+
+/* Return true if there is true register dependence between vr4130_last_insn
+ and INSN. */
+
+static bool
+vr4130_true_reg_dependence_p (rtx insn)
+{
+ note_stores (PATTERN (vr4130_last_insn),
+ vr4130_true_reg_dependence_p_1, &insn);
+ return insn == 0;
+}
+
+/* A TUNE_MIPS4130 helper function. Given that INSN1 is at the head of
+ the ready queue and that INSN2 is the instruction after it, return
+ true if it is worth promoting INSN2 ahead of INSN1. Look for cases
+ in which INSN1 and INSN2 can probably issue in parallel, but for
+ which (INSN2, INSN1) should be less sensitive to instruction
+ alignment than (INSN1, INSN2). See 4130.md for more details. */
+
+static bool
+vr4130_swap_insns_p (rtx insn1, rtx insn2)
+{
+ rtx dep;
+
+ /* Check for the following case:
+
+ 1) there is some other instruction X with an anti dependence on INSN1;
+ 2) X has a higher priority than INSN2; and
+ 3) X is an arithmetic instruction (and thus has no unit restrictions).
+
+ If INSN1 is the last instruction blocking X, it would better to
+ choose (INSN1, X) over (INSN2, INSN1). */
+ for (dep = INSN_DEPEND (insn1); dep != 0; dep = XEXP (dep, 1))
+ if (REG_NOTE_KIND (dep) == REG_DEP_ANTI
+ && INSN_PRIORITY (XEXP (dep, 0)) > INSN_PRIORITY (insn2)
+ && recog_memoized (XEXP (dep, 0)) >= 0
+ && get_attr_vr4130_class (XEXP (dep, 0)) == VR4130_CLASS_ALU)
+ return false;
+
+ if (vr4130_last_insn != 0
+ && recog_memoized (insn1) >= 0
+ && recog_memoized (insn2) >= 0)
+ {
+ /* See whether INSN1 and INSN2 use different execution units,
+ or if they are both ALU-type instructions. If so, they can
+ probably execute in parallel. */
+ enum attr_vr4130_class class1 = get_attr_vr4130_class (insn1);
+ enum attr_vr4130_class class2 = get_attr_vr4130_class (insn2);
+ if (class1 != class2 || class1 == VR4130_CLASS_ALU)
+ {
+ /* If only one of the instructions has a dependence on
+ vr4130_last_insn, prefer to schedule the other one first. */
+ bool dep1 = vr4130_true_reg_dependence_p (insn1);
+ bool dep2 = vr4130_true_reg_dependence_p (insn2);
+ if (dep1 != dep2)
+ return dep1;
+
+ /* Prefer to schedule INSN2 ahead of INSN1 if vr4130_last_insn
+ is not an ALU-type instruction and if INSN1 uses the same
+ execution unit. (Note that if this condition holds, we already
+ know that INSN2 uses a different execution unit.) */
+ if (class1 != VR4130_CLASS_ALU
+ && recog_memoized (vr4130_last_insn) >= 0
+ && class1 == get_attr_vr4130_class (vr4130_last_insn))
+ return true;
+ }
+ }
+ return false;
+}
+
+/* A TUNE_MIPS4130 helper function. (READY, NREADY) describes a ready
+ queue with at least two instructions. Swap the first two if
+ vr4130_swap_insns_p says that it could be worthwhile. */
+
+static void
+vr4130_reorder (rtx *ready, int nready)
+{
+ if (vr4130_swap_insns_p (ready[nready - 1], ready[nready - 2]))
+ mips_promote_ready (ready, nready - 2, nready - 1);
+}
+
/* Remove the instruction at index LOWER from ready queue READY and
reinsert it in front of the instruction at index HIGHER. LOWER must
be <= HIGHER. */
@@ -9295,6 +9816,13 @@ mips_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
if (*nreadyp > 0)
mips_macc_chains_reorder (ready, *nreadyp);
}
+ if (reload_completed && TUNE_MIPS4130 && !TARGET_VR4130_ALIGN)
+ {
+ if (cycle == 0)
+ vr4130_last_insn = 0;
+ if (*nreadyp > 1)
+ vr4130_reorder (ready, *nreadyp);
+ }
return mips_issue_rate ();
}
@@ -9315,6 +9843,7 @@ mips_variable_issue (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
more--;
if (!reload_completed && TUNE_MACC_CHAINS)
mips_macc_chains_record (insn);
+ vr4130_last_insn = insn;
break;
}
return more;
@@ -9339,6 +9868,7 @@ mips_issue_rate (void)
{
switch (mips_tune)
{
+ case PROCESSOR_R4130:
case PROCESSOR_R5400:
case PROCESSOR_R5500:
case PROCESSOR_R7000:
@@ -9368,6 +9898,7 @@ mips_use_dfa_pipeline_interface (void)
{
switch (mips_tune)
{
+ case PROCESSOR_R4130:
case PROCESSOR_R5400:
case PROCESSOR_R5500:
case PROCESSOR_R7000:
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index 37111ca..e23abf1 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -171,7 +171,7 @@ extern const struct mips_cpu_info *mips_tune_info;
#define MASK_FIX_R4400 0x01000000 /* Work around R4400 errata. */
#define MASK_FIX_SB1 0x02000000 /* Work around SB-1 errata. */
#define MASK_FIX_VR4120 0x04000000 /* Work around VR4120 errata. */
-
+#define MASK_VR4130_ALIGN 0x08000000 /* Perform VR4130 alignment opts. */
#define MASK_FP_EXCEPTIONS 0x10000000 /* FP exceptions are enabled. */
/* Debug switches, not documented */
@@ -253,6 +253,7 @@ extern const struct mips_cpu_info *mips_tune_info;
/* Work around R4400 errata. */
#define TARGET_FIX_R4400 (target_flags & MASK_FIX_R4400)
#define TARGET_FIX_VR4120 (target_flags & MASK_FIX_VR4120)
+#define TARGET_VR4130_ALIGN (target_flags & MASK_VR4130_ALIGN)
#define TARGET_FP_EXCEPTIONS (target_flags & MASK_FP_EXCEPTIONS)
@@ -332,6 +333,8 @@ extern const struct mips_cpu_info *mips_tune_info;
#define TUNE_MIPS3000 (mips_tune == PROCESSOR_R3000)
#define TUNE_MIPS3900 (mips_tune == PROCESSOR_R3900)
#define TUNE_MIPS4000 (mips_tune == PROCESSOR_R4000)
+#define TUNE_MIPS4120 (mips_tune == PROCESSOR_R4120)
+#define TUNE_MIPS4130 (mips_tune == PROCESSOR_R4130)
#define TUNE_MIPS5000 (mips_tune == PROCESSOR_R5000)
#define TUNE_MIPS5400 (mips_tune == PROCESSOR_R5400)
#define TUNE_MIPS5500 (mips_tune == PROCESSOR_R5500)
@@ -371,7 +374,9 @@ extern const struct mips_cpu_info *mips_tune_info;
Multiply-accumulate instructions are a bigger win for some targets
than others, so this macro is defined on an opt-in basis. */
-#define TUNE_MACC_CHAINS TUNE_MIPS5500
+#define TUNE_MACC_CHAINS (TUNE_MIPS5500 \
+ || TUNE_MIPS4120 \
+ || TUNE_MIPS4130)
#define TARGET_OLDABI (mips_abi == ABI_32 || mips_abi == ABI_O64)
#define TARGET_NEWABI (mips_abi == ABI_N32 || mips_abi == ABI_64)
@@ -619,6 +624,10 @@ extern const struct mips_cpu_info *mips_tune_info;
N_("Don't generate fused multiply/add instructions")}, \
{"fused-madd", -MASK_NO_FUSED_MADD, \
N_("Generate fused multiply/add instructions")}, \
+ {"vr4130-align", MASK_VR4130_ALIGN, \
+ N_("Perform VR4130-specific alignment optimizations")}, \
+ {"no-vr4130-align", -MASK_VR4130_ALIGN, \
+ N_("Don't perform VR4130-specific alignment optimizations")}, \
{"fix4300", MASK_4300_MUL_FIX, \
N_("Work around early 4300 hardware bug")}, \
{"no-fix4300", -MASK_4300_MUL_FIX, \
diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
index cdb5005..e968246 100644
--- a/gcc/config/mips/mips.md
+++ b/gcc/config/mips/mips.md
@@ -631,6 +631,7 @@
;; Include scheduling descriptions.
+(include "4130.md")
(include "5400.md")
(include "5500.md")
(include "7000.md")
@@ -1584,6 +1585,37 @@
(set_attr "mode" "SI")
(set_attr "length" "8")])
+;; On the VR4120 and VR4130, it is better to use "mtlo $0; macc" instead
+;; of "mult; mflo". They have the same latency, but the first form gives
+;; us an extra cycle to compute the operands.
+
+;; Operand 0: LO
+;; Operand 1: GPR (1st multiplication operand)
+;; Operand 2: GPR (2nd multiplication operand)
+;; Operand 3: HI
+;; Operand 4: GPR (destination)
+(define_peephole2
+ [(parallel
+ [(set (match_operand:SI 0 "register_operand" "")
+ (mult:SI (match_operand:SI 1 "register_operand" "")
+ (match_operand:SI 2 "register_operand" "")))
+ (clobber (match_operand:SI 3 "register_operand" ""))])
+ (set (match_operand:SI 4 "register_operand" "")
+ (unspec:SI [(match_dup 0) (match_dup 3)] UNSPEC_MFHILO))]
+ "ISA_HAS_MACC && !GENERATE_MULT3_SI"
+ [(set (match_dup 0)
+ (const_int 0))
+ (parallel
+ [(set (match_dup 0)
+ (plus:SI (mult:SI (match_dup 1)
+ (match_dup 2))
+ (match_dup 0)))
+ (set (match_dup 4)
+ (plus:SI (mult:SI (match_dup 1)
+ (match_dup 2))
+ (match_dup 0)))
+ (clobber (match_dup 3))])])
+
;; Multiply-accumulate patterns
;; For processors that can copy the output to a general register:
@@ -1673,7 +1705,10 @@
else if (TARGET_MIPS5500)
return "madd\t%1,%2";
else
- return "macc\t%.,%1,%2";
+ /* The VR4130 assumes that there is a two-cycle latency between a macc
+ that "writes" to $0 and an instruction that reads from it. We avoid
+ this by assigning to $1 instead. */
+ return "%[macc\t%@,%1,%2%]";
}
[(set_attr "type" "imadd")
(set_attr "mode" "SI")])
@@ -1697,6 +1732,31 @@
[(set_attr "type" "imadd")
(set_attr "mode" "SI")])
+;; An msac-like instruction implemented using negation and a macc.
+(define_insn_and_split "*msac_using_macc"
+ [(set (match_operand:SI 0 "register_operand" "=l,d")
+ (minus:SI (match_operand:SI 1 "register_operand" "0,l")
+ (mult:SI (match_operand:SI 2 "register_operand" "d,d")
+ (match_operand:SI 3 "register_operand" "d,d"))))
+ (clobber (match_scratch:SI 4 "=h,h"))
+ (clobber (match_scratch:SI 5 "=X,1"))
+ (clobber (match_scratch:SI 6 "=d,d"))]
+ "ISA_HAS_MACC && !ISA_HAS_MSAC"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 6)
+ (neg:SI (match_dup 3)))
+ (parallel
+ [(set (match_dup 0)
+ (plus:SI (mult:SI (match_dup 2)
+ (match_dup 6))
+ (match_dup 1)))
+ (clobber (match_dup 4))
+ (clobber (match_dup 5))])]
+ ""
+ [(set_attr "type" "imadd")
+ (set_attr "length" "8")])
+
;; Patterns generated by the define_peephole2 below.
(define_insn "*macc2"
@@ -2367,7 +2427,8 @@
else if (TARGET_MIPS5500)
return "maddu\t%1,%2";
else
- return "maccu\t%.,%1,%2";
+ /* See comment in *macc. */
+ return "%[maccu\t%@,%1,%2%]";
}
[(set_attr "type" "imadd")
(set_attr "mode" "SI")])
@@ -2387,7 +2448,8 @@
else if (TARGET_MIPS5500)
return "madd\t%1,%2";
else
- return "macc\t%.,%1,%2";
+ /* See comment in *macc. */
+ return "%[macc\t%@,%1,%2%]";
}
[(set_attr "type" "imadd")
(set_attr "mode" "SI")])
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 7bafb8c..ea27ed0 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -483,7 +483,8 @@ in the following sections.
-mfix-vr4120 -mno-fix-vr4120 -mfix-sb1 -mno-fix-sb1 @gol
-mflush-func=@var{func} -mno-flush-func @gol
-mbranch-likely -mno-branch-likely @gol
--mfp-exceptions -mno-fp-exceptions}
+-mfp-exceptions -mno-fp-exceptions @gol
+-mvr4130-align -mno-vr4130-align}
@emph{i386 and x86-64 Options}
@gccoptlist{-mtune=@var{cpu-type} -march=@var{cpu-type} @gol
@@ -8245,6 +8246,18 @@ enabled.
For instance, on the SB-1, if FP exceptions are disabled, and we are emitting
64-bit code, then we can use both FP pipes. Otherwise, we can only use one
FP pipe.
+
+@item -mvr4130-align
+@itemx -mno-vr4130-align
+@opindex mvr4130-align
+The VR4130 pipeline is two-way superscalar, but can only issue two
+instructions together if the first one is 8-byte aligned. When this
+option is enabled, GCC will align pairs of instructions that it
+thinks should execute in parallel.
+
+This option only has an effect when optimizing for the VR4130.
+It normally makes code faster, but at the expense of making it bigger.
+It is enabled by default at optimization level @option{-O3}.
@end table
@node i386 and x86-64 Options