3 files changed, 543 insertions, 408 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 721a6ec..bb1ab08 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,56 @@
+Tue Aug 27 14:39:09 2002  J"orn Rennecke <joern.rennecke@superh.com>
+
+	* sh.md (attribute type): Add types mt_group, fload, pcfload, fpul_gp,
+	mac_gp ftrc_s and cwb.  Add / Adjust definitions in individual insn
+	accordingly.
+	(attribute insn_class): Provide default definitions based on type.
+	Remove all insn-specific settings.
+	(various function units): Remove old SH4 scheduling.
+	(branch_zero, dfp_comp, late_fp_use, any_fp_comp, any_int_load):
+	New attributes.  Set them where appropriate.
+	(cpu unit FS): Don't define / use.
+	(F3, load_store): New cpu units.
+	(F01): New reservation.
+	(all insn_reservations): Make dependent on sh4 pipeline model.
+	Fix latencies.
+	(nil, reg_mov, freg_mov, sh4_fpul_gp, sh4_call): New insn_reservations.
+	(sh4_mac_gp, fp_arith_ftrc, arith3, arith3b): Likewise.
+	(mt insn_reservation): Use type mt_group.
+	(insn_reservation load_store): Split into sh4_load, sh4_load_si,
+	sh4_fload and sh4_store.
+	(insn_reservation branch_zero and branch): Replace with sh4_branch.
+	(insn_reservation branch_far): Replace with sh4_return.
+	(insn_reservation return_from_exp): Rename to:
+	(sh4_return_from_exp).  Change to be just d_lock*5.
+	(insn_reservation lds_to_pr): Rename to:
+	(sh4_lds_to_pr).  Change to be just d_lock*2.
+	(insn_reservation ldsmem_to_pr, sts_from_pr): Change to be just
+	d_lock*2.
+	(insn_reservation prload_mem): Rename to:
+	(sh4_prstore_mem).  Change to d_lock*2,nothing,memory.
+	(insn_reservation fpscr_store): Rename to:
+	(fpscr_load).  Change to d_lock,nothing,F1*3.
+	(insn_reservation fpscr_store_mem): Rename to:
+	(fpscr_load_mem).  Change to d_lock,nothing,(F1+memory),F1*2.
+	(insn_reservation multi): Change to
+	d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2.
+	(insn_reservation fp_arith): Change to issue,F01,F2.
+	(insn_reservation fp_div: Change to issue,F01+F3,F2+F3,F3*7,F1+F3,F2.
+	(insn_reservation dp_float): Change to issue,F01,F1+F2,F2.
+	(insn_reservation fp_double_arith): Change to issue,F01,F1+F2,fpu*4,F2.
+	(insn_reservation fp_double_cmp): Change to
+	d_lock,(d_lock+F01),F1+F2,F2.
+	(insn_reservation dp_div): Change to
+	issue,F01+F3,F1+F2+F3,F2+F3,F3*16,F1+F3,(fpu+F3)*2,F2.
+	* sh.c (flow_dependent_p, flow_dependent_p_1): New functions.
+	(sh_adjust_cost, SHcompact): Differentiate between different
+	kinds of dependencies.  Drop factor of ten for superscalar.
+	Use new instruction types.  Add new exception rules.
+
+	* sh.md (mulhisi3, umulhisi3: Add a REG_EQUAL note.
+
+	* sh.md (mperm_w): Add DONE.
+
 2002-08-27  David Edelsohn  <edelsohn@gnu.org>
 
 	* longlong.h: Import current PowerPC defintion from GMP-4.1.
diff --git a/gcc/config/sh/sh.c b/gcc/config/sh/sh.c
index 3512b97..228e0fa 100644
--- a/gcc/config/sh/sh.c
+++ b/gcc/config/sh/sh.c
@@ -208,6 +208,8 @@ static const char *sh_strip_name_encoding PARAMS ((const char *));
 static void sh_init_builtins PARAMS ((void));
 static void sh_media_init_builtins PARAMS ((void));
 static rtx sh_expand_builtin PARAMS ((tree, rtx, rtx, enum machine_mode, int));
+static int flow_dependent_p PARAMS ((rtx, rtx));
+static void flow_dependent_p_1 PARAMS ((rtx, rtx, void *));
 
 
 /* Initialize the GCC target structure.  */
@@ -6994,7 +6996,7 @@ sh_adjust_cost (insn, link, dep_insn, cost)
      rtx dep_insn;
      int cost;
 {
-  rtx reg;
+  rtx reg, use_pat;
 
   if (TARGET_SHMEDIA)
     {
@@ -7007,49 +7009,119 @@ sh_adjust_cost (insn, link, dep_insn, cost)
           && get_attr_is_mac_media (dep_insn))
         cost = 1;
     }
-  else if (GET_CODE(insn) == CALL_INSN)
+  else if (REG_NOTE_KIND (link) == 0)
     {
+      enum attr_type dep_type, type;
+
+      if (recog_memoized (insn) < 0
+	  || recog_memoized (dep_insn) < 0)
+	return;
+
+      dep_type = get_attr_type (dep_insn);
+      if (dep_type == TYPE_FLOAD || dep_type == TYPE_PCFLOAD)
+	cost--;
+      if ((dep_type == TYPE_LOAD_SI || dep_type == TYPE_PCLOAD_SI)
+	  && (type = get_attr_type (insn)) != TYPE_CALL
+	  && type != TYPE_SFUNC)
+	cost--;
+
       /* The only input for a call that is timing-critical is the
 	 function's address.  */
-      rtx call = PATTERN (insn);
-
-      if (GET_CODE (call) == PARALLEL)
-	call = XVECEXP (call, 0 ,0);
-      if (GET_CODE (call) == SET)
-	call = SET_SRC (call);
-      if (GET_CODE (call) == CALL && GET_CODE (XEXP (call, 0)) == MEM
-	  && ! reg_set_p (XEXP (XEXP (call, 0), 0), dep_insn))
-	cost = 0;
-    }
-  /* All sfunc calls are parallels with at least four components.
-     Exploit this to avoid unnecessary calls to sfunc_uses_reg.  */
-  else if (GET_CODE (PATTERN (insn)) == PARALLEL
-	   && XVECLEN (PATTERN (insn), 0) >= 4
-	   && (reg = sfunc_uses_reg (insn)))
-    {
+      if (GET_CODE(insn) == CALL_INSN)
+	{
+	  rtx call = PATTERN (insn);
+
+	  if (GET_CODE (call) == PARALLEL)
+	    call = XVECEXP (call, 0 ,0);
+	  if (GET_CODE (call) == SET)
+	    call = SET_SRC (call);
+	  if (GET_CODE (call) == CALL && GET_CODE (XEXP (call, 0)) == MEM
+	      && ! reg_set_p (XEXP (XEXP (call, 0), 0), dep_insn))
+	    cost = 0;
+	}
       /* Likewise, the most timing critical input for an sfuncs call
 	 is the function address.  However, sfuncs typically start
 	 using their arguments pretty quickly.
 	 Assume a four cycle delay before they are needed.  */
-      if (! reg_set_p (reg, dep_insn))
-	cost -= TARGET_SUPERSCALAR ? 40 : 4;
-    }
-  /* Adjust load_si / pcload_si type insns latency.  Use the known
-     nominal latency and form of the insn to speed up the check.  */
-  else if (cost == 3
-	   && GET_CODE (PATTERN (dep_insn)) == SET
-	   /* Latency for dmpy type insns is also 3, so check the that
-	      it's actually a move insn.  */
-	   && general_movsrc_operand (SET_SRC (PATTERN (dep_insn)), SImode))
+      /* All sfunc calls are parallels with at least four components.
+	 Exploit this to avoid unnecessary calls to sfunc_uses_reg.  */
+      else if (GET_CODE (PATTERN (insn)) == PARALLEL
+	       && XVECLEN (PATTERN (insn), 0) >= 4
+	       && (reg = sfunc_uses_reg (insn)))
+	{
+	  if (! reg_set_p (reg, dep_insn))
+	    cost -= 4;
+	}
+      /* When the preceding instruction loads the shift amount of
+	 the following SHAD/SHLD, the latency of the load is increased
+	 by 1 cycle.  */
+      else if (TARGET_SH4
+	       && get_attr_type (insn) == TYPE_DYN_SHIFT
+	       && get_attr_any_int_load (dep_insn) == ANY_INT_LOAD_YES
+	       && reg_overlap_mentioned_p (SET_DEST (PATTERN (dep_insn)),
+					   XEXP (SET_SRC (single_set(insn)),
+						 1)))
+	cost++;
+      /* When an LS group instruction with a latency of less than
+	 3 cycles is followed by a double-precision floating-point
+	 instruction, FIPR, or FTRV, the latency of the first
+	 instruction is increased to 3 cycles.  */
+      else if (cost < 3
+	       && get_attr_insn_class (dep_insn) == INSN_CLASS_LS_GROUP
+	       && get_attr_dfp_comp (insn) == DFP_COMP_YES)
+	cost = 3;
+      /* The lsw register of a double-precision computation is ready one
+	 cycle earlier.  */
+      else if (reload_completed
+	       && get_attr_dfp_comp (dep_insn) == DFP_COMP_YES
+	       && (use_pat = single_set (insn))
+	       && ! regno_use_in (REGNO (SET_DEST (single_set (dep_insn))),
+				  SET_SRC (use_pat)))
+	cost -= 1;
+
+      if (get_attr_any_fp_comp (dep_insn) == ANY_FP_COMP_YES
+	  && get_attr_late_fp_use (insn) == LATE_FP_USE_YES)
+	cost -= 1;
+    }
+  /* An anti-dependence penalty of two applies if the first insn is a double
+     precision fadd / fsub / fmul.  */
+  else if (REG_NOTE_KIND (link) == REG_DEP_ANTI
+	   && recog_memoized (dep_insn) >= 0
+	   && get_attr_type (dep_insn) == TYPE_DFP_ARITH
+	   /* A lot of alleged anti-flow dependences are fake,
+	      so check this one is real.  */
+	   && flow_dependent_p (dep_insn, insn))
     cost = 2;
-  else if (cost == 30
-	   && GET_CODE (PATTERN (dep_insn)) == SET
-	   && GET_MODE (SET_SRC (PATTERN (dep_insn))) == SImode)
-    cost = 20;
+
 
   return cost;
 }
 
+/* Check if INSN is flow-dependent on DEP_INSN.  Can also be used to check
+   if DEP_INSN is anti-flow dependent on INSN.  */
+static int
+flow_dependent_p (insn, dep_insn)
+     rtx insn, dep_insn;
+{
+  rtx tmp = PATTERN (insn);
+
+  note_stores (PATTERN (dep_insn), flow_dependent_p_1, &tmp);
+  return tmp == NULL_RTX;
+}
+
+/* A helper function for flow_dependent_p called through note_stores.  */
+static void
+flow_dependent_p_1 (x, pat, data)
+     rtx x;
+     rtx pat ATTRIBUTE_UNUSED;
+     void *data;
+{
+  rtx * pinsn = (rtx *) data;
+
+  if (*pinsn && reg_referenced_p (x, *pinsn))
+    *pinsn = NULL_RTX;
+}
+
 /* For use by ALLOCATE_INITIAL_VALUE.  Note that sh.md contains some
    'special function' patterns (type sfunc) that clobber pr, but that
    do not look like function calls to leaf_function_p.  Hence we must
@@ -7060,27 +7132,26 @@ sh_pr_n_sets ()
   return REG_N_SETS (TARGET_SHMEDIA ? PR_MEDIA_REG : PR_REG);
 }
 
-/* This Function Returns non zero if DFA based scheduler
-   interface is to be used.At present supported only for
-   SH4.  */
+/* This Function returns non zero if the DFA based scheduler interface
+   is to be used.  At present this is supported for the SH4 only.  */
 static int
 sh_use_dfa_interface()
 {
-        if (TARGET_SH4)
-                return 1;
-        else
-                return 0;
+  if (TARGET_HARD_SH4)
+    return 1;
+  else
+    return 0;
 }
 
-/* This function returns "2" that signifies dual issue 
-   for SH4 processor.To be used by DFA pipeline description.  */
+/* This function returns "2" to indicate dual issue for the SH4
+   processor.  To be used by the DFA pipeline description.  */
 static int
 sh_issue_rate()
 {
-	if(TARGET_SH4)
-		return 2;
-	else
-		return 1;
+  if (TARGET_SUPERSCALAR)
+    return 2;
+  else
+    return 1;
 }
 
 /* SHmedia requires registers for branches, so we can't generate new
diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md
index 7016e74..88b2dc8 100644
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -183,8 +183,10 @@
 ;; arith3b	like above, but might end with a redirected branch
 ;; load		from memory
 ;; load_si	Likewise, SImode variant for general register.
+;; fload	Likewise, but load to fp register.
 ;; store	to memory
-;; move		register to register
+;; move		general purpose register to register
+;; mt_group	other sh4 mt instructions
 ;; fmove	register to register, floating point
 ;; smpy		word precision integer multiply
 ;; dmpy		longword or doublelongword precision integer multiply
@@ -194,15 +196,20 @@
 ;; pstore	store of pr reg, which can't be put into delay slot of jsr
 ;; prget	copy pr to register, ditto
 ;; pcload	pc relative load of constant value
+;; pcfload	Likewise, but load to fp register.
 ;; pcload_si	Likewise, SImode variant for general register.
 ;; rte		return from exception
 ;; sfunc	special function call with known used registers
 ;; call		function call
 ;; fp		floating point
 ;; fdiv		floating point divide (or square root)
-;; gp_fpul	move between general purpose register and fpul
+;; gp_fpul	move from general purpose register to fpul
+;; fpul_gp	move from fpul to general purpose register
+;; mac_gp	move from mac[lh] to general purpose register
 ;; dfp_arith, dfp_cmp,dfp_conv
+;; ftrc_s	fix_truncsfsi2_i4
 ;; dfdiv	double precision floating point divide (or square root)
+;; cwb		ic_invalidate_line_i
 ;; arith_media	SHmedia arithmetic, logical, and shift instructions
 ;; cbranch_media SHmedia conditional branch instructions
 ;; cmp_media	SHmedia compare instructions
@@ -233,30 +240,32 @@
 ;; nil		no-op move, will be deleted.
 
 (define_attr "type"
- "cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,rte,sfunc,call,fp,fdiv,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
+ "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fdiv,ftrc_s,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,mem_fpscr,gp_fpscr,cwb,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
   (const_string "other"))
 
 ;; We define a new attribute namely "insn_class".We use
-;; this for DFA based pipeline description.
-;; Although the "type" attribute covers almost all insn 
-;; classes,it is more convenient to define new attribute
-;; for certain reservations.
+;; this for the DFA based pipeline description.
 ;;
 ;; mt_group      SH4 "mt" group instructions.
 ;;
-;; ex_group      SH4 "ex" group instructions.They mostly
-;;               overlap with arithmetic instructions but
-;;               new attribute defined to distinguish from
-;;	         mt group instructions.
+;; ex_group      SH4 "ex" group instructions.
+;;
+;; ls_group      SH4 "ls" group instructions.
 ;;
-;; lds_to_fpscr  The "type" attribute couldn't sufficiently
-;;               distinguish it from others.It is part of 
-;;               new attribute.Similar case with ldsmem_to_fpscr
-;;	  	 and cwb. 
 
 (define_attr "insn_class"
-	     "mt_group,ex_group,lds_to_fpscr,ldsmem_to_fpscr,cwb,none"
-	     (const_string "none"))
+  "mt_group,ex_group,ls_group,br_group,fe_group,co_group,none"
+  (cond [(eq_attr "type" "move,mt_group") (const_string "mt_group")
+         (eq_attr "type" "arith,dyn_shift") (const_string "ex_group")
+	 (eq_attr "type" "fmove,load,pcload,load_si,pcload_si,fload,pcfload,store,gp_fpul,fpul_gp") (const_string "ls_group")
+	 (eq_attr "type" "cbranch,jump") (const_string "br_group")
+	 (eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_conv,dfdiv")
+	   (const_string "fe_group")
+	 (eq_attr "type" "jump_ind,smpy,dmpy,mac_gp,return,pload,prset,pstore,prget,rte,sfunc,call,dfp_cmp,mem_fpscr,gp_fpscr,cwb") (const_string "co_group")]
+	(const_string "none")))
+;; nil are zero instructions, and arith3 / arith3b are multiple instructions,
+;; so these do not belong in an insn group, although they are modeled
+;; with their own define_insn_reservations.
 
 ;; Indicate what precision must be selected in fpscr for this insn, if any.
 
@@ -445,178 +454,6 @@
   (and (eq_attr "pipe_model" "sh1") (eq_attr "type" "fdiv")) 13 12)
 
 
-;; SH4 scheduling
-;; The SH4 is a dual-issue implementation, thus we have to multiply all
-;; costs by at least two.
-;; There will be single increments of the modeled that don't correspond
-;; to the actual target ;; whenever two insns to be issued depend one a
-;; single resource, and the scheduler picks to be the first one.
-;; If we multiplied the costs just by two, just two of these single
-;; increments would amount to an actual cycle.  By picking a larger
-;; factor, we can ameliorate the effect; However, we then have to make sure
-;; that only two insns are modeled as issued per actual cycle.
-;; Moreover, we need a way to specify the latency of insns that don't
-;; use an actual function unit.
-;; We use an 'issue' function unit to do that, and a cost factor of 10.
-
-(define_function_unit "issue" 2 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "!nil,arith3"))
-  10 10)
-
-(define_function_unit "issue" 2 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "arith3"))
-  30 30)
-
-;; There is no point in providing exact scheduling information about branches,
-;; because they are at the starts / ends of basic blocks anyways.
-
-;; Some insns cannot be issued before/after another insn in the same cycle,
-;; irrespective of the type of the other insn.
-
-;; default is dual-issue, but can't be paired with an insn that
-;; uses multiple function units.
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "!smpy,dmpy,pload,pstore,dfp_cmp,gp_fpul,call,sfunc,arith3,arith3b"))
-  1 10
-  [(eq_attr "type" "smpy,dmpy,pload,pstore,dfp_cmp,gp_fpul")])
-
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "smpy,dmpy,pload,pstore,dfp_cmp,gp_fpul"))
-  10 10
-  [(const_int 1)])
-
-;; arith3 insns are always pairable at the start, but not inecessarily at
-;; the end; however, there doesn't seem to be a way to express that.
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "arith3"))
-  30 20
-  [(const_int 1)])
-
-;; arith3b insn are pairable at the end and have latency that prevents pairing
-;; with the following branch, but we don't want this latency be respected;
-;; When the following branch is immediately adjacent, we can redirect the
-;; internal branch, which is likly to be a larger win.
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "arith3b"))
-  20 20
-  [(const_int 1)])
-
-;; calls introduce a longisch delay that is likely to flush the pipelines.
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "call,sfunc"))
-  160 160
-  [(eq_attr "type" "!call") (eq_attr "type" "call")])
-
-;; Load and store instructions have no alignment peculiarities for the SH4,
-;; but they use the load-store unit, which they share with the fmove type
-;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) .
-;; Loads have a latency of two.
-;; However, call insns can only paired with a preceding insn, and have
-;; a delay slot, so that we want two more insns to be scheduled between the
-;; load of the function address and the call.  This is equivalent to a
-;; latency of three.
-;; We cannot use a conflict list for this, because we need to distinguish
-;; between the actual call address and the function arguments.
-;; ADJUST_COST can only properly handle reductions of the cost, so we
-;; use a latency of three here, which gets multiplied by 10 to yield 30.
-;; We only do this for SImode loads of general registers, to make the work
-;; for ADJUST_COST easier.
-
-;; When specifying different latencies for different insns using the
-;; the same function unit, genattrtab.c assumes a 'FIFO constraint'
-;; so that the blockage is at least READY-COST (E) + 1 - READY-COST (C)
-;; for an executing insn E and a candidate insn C.
-;; Therefore, we define three different function units for load_store:
-;; load_store, load and load_si.
-
-(define_function_unit "load_si" 1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "load_si,pcload_si")) 30 10)
-(define_function_unit "load" 1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "load,pcload,pload")) 20 10)
-(define_function_unit "load_store" 1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "load_si,pcload_si,load,pcload,pload,store,pstore,fmove"))
-  10 10)
-
-(define_function_unit "int"    1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "arith,dyn_shift")) 10 10)
-
-;; Again, we have to pretend a lower latency for the "int" unit to avoid a
-;; spurious FIFO constraint; the multiply instructions use the "int"
-;; unit actually only for two cycles.
-(define_function_unit "int"    1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "smpy,dmpy")) 20 20)
-
-;; We use a fictous "mpy" unit to express the actual latency.
-(define_function_unit "mpy"    1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "smpy,dmpy")) 40 20)
-
-;; Again, we have to pretend a lower latency for the "int" unit to avoid a
-;; spurious FIFO constraint.
-(define_function_unit "int"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "gp_fpul")) 10 10)
-
-;; We use a fictous "gp_fpul" unit to express the actual latency.
-(define_function_unit "gp_fpul"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "gp_fpul")) 20 10)
-
-;; ??? multiply uses the floating point unit, but with a two cycle delay.
-;; Thus, a simple single-precision fp operation could finish if issued in
-;; the very next cycle, but stalls when issued two or three cycles later.
-;; Similarily, a divide / sqrt can work without stalls if issued in
-;; the very next cycle, while it would have to block if issued two or
-;; three cycles later.
-;; There is no way to model this with gcc's function units.  This problem is
-;; actually mentioned in md.texi.  Tackling this problem requires first that
-;; it is possible to speak about the target in an open discussion.
-;;
-;; However, simple double-precision operations always conflict.
-
-(define_function_unit "fp"    1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "smpy,dmpy")) 40 40
-  [(eq_attr "type" "dfp_cmp,dfp_conv,dfp_arith")])
-
-;; The "fp" unit is for pipeline stages F1 and F2.
-
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "fp")) 30 10)
-
-;; Again, we have to pretend a lower latency for the "fp" unit to avoid a
-;; spurious FIFO constraint; the bulk of the fdiv type insns executes in
-;; the F3 stage.
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "fdiv")) 30 10)
-
-;; The "fdiv" function unit models the aggregate effect of the F1, F2 and F3
-;; pipeline stages on the pipelining of fdiv/fsqrt insns.
-;; We also use it to give the actual latency here.
-;; fsqrt is actually one cycle faster than fdiv (and the value used here),
-;; but that will hardly matter in practice for scheduling.
-(define_function_unit "fdiv"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "fdiv")) 120 100)
-
-;; There is again a late use of the "fp" unit by [d]fdiv type insns
-;; that we can't express.
-
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "dfp_cmp,dfp_conv")) 40 20)
-
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "dfp_arith")) 80 60)
-
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "dfdiv")) 230 10)
-
-(define_function_unit "fdiv"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "dfdiv")) 230 210)
-
 ;; SH-5 SHmedia scheduling
 ;; When executing SHmedia code, the SH-5 is a fairly straightforward
 ;; single-issue machine.  It has four pipelines, the branch unit (br),
@@ -706,6 +543,35 @@
 (define_attr "is_mac_media" ""
   (if_then_else (eq_attr "type" "mac_media") (const_int 1) (const_int 0)))
 
+(define_attr "branch_zero" "yes,no"
+  (cond [(eq_attr "type" "!cbranch") (const_string "no")
+	 (ne (symbol_ref "(next_active_insn (insn)\
+			   == (prev_active_insn\
+			       (XEXP (SET_SRC (PATTERN (insn)), 1))))\
+			  && get_attr_length (next_active_insn (insn)) == 2")
+	     (const_int 0))
+	 (const_string "yes")]
+	(const_string "no")))
+
+;; SH4 Double-precision computation with double-precision result -
+;; the two halves are ready at different times.
+(define_attr "dfp_comp" "yes,no"
+  (cond [(eq_attr "type" "dfp_arith,dfp_conv,dfdiv") (const_string "yes")]
+	(const_string "no")))
+
+;; Insns for which the latency of a preceding fp insn is decreased by one.
+(define_attr "late_fp_use" "yes,no" (const_string "no"))
+;; And feeding insns for which this relevant.
+(define_attr "any_fp_comp" "yes,no"
+  (cond [(eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_conv,dfdiv")
+	 (const_string "yes")]
+	(const_string "no")))
+
+(define_attr "any_int_load" "yes,no"
+  (cond [(eq_attr "type" "load,load_si,pcload,pcload_si")
+	 (const_string "yes")]
+	(const_string "no")))
+
 (define_delay
   (eq_attr "needs_delay_slot" "yes")
   [(eq_attr "in_delay_slot" "yes") (nil) (nil)])
@@ -755,7 +621,7 @@
 	       (const_int 0)))]
   "TARGET_SH1"
   "tst	%1,%0"
-  [(set_attr "insn_class" "mt_group")])
+  [(set_attr "type" "mt_group")])
 
 ;; ??? Perhaps should only accept reg/constant if the register is reg 0.
 ;; That would still allow reload to create cmpi instructions, but would
@@ -772,7 +638,7 @@
 	tst	%0,%0
 	cmp/eq	%1,%0
 	cmp/eq	%1,%0"
-   [(set_attr "insn_class" "mt_group,mt_group,mt_group")])
+   [(set_attr "type" "mt_group")])
 
 (define_insn "cmpgtsi_t"
   [(set (reg:SI T_REG)
@@ -782,7 +648,7 @@
   "@
 	cmp/gt	%1,%0
 	cmp/pl	%0"
-   [(set_attr "insn_class" "mt_group,mt_group")])
+   [(set_attr "type" "mt_group")])
 
 (define_insn "cmpgesi_t"
   [(set (reg:SI T_REG)
@@ -792,7 +658,7 @@
   "@
 	cmp/ge	%1,%0
 	cmp/pz	%0"
-   [(set_attr "insn_class" "mt_group,mt_group")])
+   [(set_attr "type" "mt_group")])
 
 ;; -------------------------------------------------------------------------
 ;; SImode unsigned integer comparisons
@@ -804,7 +670,7 @@
 		(match_operand:SI 1 "arith_reg_operand" "r")))]
   "TARGET_SH1"
   "cmp/hs	%1,%0"
-   [(set_attr "insn_class" "mt_group")])
+   [(set_attr "type" "mt_group")])
 
 (define_insn "cmpgtusi_t"
   [(set (reg:SI T_REG)
@@ -812,7 +678,7 @@
 		(match_operand:SI 1 "arith_reg_operand" "r")))]
   "TARGET_SH1"
   "cmp/hi	%1,%0"
-   [(set_attr "insn_class" "mt_group")])
+   [(set_attr "type" "mt_group")])
 
 ;; We save the compare operands in the cmpxx patterns and use them when
 ;; we generate the branch.
@@ -909,7 +775,7 @@
 	cmp/eq\\t%S1,%S0\;bf{.|/}s\\t%,Ldi%=\;cmp/ge\\t%S1,%S0\;cmp/hs\\t%R1,%R0\\n%,Ldi%=:
 	cmp/pz\\t%S0"
   [(set_attr "length" "8,2")
-   (set_attr "type" "arith3,arith")])
+   (set_attr "type" "arith3,mt_group")])
 
 ;; -------------------------------------------------------------------------
 ;; DImode unsigned integer comparisons
@@ -1176,8 +1042,7 @@
 	(ltu:SI (plus:SI (match_dup 1) (match_dup 2)) (match_dup 1)))]
   "TARGET_SH1"
   "addc	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "addc1"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -1187,8 +1052,7 @@
    (clobber (reg:SI T_REG))]
   "TARGET_SH1"
   "addc	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_expand "addsi3"
   [(set (match_operand:SI 0 "arith_reg_operand" "")
@@ -1217,8 +1081,7 @@
 		 (match_operand:SI 2 "arith_operand" "rI")))]
   "TARGET_SH1"
   "add	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 ;; -------------------------------------------------------------------------
 ;; Subtraction instructions
@@ -1287,8 +1150,7 @@
 	(gtu:SI (minus:SI (match_dup 1) (match_dup 2)) (match_dup 1)))]
   "TARGET_SH1"
   "subc	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "subc1"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -1298,8 +1160,7 @@
    (clobber (reg:SI T_REG))]
   "TARGET_SH1"
   "subc	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "*subsi3_internal"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -1307,8 +1168,7 @@
 		  (match_operand:SI 2 "arith_reg_operand" "r")))]
   "TARGET_SH1"
   "sub	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "*subsi3_media"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -1734,6 +1594,14 @@
      invariant code motion can move it.  */
   REG_NOTES (first) = gen_rtx_INSN_LIST (REG_LIBCALL, last, REG_NOTES (first));
   REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
+  /* expand_binop can't find a suitable code in umul_widen_optab to
+     make a REG_EQUAL note from, so make one here.
+     See also smulsi3_highpart.
+     ??? Alternatively, we could put this at the calling site of expand_binop,
+     i.e. expand_expr.  */
+  REG_NOTES (last)
+    = gen_rtx_EXPR_LIST (REG_EQUAL, copy_rtx (SET_SRC (single_set (first))),
+			 REG_NOTES (last));
   DONE;
 }")
 
@@ -1756,6 +1624,14 @@
      invariant code motion can move it.  */
   REG_NOTES (first) = gen_rtx_INSN_LIST (REG_LIBCALL, last, REG_NOTES (first));
   REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
+  /* expand_binop can't find a suitable code in umul_widen_optab to
+     make a REG_EQUAL note from, so make one here.
+     See also smulsi3_highpart.
+     ??? Alternatively, we could put this at the calling site of expand_binop,
+     i.e. expand_expr.  */
+  REG_NOTES (last)
+    = gen_rtx_EXPR_LIST (REG_EQUAL, copy_rtx (SET_SRC (single_set (first))),
+			 REG_NOTES (last));
   DONE;
 }")
 
@@ -2019,6 +1895,7 @@
   REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
   /* expand_binop can't find a suitable code in mul_highpart_optab to
      make a REG_EQUAL note from, so make one here.
+     See also {,u}mulhisi.
      ??? Alternatively, we could put this at the calling site of expand_binop,
      i.e. expand_mult_highpart.  */
   REG_NOTES (last)
@@ -2076,8 +1953,7 @@
 		(match_operand:SI 2 "logical_operand" "r,L")))]
   "TARGET_SH1"
   "and	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 ;; If the constant is 255, then emit a extu.b instruction instead of an
 ;; and, since that will give better code.
@@ -2133,8 +2009,7 @@
 		(match_operand:SI 2 "logical_operand" "r,L")))]
   "TARGET_SH1"
   "or	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "iordi3"
   [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -2152,8 +2027,7 @@
 		(match_operand:SI 2 "logical_operand" "L,r")))]
   "TARGET_SH1"
   "xor	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "xordi3"
   [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -2220,8 +2094,7 @@
 	(lshiftrt:SI (match_dup 1) (const_int 31)))]
   "TARGET_SH1"
   "rotl	%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "rotlsi3_31"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2230,8 +2103,7 @@
    (clobber (reg:SI T_REG))]
   "TARGET_SH1"
   "rotr	%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "rotlsi3_16"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2239,8 +2111,7 @@
 		   (const_int 16)))]
   "TARGET_SH1"
   "swap.w	%1,%0"
-  [(set_attr "type" "arith")
-  (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_expand "rotlsi3"
   [(set (match_operand:SI 0 "arith_reg_operand" "")
@@ -2304,8 +2175,7 @@
 		   (const_int 8)))]
   "TARGET_SH1"
   "swap.b	%1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_expand "rotlhi3"
   [(set (match_operand:HI 0 "arith_reg_operand" "")
@@ -2347,8 +2217,7 @@
      (clobber (match_dup 4))])]
   "operands[4] = gen_rtx_SCRATCH (SImode);"
   [(set_attr "length" "*,*,*,4")
-   (set_attr "type" "dyn_shift,arith,arith,arith")
-   (set_attr "insn_class" "ex_group,ex_group,ex_group,ex_group")])
+   (set_attr "type" "dyn_shift,arith,arith,arith")])
 
 (define_insn "ashlhi3_k"
   [(set (match_operand:HI 0 "arith_reg_operand" "=r,r")
@@ -2358,8 +2227,7 @@
   "@
 	add	%0,%0
 	shll%O2	%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "ashlsi3_n"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2376,8 +2244,7 @@
 	       (eq (symbol_ref "shift_insns_rtx (insn)") (const_int 3))
 	       (const_string "6")]
 	      (const_string "8")))
-   (set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+   (set_attr "type" "arith")])
 
 (define_split
   [(set (match_operand:SI 0 "arith_reg_operand" "")
@@ -2466,8 +2333,7 @@
    (clobber (reg:SI T_REG))]
   "TARGET_SH1 && INTVAL (operands[2]) == 1"
   "shar	%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 ;; We can't do HImode right shifts correctly unless we start out with an
 ;; explicit zero / sign extension; doing that would result in worse overall
@@ -2526,8 +2392,7 @@
 	(lt:SI (match_dup 1) (const_int 0)))]
   "TARGET_SH1"
   "shll	%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "ashrsi3_d"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2535,8 +2400,7 @@
 		     (neg:SI (match_operand:SI 2 "arith_reg_operand" "r"))))]
   "TARGET_SH3"
   "shad	%2,%0"
-  [(set_attr "type" "dyn_shift")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "dyn_shift")])
 
 (define_insn "ashrsi3_n"
   [(set (reg:SI R4_REG)
@@ -2587,8 +2451,7 @@
 		     (neg:SI (match_operand:SI 2 "arith_reg_operand" "r"))))]
   "TARGET_SH3"
   "shld	%2,%0"
-  [(set_attr "type" "dyn_shift")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "dyn_shift")])
 
 ;;  Only the single bit shift clobbers the T bit.
 
@@ -2599,8 +2462,7 @@
    (clobber (reg:SI T_REG))]
   "TARGET_SH1 && CONST_OK_FOR_M (INTVAL (operands[2]))"
   "shlr	%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "lshrsi3_k"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2609,8 +2471,7 @@
   "TARGET_SH1 && CONST_OK_FOR_K (INTVAL (operands[2]))
    && ! CONST_OK_FOR_M (INTVAL (operands[2]))"
   "shlr%O2	%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "lshrsi3_n"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2689,8 +2550,7 @@
   "TARGET_SH1"
   "shll	%R0\;rotcl	%S0"
   [(set_attr "length" "4")
-   (set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+   (set_attr "type" "arith")])
 
 (define_insn "ashldi3_media"
   [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -2730,8 +2590,7 @@
   "TARGET_SH1"
   "shlr	%S0\;rotcr	%R0"
   [(set_attr "length" "4")
-   (set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+   (set_attr "type" "arith")])
 
 (define_insn "lshrdi3_media"
   [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -2771,8 +2630,7 @@
   "TARGET_SH1"
   "shar	%S0\;rotcr	%R0"
   [(set_attr "length" "4")
-   (set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+   (set_attr "type" "arith")])
 
 (define_insn "ashrdi3_media"
   [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -3007,8 +2865,7 @@
 			     (const_int 16))))]
   "TARGET_SH1"
   "xtrct	%1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "xtrct_right"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -3018,8 +2875,7 @@
 			   (const_int 16))))]
   "TARGET_SH1"
   "xtrct	%2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 ;; -------------------------------------------------------------------------
 ;; Unary arithmetic
@@ -3034,8 +2890,7 @@
 	       (const_int 0)))]
   "TARGET_SH1"
   "negc	%1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "*negdi_media"
   [(set (match_operand:DI 0 "arith_reg_operand" "=r")
@@ -3073,16 +2928,14 @@
 	(neg:SI (match_operand:SI 1 "arith_reg_operand" "r")))]
   "TARGET_SH1"
   "neg	%1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "one_cmplsi2"
   [(set (match_operand:SI 0 "arith_reg_operand" "=r")
 	(not:SI (match_operand:SI 1 "arith_reg_operand" "r")))]
   "TARGET_SH1"
   "not	%1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_expand "one_cmpldi2"
   [(set (match_operand:DI 0 "arith_reg_operand" "")
@@ -3157,8 +3010,7 @@
 	(zero_extend:SI (match_operand:HI 1 "arith_reg_operand" "r")))]
   "TARGET_SH1"
   "extu.w	%1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "*zero_extendhisi2_media"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -3196,8 +3048,7 @@
 	(zero_extend:SI (match_operand:QI 1 "arith_reg_operand" "r")))]
   "TARGET_SH1"
   "extu.b	%1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 (define_insn "*zero_extendqisi2_media"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -3213,8 +3064,7 @@
 	(zero_extend:HI (match_operand:QI 1 "arith_reg_operand" "r")))]
   "TARGET_SH1"
   "extu.b	%1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
 
 ;; -------------------------------------------------------------------------
 ;; Sign extension instructions
@@ -3288,8 +3138,7 @@
   "@
 	exts.w	%1,%0
    	mov.w	%1,%0"
-  [(set_attr "type" "arith,load")
-   (set_attr "insn_class" "ex_group,*")])
+  [(set_attr "type" "arith,load")])
 
 (define_insn "*extendhisi2_media"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -3325,8 +3174,7 @@
   "@
 	exts.b	%1,%0
 	mov.b	%1,%0"
-  [(set_attr "type" "arith,load")
-   (set_attr "insn_class" "ex_group,*")])
+  [(set_attr "type" "arith,load")])
 
 (define_insn "*extendqisi2_media"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -3356,8 +3204,7 @@
   "@
 	exts.b	%1,%0
 	mov.b	%1,%0"
-  [(set_attr "type" "arith,load")
-   (set_attr "insn_class" "ex_group,*")])
+  [(set_attr "type" "arith,load")])
 
 /* It would seem useful to combine the truncXi patterns into the movXi
    patterns, but unary operators are ignored when matching constraints,
@@ -3431,6 +3278,7 @@
   "TARGET_SH3E && ! TARGET_SH5"
   "sts.l	fpul,@-r15"
   [(set_attr "type" "store")
+   (set_attr "late_fp_use" "yes")
    (set_attr "hit_stack" "yes")])
 
 ;; DFmode pushes for sh4 require a lot of what is defined for movdf_i4,
@@ -3506,8 +3354,7 @@
 	lds.l	%1,%0
 	lds.l	%1,%0
 	fake	%1,%0"
-  [(set_attr "type" "pcload_si,move,*,load_si,move,prget,move,store,store,pstore,move,prset,load,pload,pcload_si")
-   (set_attr "insn_class"  "*,*,mt_group,*,*,*,*,*,*,*,*,*,*,*,*")
+  [(set_attr "type" "pcload_si,move,mt_group,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,pcload_si")
    (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")])
 
 ;; t/r must come after r/r, lest reload will try to reload stuff like
@@ -3541,7 +3388,8 @@
 	lds	%1,%0
 	sts	%1,%0
 	! move optimized away"
-  [(set_attr "type" "pcload_si,move,*,load_si,move,prget,move,store,store,pstore,move,prset,load,pload,load,store,pcload_si,gp_fpul,gp_fpul,nil")
+  [(set_attr "type" "pcload_si,move,*,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,load,store,pcload_si,gp_fpul,fpul_gp,nil")
+   (set_attr "late_fp_use" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes,*,*,yes,*")
    (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")])
 
 (define_insn "movsi_i_lowpart"
@@ -3666,7 +3514,7 @@
   "TARGET_HARD_SH4"
   "ocbwb\\t@%0\;extu.w\\t%0,%2\;or\\t%1,%2\;mov.l\\t%0,@%2"
   [(set_attr "length" "8")
-   (set_attr "insn_class" "cwb")])
+   (set_attr "type" "cwb")])
 
 ;; ??? could make arg 0 an offsettable memory operand to allow to save
 ;; an add in the code that calculates the address.
@@ -4313,7 +4161,8 @@
       (if_then_else
        (ne (symbol_ref "TARGET_SHCOMPACT") (const_int 0))
        (const_int 10) (const_int 8))])
-   (set_attr "type" "fmove,move,pcload,load,store,pcload,load,store,load,load")
+   (set_attr "type" "fmove,move,pcfload,fload,store,pcload,load,store,load,fload")
+   (set_attr "late_fp_use" "*,*,*,*,yes,*,*,*,*,*")
    (set (attr "fp_mode") (if_then_else (eq_attr "fmovd" "yes")
 					   (const_string "double")
 					   (const_string "none")))])
@@ -5015,7 +4864,8 @@
 	sts.l	%1,%0
 	lds.l	%1,%0
 	! move optimized away"
-  [(set_attr "type" "fmove,move,fmove,fmove,pcload,load,store,pcload,load,store,fmove,fmove,load,*,gp_fpul,gp_fpul,store,load,nil")
+  [(set_attr "type" "fmove,move,fmove,fmove,pcfload,fload,store,pcload,load,store,fmove,fmove,load,*,fpul_gp,gp_fpul,store,load,nil")
+   (set_attr "late_fp_use" "*,*,*,*,*,*,yes,*,*,*,*,*,*,*,yes,*,yes,*,*")
    (set_attr "length" "*,*,*,*,4,*,*,*,*,*,2,2,2,4,2,2,2,2,0")
    (set (attr "fp_mode") (if_then_else (eq_attr "fmovd" "yes")
 					   (const_string "single")
@@ -7996,8 +7846,8 @@
 ;; GO_IF_LEGITIMATE_ADDRESS guards about bogus addresses before reload,
 ;; SECONDARY_INPUT_RELOAD_CLASS does this during reload, and the insn's
 ;; predicate after reload.
-;; The gp_fpul type for r/!c might look a bit odd, but it actually schedules
-;; like a gpr <-> fpul move.
+;; The mac_gp type for r/!c might look a bit odd, but it actually schedules
+;; like a mac -> gpr move.
 (define_insn "fpu_switch"
   [(set (match_operand:PSI 0 "register_operand" "=c,c,r,c,c,r,m,r")
 	(match_operand:PSI 1 "general_movsrc_operand" "c,>,m,m,r,r,r,!c"))]
@@ -8016,8 +7866,7 @@
 	mov.l	%1,%0
 	sts	fpscr,%0"
   [(set_attr "length" "0,2,2,4,2,2,2,2")
-   (set_attr "type" "dfp_conv,dfp_conv,load,dfp_conv,dfp_conv,move,store,gp_fpul")
-   (set_attr "insn_class" "ldsmem_to_fpscr,*,*,lds_to_fpscr,*,*,*,*")])
+   (set_attr "type" "nil,mem_fpscr,load,mem_fpscr,gp_fpscr,move,store,mac_gp")])
 
 (define_split
   [(set (reg:PSI FPSCR_REG)
@@ -8363,7 +8212,7 @@
    (use (match_operand:PSI 2 "fpscr_operand" "c"))]
   "TARGET_SH4"
   "ftrc	%1,%0"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "ftrc_s")
    (set_attr "fp_mode" "single")])
 
 ;; ??? This pattern is used nowhere.  fix_truncsfsi2 always expands to
@@ -8785,6 +8634,7 @@
   "TARGET_SH4"
   "ftrc	%1,%0"
   [(set_attr "type" "dfp_conv")
+   (set_attr "dfp_comp" "no")
    (set_attr "fp_mode" "double")])
 
 ;; ??? This pattern is used nowhere.  fix_truncdfsi2 always expands to
@@ -9877,6 +9727,7 @@
 {
   emit_insn ((TARGET_LITTLE_ENDIAN ? gen_mperm_w_little : gen_mperm_w_big)
 	     (operands[0], operands[1], operands[2]));
+  DONE;
 }")
 
 ; This use of vec_select isn't exactly correct according to rtl.texi
@@ -10591,18 +10442,22 @@
 
 (define_cpu_unit "f1_1,f1_2" "fpu_pipe")
 
-;; The floating point units.
+;; The floating point units (except FS - F2 always precedes it.)
 
-(define_cpu_unit "F1,F2,F3,FS" "fpu_pipe")
+(define_cpu_unit "F0,F1,F2,F3" "fpu_pipe")
 
 ;; This is basically the MA unit of SH4
 ;; used in LOAD/STORE pipeline.
 
 (define_cpu_unit "memory" "inst_pipeline")
 
+;; However, there are LS group insns that don't use it, even ones that
+;; complete in 0 cycles.  So we use an extra unit for the issue of LS insns.
+(define_cpu_unit "load_store" "inst_pipeline")
+
 ;; The address calculator used for branch instructions.
-;; This will be reserved with "issue" of branch instructions
-;; and this is to make sure that  no two branch instructions 
+;; This will be reserved after "issue" of branch instructions
+;; and this is to make sure that no two branch instructions 
 ;; can be issued in parallel. 
 
 (define_cpu_unit "pcr_addrcalc" "inst_pipeline")
@@ -10613,26 +10468,57 @@
 (define_reservation  "issue"  "pipe_01|pipe_02")
 
 ;; This is to express the locking of D stage.
+;; Note that the issue of a CO group insn also effectively locks the D stage.
 
 (define_reservation  "d_lock" "pipe_01+pipe_02")
 
+;; Every FE instruction but fipr / ftrv starts with issue and this.
+(define_reservation "F01" "F0+F1")
+
 ;; This is to simplify description where F1,F2,FS
 ;; are used simultaneously.
 
-(define_reservation "fpu" "F1+F2+FS")
+(define_reservation "fpu" "F1+F2")
 
 ;; This is to highlight the fact that f1 
 ;; cannot overlap with F1.
 
 (exclusion_set  "f1_1,f1_2" "F1")
 
+(define_insn_reservation "nil" 0 (eq_attr "type" "nil") "nothing")
+
 ;; Although reg moves have a latency of zero 
 ;; we need to highlight that they use D stage
 ;; for one cycle.
 
+;; Group:	MT
+
 (define_insn_reservation "reg_mov" 0
-               (eq_attr "type" "move,fmove")
-              "issue")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "move"))
+  "issue")
+
+;; Group:	LS
+
+(define_insn_reservation "freg_mov" 0
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fmove"))
+  "issue+load_store")
+
+;; We don't model all pipeline stages; we model the issue ('D') stage
+;; inasmuch as we allow only two instructions to issue simultanously,
+;; and CO instructions prevent any simultanous issue of another instruction.
+;; (This uses pipe_01 and pipe_02).
+;; Double issue of EX insns is prevented by using the int unit in the EX stage.
+;; Double issue of EX / BR insns is prevented by using the int unit /
+;; pcr_addrcalc unit in the EX stage.
+;; Double issue of BR / LS instructions is prevented by using the
+;; pcr_addrcalc / load_store unit in the issue cycle.
+;; Double issue of FE instructions is prevented by using F0 in the first
+;; pipeline stage after the first D stage.
+;; There is no need to describe the [ES]X / [MN]A / S stages after a D stage
+;; (except in the cases outlined above), nor to describe the FS stage after
+;; the F2 stage.
 
 ;; Other MT  group intructions(1 step operations)
 ;; Group:	MT
@@ -10640,88 +10526,170 @@
 ;; Issue Rate: 	1
 
 (define_insn_reservation "mt" 1
-                      (eq_attr "insn_class" "mt_group")
-                      "issue,nothing")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "mt_group"))
+  "issue")
 
 ;; Fixed Point Arithmetic Instructions(1 step operations)
 ;; Group:	EX
 ;; Latency: 	1
 ;; Issue Rate: 	1
 
-(define_insn_reservation "simple_arith" 1 
-            (eq_attr "insn_class" "ex_group")
-            "issue,int")
+(define_insn_reservation "sh4_simple_arith" 1 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "insn_class" "ex_group"))
+  "issue,int")
+
+;; Load and store instructions have no alignment peculiarities for the SH4,
+;; but they use the load-store unit, which they share with the fmove type
+;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) .
+;; Loads have a latency of two.
+;; However, call insns can only paired with a preceding insn, and have
+;; a delay slot, so that we want two more insns to be scheduled between the
+;; load of the function address and the call.  This is equivalent to a
+;; latency of three.
+;; ADJUST_COST can only properly handle reductions of the cost, so we
+;; use a latency of three here, which gets multiplied by 10 to yield 30.
+;; We only do this for SImode loads of general registers, to make the work
+;; for ADJUST_COST easier.
 
 ;; Load Store instructions. (MOV.[BWL]@(d,GBR)
 ;; Group:	LS
 ;; Latency: 	2
 ;; Issue Rate: 	1
 
-(define_insn_reservation "load_store" 2
-       (eq_attr "type" "load,load_si,pcload,pcload_si,store")
-       "issue,memory*2")
+(define_insn_reservation "sh4_load" 2
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "load,pcload"))
+  "issue+load_store,nothing,memory")
+
+;; calls / sfuncs need an extra instruction for their delay slot.
+;; Moreover, estimating the latency for SImode loads as 3 will also allow
+;; adjust_cost to meaningfully bump it back up to 3 if they load the shift
+;; count of a dynamic shift.
+(define_insn_reservation "sh4_load_si" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "load_si,pcload_si"))
+  "issue+load_store,nothing,memory")
+
+;; (define_bypass 2 "sh4_load_si" "!sh4_call")
+
+;; The load latency is upped to three higher if the dependent insn does
+;; double precision computation.  We want the 'default' latency to reflect
+;; that increased latency because otherwise the insn priorities won't
+;; allow proper scheduling.
+(define_insn_reservation "sh4_fload" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fload,pcfload"))
+  "issue+load_store,nothing,memory")
+
+;; (define_bypass 2 "sh4_fload" "!")
+
+(define_insn_reservation "sh4_store" 1
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "store"))
+  "issue+load_store,nothing,memory")
+
+;; Load Store instructions.
+;; Group:	LS
+;; Latency: 	1
+;; Issue Rate: 	1
+
+(define_insn_reservation "sh4_gp_fpul" 1
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "gp_fpul"))
+  "issue+load_store")
+
+;; Load Store instructions.
+;; Group:	LS
+;; Latency: 	3
+;; Issue Rate: 	1
+
+(define_insn_reservation "sh4_fpul_gp" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fpul_gp"))
+  "issue+load_store")
 
 ;; Branch (BF,BF/S,BT,BT/S,BRA)
 ;; Group:	BR
-;; Latency: 	2 (or 1) Actually Observed to be 5/7
+;; Latency when taken: 	2 (or 1)
 ;; Issue Rate: 	1
 ;; The latency is 1 when displacement is 0.
-;; This reservation can be further broken into 2
-;;    1. branch_zero : One with latency 1 and in the TEST 
-;;       part it also checks for 0 (ZERO) displacement 
-;;    2. branch: Latency 2.
-
-(define_insn_reservation "branch_zero"  5
-             (and (eq_attr "type" "cbranch")
-		  (eq_attr "length" "2"))
-             "(issue+pcr_addrcalc),pcr_addrcalc,nothing")
+;; We can't really do much with the latency, even if we could express it,
+;; but the pairing restrictions are useful to take into account.
+;; ??? If the branch is likely, we might want to fill the delay slot;
+;; if the branch is likely, but not very likely, should we pretend to use
+;; a resource that CO instructions use, to get a pairable delay slot insn?
 
-(define_insn_reservation "branch"  7
-             (eq_attr "type" "cbranch")
-             "(issue+pcr_addrcalc),pcr_addrcalc,nothing")
+(define_insn_reservation "sh4_branch"  1
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "cbranch,jump"))
+  "issue+pcr_addrcalc")
 
 ;; Branch Far (JMP,RTS,BRAF)
 ;; Group:	CO
 ;; Latency: 	3
 ;; Issue Rate: 	2
-;;    Since issue stage (D stage) is blocked for 2nd cycle, 
-;;    cpu_unit  int  is reserved since it might be required for far
-;;    address calculation.
+;; ??? Scheduling happens before branch shortening, and hence jmp and braf
+;; can't be distinguished from bra for the "jump" pattern.
 
-(define_insn_reservation "branch_far" 12
-         (and (eq_attr "type" "jump,return")
-	      (eq_attr "length" "6"))
-         "d_lock*2,int+pcr_addrcalc,pcr_addrcalc")
+(define_insn_reservation "sh4_return" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "return,jump_ind"))
+         "d_lock*2")
 
 ;; RTE
 ;; Group:	CO
-;; atency: 	5
+;; Latency: 	5
 ;; Issue Rate: 	5
 ;; this instruction can be executed in any of the pipelines 
 ;; and blocks the pipeline for next 4 stages.
 
-(define_insn_reservation "return_from_exp" 5
-          (eq_attr "type" "rte")
-         "(issue+pcr_addrcalc),d_lock*4,int+pcr_addrcalc,nothing")
+(define_insn_reservation "sh4_return_from_exp" 5
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "rte"))
+  "d_lock*5")
 
 ;; OCBP, OCBWB
 ;; Group:	CO
-;; Latency: 	5
+;; Latency: 	1-5
 ;; Issue Rate: 	1
 
-(define_insn_reservation "ocbwb"  5
-          (eq_attr "insn_class" "cwb") 
- 	   "issue,(int+memory),memory*5")
+;; cwb is used for the sequence ocbwb @%0; extu.w %0,%2; or %1,%2; mov.l %0,@%2
+;; ocbwb on its own would be "d_lock,nothing,memory*5"
+(define_insn_reservation "ocbwb"  6
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "cwb"))
+  "d_lock*2,(d_lock+memory)*3,issue+load_store+memory,memory*2")
 		
 ;; LDS to PR,JSR
 ;; Group:	CO
 ;; Latency: 	3
 ;; Issue Rate: 	2
 ;; The SX stage is blocked for last 2 cycles.
+;; OTOH, the only time that has an effect for insns generated by the compiler
+;; is when lds to PR is followed by sts from PR - and that is highly unlikely -
+;; or when we are doing a function call - and we don't do inter-function
+;; scheduling.  For the function call case, it's really best that we end with
+;; something that models an rts.
 
-(define_insn_reservation "lds_to_pr" 3 
-          (eq_attr "type" "prset,call,sfunc") 
-          "(issue+pcr_addrcalc),(issue+int+pcr_addrcalc),(int+pcr_addrcalc)*2")
+(define_insn_reservation "sh4_lds_to_pr" 3 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "prset") )
+  "d_lock*2")
+
+;; calls introduce a longisch delay that is likely to flush the pipelines
+;; of the caller's instructions.  Ordinary functions tend to end with a
+;; load to restore a register (in the delay slot of rts), while sfuncs
+;; tend to end with an EX or MT insn.  But that is not actually relevant,
+;; since there are no instructions that contend for memory access early.
+;; We could, of course, provide exact scheduling information for specific
+;; sfuncs, if that should prove useful.
+
+(define_insn_reservation "sh4_call" 16 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "call,sfunc"))
+  "d_lock*16")
 
 ;; LDS.L to PR 
 ;; Group:	CO
@@ -10730,8 +10698,9 @@
 ;; The SX unit is blocked for last 2 cycles.
  
 (define_insn_reservation "ldsmem_to_pr"  3
-      (eq_attr "type" "pload") 
-     "(issue+pcr_addrcalc),(issue+int+pcr_addrcalc),(int+memory+pcr_addrcalc),(int+pcr_addrcalc)")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "pload"))
+  "d_lock*2")
 
 ;; STS from PR
 ;; Group:	CO
@@ -10740,17 +10709,19 @@
 ;; The SX unit in second and third cycles.
 
 (define_insn_reservation "sts_from_pr" 2
-        (eq_attr "type" "prget")
-       "(issue+pcr_addrcalc),(pipe_01+int+pcr_addrcalc),(int+pcr_addrcalc),nothing")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "prget"))
+  "d_lock*2")
 
 ;; STS.L from PR
 ;; Group:	CO
 ;; Latency: 	2
 ;; Issue Rate: 	2
 
-(define_insn_reservation "prload_mem" 2 
-          (eq_attr "type" "pstore")
-           "(issue+pcr_addrcalc),(pipe_01+int+pcr_addrcalc),(int+memory+pcr_addrcalc),memory")
+(define_insn_reservation "sh4_prstore_mem" 2 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "pstore"))
+  "d_lock*2,nothing,memory")
 
 ;; LDS to FPSCR
 ;; Group:	CO
@@ -10758,9 +10729,10 @@
 ;; Issue Rate: 	1
 ;; F1 is blocked for last three cycles. 
 
-(define_insn_reservation "fpscr_store" 4
-        (eq_attr "insn_class" "lds_to_fpscr")
-       "issue,int,F1*3")
+(define_insn_reservation "fpscr_load" 4
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "gp_fpscr"))
+  "d_lock,nothing,F1*3")
 
 ;; LDS.L to FPSCR
 ;; Group:	CO
@@ -10769,9 +10741,10 @@
 ;; Issue Rate: 	1
 ;; F1 is blocked for last three cycles.
 
-(define_insn_reservation "fpscr_store_mem" 4
-        (eq_attr "insn_class"  "ldsmem_to_fpscr") 
-        "issue,(int+memory),(F1+memory),F1*2")
+(define_insn_reservation "fpscr_load_mem" 4
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type"  "mem_fpscr"))
+  "d_lock,nothing,(F1+memory),F1*2")
 
 
 ;; Fixed point multiplication (DMULS.L DMULU.L MUL.L MULS.W,MULU.W)
@@ -10780,28 +10753,49 @@
 ;; Issue Rate: 	1
 
 (define_insn_reservation "multi" 4
-	(eq_attr "type" "smpy,dmpy")
- 	"issue,(issue+int+f1_1),(int+f1_1),(f1_1|f1_2)*2,F2,FS")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "smpy,dmpy"))
+  "d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2")
+
+;; Fixed STS from MACL / MACH
+;; Group:	CO
+;; Latency: 	3
+;; Issue Rate: 	1
+
+(define_insn_reservation "sh4_mac_gp" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "mac_gp"))
+  "d_lock")
 
 
 ;; Single precision floating point computation FCMP/EQ,
-;; FCP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRVHG, FSCHG
+;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRVHG, FSCHG
 ;; Group:	FE
-;; Latency: 	4
+;; Latency: 	3/4
 ;; Issue Rate: 	1
 
-(define_insn_reservation "fp_arith"  4
-              (eq_attr "type" "fp")
- 	      "issue,F1,F2,FS")
+(define_insn_reservation "fp_arith"  3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fp"))
+  "issue,F01,F2")
+
+(define_insn_reservation "fp_arith_ftrc"  3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "ftrc_s"))
+  "issue,F01,F2")
+
+(define_bypass 1 "fp_arith_ftrc" "sh4_fpul_gp")
 
 ;; Single Precision FDIV/SQRT
 ;; Group:	FE
-;; Latency: 	12/13
+;; Latency: 	12/13 (FDIV); 11/12 (FSQRT)
 ;; Issue Rate: 	1
+;; We describe fdiv here; fsqrt is actually one cycle faster.
 
-(define_insn_reservation "fp_div" 13
-               (eq_attr "type" "fdiv")
-               "issue,F1+F3,F1+F2+F3,F3*7,F1+F3,F2,FS")
+(define_insn_reservation "fp_div" 12
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fdiv"))
+  "issue,F01+F3,F2+F3,F3*7,F1+F3,F2")
 
 ;; Double Precision floating point computation
 ;; (FCNVDS, FCNVSD, FLOAT, FTRC)
@@ -10809,34 +10803,51 @@
 ;; Latency: 	(3,4)/5
 ;; Issue Rate: 	1
 
-(define_insn_reservation "dp_float" 5
-         (eq_attr "type" "dfp_conv")
-	 "issue,F1,F1+F2,F2+FS,FS")
+(define_insn_reservation "dp_float" 4
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "dfp_conv"))
+  "issue,F01,F1+F2,F2")
 
-;; Double-precision floating-point (FADD ,FMUL,FSUB) 
+;; Double-precision floating-point (FADD,FMUL,FSUB) 
 ;; Group:	FE
 ;; Latency: 	(7,8)/9
 ;; Issue Rate: 	1
 
-(define_insn_reservation "fp_double_arith" 9
-        (eq_attr "type" "dfp_arith")
-	"issue,F1,F1+F2,fpu*4,F2+FS,FS")
+(define_insn_reservation "fp_double_arith" 8
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "dfp_arith"))
+  "issue,F01,F1+F2,fpu*4,F2")
 
 ;; Double-precision FCMP (FCMP/EQ,FCMP/GT) 
-;; Group:	FE
+;; Group:	CO
 ;; Latency: 	3/5
 ;; Issue Rate: 	2
 
-(define_insn_reservation "fp_double_cmp" 5 
-        (eq_attr "type" "dfp_cmp")
-	"issue,(issue+F1),F1+F2,F2+FS,FS")
+(define_insn_reservation "fp_double_cmp" 3 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "dfp_cmp"))
+  "d_lock,(d_lock+F01),F1+F2,F2")
 
 ;; Double precision FDIV/SQRT
 ;; Group:	FE
 ;; Latency: 	(24,25)/26
 ;; Issue Rate: 	1
 
-(define_insn_reservation "dp_div" 26
-        (eq_attr "type" "dfdiv")
- 	"issue,F1+F3,F1+F2+F3,F2+F3+FS,F3*16,F1+F3,F1+F2+F3,fpu+F3,F2+FS,FS")
+(define_insn_reservation "dp_div" 25
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "dfdiv"))
+  "issue,F01+F3,F1+F2+F3,F2+F3,F3*16,F1+F3,(fpu+F3)*2,F2")
+
 
+;; Use the branch-not-taken case to model arith3 insns.  For the branch taken
+;; case, we'd get a d_lock instead of issue at the end.
+(define_insn_reservation "arith3" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "arith3"))
+  "issue,d_lock+pcr_addrcalc,issue")
+
+;; arith3b insns schedule the same no matter if the branch is taken or not.
+(define_insn_reservation "arith3b" 2
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "arith3"))
+  "issue,d_lock+pcr_addrcalc")