1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
;;
;; Pipeline description for the VR4130 family.
;;
;; The processor issues each 8-byte aligned pair of instructions together,
;; stalling the second instruction if it depends on the first. Thus, if we
;; want two instructions to issue in parallel, we need to make sure that the
;; first one is 8-byte aligned.
;;
;; For the purposes of this pipeline description, we treat the processor
;; like a standard two-way superscalar architecture. If scheduling were
;; the last pass to run, we could use the scheduler hooks to vary the
;; issue rate depending on whether an instruction is at an aligned or
;; unaligned address. Unfortunately, delayed branch scheduling and
;; hazard avoidance are done after the final scheduling pass, and they
;; can change the addresses of many instructions.
;;
;; We get around this in two ways:
;;
;; (1) By running an extra pass at the end of compilation. This pass goes
;; through the function looking for pairs of instructions that could
;; execute in parallel. It makes sure that the first instruction in
;; each pair is suitably aligned, inserting nops if necessary. Doing
;; this gives the same kind of pipeline behavior we would see on a
;; normal superscalar target.
;;
;; This pass is generally a speed improvement, but the extra nops will
;; obviously make the program bigger. It is therefore unsuitable for
;; -Os (at the very least).
;;
;; (2) By modifying the scheduler hooks so that, where possible:
;;
;; (a) dependent instructions are separated by a non-dependent
;; instruction;
;;
;; (b) instructions that use the multiplication unit are separated
;; by non-multiplication instructions; and
;;
;; (c) memory access instructions are separated by non-memory
;; instructions.
;;
;; The idea is to keep conflicting instructions apart wherever possible
;; and thus make the schedule less dependent on alignment.
(define_automaton "vr4130_main, vr4130_muldiv, vr4130_mulpre")
(define_cpu_unit "vr4130_alu1, vr4130_alu2, vr4130_dcache" "vr4130_main")
(define_cpu_unit "vr4130_muldiv" "vr4130_muldiv")
;; This is a fake unit for pre-reload scheduling of multiplications.
;; It enforces the true post-reload repeat rate.
(define_cpu_unit "vr4130_mulpre" "vr4130_mulpre")
;; The scheduling hooks use this attribute for (b) above.
(define_attr "vr4130_class" "mul,mem,alu"
(cond [(eq_attr "type" "load,store")
(const_string "mem")
(eq_attr "type" "mfhilo,mthilo,imul,imul3,imadd,idiv")
(const_string "mul")]
(const_string "alu")))
(define_insn_reservation "vr4130_multi" 1
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "multi,unknown"))
"vr4130_alu1 + vr4130_alu2 + vr4130_dcache + vr4130_muldiv")
(define_insn_reservation "vr4130_int" 1
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "arith,const,logical,move,nop,shift,signext,slt"))
"vr4130_alu1 | vr4130_alu2")
(define_insn_reservation "vr4130_load" 3
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "load"))
"vr4130_dcache")
(define_insn_reservation "vr4130_store" 1
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "store"))
"vr4130_dcache")
(define_insn_reservation "vr4130_mfhilo" 3
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "mfhilo"))
"vr4130_muldiv")
(define_insn_reservation "vr4130_mthilo" 1
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "mthilo"))
"vr4130_muldiv")
;; The product is available in LO & HI after one cycle. Moving the result
;; into an integer register will take an additional three cycles, see mflo
;; & mfhi above. Note that the same latencies and repeat rates apply if we
;; use "mtlo; macc" instead of "mult; mflo".
(define_insn_reservation "vr4130_mulsi" 4
(and (eq_attr "cpu" "r4130")
(and (eq_attr "type" "imul,imul3")
(eq_attr "mode" "SI")))
"vr4130_muldiv + (vr4130_mulpre * 2)")
;; As for vr4130_mulsi, but the product is available in LO and HI
;; after 3 cycles.
(define_insn_reservation "vr4130_muldi" 6
(and (eq_attr "cpu" "r4130")
(and (eq_attr "type" "imul,imul3")
(eq_attr "mode" "DI")))
"(vr4130_muldiv * 3) + (vr4130_mulpre * 4)")
;; maccs can execute in consecutive cycles without stalling, but it
;; is 3 cycles before the integer destination can be read.
(define_insn_reservation "vr4130_macc" 3
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "imadd"))
"vr4130_muldiv")
(define_bypass 1 "vr4130_mulsi,vr4130_macc" "vr4130_macc" "mips_linked_madd_p")
(define_bypass 1 "vr4130_mulsi,vr4130_macc" "vr4130_mfhilo")
(define_bypass 3 "vr4130_muldi" "vr4130_mfhilo")
(define_insn_reservation "vr4130_divsi" 36
(and (eq_attr "cpu" "r4130")
(and (eq_attr "type" "idiv")
(eq_attr "mode" "SI")))
"vr4130_muldiv * 36")
(define_insn_reservation "vr4130_divdi" 72
(and (eq_attr "cpu" "r4130")
(and (eq_attr "type" "idiv")
(eq_attr "mode" "DI")))
"vr4130_muldiv * 72")
(define_insn_reservation "vr4130_branch" 0
(and (eq_attr "cpu" "r4130")
(eq_attr "type" "branch,jump,call"))
"vr4130_alu1 | vr4130_alu2")
|