aboutsummaryrefslogtreecommitdiff
path: root/ld/emultempl/spu_ovl.S
blob: 3f9c83bbca4e18baae3595891fbae1f872a1a026 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
/* Overlay manager for SPU.

   Copyright 2006, 2007 Free Software Foundation, Inc.

   This file is part of the GNU Binutils.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
   MA 02110-1301, USA.  */

/* MFC DMA defn's.  */
#define MFC_GET_CMD		0x40
#define MFC_MAX_DMA_SIZE	0x4000
#define MFC_TAG_UPDATE_ALL	2
#define MFC_TAG_ID		0

/* Register usage.  */
#define reserved1	$75
#define parm		$75
#define tab1		reserved1
#define tab2		reserved1
#define vma		reserved1
#define oldvma		reserved1
#define newmask		reserved1
#define map		reserved1

#define reserved2	$76
#define off1		reserved2
#define off2		reserved2
#define present1	reserved2
#define present2	reserved2
#define sz		reserved2
#define cmp		reserved2
#define add64		reserved2
#define cgbits		reserved2
#define off3		reserved2
#define off4		reserved2
#define off5		reserved2
#define tagstat		reserved2

#define reserved3	$77
#define buf1		reserved3
#define buf2		reserved3
#define rv3		reserved3
#define ealo		reserved3
#define cmd		reserved3
#define off64		reserved3
#define tab3		reserved3
#define tab4		reserved3
#define tab5		reserved3

#define reserved4	$78
#define ovl		reserved4
#define rv2		reserved4
#define rv5		reserved4
#define cgshuf		reserved4
#define newovl		reserved4

#define reserved5	$79
#define target		reserved5

#define save1		$72
#define rv4		save1
#define rv7		save1
#define tagid		save1
#define maxsize		save1
#define pbyte		save1
#define pbit		save1

#define save2		$73
#define cur		save2
#define rv6		save2
#define osize		save2
#define zovl		save2
#define oldovl		save2
#define newvma		save2

#define save3		$74
#define rv1		save3
#define ea64		save3
#define buf3		save3
#define genwi		save3
#define newmap		save3
#define oldmask		save3


	.text
	.align 	4
	.type	__rv_pattern, @object
	.size	__rv_pattern, 16
__rv_pattern:
	.word	0x00010203, 0x10111213, 0x80808080, 0x80808080

	.type	__cg_pattern, @object
	.size	__cg_pattern, 16
__cg_pattern:
	.word	0x04050607, 0x80808080, 0x80808080, 0x80808080

	.type	__ovly_current, @object
	.size	__ovly_current, 16
__ovly_current:
	.space	16

/*
 * __ovly_return - stub for returning from overlay functions.
 *
 * On entry the four slots of $lr are:
 *   __ovly_return, prev ovl index, caller return addr, undefined.
 *
 * Load the previous overlay and jump to the caller return address.
 * Updates __ovly_current.
 */
	.align 	4
	.global	__ovly_return
	.type	__ovly_return, @function
__ovly_return:
	ila	tab1, _ovly_table - 16				# 0,2	0
	shlqbyi	ovl, $lr, 4					# 1,4	0
#nop
	shlqbyi	target, $lr, 8					# 1,4	1
#nop; lnop
#nop; lnop
	shli	off1, ovl, 4					# 0,4	4
#lnop
#nop
	hbr	ovly_ret9, target				# 1,15	5
#nop; lnop
#nop; lnop
#nop
	lqx	vma, tab1, off1					# 1,6	8
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop
	rotqbyi	buf1, vma, 12					# 1,4	14
#nop
	stqd	save3, -48($sp)					# 1,6	15
#nop
	stqd	save2, -32($sp)					# 1,6	16
#nop
	stqd	save1, -16($sp)					# 1,6	17
	andi	present1, buf1, 1				# 0,2	18
	stqd	ovl, (__ovly_current - __ovly_return)($lr)	# 1,6	18
#nop; lnop
#nop
	brz	present1, __ovly_load_event			# 1,4	20
ovly_ret9:
#nop
	bi	target						# 1,4	21

/*
 * __ovly_load - copy an overlay partion to local store.
 *
 * On entry $75 points to a word consisting of the overlay index in
 * the top 14 bits, and the target address in the bottom 18 bits.
 *
 * Sets up $lr to return via __ovly_return.
 * Updates __ovly_current.
 */
	.align  3
	.global	__ovly_load
	.type	__ovly_load, @function
__ovly_load:
#if OVL_STUB_SIZE == 8
########
#nop
	lqd	target, 0(parm)					# 1,6	-11
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop
	rotqby	target, target, parm				# 1,4	-5
	ila	tab2, _ovly_table - 16				# 0,2	-4
	stqd	save3, -48($sp)					# 1,6	-4
#nop
	stqd	save2, -32($sp)					# 1,6	-3
#nop
	stqd	save1, -16($sp)					# 1,6	-2
	rotmi	ovl, target, -18				# 0,4	-1
	hbr	ovly_load9, target				# 1,15	-1
	ila	rv1, __ovly_return				# 0,2	0
#lnop
#nop; lnop
#nop
	lqd	cur, (__ovly_current - __ovly_return)(rv1)	# 1,6	2
	shli	off2, ovl, 4					# 0,4	3
	stqd	ovl, (__ovly_current - __ovly_return)(rv1)	# 1,6	3
	ceq	rv2, $lr, rv1					# 0,2	4
	lqd	rv3, (__rv_pattern - __ovly_return)(rv1)	# 1,6	4
#nop; lnop
#nop; lnop
#nop
	lqx	vma, tab2, off2					# 1,6	7
########
#else /* OVL_STUB_SIZE == 16 */
########
	ila	tab2, _ovly_table - 16				# 0,2	0
	stqd	save3, -48($sp)					# 1,6	0
	ila	rv1, __ovly_return				# 0,2	1
	stqd	save2, -32($sp)					# 1,6	1
	shli	off2, ovl, 4					# 0,4	2
	lqa	cur, __ovly_current				# 1,6	2
	nop
	stqa	ovl, __ovly_current				# 1,6	3
	ceq	rv2, $lr, rv1					# 0,2	4
	lqd	rv3, (__rv_pattern - __ovly_return)(rv1)	# 1,6	4
#nop
	hbr	ovly_load9, target				# 1,15	5
#nop
	lqx	vma, tab2, off2					# 1,6	6
#nop
	stqd	save1, -16($sp)					# 1,6	7
########
#endif

#nop; lnop
#nop; lnop
#nop
	shufb	rv4, rv1, cur, rv3				# 1,4	10
#nop
	fsmb	rv5, rv2					# 1,4	11
#nop
	rotqmbyi rv6, $lr, -8					# 1,4	12
#nop
	rotqbyi	buf2, vma, 12					# 1,4	13
#nop
	lqd	save3, -48($sp)					# 1,6	14
#nop; lnop
	or	rv7, rv4, rv6					# 0,2	16
	lqd	save2, -32($sp)					# 1,6	16
	andi	present2, buf2, 1				# 0,2	17
	lnop							# 1,0	17
	selb	$lr, rv7, $lr, rv5				# 0,2	18
	lqd	save1, -16($sp)					# 1,6	18
#nop
	brz	present2, __ovly_load_event			# 1,4	19
ovly_load9:
#nop
	bi	target						# 1,4	20

/* If we get here, we are about to load a new overlay.
 * "vma" contains the relevant entry from _ovly_table[].
 *	extern struct {
 *		u32 vma;
 *		u32 size;
 *		u32 file_offset;
 *		u32 buf;
 *	} _ovly_table[];
 */
	.align  3
	.global	__ovly_load_event
	.type	__ovly_load_event, @function
__ovly_load_event:
#nop
	rotqbyi	sz, vma, 8					# 1,4	0
#nop
	rotqbyi	osize, vma, 4					# 1,4	1
#nop
	lqa	ea64, _EAR_					# 1,6	2
#nop
	lqd	cgshuf, (__cg_pattern - __ovly_return)($lr)	# 1,6	3

/* We could predict the branch at the end of this loop by adding a few
   instructions, and there are plenty of free cycles to do so without
   impacting loop execution time.  However, it doesn't make a great
   deal of sense since we need to wait for the dma to complete anyway.  */
__ovly_xfer_loop:
#nop
	rotqmbyi off64, sz, -4					# 1,4	4
#nop; lnop
#nop; lnop
#nop; lnop
	cg	cgbits, ea64, off64				# 0,2	8
#lnop
#nop; lnop
#nop
	shufb	add64, cgbits, cgbits, cgshuf			# 1,4	10
#nop; lnop
#nop; lnop
#nop; lnop
	addx	add64, ea64, off64				# 0,2	14
#lnop
	ila	maxsize, MFC_MAX_DMA_SIZE			# 0,2	15
	lnop
	ori	ea64, add64, 0					# 0,2	16
	rotqbyi	ealo, add64, 4					# 1,4	16
	cgt	cmp, osize, maxsize				# 0,2	17
	wrch	$MFC_LSA, vma					# 1,6	17
#nop; lnop
	selb	sz, osize, maxsize, cmp				# 0,2	19
	wrch	$MFC_EAH, ea64					# 1,6	19
	ila	tagid, MFC_TAG_ID				# 0,2	20
	wrch	$MFC_EAL, ealo					# 1,6	20
	ila	cmd, MFC_GET_CMD				# 0,2	21
	wrch	$MFC_Size, sz					# 1,6	21
	sf	osize, sz, osize				# 0,2	22
	wrch	$MFC_TagId, tagid				# 1,6	22
	a	vma, vma, sz					# 0,2	23
	wrch	$MFC_Cmd, cmd					# 1,6	23
#nop
	brnz	osize, __ovly_xfer_loop				# 1,4	24

/* Now update our data structions while waiting for DMA to complete.
   Low bit of .buf needs to be cleared on the _ovly_table entry
   corresponding to the evicted overlay, and set on the entry for the
   newly loaded overlay.  Note that no overlay may in fact be evicted
   as _ovly_buf_table[] starts with all zeros.  Don't zap .buf entry
   for zero index!  Also of course update the _ovly_buf_table entry.  */
#nop
	lqd	newovl, (__ovly_current - __ovly_return)($lr)	# 1,6	25
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
	shli	off3, newovl, 4					# 0,4	31
#lnop
	ila	tab3, _ovly_table - 16				# 0,2	32
#lnop
#nop
	fsmbi	pbyte, 1					# 1,4	33
#nop; lnop
#nop
	lqx	vma, tab3, off3					# 1,6	35
#nop; lnop
	andi	pbit, pbyte, 1					# 0,2	37
	lnop
#nop; lnop
#nop; lnop
#nop; lnop
	or	newvma, vma, pbit				# 0,2	41
	rotqbyi	buf3, vma, 12					# 1,4	41
#nop; lnop
#nop
	stqx	newvma, tab3, off3				# 1,6	43
#nop; lnop
	shli	off4, buf3, 2					# 1,4	45
#lnop
	ila	tab4, _ovly_buf_table				# 0,2	46
#lnop
#nop; lnop
#nop; lnop
#nop
	lqx	map, tab4, off4					# 1,6	49
#nop
	cwx	genwi, tab4, off4				# 1,4	50
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop
	rotqby	oldovl, map, off4				# 1,4	55
	nop
	shufb	newmap, newovl, map, genwi			# 0,4	56
#if MFC_TAG_ID < 16
	ila	newmask, 1 << MFC_TAG_ID			# 0,2	57
#else
	ilhu	newmask, 1 << (MFC_TAG_ID - 16)			# 0,2	57
#endif
#lnop
#nop; lnop
#nop; lnop
	stqx	newmap, tab4, off4				# 1,6	60

/* Save app's tagmask, wait for DMA complete, restore mask.  */
	ila	tagstat, MFC_TAG_UPDATE_ALL			# 0,2	61
	rdch	oldmask, $MFC_RdTagMask				# 1,6	61
#nop
	wrch	$MFC_WrTagMask, newmask				# 1,6	62
#nop
	wrch	$MFC_WrTagUpdate, tagstat			# 1,6	63
#nop
	rdch	tagstat, $MFC_RdTagStat				# 1,6	64
#nop
	sync							# 1,4	65
/* Any hint prior to the sync is lost.  A hint here allows the branch
   to complete 15 cycles after the hint.  With no hint the branch will
   take 18 or 19 cycles.  */
	ila	tab5, _ovly_table - 16				# 0,2	66
	hbr	do_load99, target				# 1,15	66
	shli	off5, oldovl, 4					# 0,4	67
	wrch	$MFC_WrTagMask, oldmask				# 1,6	67
	ceqi	zovl, oldovl, 0					# 0,2	68
#lnop
#nop; lnop
#nop
	fsm	zovl, zovl					# 1,4	70
#nop
	lqx	oldvma, tab5, off5				# 1,6	71
#nop
	lqd	save3, -48($sp)					# 1,6	72
#nop; lnop
	andc	pbit, pbit, zovl				# 0,2	74
	lqd	save2, -32($sp)					# 1,6	74
#nop; lnop
#nop; lnop
	andc	oldvma, oldvma, pbit				# 0,2	77
	lqd	save1, -16($sp)					# 1,6	77
#nop; lnop
	nop
	stqx	oldvma, tab5, off5				# 1,6	79
#nop; lnop

	.global	_ovly_debug_event
	.type	_ovly_debug_event, @function
_ovly_debug_event:
	nop
/* Branch to target address. */
do_load99:
	bi	target						# 1,4	81

	.size	__ovly_load, . - __ovly_load