/* Overlay manager for SPU.

   Copyright (C) 2006-2014 Free Software Foundation, Inc.

   This file is part of the GNU Binutils.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
   MA 02110-1301, USA.  */

/* MFC DMA defn's.  */
#define MFC_GET_CMD		0x40
#define MFC_MAX_DMA_SIZE	0x4000
#define MFC_TAG_UPDATE_ALL	2
#define MFC_TAG_ID		0

/* Register usage.  */
#define reserved1	$75
#define parm		$75
#define tab1		reserved1
#define tab2		reserved1
#define vma		reserved1
#define oldvma		reserved1
#define newmask		reserved1
#define map		reserved1

#define reserved2	$76
#define off1		reserved2
#define off2		reserved2
#define present1	reserved2
#define present2	reserved2
#define sz		reserved2
#define cmp		reserved2
#define add64		reserved2
#define cgbits		reserved2
#define off3		reserved2
#define off4		reserved2
#define addr4		reserved2
#define off5		reserved2
#define tagstat		reserved2

#define reserved3	$77
#define size1		reserved3
#define size2		reserved3
#define rv3		reserved3
#define ealo		reserved3
#define cmd		reserved3
#define off64		reserved3
#define tab3		reserved3
#define tab4		reserved3
#define tab5		reserved3

#define reserved4	$78
#define ovl		reserved4
#define rv2		reserved4
#define rv5		reserved4
#define cgshuf		reserved4
#define newovl		reserved4
#define irqtmp1		reserved4
#define irqtmp2		reserved4

#define reserved5	$79
#define target		reserved5

#define save1		$74
#define rv4		save1
#define rv7		save1
#define tagid		save1
#define maxsize		save1
#define pbyte		save1
#define pbit		save1

#define save2		$73
#define cur		save2
#define rv6		save2
#define osize		save2
#define zovl		save2
#define oldovl		save2
#define newvma		save2

#define save3		$72
#define rv1		save3
#define ea64		save3
#define buf3		save3
#define genwi		save3
#define newmap		save3
#define oldmask		save3

#define save4		$71
#define irq_stat	save4

	.text
	.align 	4
	.type	__rv_pattern, @object
	.size	__rv_pattern, 16
__rv_pattern:
	.word	0x00010203, 0x10111213, 0x80808080, 0x80808080

	.type	__cg_pattern, @object
	.size	__cg_pattern, 16
__cg_pattern:
	.word	0x04050607, 0x80808080, 0x80808080, 0x80808080

	.type	__ovly_current, @object
	.size	__ovly_current, 16
__ovly_current:
	.space	16

/*
 * __ovly_return - stub for returning from overlay functions.
 *
 * On entry the four slots of $lr are:
 *   __ovly_return, prev ovl index, caller return addr, undefined.
 *
 * Load the previous overlay and jump to the caller return address.
 * Updates __ovly_current.
 */
	.align 	4
	.global	__ovly_return
	.type	__ovly_return, @function
__ovly_return:
	ila	tab1, _ovly_table - 16				# 0,2	0
	shlqbyi	ovl, $lr, 4					# 1,4	0
#nop
	shlqbyi	target, $lr, 8					# 1,4	1
#nop; lnop
#nop; lnop
	shli	off1, ovl, 4					# 0,4	4
#lnop
#nop
	hbr	ovly_ret9, target				# 1,15	5
#nop; lnop
#nop; lnop
#nop
	lqx	vma, tab1, off1					# 1,6	8
#ifdef OVLY_IRQ_SAVE
	nop
	stqd	save4, -64($sp)					# 1,6	9
#else
#nop; lnop
#endif
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop
	rotqbyi	size1, vma, 4					# 1,4	14
#nop
	stqd	save3, -48($sp)					# 1,6	15
#nop
	stqd	save2, -32($sp)					# 1,6	16
#nop
	stqd	save1, -16($sp)					# 1,6	17
	andi	present1, size1, 1				# 0,2	18
	stqr	ovl, __ovly_current				# 1,6	18
#nop; lnop
#nop
	brz	present1, do_load				# 1,4	20
ovly_ret9:
#nop
	bi	target						# 1,4	21

/*
 * __ovly_load - copy an overlay partion to local store.
 *
 * On entry $75 points to a word consisting of the overlay index in
 * the top 14 bits, and the target address in the bottom 18 bits.
 *
 * Sets up $lr to return via __ovly_return.  If $lr is already set
 * to return via __ovly_return, don't change it.  In that case we
 * have a tail call from one overlay function to another.
 * Updates __ovly_current.
 */
	.align  3
	.global	__ovly_load
	.type	__ovly_load, @function
__ovly_load:
#if OVL_STUB_SIZE == 8
########
#nop
	lqd	target, 0(parm)					# 1,6	-11
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop
	rotqby	target, target, parm				# 1,4	-5
	ila	tab2, _ovly_table - 16				# 0,2	-4
	stqd	save3, -48($sp)					# 1,6	-4
#nop
	stqd	save2, -32($sp)					# 1,6	-3
#nop
	stqd	save1, -16($sp)					# 1,6	-2
	rotmi	ovl, target, -18				# 0,4	-1
	hbr	ovly_load9, target				# 1,15	-1
	ila	rv1, __ovly_return				# 0,2	0
#lnop
#nop; lnop
#nop
	lqr	cur, __ovly_current				# 1,6	2
	shli	off2, ovl, 4					# 0,4	3
	stqr	ovl, __ovly_current				# 1,6	3
	ceq	rv2, $lr, rv1					# 0,2	4
	lqr	rv3, __rv_pattern				# 1,6	4
#nop; lnop
#nop; lnop
#nop
	lqx	vma, tab2, off2					# 1,6	7
########
#else /* OVL_STUB_SIZE == 16 */
########
	ila	tab2, _ovly_table - 16				# 0,2	0
	stqd	save3, -48($sp)					# 1,6	0
	ila	rv1, __ovly_return				# 0,2	1
	stqd	save2, -32($sp)					# 1,6	1
	shli	off2, ovl, 4					# 0,4	2
	lqr	cur, __ovly_current				# 1,6	2
	nop
	stqr	ovl, __ovly_current				# 1,6	3
	ceq	rv2, $lr, rv1					# 0,2	4
	lqr	rv3, __rv_pattern				# 1,6	4
#nop
	hbr	ovly_load9, target				# 1,15	5
#nop
	lqx	vma, tab2, off2					# 1,6	6
#nop
	stqd	save1, -16($sp)					# 1,6	7
########
#endif

#nop; lnop
#nop; lnop
#nop
	shufb	rv4, rv1, cur, rv3				# 1,4	10
#nop
	fsmb	rv5, rv2					# 1,4	11
#nop
	rotqmbyi rv6, $lr, -8					# 1,4	12
#nop
	rotqbyi	size2, vma, 4					# 1,4	13
#nop
	lqd	save3, -48($sp)					# 1,6	14
#nop; lnop
	or	rv7, rv4, rv6					# 0,2	16
	lqd	save2, -32($sp)					# 1,6	16
	andi	present2, size2, 1				# 0,2	17
#ifdef OVLY_IRQ_SAVE
	stqd	save4, -64($sp)					# 1,6	17
#else
	lnop							# 1,0	17
#endif
	selb	$lr, rv7, $lr, rv5				# 0,2	18
	lqd	save1, -16($sp)					# 1,6	18
#nop
	brz	present2, do_load				# 1,4	19
ovly_load9:
#nop
	bi	target						# 1,4	20

/* If we get here, we are about to load a new overlay.
 * "vma" contains the relevant entry from _ovly_table[].
 *	extern struct {
 *		u32 vma;
 *		u32 size;
 *		u32 file_offset;
 *		u32 buf;
 *	} _ovly_table[];
 */
	.align  3
	.global	__ovly_load_event
	.type	__ovly_load_event, @function
__ovly_load_event:
do_load:
#ifdef OVLY_IRQ_SAVE
	ila	irqtmp1, do_load10				# 0,2	-5
	rotqbyi	sz, vma, 8					# 1,4	-5
#nop
	rdch	irq_stat, $SPU_RdMachStat			# 1,6	-4
#nop
	bid	irqtmp1						# 1,4	-3
do_load10:
	nop
#else
#nop
	rotqbyi	sz, vma, 8					# 1,4	0
#endif
	rotqbyi	osize, vma, 4					# 1,4	1
#nop
	lqa	ea64, _EAR_					# 1,6	2
#nop
	lqr	cgshuf, __cg_pattern				# 1,6	3

/* We could predict the branch at the end of this loop by adding a few
   instructions, and there are plenty of free cycles to do so without
   impacting loop execution time.  However, it doesn't make a great
   deal of sense since we need to wait for the dma to complete anyway.  */
__ovly_xfer_loop:
#nop
	rotqmbyi off64, sz, -4					# 1,4	4
#nop; lnop
#nop; lnop
#nop; lnop
	cg	cgbits, ea64, off64				# 0,2	8
#lnop
#nop; lnop
#nop
	shufb	add64, cgbits, cgbits, cgshuf			# 1,4	10
#nop; lnop
#nop; lnop
#nop; lnop
	addx	add64, ea64, off64				# 0,2	14
#lnop
	ila	maxsize, MFC_MAX_DMA_SIZE			# 0,2	15
	lnop
	ori	ea64, add64, 0					# 0,2	16
	rotqbyi	ealo, add64, 4					# 1,4	16
	cgt	cmp, osize, maxsize				# 0,2	17
	wrch	$MFC_LSA, vma					# 1,6	17
#nop; lnop
	selb	sz, osize, maxsize, cmp				# 0,2	19
	wrch	$MFC_EAH, ea64					# 1,6	19
	ila	tagid, MFC_TAG_ID				# 0,2	20
	wrch	$MFC_EAL, ealo					# 1,6	20
	ila	cmd, MFC_GET_CMD				# 0,2	21
	wrch	$MFC_Size, sz					# 1,6	21
	sf	osize, sz, osize				# 0,2	22
	wrch	$MFC_TagId, tagid				# 1,6	22
	a	vma, vma, sz					# 0,2	23
	wrch	$MFC_Cmd, cmd					# 1,6	23
#nop
	brnz	osize, __ovly_xfer_loop				# 1,4	24

/* Now update our data structions while waiting for DMA to complete.
   Low bit of .size needs to be cleared on the _ovly_table entry
   corresponding to the evicted overlay, and set on the entry for the
   newly loaded overlay.  Note that no overlay may in fact be evicted
   as _ovly_buf_table[] starts with all zeros.  Don't zap .size entry
   for zero index!  Also of course update the _ovly_buf_table entry.  */
#nop
	lqr	newovl, __ovly_current				# 1,6	25
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop; lnop
	shli	off3, newovl, 4					# 0,4	31
#lnop
	ila	tab3, _ovly_table - 16				# 0,2	32
#lnop
#nop
	fsmbi	pbyte, 0x100					# 1,4	33
#nop; lnop
#nop
	lqx	vma, tab3, off3					# 1,6	35
#nop; lnop
	andi	pbit, pbyte, 1					# 0,2	37
	lnop
#nop; lnop
#nop; lnop
#nop; lnop
	or	newvma, vma, pbit				# 0,2	41
	rotqbyi	buf3, vma, 12					# 1,4	41
#nop; lnop
#nop
	stqx	newvma, tab3, off3				# 1,6	43
#nop; lnop
	shli	off4, buf3, 2					# 1,4	45
#lnop
	ila	tab4, _ovly_buf_table - 4			# 0,2	46
#lnop
#nop; lnop
#nop; lnop
#nop
	lqx	map, tab4, off4					# 1,6	49
#nop
	cwx	genwi, tab4, off4				# 1,4	50
	a	addr4, tab4, off4				# 0,2	51
#lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop
	rotqby	oldovl, map, addr4				# 1,4	55
#nop
	shufb	newmap, newovl, map, genwi			# 0,4	56
#if MFC_TAG_ID < 16
	ila	newmask, 1 << MFC_TAG_ID			# 0,2	57
#else
	ilhu	newmask, 1 << (MFC_TAG_ID - 16)			# 0,2	57
#endif
#lnop
#nop; lnop
#nop; lnop
	stqd	newmap, 0(addr4)				# 1,6	60

/* Save app's tagmask, wait for DMA complete, restore mask.  */
	ila	tagstat, MFC_TAG_UPDATE_ALL			# 0,2	61
	rdch	oldmask, $MFC_RdTagMask				# 1,6	61
#nop
	wrch	$MFC_WrTagMask, newmask				# 1,6	62
#nop
	wrch	$MFC_WrTagUpdate, tagstat			# 1,6	63
#nop
	rdch	tagstat, $MFC_RdTagStat				# 1,6	64
#nop
	sync							# 1,4	65
/* Any hint prior to the sync is lost.  A hint here allows the branch
   to complete 15 cycles after the hint.  With no hint the branch will
   take 18 or 19 cycles.  */
	ila	tab5, _ovly_table - 16				# 0,2	66
	hbr	do_load99, target				# 1,15	66
	shli	off5, oldovl, 4					# 0,4	67
	wrch	$MFC_WrTagMask, oldmask				# 1,6	67
	ceqi	zovl, oldovl, 0					# 0,2	68
#lnop
#nop; lnop
#nop
	fsm	zovl, zovl					# 1,4	70
#nop
	lqx	oldvma, tab5, off5				# 1,6	71
#nop
	lqd	save3, -48($sp)					# 1,6	72
#nop; lnop
	andc	pbit, pbit, zovl				# 0,2	74
	lqd	save2, -32($sp)					# 1,6	74
#ifdef OVLY_IRQ_SAVE
	ila	irqtmp2, do_load90				# 0,2	75
#lnop
	andi	irq_stat, irq_stat, 1				# 0,2	76
#lnop
#else
#nop; lnop
#nop; lnop
#endif
	andc	oldvma, oldvma, pbit				# 0,2	77
	lqd	save1, -16($sp)					# 1,6	77
	nop	       						# 0,0	78
#lnop
#nop
	stqx	oldvma, tab5, off5				# 1,6	79
#nop
#ifdef OVLY_IRQ_SAVE
	binze	irq_stat, irqtmp2				# 1,4	80
do_load90:
#nop
	lqd	save4, -64($sp)					# 1,6	84
#else
#nop; lnop
#endif

	.global	_ovly_debug_event
	.type	_ovly_debug_event, @function
_ovly_debug_event:
	nop
/* Branch to target address. */
do_load99:
	bi	target						# 1,4	81/85

	.size	__ovly_load, . - __ovly_load