/* Overlay manager for SPU.

   Copyright 2006, 2007 Free Software Foundation, Inc.

   This file is part of the GNU Binutils.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
   MA 02110-1301, USA.  */

/**
 * MFC DMA defn's.
 */
#define MFC_GET_CMD		0x40
#define MFC_MAX_DMA_SIZE	0x4000
#define MFC_TAG_UPDATE_ALL	2
#define MFC_TAG_ID		0


/**
 * Temporary register allocations.
 * These are saved/restored here.
 */
#define tab		$75
#define cgbits		$75
#define add64		$75
#define ealo		$75
#define newmask		$75
#define tagstat		$75
#define bchn		$75
#define rv1		$75

#define off		$76
#define off64		$76
#define maxsize		$76
#define oldmask		$76
#define sz		$76
#define lnkr		$76
#define rv2		$76

#define cur		$77
#define cmp		$77
#define buf		$77
#define genwi		$77
#define tagid		$77
#define cmd		$77
#define rv3		$77

#define cgshuf		$78

#define vma		$6

#define map		$7
#define osize		$7
#define cmp2		$7

#define ea64		$8
#define retval		$8

#ifdef OVLY_IRQ_SAVE
#define irqtmp		$8
#define irq_stat	$9
#endif

# Stack quadword minux N
#define	SQWM1	-16*1
#define	SQWM2	-16*2
#define	SQWM3	-16*3
#define	SQWM4	-16*4
#define	SQWM5	-16*5
#define	SQWM6	-16*6
#define	SQWM7	-16*7
#define	SQWM8	-16*8
#define	SQWM9	-16*9
#define	SQWM10	-16*10
#define	SQWM11	-16*11
#define	SQWM12	-16*12
#define	SQWM13	-16*13
#define	SQWM14	-16*14
#define	SQWM15	-16*15
#define	SQWM16	-16*16

	.extern		_ovly_table
	.extern		_ovly_buf_table

#ifdef OVLY_PRINTFS
#define SPE_C99_VPRINTF 37
__entry_event_format:
	.string		"In entry_event_hook segment=0x%08x entry-address=0x%08x\n"
__debug_event_format:
  	.string		"In debug_event_hook link-register=0x%08x %08x %08x %08x\n"
__dma_event_format:
  	.string		"In dma_event_hook vma=0x%08x ea=%08x%08x sz=%08x\n"
__ovly_buf_table_format:
  	.string		"_ovly_buf_table[%08x]=%08x\n"
#endif

	.text
	.align 		4
	.type		__rv_pattern, @object
	.size		__rv_pattern, 16
__rv_pattern:
	.word		0x00010203, 0x1c1d1e1f, 0x00010203, 0x10111213
	.type		__cg_pattern, @object
	.size		__cg_pattern, 16
__cg_pattern:
	.word		0x04050607, 0x80808080, 0x80808080, 0x80808080

/**
 * __ovly_return - stub for returning from overlay functions.
 *
 * inputs:
 *	$lr	link register
 *
 * outputs:
 *	$78	old partition number, to be reloaded
 *	$79	return address in old partion number
 */
	.global		__ovly_return
	.type		__ovly_return, @function

	.word		0
__ovly_return:
	shlqbyi		$78, $lr, 4
	shlqbyi		$79, $lr, 8
	biz		$78, $79
	.size		__ovly_return, . - __ovly_return

/**
 * __ovly_load - copy an overlay partion to local store.
 *
 * inputs:
 *	$78	partition number to be loaded.
 *	$79	branch target in new partition.
 *	$lr	link register, containing return addr.
 *
 * outputs:
 *	$lr	new link register, returning through __ovly_return.
 *
 * Copy a new overlay partition into local store, or return
 * immediately if the partition is already resident.
 */
	.global		__ovly_load
	.type		__ovly_load, @function

__ovly_load:
/* Save temporary registers to stack. */
	stqd		$6, -16($sp)
	stqd		$7, -32($sp)
	stqd		$8, -48($sp)

#ifdef OVLY_IRQ_SAVE
/* Save irq state, then disable interrupts. */
	stqd		$9, -64($sp)
	ila		irqtmp, __ovly_irq_save
	rdch		irq_stat, $SPU_RdMachStat
	bid		irqtmp
__ovly_irq_save:
#endif

#ifdef OVLY_PRINTFS
//==============================================
// In entry_event_hook segment=0x%08x entry-address=0x%08x
//==============================================
# save registers
	stqd	$10, SQWM5($sp)
	stqd	$11, SQWM6($sp)
	stqd	$12, SQWM7($sp)
# Place input parameters onto the stack to form the
# local storage memory image.
	ila	$10, __entry_event_format
	stqd	$10, SQWM12($sp)
	ai	$10, $sp, SQWM9
	stqd	$10, SQWM11($sp)
	stqd	$sp, SQWM10($sp)
	stqd	$78, SQWM9($sp)
	stqd	$79, SQWM8($sp)
# Construct a message consisting of the 8-bit opcode
# and 24-bit local store pointer to the input
# parameters and place it forllowing the stop and signal
	ila	$10, 0x3ffff		# address mask
	ilhu	$11, SPE_C99_VPRINTF << 8
	ai	$12, $sp, SQWM12	# parameter pointer
	selb	$11, $11, $12, $10	# combine command & address ptr
	brsl	$10, next1a
next1a:
	.type	next1a, @function
	lqr	$12, message1a
	cwd	$10, message1a-next1a($10)
	shufb	$11, $11, $12, $10	# insert msg into inst word
	stqr	$11, message1a		# store cmd/ptr into msg word
	dsync
# Notify the PPE to perform the assisted call request
# by issing a stop and signal with a signal code
# of 0x2100 (C99 class)
	stop	0x2100
message1a:
	.word	0

# save registers
	stqd	$13, SQWM8($sp)
	stqd	$14, SQWM9($sp)
	stqd	$15, SQWM10($sp)
	stqd	$16, SQWM11($sp)

# initialize loop
	il	$13, 1
	ila	$14, _ovly_buf_table
	ila	$15, _ovly_buf_table_end

loop_start1:
# Place input parameters onto the stack to form the
# local storage memory image.
	ila	$10, __ovly_buf_table_format
	stqd	$10, SQWM16($sp)
	ai	$10, $sp, SQWM13
	stqd	$10, SQWM15($sp)
	stqd	$sp, SQWM14($sp)
	stqd	$13, SQWM13($sp)
	lqd	$16, 0($14)
	rotqby	$16, $16, $14
	stqd	$16, SQWM12($sp)
# Construct a message consisting of the 8-bit opcode
# and 24-bit local store pointer to the input
# parameters and place it forllowing the stop and signal
	ila	$10, 0x3ffff		# address mask
	ilhu	$11, SPE_C99_VPRINTF << 8
	ai	$12, $sp, SQWM16	# parameter pointer
	selb	$11, $11, $12, $10	# combine command & address ptr
	brsl	$10, next1b
next1b:
	.type	next1b, @function
	lqr	$12, message1b
	cwd	$10, message1b-next1b($10)
	shufb	$11, $11, $12, $10	# insert msg into inst word
	stqr	$11, message1b		# store cmd/ptr into msg word
	dsync
# Notify the PPE to perform the assisted call request
# by issing a stop and signal with a signal code
# of 0x2100 (C99 class)
	stop	0x2100
message1b:
	.word	0

# move to next entry
	ai	$13, $13, 1
	ai	$14, $14, 4
	clgt	$16, $15, $14
	brnz	$16, loop_start1

# restore registers
	lqd	$16, SQWM11($sp)
	lqd	$15, SQWM10($sp)
	lqd	$14, SQWM9($sp)
	lqd	$13, SQWM8($sp)
	lqd	$12, SQWM7($sp)
	lqd	$11, SQWM6($sp)
	lqd	$10, SQWM5($sp)
//==============================================
#endif

/* Set branch hint to overlay target. */
	hbr		__ovly_load_ret, $79

/* Get caller's overlay index by back chaining through stack frames.
 * Loop until end of stack (back chain all-zeros) or
 * encountered a link register we set here. */
	lqd		bchn, 0($sp)
	ila		retval, __ovly_return

__ovly_backchain_loop:
	lqd		lnkr, 16(bchn)
	lqd		bchn, 0(bchn)
	ceq		cmp, lnkr, retval
	ceqi		cmp2, bchn, 0
	or		cmp, cmp, cmp2
	brz		cmp, __ovly_backchain_loop

/* If we reached the zero back-chain, then lnkr is bogus.  Clear the
 * part of lnkr that we use later (slot 3). */
	rotqbyi		cmp2, cmp2, 4
	andc		lnkr, lnkr, cmp2

/* Set lr = {__ovly_return, prev ovl ndx, caller return adr, callee ovl ndx}. */
	lqd		rv1, (__rv_pattern-__ovly_return+4)(retval)
	shufb		rv2, retval, lnkr, rv1
	shufb		rv3, $lr, $78, rv1
	fsmbi		rv1, 0xff
	selb		rv2, rv2, rv3, rv1
/* If we have a tail call from one overlay function to another overlay,
   then lr is already set up.  Don't change it.  */
	ceq		rv1, $lr, retval
	fsmb		rv1, rv1
	selb		$lr, rv2, $lr, rv1

/* Branch to $79 if non-overlay */
	brz		$78, __ovly_load_restore

/* Load values from _ovly_table[$78].
 *	extern struct {
 *		u32 vma;
 *		u32 size;
 *		u32 file_offset;
 *		u32 buf;
 *	} _ovly_table[];
 */
	shli		off, $78, 4
	ila		tab, _ovly_table - 16
	lqx		vma, tab, off
	rotqbyi		buf, vma, 12

/* Load values from _ovly_buf_table[buf].
 *	extern struct {
 *		u32 mapped;
 *	} _ovly_buf_table[];
 */
	ila		tab, _ovly_buf_table
	ai		off, buf, -1
	shli		off, off, 2
	lqx		map, tab, off
	rotqby		cur, map, off

/* Branch to $79 now if overlay is already mapped.  */
	ceq		cmp, $78, cur
	brnz		cmp, __ovly_load_restore

/* Marker for profiling code.  If we get here, we are about to load
 * a new overlay.
 */
	.global		__ovly_load_event
	.type		__ovly_load_event, @function
__ovly_load_event:

/* Set _ovly_buf_table[buf].mapped = $78. */
	cwx		genwi, tab, off
	shufb		map, $78, map, genwi
	stqx		map, tab, off

/* A new partition needs to be loaded. Prepare for DMA loop.
 * _EAR_ is the 64b base EA, filled in at run time by the
 * loader, and indicating the value for SPU executable image start.
 */
	lqd		cgshuf, (__cg_pattern-__ovly_return+4)(retval)
	rotqbyi		osize, vma, 4
	rotqbyi		sz, vma, 8
	lqa		ea64, _EAR_

__ovly_xfer_loop:
/* 64b add to compute next ea64. */
	rotqmbyi	off64, sz, -4
	cg		cgbits, ea64, off64
	shufb		add64, cgbits, cgbits, cgshuf
	addx		add64, ea64, off64
	ori		ea64, add64, 0

/* Setup DMA parameters, then issue DMA request. */
	rotqbyi		ealo, add64, 4
	ila		maxsize, MFC_MAX_DMA_SIZE
	cgt		cmp, osize, maxsize
	selb		sz, osize, maxsize, cmp
	ila		tagid, MFC_TAG_ID
	wrch		$MFC_LSA, vma
	wrch		$MFC_EAH, ea64
	wrch		$MFC_EAL, ealo
	wrch		$MFC_Size, sz
	wrch		$MFC_TagId, tagid
	ila		cmd, MFC_GET_CMD
	wrch		$MFC_Cmd, cmd

#ifdef OVLY_PRINTFS
//==============================================
// In dma_event_hook vma=0x%08x ea=%08x%08x sz=%08x
//==============================================
# save registers
	stqd	$10, SQWM5($sp)
	stqd	$11, SQWM6($sp)
	stqd	$12, SQWM7($sp)
# Place input parameters onto the stack to form the
# local storage memory image.
	ila	$10, __dma_event_format
	stqd	$10, SQWM14($sp)
	ai	$10, $sp, SQWM11
	stqd	$10, SQWM13($sp)
	stqd	$sp, SQWM12($sp)
	stqd	vma, SQWM11($sp)
	stqd	ea64, SQWM10($sp)
	stqd	ealo, SQWM9($sp)
	stqd	sz, SQWM8($sp)
# Construct a message consisting of the 8-bit opcode
# and 24-bit local store pointer to the input
# parameters and place it forllowing the stop and signal
	ila	$10, 0x3ffff		# address mask
	ilhu	$11, SPE_C99_VPRINTF << 8
	ai	$12, $sp, SQWM14	# parameter pointer
	selb	$11, $11, $12, $10	# combine command & address ptr
	brsl	$10, next3a
next3a:
	.type	next3a, @function
	lqr	$12, message3a
	cwd	$10, message3a-next3a($10)
	shufb	$11, $11, $12, $10	# insert msg into inst word
	stqr	$11, message3a		# store cmd/ptr into msg word
	dsync
# Notify the PPE to perform the assisted call request
# by issing a stop and signal with a signal code
# of 0x2100 (C99 class)
	stop	0x2100
message3a:
	.word	0

# restore registers
	lqd	$12, SQWM7($sp)
	lqd	$11, SQWM6($sp)
	lqd	$10, SQWM5($sp)
//==============================================
#endif

/* Increment vma, decrement size, branch back as needed. */
	a		vma, vma, sz
	sf		osize, sz, osize
	brnz		osize, __ovly_xfer_loop

/* Save app's tagmask, wait for DMA complete, restore mask. */
	rdch		oldmask, $MFC_RdTagMask
#if MFC_TAG_ID < 16
	ilh		newmask, 1 << MFC_TAG_ID
#else
	ilhu		newmask, 1 << (MFC_TAG_ID - 16)
#endif
	wrch		$MFC_WrTagMask, newmask
	ila		tagstat, MFC_TAG_UPDATE_ALL
	wrch		$MFC_WrTagUpdate, tagstat
	rdch		tagstat, $MFC_RdTagStat
	sync
	wrch		$MFC_WrTagMask, oldmask

#ifdef OVLY_PRINTFS
//==============================================
// In debug_event_hook link-register=0x%08x %08x %08x %08x
//==============================================
# save registers
	stqd	$10, SQWM5($sp)
	stqd	$11, SQWM6($sp)
	stqd	$12, SQWM7($sp)
# Place input parameters onto the stack to form the
# local storage memory image.
	ila	$10, __debug_event_format
	stqd	$10, SQWM14($sp)
	ai	$10, $sp, SQWM11
	stqd	$10, SQWM13($sp)
	stqd	$sp, SQWM12($sp)
	stqd	$lr, SQWM11($sp)
	rotqbyi $10, $lr, 4
	stqd	$10, SQWM10($sp)
	rotqbyi $10, $10, 4
	stqd	$10, SQWM9($sp)
	rotqbyi $10, $10, 4
	stqd	$10, SQWM8($sp)
# Construct a message consisting of the 8-bit opcode
# and 24-bit local store pointer to the input
# parameters and place it forllowing the stop and signal
	ila	$10, 0x3ffff		# address mask
	ilhu	$11, SPE_C99_VPRINTF << 8
	ai	$12, $sp, SQWM14	# parameter pointer
	selb	$11, $11, $12, $10	# combine command & address ptr
	brsl	$10, next2a
next2a:
	.type	next2a, @function
	lqr	$12, message2a
	cwd	$10, message2a-next2a($10)
	shufb	$11, $11, $12, $10	# insert msg into inst word
	stqr	$11, message2a		# store cmd/ptr into msg word
	dsync
# Notify the PPE to perform the assisted call request
# by issing a stop and signal with a signal code
# of 0x2100 (C99 class)
	stop	0x2100
message2a:
	.word	0

# save registers
	stqd	$13, SQWM8($sp)
	stqd	$14, SQWM9($sp)
	stqd	$15, SQWM10($sp)
	stqd	$16, SQWM11($sp)

# initialize loop
	il	$13, 1
	ila	$14, _ovly_buf_table
	ila	$15, _ovly_buf_table_end

loop_start2:
# Place input parameters onto the stack to form the
# local storage memory image.
	ila	$10, __ovly_buf_table_format
	stqd	$10, SQWM16($sp)
	ai	$10, $sp, SQWM13
	stqd	$10, SQWM15($sp)
	stqd	$sp, SQWM14($sp)
	stqd	$13, SQWM13($sp)
	lqd	$16, 0($14)
	rotqby	$16, $16, $14
	stqd	$16, SQWM12($sp)
# Construct a message consisting of the 8-bit opcode
# and 24-bit local store pointer to the input
# parameters and place it forllowing the stop and signal
	ila	$10, 0x3ffff		# address mask
	ilhu	$11, SPE_C99_VPRINTF << 8
	ai	$12, $sp, SQWM16	# parameter pointer
	selb	$11, $11, $12, $10	# combine command & address ptr
	brsl	$10, next2b
next2b:
	.type	next2b, @function
	lqr	$12, message2b
	cwd	$10, message2b-next2b($10)
	shufb	$11, $11, $12, $10	# insert msg into inst word
	stqr	$11, message2b		# store cmd/ptr into msg word
	dsync
# Notify the PPE to perform the assisted call request
# by issing a stop and signal with a signal code
# of 0x2100 (C99 class)
	stop	0x2100
message2b:
	.word	0

# move to next entry
	ai	$13, $13, 1
	ai	$14, $14, 4
	clgt	$16, $15, $14
	brnz	$16, loop_start2

# restore registers
	lqd	$16, SQWM11($sp)
	lqd	$15, SQWM10($sp)
	lqd	$14, SQWM9($sp)
	lqd	$13, SQWM8($sp)
	lqd	$12, SQWM7($sp)
	lqd	$11, SQWM6($sp)
	lqd	$10, SQWM5($sp)
//==============================================
#endif

	.global		_ovly_debug_event
	.type		_ovly_debug_event, @function
_ovly_debug_event:
/* GDB inserts debugger trap here.  */
	nop

__ovly_load_restore:
#ifdef OVLY_IRQ_SAVE
/* Conditionally re-enable interrupts. */
	andi		irq_stat, irq_stat, 1
	ila		irqtmp, __ovly_irq_restore
	binze		irq_stat, irqtmp
__ovly_irq_restore:
	lqd		$9, -64($sp)
#endif

/* Restore saved registers. */
	lqd		$8, -48($sp)
	lqd		$7, -32($sp)
	lqd		$6, -16($sp)

__ovly_load_ret:
/* Branch to target address. */
	bi		$79

	.size		__ovly_load, . - __ovly_load