1 files changed, 265 insertions, 0 deletions
diff --git a/sysdeps/ia64/memcpy.S b/sysdeps/ia64/memcpy.S
new file mode 100644
index 0000000..a254202
--- /dev/null
+++ b/sysdeps/ia64/memcpy.S
@@ -0,0 +1,265 @@
+/* Optimized version of the standard memcpy() function.
+   This file is part of the GNU C Library.
+   Copyright (C) 1991,92,93,97,98,99 Free Software Foundation, Inc.
+   Contributed by Dan Pop <Dan.Pop@cern.ch>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+/* Return: dest
+
+   Inputs:
+        in0:    dest
+        in1:    src
+        in2:    byte count
+
+   An assembly implementation of the algorithm used by the generic C
+   version from glibc.  The case when all three arguments are multiples
+   of 8 is treated separatedly, for extra performance.
+
+   In this form, it assumes little endian mode.  For big endian mode,
+   sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
+   and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
+   shrp instruction.  */
+
+#include <sysdep.h>
+#undef ret
+
+#define OP_T_THRES 	16
+#define OPSIZ 		8
+
+#define saved_pfs	r14
+#define sf		r15
+#define rescnt		r16
+#define saved_pr	r17
+#define saved_lc	r18
+#define dest		r19
+#define src		r20
+#define len		r21
+#define asrc		r22
+#define tmp2		r23
+#define tmp3		r24
+#define	tmp4		r25
+#define ptable		r26
+#define ploop56		r27
+#define	loopaddr	r28
+#define	sh1		r29
+#define loopcnt		r30
+#define	value		r31
+
+#define dl0		r22
+#define dh0		r23
+#define dl1		r24
+#define dh1		r25
+#define dl2		r26
+#define dh2		r27
+#define dl3		r28
+#define dh3		r29 
+
+#define LOOP(shift)							\
+		.align	32 ; 						\
+.loop##shift##:								\
+(p[0])		ld8	r[0] = [asrc], 8 ;	/* w1 */		\
+(p[MEMLAT+1])	st8	[dest] = value, 8 ;				\
+(p[MEMLAT])	shrp	value = r[MEMLAT], r[MEMLAT+1], shift ;	\
+		nop.b	0 ;						\
+		nop.b	0 ;						\
+		br.ctop.sptk .loop##shift ;				\
+		br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
+
+ENTRY(memcpy)
+	alloc 	saved_pfs = ar.pfs, 3, 40-3, 0, 40
+#include "softpipe.h"
+	.rotr	r[MEMLAT + 2], q[MEMLAT + 1], s0[2], s1[2], s2[2], s3[2]
+	.rotf	tl0[5], th0[5], tl1[5], th1[5], tl2[5], th2[5], tl3[5], th3[5]
+	.rotp	p[MEMLAT + 2]
+	mov	ret0 = in0		// return value = dest
+	mov	saved_pr = pr		// save the predicate registers
+	brp.loop.many.tk.tk.imp	.l0, .done - 16
+        mov 	saved_lc = ar.lc	// save the loop counter
+	or	tmp3 = in0, in1 ;;	// tmp3 = dest | src
+	or	tmp3 = tmp3, in2	// tmp3 = dest | src | len
+	mov 	dest = in0		// dest
+	mov 	src = in1		// src
+	mov	len = in2		// len
+	sub	tmp2 = r0, in0		// tmp2 = -dest
+	cmp.eq	p6, p0 = in2, r0	// if (len == 0)
+(p6)	br.cond.spnt .restore_and_exit;;// 	return dest;
+	and	tmp4 = 7, tmp3 		// tmp4 = (dest | src | len) & 7
+	tbit.nz	p8, p0 = src, 3 ;;	// test for 16-byte boundary align
+	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
+(p6)	br.cond.sptk .next		//	goto next;
+
+// The optimal case, when dest, src and len are all multiples of 8
+
+(p8)	ld8	value = [src], 8	// align src if necessary
+(p8)	adds	len = -8, len ;;	// adjust len accordingly
+	shr.u	loopcnt = len, 6 	// loopcnt = len / 64
+	shr.u	rescnt = len, 3		// rescnt = len / 8
+	mov	pr.rot = 1 << 16 	// set rotating predicates
+	mov	ar.ec = 4 + 1 ;;	// set the epilog counter
+	cmp.eq	p6, p0 = loopcnt, r0 
+	and	rescnt = 7, rescnt	// resnt = residual word count
+	adds	loopcnt = -1, loopcnt	// --loopcnt
+(p8)	st8	[dest] = value, 8	// copy one word if aligning 
+(p6)	br.cond.spnt .epilog;;		// there are < 8 words to copy
+	add	sf = 64 * 4, src
+	mov	ar.lc = loopcnt 	// set the loop counter		 
+	mov	s0[1] = src
+	add	s1[1] = 16*1, src
+	add     s2[1] = 16*2, src
+	add	s3[1] = 16*3, src
+	;;
+	mov     dl0 = dest
+	add	dh0 = 8 * 1, dest
+	add	dl1 = 8 * 2, dest
+	add     dh1 = 8 * 3, dest
+	add	dl2 = 8 * 4, dest
+	add	dh2 = 8 * 5, dest
+	add	dl3 = 8 * 6, dest
+	add	dh3 = 8 * 7, dest
+	;;	
+.l0:
+(p[0]) 	lfetch.nta [sf], 64
+
+(p[0])  ldfp8   tl0[0], th0[0] = [s0[1]]
+(p[0])  ldfp8   tl1[0], th1[0] = [s1[1]]
+(p[0])  ldfp8   tl2[0], th2[0] = [s2[1]]
+(p[0])  ldfp8   tl3[0], th3[0] = [s3[1]]
+
+(p[0])  add     s0[0] = 64, s0[1]
+(p[0])  add     s1[0] = 64, s1[1]
+(p[0])  add     s2[0] = 64, s2[1]
+(p[0])  add     s3[0] = 64, s3[1]
+(p[1])	mov	src = s0[1]		// for the epilog code
+
+(p[4])  stf8    [dl0] = tl0[4], 64
+(p[4])  stf8    [dh0] = th0[4], 64
+(p[4])  stf8    [dl1] = tl1[4], 64
+(p[4])  stf8    [dh1] = th1[4], 64
+(p[4])  stf8    [dl2] = tl2[4], 64
+(p[4])  stf8    [dh2] = th2[4], 64
+(p[4])  stf8    [dl3] = tl3[4], 64
+(p[4])  stf8    [dh3] = th3[4], 64
+
+	br.ctop.sptk.many .l0
+.done:
+	mov	dest = dl0
+.epilog:
+	cmp.eq	p6, p0 = rescnt, r0	// are there any words left to copy?
+	tbit.nz	p10, p0 = rescnt, 0
+(p6)	br.cond.spnt .restore_and_exit ;;
+(p10)	ld8	r[0] = [src], 8
+	tbit.nz	p11, p0 = rescnt, 1 ;;
+(p11)	ld8	r[1] = [src], 8
+(p10)	st8	[dest] = r[0], 8 ;;
+(p11)	ld8	r[2] = [src], 8 
+(p11)	st8	[dest] = r[1], 8
+	tbit.nz	p12, p0 = rescnt, 2 ;;
+(p12)	ld8	r[3] = [src], 8
+(p11)	st8	[dest] = r[2], 8 ;;
+(p12)	ld8	r[4] = [src], 8
+(p12)	st8	[dest] = r[3], 8 ;;
+(p12)	ld8	r[5] = [src], 8
+(p12) 	st8	[dest] = r[4], 8 
+	mov	ar.lc = saved_lc ;;	// restore the loop counter
+(p12) 	ld8	r[6] = [src], 8
+(p12)	st8	[dest] = r[5], 8 
+	mov	ar.pfs = saved_pfs;;	// restore the PFS
+(p12)	st8	[dest] = r[6]
+	mov	pr = saved_pr, -1 	// restore the predicate registers
+	br.ret.sptk.many b0
+.next:
+	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
+	and	loopcnt = 7, tmp2 		// loopcnt = -dest % 8
+(p6)	br.cond.spnt	.cpyfew			// copy byte by byte
+	;;
+	cmp.eq	p6, p0 = loopcnt, r0
+(p6)	br.cond.sptk	.dest_aligned
+	sub	len = len, loopcnt	// len -= -dest % 8
+	adds	loopcnt = -1, loopcnt	// --loopcnt
+	;;
+	mov	ar.lc = loopcnt
+.l1:					// copy -dest % 8 bytes
+	ld1	value = [src], 1	// value = *src++
+	;;
+	st1	[dest] = value, 1	// *dest++ = value  
+	br.cloop.dptk .l1	
+.dest_aligned:
+	and	sh1 = 7, src 		// sh1 = src % 8
+	and	tmp2 = -8, len   	// tmp2 = len & -OPSIZ
+	and	asrc = -8, src		// asrc = src & -OPSIZ  -- align src
+	shr.u	loopcnt = len, 3	// loopcnt = len / 8
+	and	len = 7, len;;		// len = len % 8
+	adds	loopcnt = -1, loopcnt	// --loopcnt
+	addl	tmp4 = @ltoff(.table), gp 
+	addl	tmp3 = @ltoff(.loop56), gp
+	mov     ar.ec = MEMLAT + 1	// set EC
+	mov     pr.rot = 1 << 16;;	// set rotating predicates
+	mov	ar.lc = loopcnt		// set LC
+	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
+(p6)    br.cond.sptk .src_aligned
+	add	src = src, tmp2		// src += len & -OPSIZ
+	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
+	ld8	ploop56 = [tmp3]	// ploop56 = &loop56
+	ld8	ptable = [tmp4];;	// ptable = &table
+	add	tmp3 = ptable, sh1;;	// tmp3 = &table + sh1
+	mov	ar.ec = MEMLAT + 1 + 1 // one more pass needed
+	ld8	tmp4 = [tmp3];;		// tmp4 = loop offset
+	sub	loopaddr = ploop56,tmp4	// loopadd = &loop56 - loop offset
+	ld8	r[1] = [asrc], 8;;	// w0
+	mov	b6 = loopaddr;;
+	br	b6			// jump to the appropriate loop
+
+	LOOP(8)
+	LOOP(16)
+	LOOP(24)
+	LOOP(32)
+	LOOP(40)
+	LOOP(48)
+	LOOP(56)
+	
+.src_aligned:
+.l3:
+(p[0])		ld8	r[0] = [src], 8
+(p[MEMLAT])	st8	[dest] = r[MEMLAT], 8
+		br.ctop.dptk .l3
+.cpyfew:
+	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
+	adds	len = -1, len		// --len;
+(p6)	br.cond.spnt	.restore_and_exit ;;
+	mov	ar.lc = len
+.l4:
+	ld1	value = [src], 1
+	;;
+	st1	[dest] = value, 1
+	br.cloop.dptk	.l4 ;;
+.restore_and_exit:
+	mov 	ar.pfs = saved_pfs	// restore the PFS
+	mov     pr = saved_pr, -1    	// restore the predicate registers
+	mov 	ar.lc = saved_lc	// restore the loop counter
+	br.ret.sptk.many b0
+	.align 8
+.table:
+	data8	0			// dummy entry
+	data8 	.loop56 - .loop8
+	data8 	.loop56 - .loop16
+	data8 	.loop56 - .loop24
+	data8	.loop56 - .loop32
+	data8	.loop56 - .loop40
+	data8	.loop56 - .loop48
+	data8	.loop56 - .loop56
+
+END(memcpy)