aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/ia64/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/ia64/memcpy.S')
-rw-r--r--sysdeps/ia64/memcpy.S436
1 files changed, 0 insertions, 436 deletions
diff --git a/sysdeps/ia64/memcpy.S b/sysdeps/ia64/memcpy.S
deleted file mode 100644
index a2aeea0..0000000
--- a/sysdeps/ia64/memcpy.S
+++ /dev/null
@@ -1,436 +0,0 @@
-/* Optimized version of the standard memcpy() function.
- This file is part of the GNU C Library.
- Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
- Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
- Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
-
-/* Return: dest
-
- Inputs:
- in0: dest
- in1: src
- in2: byte count
-
- An assembly implementation of the algorithm used by the generic C
- version from glibc. The case when source and sest are aligned is
- treated separately, for extra performance.
-
- In this form, memcpy assumes little endian mode. For big endian mode,
- sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
- and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
- shrp instruction. */
-
-#define USE_LFETCH
-#define USE_FLP
-#include <sysdep.h>
-#undef ret
-
-#define LFETCH_DIST 500
-
-#define ALIGN_UNROLL_no 4 // no. of elements
-#define ALIGN_UNROLL_sh 2 // (shift amount)
-
-#define MEMLAT 8
-#define Nrot ((4*(MEMLAT+2) + 7) & ~7)
-
-#define OP_T_THRES 16
-#define OPSIZ 8
-
-#define loopcnt r14
-#define elemcnt r15
-#define saved_pr r16
-#define saved_lc r17
-#define adest r18
-#define dest r19
-#define asrc r20
-#define src r21
-#define len r22
-#define tmp2 r23
-#define tmp3 r24
-#define tmp4 r25
-#define ptable r26
-#define ploop56 r27
-#define loopaddr r28
-#define sh1 r29
-#define ptr1 r30
-#define ptr2 r31
-
-#define movi0 mov
-
-#define p_scr p6
-#define p_xtr p7
-#define p_nxtr p8
-#define p_few p9
-
-#if defined(USE_FLP)
-#define load ldf8
-#define store stf8
-#define tempreg f6
-#define the_r fr
-#define the_s fs
-#define the_t ft
-#define the_q fq
-#define the_w fw
-#define the_x fx
-#define the_y fy
-#define the_z fz
-#elif defined(USE_INT)
-#define load ld8
-#define store st8
-#define tempreg tmp2
-#define the_r r
-#define the_s s
-#define the_t t
-#define the_q q
-#define the_w w
-#define the_x x
-#define the_y y
-#define the_z z
-#endif
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
-/* Manually force proper loop-alignment. Note: be sure to
- double-check the code-layout after making any changes to
- this routine! */
-# define ALIGN(n) { nop 0 }
-#else
-# define ALIGN(n) .align n
-#endif
-
-#if defined(USE_LFETCH)
-#define LOOP(shift) \
- ALIGN(32); \
-.loop##shift##: \
-{ .mmb \
-(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
-(p[0]) lfetch.nt1 [ptr1], 16 ; \
- nop.b 0 ; \
-} { .mib \
-(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
-(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
- nop.b 0 ;; \
- } { .mmb \
-(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
-(p[0]) lfetch.nt1 [ptr2], 16 ; \
- nop.b 0 ; \
-} { .mib \
-(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
-(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
- br.ctop.sptk.many .loop##shift \
-;; } \
-{ .mib \
- br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
-}
-#else
-#define LOOP(shift) \
- ALIGN(32); \
-.loop##shift##: \
-{ .mmb \
-(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
- nop.b 0 ; \
-} { .mib \
-(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
-(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
- nop.b 0 ;; \
- } { .mmb \
-(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
- nop.b 0 ; \
-} { .mib \
-(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
-(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
- br.ctop.sptk.many .loop##shift \
-;; } \
-{ .mib \
- br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
-}
-#endif
-
-
-ENTRY(memcpy)
-{ .mmi
- .prologue
- alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
- .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
- .rotp p[MEMLAT+2]
- .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
- mov ret0 = in0 // return tmp2 = dest
- .save pr, saved_pr
- movi0 saved_pr = pr // save the predicate registers
-} { .mmi
- and tmp4 = 7, in0 // check if destination is aligned
- mov dest = in0 // dest
- mov src = in1 // src
-;; }
-{ .mii
- cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
- .save ar.lc, saved_lc
- movi0 saved_lc = ar.lc // save the loop counter
- .body
- cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
-} { .mbb
- mov len = in2 // len
-(p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
-(p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
-;; }
-{ .mmi
-#if defined(USE_LFETCH)
- lfetch.nt1 [dest] //
- lfetch.nt1 [src] //
-#endif
- shr.u elemcnt = len, 3 // elemcnt = len / 8
-} { .mib
- cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
- sub loopcnt = 7, tmp4 //
-(p_scr) br.cond.dptk.many .dest_aligned
-;; }
-{ .mmi
- ld1 tmp2 = [src], 1 //
- sub len = len, loopcnt, 1 // reduce len
- movi0 ar.lc = loopcnt //
-} { .mib
- cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
-;; }
-
-.l0: // ---------------------------- // L0: Align src on 8-byte boundary
-{ .mmi
- st1 [dest] = tmp2, 1 //
-(p_scr) ld1 tmp2 = [src], 1 //
-} { .mib
- cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
- add loopcnt = -1, loopcnt
- br.cloop.dptk.few .l0 //
-;; }
-
-.dest_aligned:
-{ .mmi
- and tmp4 = 7, src // ready for alignment check
- shr.u elemcnt = len, 3 // elemcnt = len / 8
-;; }
-{ .mib
- cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
- tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
-} { .mib // is not 16B aligned
- add ptr2 = LFETCH_DIST, dest // prefetch address
- add ptr1 = LFETCH_DIST, src
-(p_scr) br.cond.dptk.many .src_not_aligned
-;; }
-
-// The optimal case, when dest, and src are aligned
-
-.both_aligned:
-{ .mmi
- .pred.rel "mutex",p_xtr,p_nxtr
-(p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
-(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
- movi0 pr.rot = 1 << 16 // set rotating predicates
-} { .mib
-(p_scr) br.cond.dpnt.many .copy_full_words
-;; }
-
-{ .mmi
-(p_xtr) load tempreg = [src], 8
-(p_xtr) add elemcnt = -1, elemcnt
- movi0 ar.ec = MEMLAT + 1 // set the epilog counter
-;; }
-{ .mmi
-(p_xtr) add len = -8, len //
- add asrc = 16, src // one bank apart (for USE_INT)
- shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
-;;}
-{ .mmi
- add loopcnt = -1, loopcnt
-(p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
- nop.i 0
-;; }
-{ .mib
- add adest = 16, dest
- movi0 ar.lc = loopcnt // set the loop counter
-;; }
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
- { nop 0 }
-#else
- .align 32
-#endif
-#if defined(USE_FLP)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
-{ .mmi
-#if defined(USE_LFETCH)
-(p[0]) lfetch.nt1 [ptr2],32
-#endif
-(p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
-(p[0]) add len = -32, len
-} {.mmb
-(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
-(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
-;; }
-{ .mmi
-#if defined(USE_LFETCH)
-(p[0]) lfetch.nt1 [ptr1],32
-#endif
-(p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
-} {.mmb
-(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
-(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
- br.ctop.dptk.many .l1
-;; }
-#elif defined(USE_INT)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
-{ .mmi
-(p[0]) load the_r[0] = [src], 8
-(p[0]) load the_q[0] = [asrc], 8
-(p[0]) add len = -32, len
-} {.mmb
-(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
-(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
-;; }
-{ .mmi
-(p[0]) load the_s[0] = [src], 24
-(p[0]) load the_t[0] = [asrc], 24
-} {.mmb
-(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
-(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
-#if defined(USE_LFETCH)
-;; }
-{ .mmb
-(p[0]) lfetch.nt1 [ptr2],32
-(p[0]) lfetch.nt1 [ptr1],32
-#endif
- br.ctop.dptk.many .l1
-;; }
-#endif
-
-.copy_full_words:
-{ .mib
- cmp.gt p_scr, p0 = 8, len //
- shr.u elemcnt = len, 3 //
-(p_scr) br.cond.dpnt.many .copy_bytes
-;; }
-{ .mii
- load tempreg = [src], 8
- add loopcnt = -1, elemcnt //
-;; }
-{ .mii
- cmp.ne p_scr, p0 = 0, loopcnt //
- mov ar.lc = loopcnt //
-;; }
-
-.l2: // ------------------------------- // L2: Max 4 words copied separately
-{ .mmi
- store [dest] = tempreg, 8
-(p_scr) load tempreg = [src], 8 //
- add len = -8, len
-} { .mib
- cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
- add loopcnt = -1, loopcnt
- br.cloop.dptk.few .l2
-;; }
-
-.copy_bytes:
-{ .mib
- cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
- add loopcnt = -1, len // len--;
-(p_scr) br.cond.spnt .restore_and_exit
-;; }
-{ .mii
- ld1 tmp2 = [src], 1
- movi0 ar.lc = loopcnt
- cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
-;; }
-
-.l3: // ------------------------------- // L3: Final byte move
-{ .mmi
- st1 [dest] = tmp2, 1
-(p_scr) ld1 tmp2 = [src], 1
-} { .mib
- cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
- add loopcnt = -1, loopcnt
- br.cloop.dptk.few .l3
-;; }
-
-.restore_and_exit:
-{ .mmi
- movi0 pr = saved_pr, -1 // restore the predicate registers
-;; }
-{ .mib
- movi0 ar.lc = saved_lc // restore the loop counter
- br.ret.sptk.many b0
-;; }
-
-
-.src_not_aligned:
-{ .mmi
- cmp.gt p_scr, p0 = 16, len
- and sh1 = 7, src // sh1 = src % 8
- shr.u loopcnt = len, 4 // element-cnt = len / 16
-} { .mib
- add tmp4 = @ltoff(.table), gp
- add tmp3 = @ltoff(.loop56), gp
-(p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
-;; }
-{ .mmi
- and asrc = -8, src // asrc = (-8) -- align src for loop
- add loopcnt = -1, loopcnt // loopcnt--
- shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
-} { .mmi
- ld8 ptable = [tmp4] // ptable = &table
- ld8 ploop56 = [tmp3] // ploop56 = &loop56
- and tmp2 = -16, len // tmp2 = len & -OPSIZ
-;; }
-{ .mmi
- add tmp3 = ptable, sh1 // tmp3 = &table + sh1
- add src = src, tmp2 // src += len & (-16)
- movi0 ar.lc = loopcnt // set LC
-;; }
-{ .mmi
- ld8 tmp4 = [tmp3] // tmp4 = loop offset
- sub len = len, tmp2 // len -= len & (-16)
- movi0 ar.ec = MEMLAT + 2 // one more pass needed
-;; }
-{ .mmi
- ld8 s[1] = [asrc], 8 // preload
- sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
- movi0 pr.rot = 1 << 16 // set rotating predicates
-;; }
-{ .mib
- nop.m 0
- movi0 b6 = loopaddr
- br b6 // jump to the appropriate loop
-;; }
-
- LOOP(8)
- LOOP(16)
- LOOP(24)
- LOOP(32)
- LOOP(40)
- LOOP(48)
- LOOP(56)
-END(memcpy)
-libc_hidden_builtin_def (memcpy)
-
- .rodata
- .align 8
-.table:
- data8 0 // dummy entry
- data8 .loop56 - .loop8
- data8 .loop56 - .loop16
- data8 .loop56 - .loop24
- data8 .loop56 - .loop32
- data8 .loop56 - .loop40
- data8 .loop56 - .loop48
- data8 .loop56 - .loop56