diff options
Diffstat (limited to 'sysdeps/ia64/memcpy.S')
-rw-r--r-- | sysdeps/ia64/memcpy.S | 436 |
1 files changed, 0 insertions, 436 deletions
diff --git a/sysdeps/ia64/memcpy.S b/sysdeps/ia64/memcpy.S deleted file mode 100644 index a2aeea0..0000000 --- a/sysdeps/ia64/memcpy.S +++ /dev/null @@ -1,436 +0,0 @@ -/* Optimized version of the standard memcpy() function. - This file is part of the GNU C Library. - Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc. - Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>. - Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch> - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -/* Return: dest - - Inputs: - in0: dest - in1: src - in2: byte count - - An assembly implementation of the algorithm used by the generic C - version from glibc. The case when source and sest are aligned is - treated separately, for extra performance. - - In this form, memcpy assumes little endian mode. For big endian mode, - sh1 must be computed using an extra instruction: sub sh1 = 64, sh1 - and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the - shrp instruction. */ - -#define USE_LFETCH -#define USE_FLP -#include <sysdep.h> -#undef ret - -#define LFETCH_DIST 500 - -#define ALIGN_UNROLL_no 4 // no. of elements -#define ALIGN_UNROLL_sh 2 // (shift amount) - -#define MEMLAT 8 -#define Nrot ((4*(MEMLAT+2) + 7) & ~7) - -#define OP_T_THRES 16 -#define OPSIZ 8 - -#define loopcnt r14 -#define elemcnt r15 -#define saved_pr r16 -#define saved_lc r17 -#define adest r18 -#define dest r19 -#define asrc r20 -#define src r21 -#define len r22 -#define tmp2 r23 -#define tmp3 r24 -#define tmp4 r25 -#define ptable r26 -#define ploop56 r27 -#define loopaddr r28 -#define sh1 r29 -#define ptr1 r30 -#define ptr2 r31 - -#define movi0 mov - -#define p_scr p6 -#define p_xtr p7 -#define p_nxtr p8 -#define p_few p9 - -#if defined(USE_FLP) -#define load ldf8 -#define store stf8 -#define tempreg f6 -#define the_r fr -#define the_s fs -#define the_t ft -#define the_q fq -#define the_w fw -#define the_x fx -#define the_y fy -#define the_z fz -#elif defined(USE_INT) -#define load ld8 -#define store st8 -#define tempreg tmp2 -#define the_r r -#define the_s s -#define the_t t -#define the_q q -#define the_w w -#define the_x x -#define the_y y -#define the_z z -#endif - -#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO -/* Manually force proper loop-alignment. Note: be sure to - double-check the code-layout after making any changes to - this routine! */ -# define ALIGN(n) { nop 0 } -#else -# define ALIGN(n) .align n -#endif - -#if defined(USE_LFETCH) -#define LOOP(shift) \ - ALIGN(32); \ -.loop##shift##: \ -{ .mmb \ -(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ -(p[0]) lfetch.nt1 [ptr1], 16 ; \ - nop.b 0 ; \ -} { .mib \ -(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ -(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ - nop.b 0 ;; \ - } { .mmb \ -(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ -(p[0]) lfetch.nt1 [ptr2], 16 ; \ - nop.b 0 ; \ -} { .mib \ -(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ -(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ - br.ctop.sptk.many .loop##shift \ -;; } \ -{ .mib \ - br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ -} -#else -#define LOOP(shift) \ - ALIGN(32); \ -.loop##shift##: \ -{ .mmb \ -(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ - nop.b 0 ; \ -} { .mib \ -(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ -(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ - nop.b 0 ;; \ - } { .mmb \ -(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ - nop.b 0 ; \ -} { .mib \ -(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ -(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ - br.ctop.sptk.many .loop##shift \ -;; } \ -{ .mib \ - br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ -} -#endif - - -ENTRY(memcpy) -{ .mmi - .prologue - alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot - .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1] - .rotp p[MEMLAT+2] - .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1] - mov ret0 = in0 // return tmp2 = dest - .save pr, saved_pr - movi0 saved_pr = pr // save the predicate registers -} { .mmi - and tmp4 = 7, in0 // check if destination is aligned - mov dest = in0 // dest - mov src = in1 // src -;; } -{ .mii - cmp.eq p_scr, p0 = in2, r0 // if (len == 0) - .save ar.lc, saved_lc - movi0 saved_lc = ar.lc // save the loop counter - .body - cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH -} { .mbb - mov len = in2 // len -(p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest -(p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte -;; } -{ .mmi -#if defined(USE_LFETCH) - lfetch.nt1 [dest] // - lfetch.nt1 [src] // -#endif - shr.u elemcnt = len, 3 // elemcnt = len / 8 -} { .mib - cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned? - sub loopcnt = 7, tmp4 // -(p_scr) br.cond.dptk.many .dest_aligned -;; } -{ .mmi - ld1 tmp2 = [src], 1 // - sub len = len, loopcnt, 1 // reduce len - movi0 ar.lc = loopcnt // -} { .mib - cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point -;; } - -.l0: // ---------------------------- // L0: Align src on 8-byte boundary -{ .mmi - st1 [dest] = tmp2, 1 // -(p_scr) ld1 tmp2 = [src], 1 // -} { .mib - cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point - add loopcnt = -1, loopcnt - br.cloop.dptk.few .l0 // -;; } - -.dest_aligned: -{ .mmi - and tmp4 = 7, src // ready for alignment check - shr.u elemcnt = len, 3 // elemcnt = len / 8 -;; } -{ .mib - cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned - tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src -} { .mib // is not 16B aligned - add ptr2 = LFETCH_DIST, dest // prefetch address - add ptr1 = LFETCH_DIST, src -(p_scr) br.cond.dptk.many .src_not_aligned -;; } - -// The optimal case, when dest, and src are aligned - -.both_aligned: -{ .mmi - .pred.rel "mutex",p_xtr,p_nxtr -(p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify -(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify - movi0 pr.rot = 1 << 16 // set rotating predicates -} { .mib -(p_scr) br.cond.dpnt.many .copy_full_words -;; } - -{ .mmi -(p_xtr) load tempreg = [src], 8 -(p_xtr) add elemcnt = -1, elemcnt - movi0 ar.ec = MEMLAT + 1 // set the epilog counter -;; } -{ .mmi -(p_xtr) add len = -8, len // - add asrc = 16, src // one bank apart (for USE_INT) - shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling -;;} -{ .mmi - add loopcnt = -1, loopcnt -(p_xtr) store [dest] = tempreg, 8 // copy the "extra" word - nop.i 0 -;; } -{ .mib - add adest = 16, dest - movi0 ar.lc = loopcnt // set the loop counter -;; } - -#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO - { nop 0 } -#else - .align 32 -#endif -#if defined(USE_FLP) -.l1: // ------------------------------- // L1: Everything a multiple of 8 -{ .mmi -#if defined(USE_LFETCH) -(p[0]) lfetch.nt1 [ptr2],32 -#endif -(p[0]) ldfp8 the_r[0],the_q[0] = [src], 16 -(p[0]) add len = -32, len -} {.mmb -(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 -(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8 -;; } -{ .mmi -#if defined(USE_LFETCH) -(p[0]) lfetch.nt1 [ptr1],32 -#endif -(p[0]) ldfp8 the_s[0], the_t[0] = [src], 16 -} {.mmb -(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24 -(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 - br.ctop.dptk.many .l1 -;; } -#elif defined(USE_INT) -.l1: // ------------------------------- // L1: Everything a multiple of 8 -{ .mmi -(p[0]) load the_r[0] = [src], 8 -(p[0]) load the_q[0] = [asrc], 8 -(p[0]) add len = -32, len -} {.mmb -(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 -(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8 -;; } -{ .mmi -(p[0]) load the_s[0] = [src], 24 -(p[0]) load the_t[0] = [asrc], 24 -} {.mmb -(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24 -(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 -#if defined(USE_LFETCH) -;; } -{ .mmb -(p[0]) lfetch.nt1 [ptr2],32 -(p[0]) lfetch.nt1 [ptr1],32 -#endif - br.ctop.dptk.many .l1 -;; } -#endif - -.copy_full_words: -{ .mib - cmp.gt p_scr, p0 = 8, len // - shr.u elemcnt = len, 3 // -(p_scr) br.cond.dpnt.many .copy_bytes -;; } -{ .mii - load tempreg = [src], 8 - add loopcnt = -1, elemcnt // -;; } -{ .mii - cmp.ne p_scr, p0 = 0, loopcnt // - mov ar.lc = loopcnt // -;; } - -.l2: // ------------------------------- // L2: Max 4 words copied separately -{ .mmi - store [dest] = tempreg, 8 -(p_scr) load tempreg = [src], 8 // - add len = -8, len -} { .mib - cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point - add loopcnt = -1, loopcnt - br.cloop.dptk.few .l2 -;; } - -.copy_bytes: -{ .mib - cmp.eq p_scr, p0 = len, r0 // is len == 0 ? - add loopcnt = -1, len // len--; -(p_scr) br.cond.spnt .restore_and_exit -;; } -{ .mii - ld1 tmp2 = [src], 1 - movi0 ar.lc = loopcnt - cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point -;; } - -.l3: // ------------------------------- // L3: Final byte move -{ .mmi - st1 [dest] = tmp2, 1 -(p_scr) ld1 tmp2 = [src], 1 -} { .mib - cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point - add loopcnt = -1, loopcnt - br.cloop.dptk.few .l3 -;; } - -.restore_and_exit: -{ .mmi - movi0 pr = saved_pr, -1 // restore the predicate registers -;; } -{ .mib - movi0 ar.lc = saved_lc // restore the loop counter - br.ret.sptk.many b0 -;; } - - -.src_not_aligned: -{ .mmi - cmp.gt p_scr, p0 = 16, len - and sh1 = 7, src // sh1 = src % 8 - shr.u loopcnt = len, 4 // element-cnt = len / 16 -} { .mib - add tmp4 = @ltoff(.table), gp - add tmp3 = @ltoff(.loop56), gp -(p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few -;; } -{ .mmi - and asrc = -8, src // asrc = (-8) -- align src for loop - add loopcnt = -1, loopcnt // loopcnt-- - shl sh1 = sh1, 3 // sh1 = 8 * (src % 8) -} { .mmi - ld8 ptable = [tmp4] // ptable = &table - ld8 ploop56 = [tmp3] // ploop56 = &loop56 - and tmp2 = -16, len // tmp2 = len & -OPSIZ -;; } -{ .mmi - add tmp3 = ptable, sh1 // tmp3 = &table + sh1 - add src = src, tmp2 // src += len & (-16) - movi0 ar.lc = loopcnt // set LC -;; } -{ .mmi - ld8 tmp4 = [tmp3] // tmp4 = loop offset - sub len = len, tmp2 // len -= len & (-16) - movi0 ar.ec = MEMLAT + 2 // one more pass needed -;; } -{ .mmi - ld8 s[1] = [asrc], 8 // preload - sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset - movi0 pr.rot = 1 << 16 // set rotating predicates -;; } -{ .mib - nop.m 0 - movi0 b6 = loopaddr - br b6 // jump to the appropriate loop -;; } - - LOOP(8) - LOOP(16) - LOOP(24) - LOOP(32) - LOOP(40) - LOOP(48) - LOOP(56) -END(memcpy) -libc_hidden_builtin_def (memcpy) - - .rodata - .align 8 -.table: - data8 0 // dummy entry - data8 .loop56 - .loop8 - data8 .loop56 - .loop16 - data8 .loop56 - .loop24 - data8 .loop56 - .loop32 - data8 .loop56 - .loop40 - data8 .loop56 - .loop48 - data8 .loop56 - .loop56 |