diff options
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_oryon1.S | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S index e86d8b0..cc267db 100644 --- a/sysdeps/aarch64/multiarch/memcpy_oryon1.S +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S @@ -152,6 +152,46 @@ L(copy96): .p2align 6 L(copy_long): + /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp. + This loop is identical to the one below it but using ldnp/stnp + instructions. For loops that are less than 32768 bytes, + the ldnp/stnp instructions will not help and will cause a slow + down so only use the ldnp/stnp loop for the largest sizes. */ + + cmp count, #32768 + b.lo L(copy_long_without_nontemp) + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldnp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldnp A_l, A_h, [src, 16] + stnp D_l, D_h, [dstin] + ldnp B_l, B_h, [src, 32] + ldnp C_l, C_h, [src, 48] + ldnp D_l, D_h, [src, 64] + add src, src, #64 + subs count, count, 128 + 16 /* Test and readjust count. */ + +L(nontemp_loop64): + tbz src, #6, 1f +1: + stnp A_l, A_h, [dst, 16] + ldnp A_l, A_h, [src, 16] + stnp B_l, B_h, [dst, 32] + ldnp B_l, B_h, [src, 32] + stnp C_l, C_h, [dst, 48] + ldnp C_l, C_h, [src, 48] + stnp D_l, D_h, [dst, 64] + ldnp D_l, D_h, [src, 64] + add src, src, #64 + add dst, dst, #64 + subs count, count, 64 + b.hi L(nontemp_loop64) + b L(last64) + +L(copy_long_without_nontemp): + and tmp1, dstin, 15 bic dst, dstin, 15 ldp D_l, D_h, [src] |