diff options
Diffstat (limited to 'newlib/libc/machine/riscv')
-rw-r--r-- | newlib/libc/machine/riscv/Makefile.inc | 4 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/memchr.c | 152 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/memcpy-asm.S | 12 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/memcpy.c | 163 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/memmove-asm.S | 40 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/memmove-stub.c | 14 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/memmove.S | 40 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/memmove.c | 259 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/memrchr.c | 172 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/memset.S | 349 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/rv_string.h | 51 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/setjmp.S | 78 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/strcmp.S | 198 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/strlen.c | 19 | ||||
-rw-r--r-- | newlib/libc/machine/riscv/xlenint.h | 7 |
15 files changed, 1240 insertions, 318 deletions
diff --git a/newlib/libc/machine/riscv/Makefile.inc b/newlib/libc/machine/riscv/Makefile.inc index 4d6c046..3cc6e19 100644 --- a/newlib/libc/machine/riscv/Makefile.inc +++ b/newlib/libc/machine/riscv/Makefile.inc @@ -1,3 +1,3 @@ libc_a_SOURCES += \ - %D%/memmove.S %D%/memmove-stub.c %D%/memset.S %D%/memcpy-asm.S %D%/memcpy.c %D%/strlen.c \ - %D%/strcpy.c %D%/stpcpy.c %D%/strcmp.S %D%/setjmp.S %D%/ieeefp.c %D%/ffs.c + %D%/memmove-asm.S %D%/memmove.c %D%/memset.S %D%/memcpy-asm.S %D%/memcpy.c %D%/strlen.c \ + %D%/strcpy.c %D%/stpcpy.c %D%/strcmp.S %D%/memchr.c %D%/memrchr.c %D%/setjmp.S %D%/ieeefp.c %D%/ffs.c diff --git a/newlib/libc/machine/riscv/memchr.c b/newlib/libc/machine/riscv/memchr.c new file mode 100644 index 0000000..62a7d19 --- /dev/null +++ b/newlib/libc/machine/riscv/memchr.c @@ -0,0 +1,152 @@ +/* +FUNCTION + <<memchr>>---find character in memory + +INDEX + memchr + +SYNOPSIS + #include <string.h> + void *memchr(const void *<[src]>, int <[c]>, size_t <[length]>); + +DESCRIPTION + This function searches memory starting at <<*<[src]>>> for the + character <[c]>. The search only ends with the first + occurrence of <[c]>, or after <[length]> characters; in + particular, <<NUL>> does not terminate the search. + +RETURNS + If the character <[c]> is found within <[length]> characters + of <<*<[src]>>>, a pointer to the character is returned. If + <[c]> is not found, then <<NULL>> is returned. + +PORTABILITY +<<memchr>> is ANSI C. + +<<memchr>> requires no supporting OS subroutines. + +QUICKREF + memchr ansi pure +*/ + +#include <sys/asm.h> +#include <stddef.h> +#include "rv_string.h" + +// Move size +#if __riscv_zilsd +#define MV_SZ 8 +#else +#define MV_SZ SZREG +#endif + + +void * +memchr (const void *src_void, + int c, + size_t length) +{ + const unsigned char *src = (const unsigned char *) src_void; + unsigned char d = c; + +#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) + size_t align = (uintptr_t) src & (MV_SZ - 1); + + if (align) + { + align = MV_SZ - align; + + if (length < align) align = length; + + switch (align) + { +#if MV_SZ == 8 + case 7: + if (*src++ == d) return (void *) (src - 1); + case 6: + if (*src++ == d) return (void *) (src - 1); + case 5: + if (*src++ == d) return (void *) (src - 1); + case 4: + if (*src++ == d) return (void *) (src - 1); +#endif /* MV_SZ == 8 */ + case 3: + if (*src++ == d) return (void *) (src - 1); + case 2: + if (*src++ == d) return (void *) (src - 1); + case 1: + if (*src++ == d) return (void *) (src - 1); + } + + length -= align; + } + + const unsigned char *end_addr = src + (length & ~(MV_SZ - 1)); + + if (src < end_addr) + { + uintxlen_t mask = __libc_splat_byte(d); + + do + { + uintlslen_t val = *(uintlslen_t*) src; + uintxlen_t word1 = val ^ mask; + + if (__libc_detect_null(word1)) + { +#if __riscv_zbb + word1 = ~__LIBC_RISCV_ZBB_ORC_B(word1); + word1 = __LIBC_RISCV_ZBB_CNT_Z(word1); + + return (void *) (src + (word1 >> 3)); +#else /* not __riscv_zbb */ + if (*src++ == d) return (void *) (src - 1); + if (*src++ == d) return (void *) (src - 1); + if (*src++ == d) return (void *) (src - 1); +#if __riscv_xlen == 64 + if (*src++ == d) return (void *) (src - 1); + if (*src++ == d) return (void *) (src - 1); + if (*src++ == d) return (void *) (src - 1); + if (*src++ == d) return (void *) (src - 1); +#endif /* __riscv_xlen == 64 */ + return (void *) src; +#endif /* __riscv_zbb */ + } +#if __riscv_zilsd + uintxlen_t word2 = (val >> 32); + word2 ^= mask; + + if (__libc_detect_null(word2)) + { + src += MV_SZ / 2; +#if __riscv_zbb + word2 = ~__LIBC_RISCV_ZBB_ORC_B(word2); + word2 = __LIBC_RISCV_ZBB_CNT_Z(word2); + + return (void *) (src + (word2 >> 3)); +#else /* not __riscv_zbb */ + if (*src++ == d) return (void *) (src - 1); + if (*src++ == d) return (void *) (src - 1); + if (*src++ == d) return (void *) (src - 1); + return (void *) src; +#endif /* __riscv_zbb */ + } +#endif /* __riscv_zilsd */ + + src += MV_SZ; + } while (src < end_addr); + + length &= MV_SZ - 1; + } + +#endif /* not PREFER_SIZE_OVER_SPEED */ + + while (length--) + { + if (*src == d) + return (void *) src; + src++; + } + + return NULL; +} diff --git a/newlib/libc/machine/riscv/memcpy-asm.S b/newlib/libc/machine/riscv/memcpy-asm.S index 5571e47..2771285 100644 --- a/newlib/libc/machine/riscv/memcpy-asm.S +++ b/newlib/libc/machine/riscv/memcpy-asm.S @@ -14,15 +14,15 @@ .global memcpy .type memcpy, @function memcpy: - mv t1, a0 + mv a3, a0 beqz a2, 2f 1: - lb t2, 0(a1) - sb t2, 0(t1) - add a2, a2, -1 - add t1, t1, 1 - add a1, a1, 1 + lbu a4, 0(a1) + sb a4, 0(a3) + addi a2, a2, -1 + addi a3, a3, 1 + addi a1, a1, 1 bnez a2, 1b 2: diff --git a/newlib/libc/machine/riscv/memcpy.c b/newlib/libc/machine/riscv/memcpy.c index e1a34a8..a27e0ec 100644 --- a/newlib/libc/machine/riscv/memcpy.c +++ b/newlib/libc/machine/riscv/memcpy.c @@ -1,4 +1,5 @@ /* Copyright (c) 2017 SiFive Inc. All rights reserved. + Copyright (c) 2025 Mahmoud Abumandour <ma.mandourr@gmail.com> This copyrighted material is made available to anyone wishing to use, modify, copy, or redistribute it subject to the terms and conditions @@ -10,83 +11,137 @@ */ #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) -//memcpy defined in memcpy-asm.S +// memcpy defined in memcpy-asm.S #else -#include <string.h> -#include <stdint.h> #include "../../string/local.h" +#include "xlenint.h" +#include <string.h> +#include <sys/asm.h> #define unlikely(X) __builtin_expect (!!(X), 0) -void * -__inhibit_loop_to_libcall -memcpy(void *__restrict aa, const void *__restrict bb, size_t n) +static inline void +__libc_memcpy_bytewise (unsigned char *dst, const unsigned char *src, + const size_t sz) { - #define BODY(a, b, t) { \ - t tt = *b; \ - a++, b++; \ - *(a - 1) = tt; \ - } + const unsigned char *end = dst + sz; + while (dst < end) + *dst++ = *src++; +} - char *a = (char *)aa; - const char *b = (const char *)bb; - char *end = a + n; - uintptr_t msk = sizeof (long) - 1; -#if __riscv_misaligned_slow || __riscv_misaligned_fast - if (n < sizeof (long)) -#else - if (unlikely ((((uintptr_t)a & msk) != ((uintptr_t)b & msk)) - || n < sizeof (long))) +#ifndef __riscv_misaligned_fast +static uintxlen_t +__libc_load_xlen (const void *src) +{ + const unsigned char *p = (const unsigned char *)src; + uintxlen_t ret = 0; + unsigned char b0 = *p++; + unsigned char b1 = *p++; + unsigned char b2 = *p++; + unsigned char b3 = *p++; + ret = (uintxlen_t)b0 | ((uintxlen_t)b1 << 8) | ((uintxlen_t)b2 << 16) + | ((uintxlen_t)b3 << 24); +#if __riscv_xlen == 64 + unsigned char b4 = *p++; + unsigned char b5 = *p++; + unsigned char b6 = *p++; + unsigned char b7 = *p++; + ret |= ((uintxlen_t)b4 << 32) | ((uintxlen_t)b5 << 40) + | ((uintxlen_t)b6 << 48) | ((uintxlen_t)b7 << 56); +#endif + return ret; +} #endif + +void * +__inhibit_loop_to_libcall +memcpy (void *__restrict aa, const void *__restrict bb, size_t n) +{ + unsigned char *a = (unsigned char *)aa; + const unsigned char *b = (const unsigned char *)bb; + unsigned char *end = a + n; + uintptr_t msk = SZREG - 1; + if (n < SZREG) { -small: if (__builtin_expect (a < end, 1)) - while (a < end) - BODY (a, b, char); + __libc_memcpy_bytewise (a, b, n); return aa; } +/* + * If misaligned access is slow or prohibited, and the alignments of the source + * and destination are different, we align the destination to do XLEN stores. + * This uses only one aligned store for every four (or eight for XLEN == 64) + * bytes of data. + */ +#ifndef __riscv_misaligned_fast + if (unlikely ((((uintptr_t)a & msk) != ((uintptr_t)b & msk)))) + { + size_t dst_pad = (uintptr_t)a & msk; + dst_pad = (SZREG - dst_pad) & msk; + __libc_memcpy_bytewise (a, b, dst_pad); + a += dst_pad; + b += dst_pad; + + uintxlen_t *la = (uintxlen_t *)a; + const unsigned char *cb = (const unsigned char *)b; + uintxlen_t *lend = (uintxlen_t *)((uintptr_t)end & ~msk); + + while (la < lend) + { + *la++ = __libc_load_xlen (cb); + cb += SZREG; + } + a = (unsigned char *)la; + b = (const unsigned char *)cb; + if (unlikely (a < end)) + __libc_memcpy_bytewise (a, b, end - a); + return aa; + } +#endif + if (unlikely (((uintptr_t)a & msk) != 0)) - while ((uintptr_t)a & msk) - BODY (a, b, char); + { + size_t pad = SZREG - ((uintptr_t)a & msk); + __libc_memcpy_bytewise (a, b, pad); + a += pad; + b += pad; + } - long *la = (long *)a; - const long *lb = (const long *)b; - long *lend = (long *)((uintptr_t)end & ~msk); + uintxlen_t *la = (uintxlen_t *)a; + const uintxlen_t *lb = (const uintxlen_t *)b; + uintxlen_t *lend = (uintxlen_t *)((uintptr_t)end & ~msk); if (unlikely (lend - la > 8)) { while (lend - la > 8) - { - long b0 = *lb++; - long b1 = *lb++; - long b2 = *lb++; - long b3 = *lb++; - long b4 = *lb++; - long b5 = *lb++; - long b6 = *lb++; - long b7 = *lb++; - long b8 = *lb++; - *la++ = b0; - *la++ = b1; - *la++ = b2; - *la++ = b3; - *la++ = b4; - *la++ = b5; - *la++ = b6; - *la++ = b7; - *la++ = b8; - } + { + uintxlen_t b0 = *lb++; + uintxlen_t b1 = *lb++; + uintxlen_t b2 = *lb++; + uintxlen_t b3 = *lb++; + uintxlen_t b4 = *lb++; + uintxlen_t b5 = *lb++; + uintxlen_t b6 = *lb++; + uintxlen_t b7 = *lb++; + uintxlen_t b8 = *lb++; + *la++ = b0; + *la++ = b1; + *la++ = b2; + *la++ = b3; + *la++ = b4; + *la++ = b5; + *la++ = b6; + *la++ = b7; + *la++ = b8; + } } - while (la < lend) - BODY (la, lb, long); - - a = (char *)la; - b = (const char *)lb; + a = (unsigned char *)la; + b = (const unsigned char *)lb; if (unlikely (a < end)) - goto small; + __libc_memcpy_bytewise (a, b, end - a); return aa; } #endif diff --git a/newlib/libc/machine/riscv/memmove-asm.S b/newlib/libc/machine/riscv/memmove-asm.S new file mode 100644 index 0000000..061472c --- /dev/null +++ b/newlib/libc/machine/riscv/memmove-asm.S @@ -0,0 +1,40 @@ +/* Copyright (c) 2019 SiFive Inc. All rights reserved. + + This copyrighted material is made available to anyone wishing to use, + modify, copy, or redistribute it subject to the terms and conditions + of the FreeBSD License. This program is distributed in the hope that + it will be useful, but WITHOUT ANY WARRANTY expressed or implied, + including the implied warranties of MERCHANTABILITY or FITNESS FOR + A PARTICULAR PURPOSE. A copy of this license is available at + http://www.opensource.org/licenses. +*/ + +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) +.text +.global memmove +.type memmove, @function +memmove: + beqz a2, .Ldone /* in case there are 0 bytes to be copied, return immediately */ + + mv a4, a0 /* copy the destination address over to a4, since memmove should return that address in a0 at the end */ + li a3, 1 + bgtu a1, a0, .Lcopy /* in case of source address > destination address, copy from start to end of the specified memory area */ + + li a3, -1 /* otherwhise, start copying from the end of the specified memory area in order to prevent data loss in case of overlapping memory areas.*/ + add a4, a4, a2 /* add the number of bytes to be copied to both addresses. this gives us the address one byte past the end of the memory area we want to copy, */ + add a1, a1, a2 /* therefore we need to subtract 1 from both addresses in the next step before starting the copying process. */ + +.Lincrement: + add a4, a4, a3 /* in case of source address < destination address, increment both addresses by -1 before copying any data to obtain the correct start addresses */ + add a1, a1, a3 +.Lcopy: + lbu a5, 0(a1) + addi a2, a2, -1 /* copy bytes as long as a2 (= the number of bytes to be copied) > 0. the increment is done here to relax the RAW dependency between load and store */ + sb a5, 0(a4) + bnez a2, .Lincrement + +.Ldone: + ret + + .size memmove, .-memmove +#endif diff --git a/newlib/libc/machine/riscv/memmove-stub.c b/newlib/libc/machine/riscv/memmove-stub.c deleted file mode 100644 index d882e46..0000000 --- a/newlib/libc/machine/riscv/memmove-stub.c +++ /dev/null @@ -1,14 +0,0 @@ -/* Copyright (c) 2019 SiFive Inc. All rights reserved. - - This copyrighted material is made available to anyone wishing to use, - modify, copy, or redistribute it subject to the terms and conditions - of the FreeBSD License. This program is distributed in the hope that - it will be useful, but WITHOUT ANY WARRANTY expressed or implied, - including the implied warranties of MERCHANTABILITY or FITNESS FOR - A PARTICULAR PURPOSE. A copy of this license is available at - http://www.opensource.org/licenses. -*/ - -#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) -#include "../../string/memmove.c" -#endif diff --git a/newlib/libc/machine/riscv/memmove.S b/newlib/libc/machine/riscv/memmove.S deleted file mode 100644 index 66d9cd4..0000000 --- a/newlib/libc/machine/riscv/memmove.S +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2019 SiFive Inc. All rights reserved. - - This copyrighted material is made available to anyone wishing to use, - modify, copy, or redistribute it subject to the terms and conditions - of the FreeBSD License. This program is distributed in the hope that - it will be useful, but WITHOUT ANY WARRANTY expressed or implied, - including the implied warranties of MERCHANTABILITY or FITNESS FOR - A PARTICULAR PURPOSE. A copy of this license is available at - http://www.opensource.org/licenses. -*/ - -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) -.text -.global memmove -.type memmove, @function -memmove: - beqz a2, 2f - - mv t1, a0 - li a3, 1 - bgtu a1, a0, 1f - - li a3, -1 - addi a4, a2 , -1 - add t1, t1, a4 - add a1, a1, a4 - -1: - lb t2, 0(a1) - sb t2, 0(t1) - add a2, a2, -1 - add t1, t1, a3 - add a1, a1, a3 - bnez a2, 1b - -2: - ret - - .size memmove, .-memmove -#endif diff --git a/newlib/libc/machine/riscv/memmove.c b/newlib/libc/machine/riscv/memmove.c new file mode 100644 index 0000000..209a75c --- /dev/null +++ b/newlib/libc/machine/riscv/memmove.c @@ -0,0 +1,259 @@ +/* Copyright (c) 2019 SiFive Inc. All rights reserved. + Copyright (c) 2025 Marlene Fally <marlene.fally@gmail.com> + + This copyrighted material is made available to anyone wishing to use, + modify, copy, or redistribute it subject to the terms and conditions + of the FreeBSD License. This program is distributed in the hope that + it will be useful, but WITHOUT ANY WARRANTY expressed or implied, + including the implied warranties of MERCHANTABILITY or FITNESS FOR + A PARTICULAR PURPOSE. A copy of this license is available at + http://www.opensource.org/licenses. +*/ + +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) +/* memmove defined in memmove-asm.S */ +#else + +#include "../../string/local.h" +#include "sys/asm.h" +#include "xlenint.h" +#include <limits.h> +#include <stddef.h> +#include <string.h> + +static inline uint8_t +__libc_fast_xlen_aligned (void *dst, const void *src) +{ +#if defined(__riscv_misaligned_fast) + return 1; +#else + return !(((uintxlen_t)src & (SZREG - 1)) | ((uintxlen_t)dst & (SZREG - 1))); +#endif +} + +#if !defined(__riscv_misaligned_fast) +static inline void +__libc_memmove_misaligned_copy (unsigned char *dst, + const uintxlen_t *aligned_src) +{ + uintxlen_t src_xlen = *aligned_src; + + *dst++ = (unsigned char)(src_xlen); + *dst++ = (unsigned char)(src_xlen >> 8); + *dst++ = (unsigned char)(src_xlen >> 16); + *dst++ = (unsigned char)(src_xlen >> 24); +#if __riscv_xlen == 64 + *dst++ = (unsigned char)(src_xlen >> 32); + *dst++ = (unsigned char)(src_xlen >> 40); + *dst++ = (unsigned char)(src_xlen >> 48); + *dst++ = (unsigned char)(src_xlen >> 56); +#endif +} +#endif + +static inline void +__libc_aligned_copy_unrolled (uintxlen_t *aligned_dst, + const uintxlen_t *aligned_src) +{ + uintxlen_t dst0 = *aligned_src++; + uintxlen_t dst1 = *aligned_src++; + uintxlen_t dst2 = *aligned_src++; + uintxlen_t dst3 = *aligned_src++; + uintxlen_t dst4 = *aligned_src++; + uintxlen_t dst5 = *aligned_src++; + uintxlen_t dst6 = *aligned_src++; + uintxlen_t dst7 = *aligned_src++; + uintxlen_t dst8 = *aligned_src; + + *aligned_dst++ = dst0; + *aligned_dst++ = dst1; + *aligned_dst++ = dst2; + *aligned_dst++ = dst3; + *aligned_dst++ = dst4; + *aligned_dst++ = dst5; + *aligned_dst++ = dst6; + *aligned_dst++ = dst7; + *aligned_dst = dst8; +} + +static inline void +__libc_memmove_bytewise_forward_copy (unsigned char *dst, + const unsigned char *src, size_t length) +{ + while (length--) + { + *dst++ = *src++; + } +} + +void *__inhibit_loop_to_libcall +memmove (void *dst_void, const void *src_void, size_t length) +{ + unsigned char *dst = dst_void; + const unsigned char *src = src_void; + uintxlen_t *aligned_dst; + const uintxlen_t *aligned_src; + + if (src <= dst) + { + if (dst < src + length) /* Memory areas overlap destructively, have to + copy backwards. */ + { + src += length; + dst += length; + + if (length >= SZREG) + { + if (__libc_fast_xlen_aligned (dst, src)) + { + aligned_dst = (uintxlen_t *)dst; + aligned_src = (uintxlen_t *)src; + + /* If possible, unroll the word-copy loop by a factor 9 to + match memcpy. This speeds up the copying process for + longer lengths while barely degrading performance for + lengths < SZREG*9. Since we are copying backwards, + decrement the addresses before copying. + */ + while (length >= SZREG * 9) + { + aligned_dst -= 9; + aligned_src -= 9; + __libc_aligned_copy_unrolled (aligned_dst, aligned_src); + length -= (SZREG * 9); + } + + while (length >= SZREG) + { + *--aligned_dst = *--aligned_src; + length -= SZREG; + } + + /* Pick up any residual with a byte copier. */ + dst = (unsigned char *)aligned_dst; + src = (unsigned char *)aligned_src; + } +#if !defined(__riscv_misaligned_fast) + else if (length > (SZREG * 2)) + { + /* At least one address is not xlen-aligned. If + misaligned accesses are slow or prohibited, + align the src so we can load SZREG bytes at a time. + This reduces the amount of memory accesses made + and therefore improves performance. + */ + while ((uintxlen_t)src & (SZREG - 1)) + { + *--dst = *--src; + length--; + } + + aligned_src = (uintxlen_t *)src; + + /* Decrement the addresses before copying since + we are copying backwards. */ + do + { + aligned_src--; + dst -= SZREG; + __libc_memmove_misaligned_copy (dst, aligned_src); + length -= SZREG; + } + while (length >= SZREG); + + /* Pick up any residual with a byte copier. */ + src = (unsigned char *)aligned_src; + } +#endif + } + while (length--) + { + *--dst = *--src; + } + + return dst_void; + } + } + else if (src < dst + length) /* Memory areas overlap non-destructively. */ + { + if (length >= SZREG) + { + if (__libc_fast_xlen_aligned (dst, src)) + { + aligned_dst = (uintxlen_t *)dst; + aligned_src = (uintxlen_t *)src; + + /* If possible, unroll the word-copy loop by a factor 9 to + match memcpy. This speeds up the copying process for longer + lengths while barely degrading performance for lengths < + SZREG*9. + */ + while (length >= SZREG * 9) + { + __libc_aligned_copy_unrolled (aligned_dst, aligned_src); + aligned_dst += 9; + aligned_src += 9; + length -= (SZREG * 9); + } + + while (length >= SZREG) + { + *aligned_dst++ = *aligned_src++; + length -= SZREG; + } + + /* Pick up any residual with a byte copier. */ + dst = (unsigned char *)aligned_dst; + src = (unsigned char *)aligned_src; + } +#if !defined(__riscv_misaligned_fast) + else if (length > (SZREG * 2)) + { + /* At least one address is not xlen-aligned. If + misaligned accesses are slow or prohibited, + align the src so we can load SZREG bytes at a time. + This reduces the amount of memory accesses made + and therefore improves performance. + */ + while ((uintxlen_t)src & (SZREG - 1)) + { + *dst++ = *src++; + length--; + } + + aligned_src = (uintxlen_t *)src; + + do + { + __libc_memmove_misaligned_copy (dst, aligned_src); + aligned_src++; + dst += SZREG; + length -= SZREG; + } + while (length >= SZREG); + + /* Pick up any residual with a byte copier. */ + src = (unsigned char *)aligned_src; + } +#endif + } + + __libc_memmove_bytewise_forward_copy (dst, src, length); + return dst_void; + } + + /* Memory areas do not overlap, redirect to memcpy. + Copy byte-by-byte for lengths <= SZREG to reduce + overhead on very short copies. + */ + if (length > SZREG) + { + return memcpy (dst_void, src_void, length); + } + else + { + __libc_memmove_bytewise_forward_copy (dst, src, length); + return dst_void; + } +} +#endif diff --git a/newlib/libc/machine/riscv/memrchr.c b/newlib/libc/machine/riscv/memrchr.c new file mode 100644 index 0000000..47e1023 --- /dev/null +++ b/newlib/libc/machine/riscv/memrchr.c @@ -0,0 +1,172 @@ +/* +FUNCTION + <<memrchr>>---reverse search for character in memory + +INDEX + memrchr + +SYNOPSIS + #include <string.h> + void *memrchr(const void *<[src]>, int <[c]>, size_t <[length]>); + +DESCRIPTION + This function searches memory starting at <[length]> bytes + beyond <<*<[src]>>> backwards for the character <[c]>. + The search only ends with the first occurrence of <[c]>; in + particular, <<NUL>> does not terminate the search. + +RETURNS + If the character <[c]> is found within <[length]> characters + of <<*<[src]>>>, a pointer to the character is returned. If + <[c]> is not found, then <<NULL>> is returned. + +PORTABILITY +<<memrchr>> is a GNU extension. + +<<memrchr>> requires no supporting OS subroutines. + +QUICKREF + memrchr +*/ + +#include <sys/asm.h> +#include <stddef.h> +#include "rv_string.h" + +// Move size +#if __riscv_zilsd +#define MV_SZ 8 + +// Offset is only 4 bytes for Zilsd/Zclsd since each register is 32 bits +#define OFFSET 4 +#else +#define MV_SZ SZREG +#define OFFSET SZREG +#endif + + +void * +memrchr (const void *src_void, + int c, + size_t length) +{ + const unsigned char *src = (const unsigned char *) src_void; + unsigned char d = c; + + if (length) src += length - 1; + +#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) + + /* + We add one to the address because even if an address is already aligned, + when loading words the bytes preceding this address are read, so check + the single byte. + + If the address has all the least significant bits set equaling MV_SZ - 1, + and has a length of at least MV_SZ, we can read a word starting from + src & ~(MV_SZ - 1) because no alignment is actually required + */ + size_t align = (uintptr_t) (src + 1) & (MV_SZ - 1); + + if (align) + { + if (length < align) align = length; + + switch (align) + { +#if MV_SZ == 8 + case 7: + if (*src-- == d) return (void *) (src + 1); + case 6: + if (*src-- == d) return (void *) (src + 1); + case 5: + if (*src-- == d) return (void *) (src + 1); + case 4: + if (*src-- == d) return (void *) (src + 1); +#endif /* MV_SZ == 8 */ + case 3: + if (*src-- == d) return (void *) (src + 1); + case 2: + if (*src-- == d) return (void *) (src + 1); + case 1: + if (*src-- == d) return (void *) (src + 1); + } + + length -= align; + } + + const unsigned char *end_addr = src - (length & ~(MV_SZ - 1)); + + if (src > end_addr) + { + src -= MV_SZ - 1; + + uintxlen_t mask = __libc_splat_byte(d); + + do + { + uintlslen_t val = *(uintlslen_t*) src; + +#if __riscv_zilsd + uintxlen_t word2 = val >> 32; + word2 ^= mask; + + if (__libc_detect_null(word2)) + { +#if __riscv_zbb + src += OFFSET; + word2 = ~__LIBC_RISCV_ZBB_ORC_B(word2); + word2 = __LIBC_RISCV_ZBB_CNT_Z_REV(word2); + + return (void *) (src + OFFSET - 1 - (word2 >> 3)); +#else /* not __riscv_zbb */ + src += MV_SZ - 1; + if (*src-- == d) return (void *) (src + 1); + if (*src-- == d) return (void *) (src + 1); + if (*src-- == d) return (void *) (src + 1); + return (void *) src; +#endif /* __riscv_zbb */ + } +#endif /* __riscv_zilsd */ + uintxlen_t word1 = val ^ mask; + + if (__libc_detect_null(word1)) + { +#if __riscv_zbb + word1 = ~__LIBC_RISCV_ZBB_ORC_B(word1); + word1 = __LIBC_RISCV_ZBB_CNT_Z_REV(word1); + + return (void *) (src + OFFSET - 1 - (word1 >> 3)); +#else /* not __riscv_zbb */ + src += OFFSET - 1; + if (*src-- == d) return (void *) (src + 1); + if (*src-- == d) return (void *) (src + 1); + if (*src-- == d) return (void *) (src + 1); +#if __riscv_xlen == 64 + if (*src-- == d) return (void *) (src + 1); + if (*src-- == d) return (void *) (src + 1); + if (*src-- == d) return (void *) (src + 1); + if (*src-- == d) return (void *) (src + 1); +#endif /* __riscv_xlen == 64 */ + return (void *) src; +#endif /* __riscv_zbb */ + } + + src -= MV_SZ; + } while (src > end_addr); + + length &= MV_SZ - 1; + src = end_addr; + } + +#endif /* not PREFER_SIZE_OVER_SPEED */ + + while (length--) + { + if (*src == d) + return (void *) src; + src--; + } + + return NULL; +} diff --git a/newlib/libc/machine/riscv/memset.S b/newlib/libc/machine/riscv/memset.S index a717ae7..533f667 100644 --- a/newlib/libc/machine/riscv/memset.S +++ b/newlib/libc/machine/riscv/memset.S @@ -9,105 +9,296 @@ http://www.opensource.org/licenses. */ +#include <sys/asm.h> + + +#define BYTE_TBL_SZ 31 +#define WORD_TBL_SZ 32 + +#if __riscv_zilsd +/* Move size */ +#define MV_SZ 8 + +/* Store instruction */ +#define RG_ST sd + +/* Zilsd and Zclsd require an even numbered register */ +#define REG_SPLAT a4 +#else +#define MV_SZ SZREG +#define RG_ST REG_S +#define REG_SPLAT a1 +#endif + +/* + Use an extended register for Zilsd and Zclsd if available + since a5 is used for the odd numbered register, in order + to eliminate an li instruction +*/ +#if __riscv_zilsd && !__riscv_abi_rve +#define REG_TABLE a6 +#else +#define REG_TABLE a5 +#endif + + .text .global memset -.type memset, @function +.type memset, @function + +/* void *memset(void *s, int c, size_t n); */ + + memset: #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) - mv t1, a0 - beqz a2, 2f + mv a3, a0 + beqz a2, .Ldone -1: - sb a1, 0(t1) - add a2, a2, -1 - add t1, t1, 1 - bnez a2, 1b +.Lset: + sb a1, 0(a3) + addi a2, a2, -1 + addi a3, a3, 1 + bnez a2, .Lset -2: +.Ldone: ret #else - li t1, 15 - move a4, a0 - bleu a2, t1, .Ltiny - and a5, a4, 15 - bnez a5, .Lmisaligned + li REG_TABLE, BYTE_TBL_SZ + mv a3, a0 + + /* If there aren't many bytes, copy them individually to reduce overhead */ + bleu a2, REG_TABLE, .Lcopy_bytes + + and a4, a3, MV_SZ - 1 + beqz a4, .Lword_check + + /* + Jump into the byte table depending on the number of bytes that need to be + written + */ +1: + auipc t0, %pcrel_hi(.Ltable_misaligned) + + /* + Instructions in the tables are forced to be four bytes, so scale count + by 4 + */ +#if __riscv_zba + sh2add t0, a4, t0 +#else + sll t1, a4, 2 + add t0, t0, t1 +#endif -.Laligned: - bnez a1, .Lwordify + /* Save the return address because we aren't exiting the function yet */ + mv t1, ra + jalr t0, %pcrel_lo(1b) -.Lwordified: - and a3, a2, ~15 - and a2, a2, 15 - add a3, a3, a4 + /* Update pointer and count by what was written */ + mv ra, t1 + add a4, a4, -MV_SZ + add a2, a2, a4 + sub a3, a3, a4 + /* Access is now aligned. Check we can copy words. */ + bleu a2, REG_TABLE, .Lcopy_bytes + +.Lword_check: + /* Don't need to splat special case of zero */ + bnez a1, .Lsplat_byte +#if __riscv_zilsd + mv REG_SPLAT, a1 +#endif + j .Lcopy_words_init + +/* + Align labels to four bytes after unconditional jumps to avoid any + penalties when jumping to 32-bit instructions that aren't 4-byte + aligned +*/ +.p2align 2 +.Lsplat_byte: +#if __riscv_zbkb + packh REG_SPLAT, a1, a1 #if __riscv_xlen == 64 -1:sd a1, 0(a4) - sd a1, 8(a4) + packw REG_SPLAT, REG_SPLAT, REG_SPLAT +#endif + pack REG_SPLAT, REG_SPLAT, REG_SPLAT #else -1:sw a1, 0(a4) - sw a1, 4(a4) - sw a1, 8(a4) - sw a1, 12(a4) + and a1, a1, 0xFF + sll t0, a1, 8 + or a1, a1, t0 + sll t0, a1, 16 + or REG_SPLAT, a1, t0 +#if __riscv_xlen == 64 + sll t0, REG_SPLAT, 32 + or REG_SPLAT, REG_SPLAT, t0 +#endif #endif - add a4, a4, 16 - bltu a4, a3, 1b - bnez a2, .Ltiny - ret +.Lcopy_words_init: +#if __riscv_zilsd + /* Odd register of even-odd pair */ + mv a5, REG_SPLAT +#endif + + /* Calculate end address */ + and t0, a2, ~(MV_SZ - 1) + add t1, a3, t0 + + /* + The idea behind the table of word copies is that first we calculate any + remainder of bytes that need to be copied by the table that aren't an + entire table length. That's copied first. After that, runs of the entire + table are performed. + */ + and t0, t0, (WORD_TBL_SZ - 1) * MV_SZ + + /* Skip if there's no remainder */ + beqz t0, .Ltable_bigly + neg t0, t0 + add t0, t0, WORD_TBL_SZ * MV_SZ + + /* Adjust start address with offset */ + sub a3, a3, t0 + +1: + auipc t2, %pcrel_hi(.Ltable_bigly) + +#if MV_SZ == 8 + /* + If eight bytes are being copied with each store, we need to divide + the table offset in half + */ + srl t0, t0, 1 +#endif + + add t2, t2, t0 + jr t2, %pcrel_lo(1b) -.Ltiny: - sub a3, t1, a2 - sll a3, a3, 2 -1:auipc t0, %pcrel_hi(.Ltable) - add a3, a3, t0 +.p2align 2 +.Ltable_bigly: +/* + Force the instructions to be four bytes to avoid an extra instruction + that would be needed to halve the offset for sw +*/ .option push .option norvc -.Ltable_misaligned: - jr a3, %pcrel_lo(1b) -.Ltable: - sb a1,14(a4) - sb a1,13(a4) - sb a1,12(a4) - sb a1,11(a4) - sb a1,10(a4) - sb a1, 9(a4) - sb a1, 8(a4) - sb a1, 7(a4) - sb a1, 6(a4) - sb a1, 5(a4) - sb a1, 4(a4) - sb a1, 3(a4) - sb a1, 2(a4) - sb a1, 1(a4) - sb a1, 0(a4) + RG_ST REG_SPLAT, MV_SZ*0(a3) + RG_ST REG_SPLAT, MV_SZ*1(a3) + RG_ST REG_SPLAT, MV_SZ*2(a3) + RG_ST REG_SPLAT, MV_SZ*3(a3) + RG_ST REG_SPLAT, MV_SZ*4(a3) + RG_ST REG_SPLAT, MV_SZ*5(a3) + RG_ST REG_SPLAT, MV_SZ*6(a3) + RG_ST REG_SPLAT, MV_SZ*7(a3) + RG_ST REG_SPLAT, MV_SZ*8(a3) + RG_ST REG_SPLAT, MV_SZ*9(a3) + RG_ST REG_SPLAT, MV_SZ*10(a3) + RG_ST REG_SPLAT, MV_SZ*11(a3) + RG_ST REG_SPLAT, MV_SZ*12(a3) + RG_ST REG_SPLAT, MV_SZ*13(a3) + RG_ST REG_SPLAT, MV_SZ*14(a3) + RG_ST REG_SPLAT, MV_SZ*15(a3) + RG_ST REG_SPLAT, MV_SZ*16(a3) + RG_ST REG_SPLAT, MV_SZ*17(a3) + RG_ST REG_SPLAT, MV_SZ*18(a3) + RG_ST REG_SPLAT, MV_SZ*19(a3) + RG_ST REG_SPLAT, MV_SZ*20(a3) + RG_ST REG_SPLAT, MV_SZ*21(a3) + RG_ST REG_SPLAT, MV_SZ*22(a3) + RG_ST REG_SPLAT, MV_SZ*23(a3) + RG_ST REG_SPLAT, MV_SZ*24(a3) + RG_ST REG_SPLAT, MV_SZ*25(a3) + RG_ST REG_SPLAT, MV_SZ*26(a3) + RG_ST REG_SPLAT, MV_SZ*27(a3) + RG_ST REG_SPLAT, MV_SZ*28(a3) + RG_ST REG_SPLAT, MV_SZ*29(a3) + RG_ST REG_SPLAT, MV_SZ*30(a3) + RG_ST REG_SPLAT, MV_SZ*31(a3) .option pop - ret -.Lwordify: - and a1, a1, 0xFF - sll a3, a1, 8 - or a1, a1, a3 - sll a3, a1, 16 - or a1, a1, a3 -#if __riscv_xlen == 64 - sll a3, a1, 32 - or a1, a1, a3 + /* Update the pointer and copy data if needed */ + add a3, a3, MV_SZ * WORD_TBL_SZ + bltu a3, t1, .Ltable_bigly + + /* Copy any remaining bytes */ + and a2, a2, MV_SZ - 1 + beqz a2, .Lexit + +#if __riscv_zilsd && __riscv_abi_rve + /* Restore table size if necessary */ + li REG_TABLE, BYTE_TBL_SZ #endif - j .Lwordified - -.Lmisaligned: - sll a3, a5, 2 -1:auipc t0, %pcrel_hi(.Ltable_misaligned) - add a3, a3, t0 - mv t0, ra - jalr a3, %pcrel_lo(1b) - mv ra, t0 - - add a5, a5, -16 - sub a4, a4, a5 - add a2, a2, a5 - bleu a2, t1, .Ltiny - j .Laligned + +.Lcopy_bytes: + auipc t0, %pcrel_hi(.Ltable_tiny) + + sub a2, REG_TABLE, a2 + + /* + Instructions in the tables are forced to be four bytes, so scale count + by 4 + */ +#if __riscv_zba + sh2add t0, a2, t0 +#else + sll a2, a2, 2 + add t0, t0, a2 +#endif + + /* Don't save the return address because we're exiting after the jump */ + jr t0, %pcrel_lo(.Lcopy_bytes) + +.p2align 2 +.Ltable_tiny: +/* + norvc is needed because the immediate is only two bits in size for c.sb, + and without it the table would have a mix of 2- and 4-byte instructions + when Zcb is available +*/ +.option push +.option norvc + sb a1, 30(a3) + sb a1, 29(a3) + sb a1, 28(a3) + sb a1, 27(a3) + sb a1, 26(a3) + sb a1, 25(a3) + sb a1, 24(a3) + sb a1, 23(a3) + sb a1, 22(a3) + sb a1, 21(a3) + sb a1, 20(a3) + sb a1, 19(a3) + sb a1, 18(a3) + sb a1, 17(a3) + sb a1, 16(a3) + sb a1, 15(a3) + sb a1, 14(a3) + sb a1, 13(a3) + sb a1, 12(a3) + sb a1, 11(a3) + sb a1, 10(a3) + sb a1, 9(a3) + sb a1, 8(a3) +#if MV_SZ == 8 +.Ltable_misaligned: +#endif + sb a1, 7(a3) + sb a1, 6(a3) + sb a1, 5(a3) + sb a1, 4(a3) +#if MV_SZ == 4 +.Ltable_misaligned: +#endif + sb a1, 3(a3) + sb a1, 2(a3) + sb a1, 1(a3) + sb a1, 0(a3) +.option pop +.Lexit: + ret #endif - .size memset, .-memset +.size memset, .-memset diff --git a/newlib/libc/machine/riscv/rv_string.h b/newlib/libc/machine/riscv/rv_string.h index 362f66a..dc2a26d 100644 --- a/newlib/libc/machine/riscv/rv_string.h +++ b/newlib/libc/machine/riscv/rv_string.h @@ -20,20 +20,24 @@ // Determine which intrinsics to use based on XLEN and endianness #if __riscv_xlen == 64 - #define __LIBC_RISCV_ZBB_ORC_B(x) __riscv_orc_b_64(x) + #define __LIBC_RISCV_ZBB_ORC_B(x) __riscv_orc_b_64(x) #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_ctz_64(x) + #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_ctz_64(x) + #define __LIBC_RISCV_ZBB_CNT_Z_REV(x) __riscv_clz_64(x) #else - #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_clz_64(x) + #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_clz_64(x) + #define __LIBC_RISCV_ZBB_CNT_Z_REV(x) __riscv_ctz_64(x) #endif #else - #define __LIBC_RISCV_ZBB_ORC_B(x) __riscv_orc_b_32(x) + #define __LIBC_RISCV_ZBB_ORC_B(x) __riscv_orc_b_32(x) #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_ctz_32(x) + #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_ctz_32(x) + #define __LIBC_RISCV_ZBB_CNT_Z_REV(x) __riscv_clz_32(x) #else - #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_clz_32(x) + #define __LIBC_RISCV_ZBB_CNT_Z(x) __riscv_clz_32(x) + #define __LIBC_RISCV_ZBB_CNT_Z_REV(x) __riscv_ctz_32(x) #endif #endif #endif @@ -82,8 +86,8 @@ static __inline char *__libc_strcpy(char *dst, const char *src, bool ret_start) if (!(*dst++ = src[0])) return dst0; if (!(*dst++ = src[1])) return dst0; if (!(*dst++ = src[2])) return dst0; - if (!(*dst++ = src[3])) return dst0; #if __riscv_xlen == 64 + if (!(*dst++ = src[3])) return dst0; if (!(*dst++ = src[4])) return dst0; if (!(*dst++ = src[5])) return dst0; if (!(*dst++ = src[6])) return dst0; @@ -94,13 +98,13 @@ static __inline char *__libc_strcpy(char *dst, const char *src, bool ret_start) if (!(*dst++ = src[0])) return dst - 1; if (!(*dst++ = src[1])) return dst - 1; if (!(*dst++ = src[2])) return dst - 1; - if (!(*dst++ = src[3])) return dst - 1; #if __riscv_xlen == 64 + if (!(*dst++ = src[3])) return dst - 1; if (!(*dst++ = src[4])) return dst - 1; if (!(*dst++ = src[5])) return dst - 1; if (!(*dst++ = src[6])) return dst - 1; - dst0 = dst; #endif + dst0 = dst; } *dst = 0; @@ -121,4 +125,33 @@ static __inline char *__libc_strcpy(char *dst, const char *src, bool ret_start) } +static __inline uintxlen_t __libc_splat_byte(unsigned char c) +{ + uintxlen_t val; + +#if __riscv_zbkb + asm volatile ("packh %0, %1, %1" + : "=r" (val) + : "r" (c) + ); +#if __riscv_xlen == 64 + asm volatile ("packw %0, %0, %0" + : "+r" (val) + ); +#endif /* __riscv_xlen == 64 */ + asm volatile ("pack %0, %0, %0" + : "+r" (val) + ); +#else /* not __riscv_zbkb */ + val = (c << 8) | c; + val = (val << 16) | val; +#if __riscv_xlen == 64 + val = (val << 32) | val; +#endif /* __riscv_xlen == 64 */ +#endif /* __riscv_zbkb */ + + return val; +} + + #endif /* _RV_STRING_H */ diff --git a/newlib/libc/machine/riscv/setjmp.S b/newlib/libc/machine/riscv/setjmp.S index eef242e..f2b5053 100644 --- a/newlib/libc/machine/riscv/setjmp.S +++ b/newlib/libc/machine/riscv/setjmp.S @@ -16,21 +16,33 @@ .type setjmp, @function setjmp: REG_S ra, 0*SZREG(a0) - REG_S s0, 1*SZREG(a0) - REG_S s1, 2*SZREG(a0) + #if __riscv_xlen == 32 && (__riscv_zilsd) && (__riscv_misaligned_fast) + sd s0, 1*SZREG(a0) + #else + REG_S s0, 1*SZREG(a0) + REG_S s1, 2*SZREG(a0) + #endif -#ifndef __riscv_32e - REG_S s2, 3*SZREG(a0) - REG_S s3, 4*SZREG(a0) - REG_S s4, 5*SZREG(a0) - REG_S s5, 6*SZREG(a0) - REG_S s6, 7*SZREG(a0) - REG_S s7, 8*SZREG(a0) - REG_S s8, 9*SZREG(a0) - REG_S s9, 10*SZREG(a0) - REG_S s10,11*SZREG(a0) - REG_S s11,12*SZREG(a0) - REG_S sp, 13*SZREG(a0) +#ifndef __riscv_abi_rve + #if __riscv_xlen == 32 && (__riscv_zilsd) && (__riscv_misaligned_fast) + sd s2, 3*SZREG(a0) + sd s4, 5*SZREG(a0) + sd s6, 7*SZREG(a0) + sd s8, 9*SZREG(a0) + sd s10,11*SZREG(a0) + #else + REG_S s2, 3*SZREG(a0) + REG_S s3, 4*SZREG(a0) + REG_S s4, 5*SZREG(a0) + REG_S s5, 6*SZREG(a0) + REG_S s6, 7*SZREG(a0) + REG_S s7, 8*SZREG(a0) + REG_S s8, 9*SZREG(a0) + REG_S s9, 10*SZREG(a0) + REG_S s10,11*SZREG(a0) + REG_S s11,12*SZREG(a0) + #endif + REG_S sp, 13*SZREG(a0) #else REG_S sp, 3*SZREG(a0) #endif @@ -59,19 +71,31 @@ setjmp: .type longjmp, @function longjmp: REG_L ra, 0*SZREG(a0) - REG_L s0, 1*SZREG(a0) - REG_L s1, 2*SZREG(a0) -#ifndef __riscv_32e - REG_L s2, 3*SZREG(a0) - REG_L s3, 4*SZREG(a0) - REG_L s4, 5*SZREG(a0) - REG_L s5, 6*SZREG(a0) - REG_L s6, 7*SZREG(a0) - REG_L s7, 8*SZREG(a0) - REG_L s8, 9*SZREG(a0) - REG_L s9, 10*SZREG(a0) - REG_L s10,11*SZREG(a0) - REG_L s11,12*SZREG(a0) + #if __riscv_xlen == 32 && (__riscv_zilsd) && (__riscv_misaligned_fast) + ld s0, 1*SZREG(a0) + #else + REG_L s0, 1*SZREG(a0) + REG_L s1, 2*SZREG(a0) + #endif +#ifndef __riscv_abi_rve + #if __riscv_xlen == 32 && (__riscv_zilsd) && (__riscv_misaligned_fast) + ld s2, 3*SZREG(a0) + ld s4, 5*SZREG(a0) + ld s6, 7*SZREG(a0) + ld s8, 9*SZREG(a0) + ld s10,11*SZREG(a0) + #else + REG_L s2, 3*SZREG(a0) + REG_L s3, 4*SZREG(a0) + REG_L s4, 5*SZREG(a0) + REG_L s5, 6*SZREG(a0) + REG_L s6, 7*SZREG(a0) + REG_L s7, 8*SZREG(a0) + REG_L s8, 9*SZREG(a0) + REG_L s9, 10*SZREG(a0) + REG_L s10,11*SZREG(a0) + REG_L s11,12*SZREG(a0) + #endif REG_L sp, 13*SZREG(a0) #else REG_L sp, 3*SZREG(a0) diff --git a/newlib/libc/machine/riscv/strcmp.S b/newlib/libc/machine/riscv/strcmp.S index cc29b7b..0b1dfc4 100644 --- a/newlib/libc/machine/riscv/strcmp.S +++ b/newlib/libc/machine/riscv/strcmp.S @@ -16,15 +16,15 @@ .type strcmp, @function strcmp: #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) -1: +.Lcompare: lbu a2, 0(a0) lbu a3, 0(a1) - add a0, a0, 1 - add a1, a1, 1 - bne a2, a3, 2f - bnez a2, 1b + addi a0, a0, 1 + addi a1, a1, 1 + bne a2, a3, .Lreturn_diff + bnez a2, .Lcompare -2: +.Lreturn_diff: sub a0, a2, a3 ret @@ -48,12 +48,16 @@ strcmp: REG_L a2, \i*SZREG(a0) REG_L a3, \i*SZREG(a1) - and t0, a2, a5 - or t1, a2, a5 - add t0, t0, a5 - or t0, t0, t1 + #if __riscv_zbb + orc.b a4, a2 + #else + and a4, a2, a5 + or t1, a2, a5 + add a4, a4, a5 + or a4, a4, t1 + #endif - bne t0, t2, .Lnull\i + bne a4, t2, .Lnull\i .if \i+1-\n bne a2, a3, .Lmismatch .else @@ -95,73 +99,109 @@ strcmp: .Lmismatch: # words don't match, but a2 has no null byte. + #if __riscv_zbb + xor a4, a2, a3 # find differing bits + + # Check system endianness + # If little-endian, use Count Trailing Zeros (ctz) + # If big-endian, use Count Leading Zeros (clz) + # This helps identify the position of the first differing byte between a2 and a3. + + # For example, in little-endian, least significant byte comes first. + # So trailing zeros help find which byte position differs. + + # In big-endian, most significant byte comes first, so leading zeros are used. + # The position will then be used to extract the differing byte. + + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + ctz a5, a4 + #else + clz a5, a4 + #endif + + andi a5, a5, -8 # find position of bit offset to the start of the byte where the first difference occurs + + + # Shift a2 and a3 right by a5 bits to bring the target byte to the LSB, and isolate the byte of interest + srl a2, a2, a5 + and a2, a2, 0xff + + srl a3, a3, a5 + and a3, a3, 0xff + + + sub a0, a2, a3 # Calculate and return the difference in the isolated bytes + ret + + #else + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #if __riscv_xlen == 64 + sll a4, a2, 48 + sll a5, a3, 48 + bne a4, a5, .Lmismatch_upper + sll a4, a2, 32 + sll a5, a3, 32 + bne a4, a5, .Lmismatch_upper + #endif + sll a4, a2, 16 + sll a5, a3, 16 + bne a4, a5, .Lmismatch_upper + + srl a4, a2, 8*SZREG-16 + srl a5, a3, 8*SZREG-16 + sub a0, a4, a5 + and a1, a0, 0xff + bnez a1, .Lfinal_upper_diff + ret -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - -#if __riscv_xlen == 64 - sll a4, a2, 48 - sll a5, a3, 48 - bne a4, a5, .Lmismatch_upper - sll a4, a2, 32 - sll a5, a3, 32 - bne a4, a5, .Lmismatch_upper -#endif - sll a4, a2, 16 - sll a5, a3, 16 - bne a4, a5, .Lmismatch_upper - - srl a4, a2, 8*SZREG-16 - srl a5, a3, 8*SZREG-16 - sub a0, a4, a5 - and a1, a0, 0xff - bnez a1, 1f - ret - -.Lmismatch_upper: - srl a4, a4, 8*SZREG-16 - srl a5, a5, 8*SZREG-16 - sub a0, a4, a5 - and a1, a0, 0xff - bnez a1, 1f - ret - -1:and a4, a4, 0xff - and a5, a5, 0xff - sub a0, a4, a5 - ret - -#else - -#if __riscv_xlen == 64 - srl a4, a2, 48 - srl a5, a3, 48 - bne a4, a5, .Lmismatch_lower - srl a4, a2, 32 - srl a5, a3, 32 - bne a4, a5, .Lmismatch_lower -#endif - srl a4, a2, 16 - srl a5, a3, 16 - bne a4, a5, .Lmismatch_lower - - srl a4, a2, 8 - srl a5, a3, 8 - bne a4, a5, 1f - and a4, a2, 0xff - and a5, a3, 0xff -1:sub a0, a4, a5 - ret - -.Lmismatch_lower: - srl a2, a4, 8 - srl a3, a5, 8 - bne a2, a3, 1f - and a2, a4, 0xff - and a3, a5, 0xff -1:sub a0, a2, a3 - ret - -#endif + .Lmismatch_upper: + srl a4, a4, 8*SZREG-16 + srl a5, a5, 8*SZREG-16 + sub a0, a4, a5 + and a1, a0, 0xff + bnez a1, .Lfinal_upper_diff + ret + + .Lfinal_upper_diff: + and a4, a4, 0xff + and a5, a5, 0xff + sub a0, a4, a5 + ret + #else + #if __riscv_xlen == 64 + srl a4, a2, 48 + srl a5, a3, 48 + bne a4, a5, .Lmismatch_lower + srl a4, a2, 32 + srl a5, a3, 32 + bne a4, a5, .Lmismatch_lower + #endif + srl a4, a2, 16 + srl a5, a3, 16 + bne a4, a5, .Lmismatch_lower + + srl a4, a2, 8 + srl a5, a3, 8 + bne a4, a5, .Lbyte_diff + and a4, a2, 0xff + and a5, a3, 0xff + + .Lbyte_diff: + sub a0, a4, a5 + ret + + .Lmismatch_lower: + srl a2, a4, 8 + srl a3, a5, 8 + bne a2, a3, .Lfinal_lower_diff + and a2, a4, 0xff + and a3, a5, 0xff + + .Lfinal_lower_diff: + sub a0, a2, a3 + ret + #endif + #endif .Lmisaligned: # misaligned @@ -169,10 +209,10 @@ strcmp: lbu a3, 0(a1) add a0, a0, 1 add a1, a1, 1 - bne a2, a3, 1f + bne a2, a3, .Lmisaligned_diff bnez a2, .Lmisaligned -1: +.Lmisaligned_diff: sub a0, a2, a3 ret diff --git a/newlib/libc/machine/riscv/strlen.c b/newlib/libc/machine/riscv/strlen.c index 9bfd2a1..8ab5ce5 100644 --- a/newlib/libc/machine/riscv/strlen.c +++ b/newlib/libc/machine/riscv/strlen.c @@ -9,6 +9,7 @@ http://www.opensource.org/licenses. */ +#include <sys/types.h> #include <string.h> #include <stdint.h> #include "rv_string.h" @@ -38,7 +39,9 @@ size_t strlen(const char *str) asm volatile ("" : "+r"(ps)); /* prevent "optimization" */ str = (const char *)ps; - size_t ret = str - start, sp = sizeof (*ps); + + size_t ret = str - start; + ssize_t sp = sizeof (*ps); #if __riscv_zbb psval = ~__LIBC_RISCV_ZBB_ORC_B(psval); @@ -47,16 +50,16 @@ size_t strlen(const char *str) return ret + (psval >> 3) - sp; #else char c0 = str[0 - sp], c1 = str[1 - sp], c2 = str[2 - sp], c3 = str[3 - sp]; - if (c0 == 0) return ret + 0 - sp; - if (c1 == 0) return ret + 1 - sp; - if (c2 == 0) return ret + 2 - sp; - if (c3 == 0) return ret + 3 - sp; + if (c0 == 0) return ret + 0 - sp; + if (c1 == 0) return ret + 1 - sp; + if (c2 == 0) return ret + 2 - sp; + if (__riscv_xlen == 32 || c3 == 0) return ret + 3 - sp; #if __riscv_xlen == 64 c0 = str[4 - sp], c1 = str[5 - sp], c2 = str[6 - sp]; - if (c0 == 0) return ret + 4 - sp; - if (c1 == 0) return ret + 5 - sp; - if (c2 == 0) return ret + 6 - sp; + if (c0 == 0) return ret + 4 - sp; + if (c1 == 0) return ret + 5 - sp; + if (c2 == 0) return ret + 6 - sp; #endif return ret + 7 - sp; diff --git a/newlib/libc/machine/riscv/xlenint.h b/newlib/libc/machine/riscv/xlenint.h index 86363a8..2d444ff 100644 --- a/newlib/libc/machine/riscv/xlenint.h +++ b/newlib/libc/machine/riscv/xlenint.h @@ -11,4 +11,11 @@ typedef uint32_t uintxlen_t; # error __riscv_xlen must equal 32 or 64 #endif +/* Load/Store length */ +#if __riscv_zilsd +typedef uint64_t uintlslen_t; +#else +typedef uintxlen_t uintlslen_t; +#endif + #endif /* _XLENINT_H */ |