diff options
Diffstat (limited to 'newlib/libc/machine/riscv/memset.S')
-rw-r--r-- | newlib/libc/machine/riscv/memset.S | 349 |
1 files changed, 270 insertions, 79 deletions
diff --git a/newlib/libc/machine/riscv/memset.S b/newlib/libc/machine/riscv/memset.S index a717ae7..533f667 100644 --- a/newlib/libc/machine/riscv/memset.S +++ b/newlib/libc/machine/riscv/memset.S @@ -9,105 +9,296 @@ http://www.opensource.org/licenses. */ +#include <sys/asm.h> + + +#define BYTE_TBL_SZ 31 +#define WORD_TBL_SZ 32 + +#if __riscv_zilsd +/* Move size */ +#define MV_SZ 8 + +/* Store instruction */ +#define RG_ST sd + +/* Zilsd and Zclsd require an even numbered register */ +#define REG_SPLAT a4 +#else +#define MV_SZ SZREG +#define RG_ST REG_S +#define REG_SPLAT a1 +#endif + +/* + Use an extended register for Zilsd and Zclsd if available + since a5 is used for the odd numbered register, in order + to eliminate an li instruction +*/ +#if __riscv_zilsd && !__riscv_abi_rve +#define REG_TABLE a6 +#else +#define REG_TABLE a5 +#endif + + .text .global memset -.type memset, @function +.type memset, @function + +/* void *memset(void *s, int c, size_t n); */ + + memset: #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) - mv t1, a0 - beqz a2, 2f + mv a3, a0 + beqz a2, .Ldone -1: - sb a1, 0(t1) - add a2, a2, -1 - add t1, t1, 1 - bnez a2, 1b +.Lset: + sb a1, 0(a3) + addi a2, a2, -1 + addi a3, a3, 1 + bnez a2, .Lset -2: +.Ldone: ret #else - li t1, 15 - move a4, a0 - bleu a2, t1, .Ltiny - and a5, a4, 15 - bnez a5, .Lmisaligned + li REG_TABLE, BYTE_TBL_SZ + mv a3, a0 + + /* If there aren't many bytes, copy them individually to reduce overhead */ + bleu a2, REG_TABLE, .Lcopy_bytes + + and a4, a3, MV_SZ - 1 + beqz a4, .Lword_check + + /* + Jump into the byte table depending on the number of bytes that need to be + written + */ +1: + auipc t0, %pcrel_hi(.Ltable_misaligned) + + /* + Instructions in the tables are forced to be four bytes, so scale count + by 4 + */ +#if __riscv_zba + sh2add t0, a4, t0 +#else + sll t1, a4, 2 + add t0, t0, t1 +#endif -.Laligned: - bnez a1, .Lwordify + /* Save the return address because we aren't exiting the function yet */ + mv t1, ra + jalr t0, %pcrel_lo(1b) -.Lwordified: - and a3, a2, ~15 - and a2, a2, 15 - add a3, a3, a4 + /* Update pointer and count by what was written */ + mv ra, t1 + add a4, a4, -MV_SZ + add a2, a2, a4 + sub a3, a3, a4 + /* Access is now aligned. Check we can copy words. */ + bleu a2, REG_TABLE, .Lcopy_bytes + +.Lword_check: + /* Don't need to splat special case of zero */ + bnez a1, .Lsplat_byte +#if __riscv_zilsd + mv REG_SPLAT, a1 +#endif + j .Lcopy_words_init + +/* + Align labels to four bytes after unconditional jumps to avoid any + penalties when jumping to 32-bit instructions that aren't 4-byte + aligned +*/ +.p2align 2 +.Lsplat_byte: +#if __riscv_zbkb + packh REG_SPLAT, a1, a1 #if __riscv_xlen == 64 -1:sd a1, 0(a4) - sd a1, 8(a4) + packw REG_SPLAT, REG_SPLAT, REG_SPLAT +#endif + pack REG_SPLAT, REG_SPLAT, REG_SPLAT #else -1:sw a1, 0(a4) - sw a1, 4(a4) - sw a1, 8(a4) - sw a1, 12(a4) + and a1, a1, 0xFF + sll t0, a1, 8 + or a1, a1, t0 + sll t0, a1, 16 + or REG_SPLAT, a1, t0 +#if __riscv_xlen == 64 + sll t0, REG_SPLAT, 32 + or REG_SPLAT, REG_SPLAT, t0 +#endif #endif - add a4, a4, 16 - bltu a4, a3, 1b - bnez a2, .Ltiny - ret +.Lcopy_words_init: +#if __riscv_zilsd + /* Odd register of even-odd pair */ + mv a5, REG_SPLAT +#endif + + /* Calculate end address */ + and t0, a2, ~(MV_SZ - 1) + add t1, a3, t0 + + /* + The idea behind the table of word copies is that first we calculate any + remainder of bytes that need to be copied by the table that aren't an + entire table length. That's copied first. After that, runs of the entire + table are performed. + */ + and t0, t0, (WORD_TBL_SZ - 1) * MV_SZ + + /* Skip if there's no remainder */ + beqz t0, .Ltable_bigly + neg t0, t0 + add t0, t0, WORD_TBL_SZ * MV_SZ + + /* Adjust start address with offset */ + sub a3, a3, t0 + +1: + auipc t2, %pcrel_hi(.Ltable_bigly) + +#if MV_SZ == 8 + /* + If eight bytes are being copied with each store, we need to divide + the table offset in half + */ + srl t0, t0, 1 +#endif + + add t2, t2, t0 + jr t2, %pcrel_lo(1b) -.Ltiny: - sub a3, t1, a2 - sll a3, a3, 2 -1:auipc t0, %pcrel_hi(.Ltable) - add a3, a3, t0 +.p2align 2 +.Ltable_bigly: +/* + Force the instructions to be four bytes to avoid an extra instruction + that would be needed to halve the offset for sw +*/ .option push .option norvc -.Ltable_misaligned: - jr a3, %pcrel_lo(1b) -.Ltable: - sb a1,14(a4) - sb a1,13(a4) - sb a1,12(a4) - sb a1,11(a4) - sb a1,10(a4) - sb a1, 9(a4) - sb a1, 8(a4) - sb a1, 7(a4) - sb a1, 6(a4) - sb a1, 5(a4) - sb a1, 4(a4) - sb a1, 3(a4) - sb a1, 2(a4) - sb a1, 1(a4) - sb a1, 0(a4) + RG_ST REG_SPLAT, MV_SZ*0(a3) + RG_ST REG_SPLAT, MV_SZ*1(a3) + RG_ST REG_SPLAT, MV_SZ*2(a3) + RG_ST REG_SPLAT, MV_SZ*3(a3) + RG_ST REG_SPLAT, MV_SZ*4(a3) + RG_ST REG_SPLAT, MV_SZ*5(a3) + RG_ST REG_SPLAT, MV_SZ*6(a3) + RG_ST REG_SPLAT, MV_SZ*7(a3) + RG_ST REG_SPLAT, MV_SZ*8(a3) + RG_ST REG_SPLAT, MV_SZ*9(a3) + RG_ST REG_SPLAT, MV_SZ*10(a3) + RG_ST REG_SPLAT, MV_SZ*11(a3) + RG_ST REG_SPLAT, MV_SZ*12(a3) + RG_ST REG_SPLAT, MV_SZ*13(a3) + RG_ST REG_SPLAT, MV_SZ*14(a3) + RG_ST REG_SPLAT, MV_SZ*15(a3) + RG_ST REG_SPLAT, MV_SZ*16(a3) + RG_ST REG_SPLAT, MV_SZ*17(a3) + RG_ST REG_SPLAT, MV_SZ*18(a3) + RG_ST REG_SPLAT, MV_SZ*19(a3) + RG_ST REG_SPLAT, MV_SZ*20(a3) + RG_ST REG_SPLAT, MV_SZ*21(a3) + RG_ST REG_SPLAT, MV_SZ*22(a3) + RG_ST REG_SPLAT, MV_SZ*23(a3) + RG_ST REG_SPLAT, MV_SZ*24(a3) + RG_ST REG_SPLAT, MV_SZ*25(a3) + RG_ST REG_SPLAT, MV_SZ*26(a3) + RG_ST REG_SPLAT, MV_SZ*27(a3) + RG_ST REG_SPLAT, MV_SZ*28(a3) + RG_ST REG_SPLAT, MV_SZ*29(a3) + RG_ST REG_SPLAT, MV_SZ*30(a3) + RG_ST REG_SPLAT, MV_SZ*31(a3) .option pop - ret -.Lwordify: - and a1, a1, 0xFF - sll a3, a1, 8 - or a1, a1, a3 - sll a3, a1, 16 - or a1, a1, a3 -#if __riscv_xlen == 64 - sll a3, a1, 32 - or a1, a1, a3 + /* Update the pointer and copy data if needed */ + add a3, a3, MV_SZ * WORD_TBL_SZ + bltu a3, t1, .Ltable_bigly + + /* Copy any remaining bytes */ + and a2, a2, MV_SZ - 1 + beqz a2, .Lexit + +#if __riscv_zilsd && __riscv_abi_rve + /* Restore table size if necessary */ + li REG_TABLE, BYTE_TBL_SZ #endif - j .Lwordified - -.Lmisaligned: - sll a3, a5, 2 -1:auipc t0, %pcrel_hi(.Ltable_misaligned) - add a3, a3, t0 - mv t0, ra - jalr a3, %pcrel_lo(1b) - mv ra, t0 - - add a5, a5, -16 - sub a4, a4, a5 - add a2, a2, a5 - bleu a2, t1, .Ltiny - j .Laligned + +.Lcopy_bytes: + auipc t0, %pcrel_hi(.Ltable_tiny) + + sub a2, REG_TABLE, a2 + + /* + Instructions in the tables are forced to be four bytes, so scale count + by 4 + */ +#if __riscv_zba + sh2add t0, a2, t0 +#else + sll a2, a2, 2 + add t0, t0, a2 +#endif + + /* Don't save the return address because we're exiting after the jump */ + jr t0, %pcrel_lo(.Lcopy_bytes) + +.p2align 2 +.Ltable_tiny: +/* + norvc is needed because the immediate is only two bits in size for c.sb, + and without it the table would have a mix of 2- and 4-byte instructions + when Zcb is available +*/ +.option push +.option norvc + sb a1, 30(a3) + sb a1, 29(a3) + sb a1, 28(a3) + sb a1, 27(a3) + sb a1, 26(a3) + sb a1, 25(a3) + sb a1, 24(a3) + sb a1, 23(a3) + sb a1, 22(a3) + sb a1, 21(a3) + sb a1, 20(a3) + sb a1, 19(a3) + sb a1, 18(a3) + sb a1, 17(a3) + sb a1, 16(a3) + sb a1, 15(a3) + sb a1, 14(a3) + sb a1, 13(a3) + sb a1, 12(a3) + sb a1, 11(a3) + sb a1, 10(a3) + sb a1, 9(a3) + sb a1, 8(a3) +#if MV_SZ == 8 +.Ltable_misaligned: +#endif + sb a1, 7(a3) + sb a1, 6(a3) + sb a1, 5(a3) + sb a1, 4(a3) +#if MV_SZ == 4 +.Ltable_misaligned: +#endif + sb a1, 3(a3) + sb a1, 2(a3) + sb a1, 1(a3) + sb a1, 0(a3) +.option pop +.Lexit: + ret #endif - .size memset, .-memset +.size memset, .-memset |