aboutsummaryrefslogtreecommitdiff
path: root/newlib/libc/machine/riscv/memset.S
diff options
context:
space:
mode:
Diffstat (limited to 'newlib/libc/machine/riscv/memset.S')
-rw-r--r--newlib/libc/machine/riscv/memset.S349
1 files changed, 270 insertions, 79 deletions
diff --git a/newlib/libc/machine/riscv/memset.S b/newlib/libc/machine/riscv/memset.S
index a717ae7..533f667 100644
--- a/newlib/libc/machine/riscv/memset.S
+++ b/newlib/libc/machine/riscv/memset.S
@@ -9,105 +9,296 @@
http://www.opensource.org/licenses.
*/
+#include <sys/asm.h>
+
+
+#define BYTE_TBL_SZ 31
+#define WORD_TBL_SZ 32
+
+#if __riscv_zilsd
+/* Move size */
+#define MV_SZ 8
+
+/* Store instruction */
+#define RG_ST sd
+
+/* Zilsd and Zclsd require an even numbered register */
+#define REG_SPLAT a4
+#else
+#define MV_SZ SZREG
+#define RG_ST REG_S
+#define REG_SPLAT a1
+#endif
+
+/*
+ Use an extended register for Zilsd and Zclsd if available
+ since a5 is used for the odd numbered register, in order
+ to eliminate an li instruction
+*/
+#if __riscv_zilsd && !__riscv_abi_rve
+#define REG_TABLE a6
+#else
+#define REG_TABLE a5
+#endif
+
+
.text
.global memset
-.type memset, @function
+.type memset, @function
+
+/* void *memset(void *s, int c, size_t n); */
+
+
memset:
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
- mv t1, a0
- beqz a2, 2f
+ mv a3, a0
+ beqz a2, .Ldone
-1:
- sb a1, 0(t1)
- add a2, a2, -1
- add t1, t1, 1
- bnez a2, 1b
+.Lset:
+ sb a1, 0(a3)
+ addi a2, a2, -1
+ addi a3, a3, 1
+ bnez a2, .Lset
-2:
+.Ldone:
ret
#else
- li t1, 15
- move a4, a0
- bleu a2, t1, .Ltiny
- and a5, a4, 15
- bnez a5, .Lmisaligned
+ li REG_TABLE, BYTE_TBL_SZ
+ mv a3, a0
+
+ /* If there aren't many bytes, copy them individually to reduce overhead */
+ bleu a2, REG_TABLE, .Lcopy_bytes
+
+ and a4, a3, MV_SZ - 1
+ beqz a4, .Lword_check
+
+ /*
+ Jump into the byte table depending on the number of bytes that need to be
+ written
+ */
+1:
+ auipc t0, %pcrel_hi(.Ltable_misaligned)
+
+ /*
+ Instructions in the tables are forced to be four bytes, so scale count
+ by 4
+ */
+#if __riscv_zba
+ sh2add t0, a4, t0
+#else
+ sll t1, a4, 2
+ add t0, t0, t1
+#endif
-.Laligned:
- bnez a1, .Lwordify
+ /* Save the return address because we aren't exiting the function yet */
+ mv t1, ra
+ jalr t0, %pcrel_lo(1b)
-.Lwordified:
- and a3, a2, ~15
- and a2, a2, 15
- add a3, a3, a4
+ /* Update pointer and count by what was written */
+ mv ra, t1
+ add a4, a4, -MV_SZ
+ add a2, a2, a4
+ sub a3, a3, a4
+ /* Access is now aligned. Check we can copy words. */
+ bleu a2, REG_TABLE, .Lcopy_bytes
+
+.Lword_check:
+ /* Don't need to splat special case of zero */
+ bnez a1, .Lsplat_byte
+#if __riscv_zilsd
+ mv REG_SPLAT, a1
+#endif
+ j .Lcopy_words_init
+
+/*
+ Align labels to four bytes after unconditional jumps to avoid any
+ penalties when jumping to 32-bit instructions that aren't 4-byte
+ aligned
+*/
+.p2align 2
+.Lsplat_byte:
+#if __riscv_zbkb
+ packh REG_SPLAT, a1, a1
#if __riscv_xlen == 64
-1:sd a1, 0(a4)
- sd a1, 8(a4)
+ packw REG_SPLAT, REG_SPLAT, REG_SPLAT
+#endif
+ pack REG_SPLAT, REG_SPLAT, REG_SPLAT
#else
-1:sw a1, 0(a4)
- sw a1, 4(a4)
- sw a1, 8(a4)
- sw a1, 12(a4)
+ and a1, a1, 0xFF
+ sll t0, a1, 8
+ or a1, a1, t0
+ sll t0, a1, 16
+ or REG_SPLAT, a1, t0
+#if __riscv_xlen == 64
+ sll t0, REG_SPLAT, 32
+ or REG_SPLAT, REG_SPLAT, t0
+#endif
#endif
- add a4, a4, 16
- bltu a4, a3, 1b
- bnez a2, .Ltiny
- ret
+.Lcopy_words_init:
+#if __riscv_zilsd
+ /* Odd register of even-odd pair */
+ mv a5, REG_SPLAT
+#endif
+
+ /* Calculate end address */
+ and t0, a2, ~(MV_SZ - 1)
+ add t1, a3, t0
+
+ /*
+ The idea behind the table of word copies is that first we calculate any
+ remainder of bytes that need to be copied by the table that aren't an
+ entire table length. That's copied first. After that, runs of the entire
+ table are performed.
+ */
+ and t0, t0, (WORD_TBL_SZ - 1) * MV_SZ
+
+ /* Skip if there's no remainder */
+ beqz t0, .Ltable_bigly
+ neg t0, t0
+ add t0, t0, WORD_TBL_SZ * MV_SZ
+
+ /* Adjust start address with offset */
+ sub a3, a3, t0
+
+1:
+ auipc t2, %pcrel_hi(.Ltable_bigly)
+
+#if MV_SZ == 8
+ /*
+ If eight bytes are being copied with each store, we need to divide
+ the table offset in half
+ */
+ srl t0, t0, 1
+#endif
+
+ add t2, t2, t0
+ jr t2, %pcrel_lo(1b)
-.Ltiny:
- sub a3, t1, a2
- sll a3, a3, 2
-1:auipc t0, %pcrel_hi(.Ltable)
- add a3, a3, t0
+.p2align 2
+.Ltable_bigly:
+/*
+ Force the instructions to be four bytes to avoid an extra instruction
+ that would be needed to halve the offset for sw
+*/
.option push
.option norvc
-.Ltable_misaligned:
- jr a3, %pcrel_lo(1b)
-.Ltable:
- sb a1,14(a4)
- sb a1,13(a4)
- sb a1,12(a4)
- sb a1,11(a4)
- sb a1,10(a4)
- sb a1, 9(a4)
- sb a1, 8(a4)
- sb a1, 7(a4)
- sb a1, 6(a4)
- sb a1, 5(a4)
- sb a1, 4(a4)
- sb a1, 3(a4)
- sb a1, 2(a4)
- sb a1, 1(a4)
- sb a1, 0(a4)
+ RG_ST REG_SPLAT, MV_SZ*0(a3)
+ RG_ST REG_SPLAT, MV_SZ*1(a3)
+ RG_ST REG_SPLAT, MV_SZ*2(a3)
+ RG_ST REG_SPLAT, MV_SZ*3(a3)
+ RG_ST REG_SPLAT, MV_SZ*4(a3)
+ RG_ST REG_SPLAT, MV_SZ*5(a3)
+ RG_ST REG_SPLAT, MV_SZ*6(a3)
+ RG_ST REG_SPLAT, MV_SZ*7(a3)
+ RG_ST REG_SPLAT, MV_SZ*8(a3)
+ RG_ST REG_SPLAT, MV_SZ*9(a3)
+ RG_ST REG_SPLAT, MV_SZ*10(a3)
+ RG_ST REG_SPLAT, MV_SZ*11(a3)
+ RG_ST REG_SPLAT, MV_SZ*12(a3)
+ RG_ST REG_SPLAT, MV_SZ*13(a3)
+ RG_ST REG_SPLAT, MV_SZ*14(a3)
+ RG_ST REG_SPLAT, MV_SZ*15(a3)
+ RG_ST REG_SPLAT, MV_SZ*16(a3)
+ RG_ST REG_SPLAT, MV_SZ*17(a3)
+ RG_ST REG_SPLAT, MV_SZ*18(a3)
+ RG_ST REG_SPLAT, MV_SZ*19(a3)
+ RG_ST REG_SPLAT, MV_SZ*20(a3)
+ RG_ST REG_SPLAT, MV_SZ*21(a3)
+ RG_ST REG_SPLAT, MV_SZ*22(a3)
+ RG_ST REG_SPLAT, MV_SZ*23(a3)
+ RG_ST REG_SPLAT, MV_SZ*24(a3)
+ RG_ST REG_SPLAT, MV_SZ*25(a3)
+ RG_ST REG_SPLAT, MV_SZ*26(a3)
+ RG_ST REG_SPLAT, MV_SZ*27(a3)
+ RG_ST REG_SPLAT, MV_SZ*28(a3)
+ RG_ST REG_SPLAT, MV_SZ*29(a3)
+ RG_ST REG_SPLAT, MV_SZ*30(a3)
+ RG_ST REG_SPLAT, MV_SZ*31(a3)
.option pop
- ret
-.Lwordify:
- and a1, a1, 0xFF
- sll a3, a1, 8
- or a1, a1, a3
- sll a3, a1, 16
- or a1, a1, a3
-#if __riscv_xlen == 64
- sll a3, a1, 32
- or a1, a1, a3
+ /* Update the pointer and copy data if needed */
+ add a3, a3, MV_SZ * WORD_TBL_SZ
+ bltu a3, t1, .Ltable_bigly
+
+ /* Copy any remaining bytes */
+ and a2, a2, MV_SZ - 1
+ beqz a2, .Lexit
+
+#if __riscv_zilsd && __riscv_abi_rve
+ /* Restore table size if necessary */
+ li REG_TABLE, BYTE_TBL_SZ
#endif
- j .Lwordified
-
-.Lmisaligned:
- sll a3, a5, 2
-1:auipc t0, %pcrel_hi(.Ltable_misaligned)
- add a3, a3, t0
- mv t0, ra
- jalr a3, %pcrel_lo(1b)
- mv ra, t0
-
- add a5, a5, -16
- sub a4, a4, a5
- add a2, a2, a5
- bleu a2, t1, .Ltiny
- j .Laligned
+
+.Lcopy_bytes:
+ auipc t0, %pcrel_hi(.Ltable_tiny)
+
+ sub a2, REG_TABLE, a2
+
+ /*
+ Instructions in the tables are forced to be four bytes, so scale count
+ by 4
+ */
+#if __riscv_zba
+ sh2add t0, a2, t0
+#else
+ sll a2, a2, 2
+ add t0, t0, a2
+#endif
+
+ /* Don't save the return address because we're exiting after the jump */
+ jr t0, %pcrel_lo(.Lcopy_bytes)
+
+.p2align 2
+.Ltable_tiny:
+/*
+ norvc is needed because the immediate is only two bits in size for c.sb,
+ and without it the table would have a mix of 2- and 4-byte instructions
+ when Zcb is available
+*/
+.option push
+.option norvc
+ sb a1, 30(a3)
+ sb a1, 29(a3)
+ sb a1, 28(a3)
+ sb a1, 27(a3)
+ sb a1, 26(a3)
+ sb a1, 25(a3)
+ sb a1, 24(a3)
+ sb a1, 23(a3)
+ sb a1, 22(a3)
+ sb a1, 21(a3)
+ sb a1, 20(a3)
+ sb a1, 19(a3)
+ sb a1, 18(a3)
+ sb a1, 17(a3)
+ sb a1, 16(a3)
+ sb a1, 15(a3)
+ sb a1, 14(a3)
+ sb a1, 13(a3)
+ sb a1, 12(a3)
+ sb a1, 11(a3)
+ sb a1, 10(a3)
+ sb a1, 9(a3)
+ sb a1, 8(a3)
+#if MV_SZ == 8
+.Ltable_misaligned:
+#endif
+ sb a1, 7(a3)
+ sb a1, 6(a3)
+ sb a1, 5(a3)
+ sb a1, 4(a3)
+#if MV_SZ == 4
+.Ltable_misaligned:
+#endif
+ sb a1, 3(a3)
+ sb a1, 2(a3)
+ sb a1, 1(a3)
+ sb a1, 0(a3)
+.option pop
+.Lexit:
+ ret
#endif
- .size memset, .-memset
+.size memset, .-memset