1 files changed, 270 insertions, 79 deletions
diff --git a/newlib/libc/machine/riscv/memset.S b/newlib/libc/machine/riscv/memset.S
index a717ae7..533f667 100644
--- a/newlib/libc/machine/riscv/memset.S
+++ b/newlib/libc/machine/riscv/memset.S
@@ -9,105 +9,296 @@
    http://www.opensource.org/licenses.
 */
 
+#include <sys/asm.h>
+
+
+#define BYTE_TBL_SZ   31
+#define WORD_TBL_SZ   32
+
+#if __riscv_zilsd
+/* Move size */
+#define MV_SZ         8
+
+/* Store instruction */
+#define RG_ST         sd
+
+/* Zilsd and Zclsd require an even numbered register */
+#define REG_SPLAT     a4
+#else
+#define MV_SZ         SZREG
+#define RG_ST         REG_S
+#define REG_SPLAT     a1
+#endif
+
+/*
+  Use an extended register for Zilsd and Zclsd if available
+  since a5 is used for the odd numbered register, in order
+  to eliminate an li instruction
+*/
+#if __riscv_zilsd && !__riscv_abi_rve
+#define REG_TABLE     a6
+#else
+#define REG_TABLE     a5
+#endif
+
+
 .text
 .global memset
-.type	memset, @function
+.type  memset, @function
+
+/* void *memset(void *s, int c, size_t n); */
+
+
 memset:
 #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
-  mv t1, a0
-  beqz a2, 2f
+  mv     a3, a0
+  beqz   a2, .Ldone
 
-1:
-  sb a1, 0(t1)
-  add   a2, a2, -1
-  add   t1, t1, 1
-  bnez a2, 1b
+.Lset:
+  sb     a1, 0(a3)
+  addi   a2, a2, -1
+  addi   a3, a3, 1
+  bnez   a2, .Lset
 
-2:
+.Ldone:
   ret
 
 #else
-  li t1, 15
-  move a4, a0
-  bleu a2, t1, .Ltiny
-  and a5, a4, 15
-  bnez a5, .Lmisaligned
+  li     REG_TABLE, BYTE_TBL_SZ
+  mv     a3, a0
+
+  /* If there aren't many bytes, copy them individually to reduce overhead */
+  bleu   a2, REG_TABLE, .Lcopy_bytes
+
+  and    a4, a3, MV_SZ - 1
+  beqz   a4, .Lword_check
+
+  /*
+    Jump into the byte table depending on the number of bytes that need to be
+    written
+  */
+1:
+  auipc  t0, %pcrel_hi(.Ltable_misaligned)
+
+  /*
+    Instructions in the tables are forced to be four bytes, so scale count
+    by 4
+  */
+#if __riscv_zba
+  sh2add t0, a4, t0
+#else
+  sll    t1, a4, 2
+  add    t0, t0, t1
+#endif
 
-.Laligned:
-  bnez a1, .Lwordify
+  /* Save the return address because we aren't exiting the function yet */
+  mv     t1, ra
+  jalr   t0, %pcrel_lo(1b)
 
-.Lwordified:
-  and a3, a2, ~15
-  and a2, a2, 15
-  add a3, a3, a4
+  /* Update pointer and count by what was written */
+  mv     ra, t1
+  add    a4, a4, -MV_SZ
+  add    a2, a2, a4
+  sub    a3, a3, a4
 
+  /* Access is now aligned. Check we can copy words. */
+  bleu   a2, REG_TABLE, .Lcopy_bytes
+
+.Lword_check:
+  /* Don't need to splat special case of zero */
+  bnez   a1, .Lsplat_byte
+#if __riscv_zilsd
+  mv     REG_SPLAT, a1
+#endif
+  j      .Lcopy_words_init
+
+/*
+  Align labels to four bytes after unconditional jumps to avoid any
+  penalties when jumping to 32-bit instructions that aren't 4-byte
+  aligned
+*/
+.p2align 2
+.Lsplat_byte:
+#if __riscv_zbkb
+  packh  REG_SPLAT, a1, a1
 #if __riscv_xlen == 64
-1:sd a1, 0(a4)
-  sd a1, 8(a4)
+  packw  REG_SPLAT, REG_SPLAT, REG_SPLAT
+#endif
+  pack   REG_SPLAT, REG_SPLAT, REG_SPLAT
 #else
-1:sw a1, 0(a4)
-  sw a1, 4(a4)
-  sw a1, 8(a4)
-  sw a1, 12(a4)
+  and    a1, a1, 0xFF
+  sll    t0, a1, 8
+  or     a1, a1, t0
+  sll    t0, a1, 16
+  or     REG_SPLAT, a1, t0
+#if __riscv_xlen == 64
+  sll    t0, REG_SPLAT, 32
+  or     REG_SPLAT, REG_SPLAT, t0
+#endif
 #endif
-  add a4, a4, 16
-  bltu a4, a3, 1b
 
-  bnez a2, .Ltiny
-  ret
+.Lcopy_words_init:
+#if __riscv_zilsd
+  /* Odd register of even-odd pair */
+  mv     a5, REG_SPLAT
+#endif
+
+  /* Calculate end address */
+  and    t0, a2, ~(MV_SZ - 1)
+  add    t1, a3, t0
+
+  /*
+    The idea behind the table of word copies is that first we calculate any
+    remainder of bytes that need to be copied by the table that aren't an
+    entire table length. That's copied first. After that, runs of the entire
+    table are performed.
+  */
+  and    t0, t0, (WORD_TBL_SZ - 1) * MV_SZ
+
+  /* Skip if there's no remainder */
+  beqz   t0, .Ltable_bigly
+  neg    t0, t0
+  add    t0, t0, WORD_TBL_SZ * MV_SZ
+
+  /* Adjust start address with offset */
+  sub    a3, a3, t0
+
+1:
+  auipc  t2, %pcrel_hi(.Ltable_bigly)
+
+#if MV_SZ == 8
+  /*
+    If eight bytes are being copied with each store, we need to divide
+    the table offset in half
+  */
+  srl    t0, t0, 1
+#endif
+
+  add    t2, t2, t0
+  jr     t2, %pcrel_lo(1b)
 
-.Ltiny:
-  sub a3, t1, a2
-  sll a3, a3, 2
-1:auipc t0, %pcrel_hi(.Ltable)
-  add a3, a3, t0
+.p2align 2
+.Ltable_bigly:
+/*
+  Force the instructions to be four bytes to avoid an extra instruction
+  that would be needed to halve the offset for sw
+*/
 .option push
 .option norvc
-.Ltable_misaligned:
-  jr a3, %pcrel_lo(1b)
-.Ltable:
-  sb a1,14(a4)
-  sb a1,13(a4)
-  sb a1,12(a4)
-  sb a1,11(a4)
-  sb a1,10(a4)
-  sb a1, 9(a4)
-  sb a1, 8(a4)
-  sb a1, 7(a4)
-  sb a1, 6(a4)
-  sb a1, 5(a4)
-  sb a1, 4(a4)
-  sb a1, 3(a4)
-  sb a1, 2(a4)
-  sb a1, 1(a4)
-  sb a1, 0(a4)
+  RG_ST  REG_SPLAT, MV_SZ*0(a3)
+  RG_ST  REG_SPLAT, MV_SZ*1(a3)
+  RG_ST  REG_SPLAT, MV_SZ*2(a3)
+  RG_ST  REG_SPLAT, MV_SZ*3(a3)
+  RG_ST  REG_SPLAT, MV_SZ*4(a3)
+  RG_ST  REG_SPLAT, MV_SZ*5(a3)
+  RG_ST  REG_SPLAT, MV_SZ*6(a3)
+  RG_ST  REG_SPLAT, MV_SZ*7(a3)
+  RG_ST  REG_SPLAT, MV_SZ*8(a3)
+  RG_ST  REG_SPLAT, MV_SZ*9(a3)
+  RG_ST  REG_SPLAT, MV_SZ*10(a3)
+  RG_ST  REG_SPLAT, MV_SZ*11(a3)
+  RG_ST  REG_SPLAT, MV_SZ*12(a3)
+  RG_ST  REG_SPLAT, MV_SZ*13(a3)
+  RG_ST  REG_SPLAT, MV_SZ*14(a3)
+  RG_ST  REG_SPLAT, MV_SZ*15(a3)
+  RG_ST  REG_SPLAT, MV_SZ*16(a3)
+  RG_ST  REG_SPLAT, MV_SZ*17(a3)
+  RG_ST  REG_SPLAT, MV_SZ*18(a3)
+  RG_ST  REG_SPLAT, MV_SZ*19(a3)
+  RG_ST  REG_SPLAT, MV_SZ*20(a3)
+  RG_ST  REG_SPLAT, MV_SZ*21(a3)
+  RG_ST  REG_SPLAT, MV_SZ*22(a3)
+  RG_ST  REG_SPLAT, MV_SZ*23(a3)
+  RG_ST  REG_SPLAT, MV_SZ*24(a3)
+  RG_ST  REG_SPLAT, MV_SZ*25(a3)
+  RG_ST  REG_SPLAT, MV_SZ*26(a3)
+  RG_ST  REG_SPLAT, MV_SZ*27(a3)
+  RG_ST  REG_SPLAT, MV_SZ*28(a3)
+  RG_ST  REG_SPLAT, MV_SZ*29(a3)
+  RG_ST  REG_SPLAT, MV_SZ*30(a3)
+  RG_ST  REG_SPLAT, MV_SZ*31(a3)
 .option pop
-  ret
 
-.Lwordify:
-  and a1, a1, 0xFF
-  sll a3, a1, 8
-  or  a1, a1, a3
-  sll a3, a1, 16
-  or  a1, a1, a3
-#if __riscv_xlen == 64
-  sll a3, a1, 32
-  or  a1, a1, a3
+  /* Update the pointer and copy data if needed */
+  add    a3, a3, MV_SZ * WORD_TBL_SZ
+  bltu   a3, t1, .Ltable_bigly
+
+  /* Copy any remaining bytes */
+  and    a2, a2, MV_SZ - 1
+  beqz   a2, .Lexit
+
+#if __riscv_zilsd && __riscv_abi_rve
+  /* Restore table size if necessary */
+  li     REG_TABLE, BYTE_TBL_SZ
 #endif
-  j .Lwordified
-
-.Lmisaligned:
-  sll a3, a5, 2
-1:auipc t0, %pcrel_hi(.Ltable_misaligned)
-  add a3, a3, t0
-  mv t0, ra
-  jalr a3, %pcrel_lo(1b)
-  mv ra, t0
-
-  add a5, a5, -16
-  sub a4, a4, a5
-  add a2, a2, a5
-  bleu a2, t1, .Ltiny
-  j .Laligned
+
+.Lcopy_bytes:
+  auipc  t0, %pcrel_hi(.Ltable_tiny)
+
+  sub    a2, REG_TABLE, a2
+
+  /*
+    Instructions in the tables are forced to be four bytes, so scale count
+    by 4
+  */
+#if __riscv_zba
+  sh2add t0, a2, t0
+#else
+  sll    a2, a2, 2
+  add    t0, t0, a2
+#endif
+
+  /* Don't save the return address because we're exiting after the jump */
+  jr     t0, %pcrel_lo(.Lcopy_bytes)
+
+.p2align 2
+.Ltable_tiny:
+/*
+  norvc is needed because the immediate is only two bits in size for c.sb,
+  and without it the table would have a mix of 2- and 4-byte instructions
+  when Zcb is available
+*/
+.option push
+.option norvc
+  sb     a1, 30(a3)
+  sb     a1, 29(a3)
+  sb     a1, 28(a3)
+  sb     a1, 27(a3)
+  sb     a1, 26(a3)
+  sb     a1, 25(a3)
+  sb     a1, 24(a3)
+  sb     a1, 23(a3)
+  sb     a1, 22(a3)
+  sb     a1, 21(a3)
+  sb     a1, 20(a3)
+  sb     a1, 19(a3)
+  sb     a1, 18(a3)
+  sb     a1, 17(a3)
+  sb     a1, 16(a3)
+  sb     a1, 15(a3)
+  sb     a1, 14(a3)
+  sb     a1, 13(a3)
+  sb     a1, 12(a3)
+  sb     a1, 11(a3)
+  sb     a1, 10(a3)
+  sb     a1, 9(a3)
+  sb     a1, 8(a3)
+#if MV_SZ == 8
+.Ltable_misaligned:
+#endif
+  sb     a1, 7(a3)
+  sb     a1, 6(a3)
+  sb     a1, 5(a3)
+  sb     a1, 4(a3)
+#if MV_SZ == 4
+.Ltable_misaligned:
+#endif
+  sb     a1, 3(a3)
+  sb     a1, 2(a3)
+  sb     a1, 1(a3)
+  sb     a1, 0(a3)
+.option pop
+.Lexit:
+  ret
 #endif
-  .size	memset, .-memset
+.size  memset, .-memset