aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sysdeps/powerpc/powerpc64/le/power10/memset.S256
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/Makefile2
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/bzero.c8
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c14
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/memset-power10.S27
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/memset.c8
6 files changed, 314 insertions, 1 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memset.S b/sysdeps/powerpc/powerpc64/le/power10/memset.S
new file mode 100644
index 0000000..6b8e2cf
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power10/memset.S
@@ -0,0 +1,256 @@
+/* Optimized memset implementation for POWER10 LE.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
+ .machine power9
+ENTRY_TOCLESS (MEMSET, 5)
+ CALL_MCOUNT 3
+
+L(_memset):
+ /* Assume memset of zero length is uncommon, and just let it go
+ through the small path below. */
+ cmpldi r5,64
+
+ /* Replicate byte to quad word. */
+ mtvsrd v0+32,r4
+ vspltb v0,v0,7
+
+ li r7,16
+ sldi r8,r7,56
+
+ bgt L(large)
+
+ /* For short lengths we want to avoid as many branches as possible.
+ We use store VSX vector with length instructions to do this.
+ It takes advantage of the fact that if the length passed to stxvl
+ is zero nothing is done, effectively a no-op. */
+ sldi r5,r5,56
+
+ addi r10,r3,16
+
+ sub. r11,r5,r8
+ isellt r11,0,r11 /* Saturate the subtraction to zero. */
+
+ stxvl v0+32,r3,r5
+ stxvl v0+32,r10,r11
+
+ addi r9,r3,32
+ addi r10,r3,48
+
+ sub. r11,r11,r8
+ isellt r11,0,r11
+
+ sub. r5,r11,r8
+ isellt r5,0,r5
+
+ stxvl v0+32,r9,r11
+ stxvl v0+32,r10,r5
+
+ blr
+
+ .balign 16
+L(large):
+ mr r6,r3 /* Don't modify r3 since we need to return it. */
+
+ /* Get dest 16B aligned. */
+ neg r0,r3
+ clrldi. r7,r0,(64-4)
+ beq L(aligned)
+ rldic r9,r0,56,4 /* (~X & 0xf)<<56 "clrlsldi r9,r0,64-4,56". */
+
+ stxvl v0+32,r6,r9 /* Store up to 15B until aligned address. */
+
+ add r6,r6,r7
+ sub r5,r5,r7
+
+ /* Go to tail if there is less than 64B left after alignment. */
+ cmpldi r5,64
+ blt L(tail_64)
+
+ .balign 16
+L(aligned):
+ /* Go to tail if there is less than 128B left after alignment. */
+ srdi. r0,r5,7
+ beq L(tail_128)
+
+ /* If c == 0 && n >= 256 use dcbz to zero out full cache blocks. */
+ cmpldi cr5,r5,255
+ cmpldi cr6,r4,0
+ crand 27,26,21
+ bt 27,L(dcbz)
+
+ mtctr r0
+
+ .balign 32
+L(loop):
+ stxv v0+32,0(r6)
+ stxv v0+32,16(r6)
+ stxv v0+32,32(r6)
+ stxv v0+32,48(r6)
+ stxv v0+32,64(r6)
+ stxv v0+32,80(r6)
+ stxv v0+32,96(r6)
+ stxv v0+32,112(r6)
+ addi r6,r6,128
+ bdnz L(loop)
+
+ .balign 16
+L(tail):
+ /* 127B or less left, finish the tail or return. */
+ andi. r5,r5,127
+ beqlr
+
+ cmpldi r5,64
+ blt L(tail_64)
+
+ .balign 16
+L(tail_128):
+ /* Stores a minimum of 64B and up to 128B and return. */
+ stxv v0+32,0(r6)
+ stxv v0+32,16(r6)
+ stxv v0+32,32(r6)
+ stxv v0+32,48(r6)
+ addi r6,r6,64
+ andi. r5,r5,63
+ beqlr
+
+ .balign 16
+L(tail_64):
+ /* Stores up to 64B and return. */
+ sldi r5,r5,56
+
+ addi r10,r6,16
+
+ sub. r11,r5,r8
+ isellt r11,0,r11
+
+ stxvl v0+32,r6,r5
+ stxvl v0+32,r10,r11
+
+ sub. r11,r11,r8
+ blelr
+
+ addi r9,r6,32
+ addi r10,r6,48
+
+ isellt r11,0,r11
+
+ sub. r5,r11,r8
+ isellt r5,0,r5
+
+ stxvl v0+32,r9,r11
+ stxvl v0+32,r10,r5
+
+ blr
+
+ .balign 16
+L(dcbz):
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
+ Before using dcbz though, we need to get the destination 128-byte
+ aligned. */
+ neg r0,r6
+ clrldi. r0,r0,(64-7)
+ beq L(dcbz_aligned)
+
+ sub r5,r5,r0
+ mtocrf 0x2,r0 /* copying bits 57..59 to cr6. The ones for sizes 64,
+ 32 and 16 which need to be checked. */
+
+ /* Write 16-128 bytes until DST is aligned to 128 bytes. */
+64: bf 25,32f
+ stxv v0+32,0(r6)
+ stxv v0+32,16(r6)
+ stxv v0+32,32(r6)
+ stxv v0+32,48(r6)
+ addi r6,r6,64
+
+32: bf 26,16f
+ stxv v0+32,0(r6)
+ stxv v0+32,16(r6)
+ addi r6,r6,32
+
+16: bf 27,L(dcbz_aligned)
+ stxv v0+32,0(r6)
+ addi r6,r6,16
+
+ .balign 16
+L(dcbz_aligned):
+ /* Setup dcbz unroll offsets and count numbers. */
+ srdi. r0,r5,9
+ li r9,128
+ beq L(bcdz_tail)
+ li r10,256
+ li r11,384
+ mtctr r0
+
+ .balign 16
+L(dcbz_loop):
+ /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+ a throughput boost for large sizes (2048 bytes or higher). */
+ dcbz 0,r6
+ dcbz r9,r6
+ dcbz r10,r6
+ dcbz r11,r6
+ addi r6,r6,512
+ bdnz L(dcbz_loop)
+
+ andi. r5,r5,511
+ beqlr
+
+ .balign 16
+L(bcdz_tail):
+ /* We have 1-511 bytes remaining. */
+ srdi. r0,r5,7
+ beq L(tail)
+
+ mtocrf 0x1,r0
+
+256: bf 30,128f
+ dcbz 0,r6
+ dcbz r9,r6
+ addi r6,r6,256
+
+128: bf 31,L(tail)
+ dcbz 0,r6
+ addi r6,r6,128
+
+ b L(tail)
+
+END_GEN_TB (MEMSET,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+ between bzero and memset. */
+ENTRY_TOCLESS (__bzero)
+ CALL_MCOUNT 2
+ mr r5,r4
+ li r4,0
+ b L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 9ef12d3..ea50b61 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
strncase-power8
ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += memcpy-power10 memmove-power10 \
+sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
strlen-power10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
index c3f819f..50a5320 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -27,8 +27,16 @@ extern __typeof (bzero) __bzero_power4 attribute_hidden;
extern __typeof (bzero) __bzero_power6 attribute_hidden;
extern __typeof (bzero) __bzero_power7 attribute_hidden;
extern __typeof (bzero) __bzero_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (bzero) __bzero_power10 attribute_hidden;
+# endif
libc_ifunc (__bzero,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
+ && hwcap & PPC_FEATURE_HAS_VSX)
+ ? __bzero_power10 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __bzero_power8 :
(hwcap & PPC_FEATURE_HAS_VSX)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 1ab56bb..49d9a33 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -86,6 +86,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memset.c. */
IFUNC_IMPL (i, name, memset,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, memset,
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
+ PPC_FEATURE2_HAS_ISEL)
+ && hwcap & PPC_FEATURE_HAS_VSX,
+ __memset_power10)
+#endif
IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__memset_power8)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
@@ -187,6 +194,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c. */
IFUNC_IMPL (i, name, bzero,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, bzero,
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
+ PPC_FEATURE2_HAS_ISEL)
+ && hwcap & PPC_FEATURE_HAS_VSX,
+ __bzero_power10)
+#endif
IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__bzero_power8)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S
new file mode 100644
index 0000000..548e997
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S
@@ -0,0 +1,27 @@
+/* Optimized memset implementation for POWER10 LE.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define MEMSET __memset_power10
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bzero
+#define __bzero __bzero_power10
+
+#include <sysdeps/powerpc/powerpc64/le/power10/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index d483f66..6562646 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -33,10 +33,18 @@ extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__redirect_memset) __memset_power10 attribute_hidden;
+# endif
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__libc_memset,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
+ && hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memset_power10 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __memset_power8 :
(hwcap & PPC_FEATURE_HAS_VSX)