aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-evex.S5
-rw-r--r--sysdeps/x86_64/multiarch/strcat-evex.S291
-rw-r--r--sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S110
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-evex.S1282
-rw-r--r--sysdeps/x86_64/multiarch/strncat-evex.S525
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-evex.S995
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h80
7 files changed, 2115 insertions, 1173 deletions
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
index 99ea76a..3693491 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -3,6 +3,5 @@
#endif
#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY STPNCPY
-#include "strcpy-evex.S"
+#define STRNCPY STPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 0e2df94..b4207b7 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -1,286 +1,7 @@
-/* strcat with 256-bit EVEX instructions.
- Copyright (C) 2021-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_evex
-# endif
-
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-/* zero register */
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM0 ymm17
-# define YMM1 ymm18
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE 32
-
- .section .text.evex,"ax",@progbits
-ENTRY (STRCAT)
- mov %rdi, %r9
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
- xor %eax, %eax
- mov %edi, %ecx
- and $((VEC_SIZE * 4) - 1), %ecx
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
- cmp $(VEC_SIZE * 3), %ecx
- ja L(fourth_vector_boundary)
- vpcmpb $0, (%rdi), %YMMZERO, %k0
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit_null_on_first_vector)
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- jmp L(align_vec_size_start)
-L(fourth_vector_boundary):
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- vpcmpb $0, (%rax), %YMMZERO, %k0
- mov $-1, %r10d
- sub %rax, %rcx
- shl %cl, %r10d
- kmovd %k0, %edx
- and %r10d, %edx
- jnz L(exit)
-
-L(align_vec_size_start):
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- add $(VEC_SIZE * 4), %rax
- kmovd %k4, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- kmovd %k4, %edx
- add $(VEC_SIZE * 4), %rax
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- add $(VEC_SIZE * 4), %rax
- kmovd %k4, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- add $(VEC_SIZE * 5), %rax
- kmovd %k4, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
- add $VEC_SIZE, %rax
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
- add $VEC_SIZE, %rax
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1
- add $VEC_SIZE, %rax
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit)
-
- add $VEC_SIZE, %rax
-
- .p2align 4
-L(align_four_vec_loop):
- VMOVA (%rax), %YMM0
- VMOVA (VEC_SIZE * 2)(%rax), %YMM1
- vpminub VEC_SIZE(%rax), %YMM0, %YMM0
- vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
- vpminub %YMM0, %YMM1, %YMM0
- /* If K0 != 0, there is a null byte. */
- vpcmpb $0, %YMM0, %YMMZERO, %k0
- add $(VEC_SIZE * 4), %rax
- ktestd %k0, %k0
- jz L(align_four_vec_loop)
-
- vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
- sub $(VEC_SIZE * 5), %rax
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit):
- sub %rdi, %rax
-L(exit_null_on_first_vector):
- bsf %rdx, %rdx
- add %rdx, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_second_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $VEC_SIZE, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_third_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 2), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fourth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 3), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fifth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
-
- .p2align 4
-L(StartStrcpyPart):
- lea (%r9, %rax), %rdi
- mov %rsi, %rcx
- mov %r9, %rax /* save result */
-
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(ExitZero)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-evex.S"
+#ifndef STRCAT
+# define STRCAT __strcat_evex
#endif
+
+#define USE_AS_STRCAT
+#define STRCPY STRCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
new file mode 100644
index 0000000..9530d7b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
@@ -0,0 +1,110 @@
+/* strlen used for begining of str{n}cat using EVEX 256/512.
+ Copyright (C) 2011-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+/* NOTE: This file is meant to be included by strcat-evex or
+ strncat-evex and does not standalone. Before including %rdi
+ must be saved in %rax. */
+
+
+/* Simple strlen implementation that ends at
+ L(strcat_strlen_done). */
+ vpxorq %VZERO_128, %VZERO_128, %VZERO_128
+ movq %rdi, %r8
+ andq $(VEC_SIZE * -1), %r8
+ VPCMPEQ (%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+#ifdef USE_AS_WCSCPY
+ subl %r8d, %edi
+ shrl $2, %edi
+#endif
+ shrx %VRDI, %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+ movq %rax, %rdi
+#endif
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v0)
+
+
+ VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ leaq (VEC_SIZE)(%r8), %rdi
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v0)
+
+ VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v1)
+
+ VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v2)
+
+ VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v3)
+
+ andq $-(VEC_SIZE * 4), %rdi
+ .p2align 4,, 8
+L(loop_2x_vec):
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0)
+ VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2)
+ VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
+ VPTESTN %VMM(1), %VMM(1), %k1
+ VPTESTN %VMM(3), %VMM(3), %k3
+ subq $(VEC_SIZE * -4), %rdi
+ KORTEST %k1, %k3
+ jz L(loop_2x_vec)
+
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v0)
+
+ KMOV %k1, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v1)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v2)
+
+ KMOV %k3, %VRCX
+L(bsf_and_done_v3):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+ bsf %VRCX, %VRCX
+ leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
+ jmp L(strcat_strlen_done)
+
+ .p2align 4,, 4
+L(bsf_and_done_v1):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+ bsf %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+#else
+ addq %rcx, %rdi
+#endif
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 82e45ac..932129a 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -1,4 +1,4 @@
-/* strcpy with 256-bit EVEX instructions.
+/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
Copyright (C) 2021-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,990 +17,526 @@
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
-
#if ISA_SHOULD_BUILD (4)
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
+ /* Use evex-masked stores for small sizes. Turned off at the
+ moment. */
+# define USE_EVEX_MASKED_STORE 0
+ /* Use movsb in page cross case to save code size. */
+# define USE_MOVSB_IN_PAGE_CROSS 1
-# ifndef STRCPY
-# define STRCPY __strcpy_evex
-# endif
+# include <sysdep.h>
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-# define VEC_SIZE 32
+# ifndef STRCPY
+# define STRCPY __strcpy_evex
# endif
-# define XMM2 xmm18
-# define XMM3 xmm19
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
-# define YMM7 ymm23
+# ifdef USE_AS_WCSCPY
+# define VMOVU_MASK vmovdqu32
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
+# define VPTEST vptestmd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
-# ifndef USE_AS_STRCAT
+# define REP_MOVS rep movsd
-/* zero register */
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM1 ymm17
-
- .section .text.evex,"ax",@progbits
-ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
- test %R8_LP, %R8_LP
- jz L(ExitZero)
-# endif
- mov %rsi, %rcx
-# ifndef USE_AS_STPCPY
- mov %rdi, %rax /* save result */
-# endif
+# define USE_WIDE_CHAR
+# else
+# define VMOVU_MASK vmovdqu8
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
+# define VPTEST vptestmb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+# define REP_MOVS rep movsb
# endif
- and $((VEC_SIZE * 4) - 1), %ecx
- cmp $(VEC_SIZE * 2), %ecx
- jbe L(SourceStringAlignmentLessTwoVecSize)
-
- and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %ecx
-
- vpcmpb $0, (%rsi), %YMMZERO, %k0
- kmovd %k0, %edx
- shr %cl, %rdx
+# include "reg-macros.h"
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- mov $VEC_SIZE, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# else
- mov $(VEC_SIZE + 1), %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# endif
- jbe L(CopyVecSizeTailCase2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyVecSizeTail)
-
- vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1
- kmovd %k1, %edx
-# ifdef USE_AS_STRNCPY
- add $VEC_SIZE, %r10
- cmp %r10, %r8
- jbe L(CopyTwoVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyTwoVecSize)
-
- VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */
- VMOVU %YMM2, (%rdi)
-
-/* If source address alignment != destination address alignment */
- .p2align 4
-L(UnalignVecSizeBoth):
- sub %rcx, %rdi
-# ifdef USE_AS_STRNCPY
- add %rcx, %r8
- sbb %rcx, %rcx
- or %rcx, %r8
-# endif
- mov $VEC_SIZE, %rcx
- VMOVA (%rsi, %rcx), %YMM2
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 3), %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STPCPY
+# define END_REG rax
# else
- jnz L(CopyVecSize)
+# define END_REG rdi, %rdx, CHAR_SIZE
# endif
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
- vpcmpb $0, %YMM3, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec3)
+# ifdef USE_AS_STRCAT
+# define PAGE_ALIGN_REG edx
+# define PAGE_ALIGN_REG_64 rdx
# else
- jnz L(CopyVecSize)
+# define PAGE_ALIGN_REG eax
+# define PAGE_ALIGN_REG_64 rax
# endif
- VMOVU %YMM3, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM4
- vpcmpb $0, %YMM4, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec4)
-# else
- jnz L(CopyVecSize)
-# endif
+# define VZERO VMM(7)
+# define VZERO_128 VMM_128(7)
- VMOVU %YMM4, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
-# else
- jnz L(CopyVecSize)
-# endif
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
-# else
- jnz L(CopyVecSize)
-# endif
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
- VMOVU %YMM2, (%rdi, %rcx)
- vpcmpb $0, %YMM3, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec3)
-# else
- jnz L(CopyVecSize)
-# endif
- VMOVU %YMM3, (%rdi, %rcx)
- mov %rsi, %rdx
- lea VEC_SIZE(%rsi, %rcx), %rsi
- and $-(VEC_SIZE * 4), %rsi
- sub %rsi, %rdx
- sub %rdx, %rdi
-# ifdef USE_AS_STRNCPY
- lea (VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
- VMOVA (%rsi), %YMM4
- VMOVA VEC_SIZE(%rsi), %YMM5
- VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
- VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
- vpminub %YMM5, %YMM4, %YMM2
- vpminub %YMM7, %YMM6, %YMM3
- vpminub %YMM2, %YMM3, %YMM2
- /* If K7 != 0, there is a null byte. */
- vpcmpb $0, %YMM2, %YMMZERO, %k7
- kmovd %k7, %edx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 4), %r8
- jbe L(UnalignedLeaveCase2OrCase3)
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+# ifdef USE_AS_STRCAT
+ movq %rdi, %rax
+# include "strcat-strlen-evex.h.S"
# endif
- test %edx, %edx
- jnz L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
- add $(VEC_SIZE * 4), %rdi
- add $(VEC_SIZE * 4), %rsi
- VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi)
- VMOVA (%rsi), %YMM4
- VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi)
- VMOVA VEC_SIZE(%rsi), %YMM5
- vpminub %YMM5, %YMM4, %YMM2
- VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi)
- VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
- VMOVU %YMM7, -VEC_SIZE(%rdi)
- VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
- vpminub %YMM7, %YMM6, %YMM3
- vpminub %YMM2, %YMM3, %YMM2
- /* If K7 != 0, there is a null byte. */
- vpcmpb $0, %YMM2, %YMMZERO, %k7
- kmovd %k7, %edx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 4), %r8
- jbe L(UnalignedLeaveCase2OrCase3)
+
+ movl %esi, %PAGE_ALIGN_REG
+ andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+ cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+ ja L(page_cross)
+L(page_cross_continue):
+ VMOVU (%rsi), %VMM(0)
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+ movq %rdi, %rax
# endif
- test %edx, %edx
- jz L(UnalignedFourVecSizeLoop_start)
-L(UnalignedFourVecSizeLeave):
- vpcmpb $0, %YMM4, %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(CopyVecSizeUnaligned_0)
- vpcmpb $0, %YMM5, %YMMZERO, %k2
- kmovd %k2, %ecx
- test %ecx, %ecx
- jnz L(CopyVecSizeUnaligned_16)
+ /* Two short string implementations. One with traditional
+ branching approach and one with masked instructions (which
+ have potential for dramatically bad perf if dst splits a
+ page and is not in the TLB). */
+# if USE_EVEX_MASKED_STORE
+ VPTEST %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+# ifdef USE_AS_WCSCPY
+ subl $((1 << CHAR_PER_VEC)- 1), %VRCX
+# else
+ inc %VRCX
+# endif
+ jz L(more_1x_vec)
+ KMOV %VRCX, %k1
+ KXOR %k0, %k1, %k1
- vpcmpb $0, %YMM6, %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(CopyVecSizeUnaligned_32)
-
- vpcmpb $0, %YMM7, %YMMZERO, %k4
- kmovd %k4, %ecx
- bsf %ecx, %edx
- VMOVU %YMM4, (%rdi)
- VMOVU %YMM5, VEC_SIZE(%rdi)
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
- VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
- add $(VEC_SIZE - 1), %r8
- sub %rdx, %r8
- lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $(VEC_SIZE * 3), %rsi
- add $(VEC_SIZE * 3), %rdi
- jmp L(CopyVecSizeExit)
-# endif
+ VMOVU_MASK %VMM(0), (%rdi){%k1}
-/* If source address alignment == destination address alignment */
+# ifdef USE_AS_STPCPY
+ bsf %VRCX, %VRCX
+ leaq (%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+ ret
-L(SourceStringAlignmentLessTwoVecSize):
- VMOVU (%rsi), %YMM3
- VMOVU VEC_SIZE(%rsi), %YMM2
- vpcmpb $0, %YMM3, %YMMZERO, %k0
- kmovd %k0, %edx
+# else
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jz L(more_1x_vec)
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $VEC_SIZE, %r8
+ xorl %edx, %edx
+ bsf %VRCX, %VRDX
+# ifdef USE_AS_STPCPY
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# endif
+
+ /* Use mask bits in rcx to detect which copy we need. If the low
+ mask is zero then there must be a bit set in the upper half.
+ I.e if rcx != 0 and ecx == 0, then match must be upper 32
+ bits so we use L(copy_32_63). */
+# if VEC_SIZE == 64
+# ifdef USE_AS_WCSCPY
+ testb %cl, %cl
+# else
+ testl %ecx, %ecx
+# endif
+ jz L(copy_32_63)
+# endif
+
+# ifdef USE_AS_WCSCPY
+ testb $0xf, %cl
# else
- cmp $(VEC_SIZE + 1), %r8
+ testw %cx, %cx
# endif
- jbe L(CopyVecSizeTail1Case2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyVecSizeTail1)
+ jz L(copy_16_31)
- VMOVU %YMM3, (%rdi)
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $(VEC_SIZE * 2), %r8
+# ifdef USE_AS_WCSCPY
+ testb $0x3, %cl
# else
- cmp $((VEC_SIZE * 2) + 1), %r8
+ testb %cl, %cl
# endif
- jbe L(CopyTwoVecSize1Case2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyTwoVecSize1)
-
- and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %ecx
- jmp L(UnalignVecSizeBoth)
+ jz L(copy_8_15)
-/*------End of main part with loops---------------------*/
-/* Case1 */
+# ifdef USE_AS_WCSCPY
+ vmovd %VMM_128(0), (%rdi)
+ /* No need to copy, we know its zero. */
+ movl $0, (%END_REG)
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
- .p2align 4
-L(CopyVecSize):
- add %rcx, %rdi
-# endif
-L(CopyVecSizeTail):
- add %rcx, %rsi
-L(CopyVecSizeTail1):
- bsf %edx, %edx
-L(CopyVecSizeExit):
- cmp $32, %edx
- jae L(Exit32_63)
- cmp $16, %edx
- jae L(Exit16_31)
- cmp $8, %edx
- jae L(Exit8_15)
- cmp $4, %edx
- jae L(Exit4_7)
- cmp $3, %edx
- je L(Exit3)
- cmp $1, %edx
- ja L(Exit2)
- je L(Exit1)
- movb $0, (%rdi)
-# ifdef USE_AS_STPCPY
- lea (%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $1, %r8
- lea 1(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
ret
+# else
- .p2align 4
-L(CopyTwoVecSize1):
- add $VEC_SIZE, %rsi
- add $VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $VEC_SIZE, %r8
-# endif
- jmp L(CopyVecSizeTail1)
-
- .p2align 4
-L(CopyTwoVecSize):
- bsf %edx, %edx
- add %rcx, %rsi
- add $VEC_SIZE, %edx
- sub %ecx, %edx
- jmp L(CopyVecSizeExit)
-
- .p2align 4
-L(CopyVecSizeUnaligned_0):
- bsf %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- VMOVU %YMM4, (%rdi)
- add $((VEC_SIZE * 4) - 1), %r8
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- jmp L(CopyVecSizeExit)
-# endif
+ testb $0x7, %cl
+ jz L(copy_4_7)
- .p2align 4
-L(CopyVecSizeUnaligned_16):
- bsf %ecx, %edx
- VMOVU %YMM4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea VEC_SIZE(%rdi, %rdx), %rax
-# endif
- VMOVU %YMM5, VEC_SIZE(%rdi)
- add $((VEC_SIZE * 3) - 1), %r8
- sub %rdx, %r8
- lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $VEC_SIZE, %rsi
- add $VEC_SIZE, %rdi
- jmp L(CopyVecSizeExit)
-# endif
- .p2align 4
-L(CopyVecSizeUnaligned_32):
- bsf %edx, %edx
- VMOVU %YMM4, (%rdi)
- VMOVU %YMM5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
- add $((VEC_SIZE * 2) - 1), %r8
- sub %rdx, %r8
- lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $(VEC_SIZE * 2), %rsi
- add $(VEC_SIZE * 2), %rdi
- jmp L(CopyVecSizeExit)
-# endif
+ test %edx, %edx
+ jz L(set_null_term)
-# ifdef USE_AS_STRNCPY
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(CopyVecSizeUnalignedVec6):
- VMOVU %YMM6, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-
- .p2align 4
-L(CopyVecSizeUnalignedVec5):
- VMOVU %YMM5, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-
- .p2align 4
-L(CopyVecSizeUnalignedVec4):
- VMOVU %YMM4, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-
- .p2align 4
-L(CopyVecSizeUnalignedVec3):
- VMOVU %YMM3, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
+ /* NB: make this `vmovw` if support for AVX512-FP16 is added.
+ */
+ vmovd %VMM_128(0), %esi
+ movw %si, (%rdi)
+
+ .p2align 4,, 1
+L(set_null_term):
+ /* No need to copy, we know its zero. */
+ movb $0, (%END_REG)
+ ret
# endif
-/* Case2 */
-
- .p2align 4
-L(CopyVecSizeCase2):
- add $VEC_SIZE, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyTwoVecSizeCase2):
- add %rcx, %rsi
- bsf %edx, %edx
- add $VEC_SIZE, %edx
- sub %ecx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
- add %rcx, %rsi
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
-/* Case2 or Case3, Case3 */
-
- .p2align 4
-L(CopyVecSizeCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
- add $VEC_SIZE, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyTwoVecSizeCase2)
- add %rcx, %rsi
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyVecSizeTailCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyVecSizeTailCase2)
- add %rcx, %rsi
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
- add $VEC_SIZE, %rdi
- add $VEC_SIZE, %rsi
- sub $VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
- test %rdx, %rdx
- jnz L(CopyVecSizeTail1Case2)
- jmp L(StrncpyExit)
+# if VEC_SIZE == 64
+ .p2align 4,, 6
+L(copy_32_63):
+ VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+ VMOVU %VMM_256(0), (%rdi)
+ VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
+ ret
+# endif
+
+
+ .p2align 4,, 6
+L(copy_16_31):
+ /* Use xmm1 explicitly here as it won't require a `vzeroupper`
+ and will save code size. */
+ vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+ VMOVU %VMM_128(0), (%rdi)
+ vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
+ ret
+
+ .p2align 4,, 8
+L(copy_8_15):
+# ifdef USE_AS_WCSCPY
+ movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+# else
+ movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
+# endif
+ vmovq %VMM_128(0), (%rdi)
+ movq %rcx, -(8 - CHAR_SIZE)(%END_REG)
+ ret
# endif
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
- .p2align 4
-L(Exit1):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
+# ifndef USE_AS_WCSCPY
+ .p2align 4,, 12
+L(copy_4_7):
+ movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+ vmovd %VMM_128(0), (%rdi)
+ movl %ecx, -(4 - CHAR_SIZE)(%END_REG)
+ ret
# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $2, %r8
- lea 2(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
+
+
+ .p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ VMOVU %VMM(0), (%rdi)
# endif
- ret
+ subq %rsi, %rdi
+ andq $-(VEC_SIZE), %rsi
+ addq %rsi, %rdi
+ VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
- .p2align 4
-L(Exit2):
- movzwl (%rsi), %ecx
- mov %cx, (%rdi)
- movb $0, 2(%rdi)
+ /* Ideally we store after moves to minimize impact of potential
+ false-dependencies. */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+ VMOVU %VMM(0), (%rax)
+# endif
+
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
+ VMOVU %VMM(1), VEC_SIZE(%rdi)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+
+ VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
+ VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
+
+ VPTESTN %VMM(3), %VMM(3), %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(ret_vec_x3)
+
+ VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ VPTESTN %VMM(4), %VMM(4), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x4)
+
+ VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
+
+
+ /* Align for 4x loop. */
+ subq %rsi, %rdi
+
+ /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
+ we covered before aligning. */
+ subq $-(VEC_SIZE * 5), %rsi
+ andq $-(VEC_SIZE * 4), %rsi
+
+
+ /* Load first half of the loop before entry. */
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+ KORTEST %k2, %k4
+ jnz L(loop_4x_done)
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+ VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+
+ subq $(VEC_SIZE * -4), %rsi
+
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+ KORTEST %k2, %k4
+ jz L(loop_4x_vec)
+
+L(loop_4x_done):
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ /* Restore rdi (%rdi). */
+ addq %rsi, %rdi
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x0_end)
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+ KMOV %k2, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+ /* Place L(ret_vec_x4) here to save code size. We get a
+ meaningfuly benefit doing this for stpcpy. */
+ KMOV %k4, %VRDX
+L(ret_vec_x3):
+ bsf %VRDX, %VRDX
+ VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $3, %r8
- lea 3(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
# endif
+L(return_end):
ret
- .p2align 4
-L(Exit3):
- mov (%rsi), %edx
- mov %edx, (%rdi)
+ .p2align 4,, 6
+L(ret_vec_x0_end):
+ bsf %VRCX, %VRCX
# ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $4, %r8
- lea 4(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (%rdi, %rcx, CHAR_SIZE), %rax
# endif
+ inc %VRCX
+ VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
ret
- .p2align 4
-L(Exit4_7):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov -3(%rsi, %rdx), %ecx
- mov %ecx, -3(%rdi, %rdx)
+ .p2align 4,, 8
+L(ret_vec_x1):
+ bsf %VRCX, %VRCX
+ VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
- .p2align 4
-L(Exit8_15):
- mov (%rsi), %rcx
- mov -7(%rsi, %rdx), %r9
- mov %rcx, (%rdi)
- mov %r9, -7(%rdi, %rdx)
+ .p2align 4,, 4
+L(ret_vec_x2):
+ bsf %VRCX, %VRCX
+ VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
- .p2align 4
-L(Exit16_31):
- VMOVU (%rsi), %XMM2
- VMOVU -15(%rsi, %rdx), %XMM3
- VMOVU %XMM2, (%rdi)
- VMOVU %XMM3, -15(%rdi, %rdx)
+ /* ret_vec_x3 reuses return code after the loop. */
+ .p2align 4,, 6
+L(ret_vec_x4):
+ bsf %VRCX, %VRCX
+ VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
- .p2align 4
-L(Exit32_63):
- VMOVU (%rsi), %YMM2
- VMOVU -31(%rsi, %rdx), %YMM3
- VMOVU %YMM2, (%rdi)
- VMOVU %YMM3, -31(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
+
+ .p2align 4,, 4
+L(page_cross):
+# ifndef USE_AS_STRCAT
+ vpxorq %VZERO_128, %VZERO_128, %VZERO_128
# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ movq %rsi, %rcx
+ andq $(VEC_SIZE * -1), %rcx
+
+ VPCMPEQ (%rcx), %VZERO, %k0
+ KMOV %k0, %VRCX
+# ifdef USE_AS_WCSCPY
+ andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG
+ shrl $2, %PAGE_ALIGN_REG
# endif
- ret
+ shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
-# ifdef USE_AS_STRNCPY
+# if USE_MOVSB_IN_PAGE_CROSS
+ /* Optimizing more aggressively for space as this is very cold
+ code. This saves 2x cache lines. */
- .p2align 4
-L(StrncpyExit1):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 1(%rdi)
+ /* This adds once to the later result which will get correct
+ copy bounds. NB: this can never zero-out a non-zero RCX as
+ to be in the page cross case rsi cannot be aligned and we
+ already right-shift rcx by the misalignment. */
+ shl %VRCX
+ jz L(page_cross_continue)
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+ movq %rdi, %rax
# endif
- ret
+ bsf %VRCX, %VRCX
+ REP_MOVS
- .p2align 4
-L(StrncpyExit2):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 2(%rdi)
+ leaq -CHAR_SIZE(%rdi), %rax
# endif
ret
- .p2align 4
-L(StrncpyExit3_4):
- movzwl (%rsi), %ecx
- movzwl -2(%rsi, %r8), %edx
- mov %cx, (%rdi)
- mov %dx, -2(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- ret
- .p2align 4
-L(StrncpyExit5_8):
- mov (%rsi), %ecx
- mov -4(%rsi, %r8), %edx
- mov %ecx, (%rdi)
- mov %edx, -4(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- ret
+# else
+ /* Check if we found zero-char before end of page. */
+ test %VRCX, %VRCX
+ jz L(page_cross_continue)
- .p2align 4
-L(StrncpyExit9_16):
- mov (%rsi), %rcx
- mov -8(%rsi, %r8), %rdx
- mov %rcx, (%rdi)
- mov %rdx, -8(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- ret
+ /* Traditional copy case, essentially same as used in non-page-
+ cross case but since we can't reuse VMM(0) we need twice as
+ many loads from rsi. */
- .p2align 4
-L(StrncpyExit17_32):
- VMOVU (%rsi), %XMM2
- VMOVU -16(%rsi, %r8), %XMM3
- VMOVU %XMM2, (%rdi)
- VMOVU %XMM3, -16(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
+# ifndef USE_AS_STRCAT
+ xorl %edx, %edx
# endif
- ret
-
- .p2align 4
-L(StrncpyExit33_64):
- /* 0/32, 31/16 */
- VMOVU (%rsi), %YMM2
- VMOVU -VEC_SIZE(%rsi, %r8), %YMM3
- VMOVU %YMM2, (%rdi)
- VMOVU %YMM3, -VEC_SIZE(%rdi, %r8)
+ /* Dependency on rdi must already have been satisfied. */
+ bsf %VRCX, %VRDX
# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# elif !defined USE_AS_STRCAT
+ movq %rdi, %rax
# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- ret
- .p2align 4
-L(StrncpyExit65):
- /* 0/32, 32/32, 64/1 */
- VMOVU (%rsi), %YMM2
- VMOVU 32(%rsi), %YMM3
- mov 64(%rsi), %cl
- VMOVU %YMM2, (%rdi)
- VMOVU %YMM3, 32(%rdi)
- mov %cl, 64(%rdi)
-# ifdef USE_AS_STPCPY
- lea 65(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 65(%rdi)
+# if VEC_SIZE == 64
+# ifdef USE_AS_WCSCPY
+ testb %cl, %cl
+# else
+ test %ecx, %ecx
+# endif
+ jz L(page_cross_copy_32_63)
# endif
- ret
-
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(Fill1):
- mov %dl, (%rdi)
- ret
+# ifdef USE_AS_WCSCPY
+ testb $0xf, %cl
+# else
+ testw %cx, %cx
+# endif
+ jz L(page_cross_copy_16_31)
- .p2align 4
-L(Fill2):
- mov %dx, (%rdi)
- ret
+# ifdef USE_AS_WCSCPY
+ testb $0x3, %cl
+# else
+ testb %cl, %cl
+# endif
+ jz L(page_cross_copy_8_15)
- .p2align 4
-L(Fill3_4):
- mov %dx, (%rdi)
- mov %dx, -2(%rdi, %r8)
+# ifdef USE_AS_WCSCPY
+ movl (%rsi), %esi
+ movl %esi, (%rdi)
+ movl $0, (%END_REG)
ret
+# else
- .p2align 4
-L(Fill5_8):
- mov %edx, (%rdi)
- mov %edx, -4(%rdi, %r8)
- ret
+ testb $0x7, %cl
+ jz L(page_cross_copy_4_7)
- .p2align 4
-L(Fill9_16):
- mov %rdx, (%rdi)
- mov %rdx, -8(%rdi, %r8)
+ test %edx, %edx
+ jz L(page_cross_set_null_term)
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+L(page_cross_set_null_term):
+ movb $0, (%END_REG)
ret
- .p2align 4
-L(Fill17_32):
- VMOVU %XMMZERO, (%rdi)
- VMOVU %XMMZERO, -16(%rdi, %r8)
- ret
- .p2align 4
-L(CopyVecSizeUnalignedVec2):
- VMOVU %YMM2, (%rdi, %rcx)
-
- .p2align 4
-L(CopyVecSizeVecExit):
- bsf %edx, %edx
- add $(VEC_SIZE - 1), %r8
- add %rcx, %rdi
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
-
- .p2align 4
-L(StrncpyFillTailWithZero):
- xor %edx, %edx
- sub $VEC_SIZE, %r8
- jbe L(StrncpyFillExit)
-
- VMOVU %YMMZERO, (%rdi)
- add $VEC_SIZE, %rdi
-
- mov %rdi, %rsi
- and $(VEC_SIZE - 1), %esi
- sub %rsi, %rdi
- add %rsi, %r8
- sub $(VEC_SIZE * 4), %r8
- jb L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
- VMOVA %YMMZERO, (%rdi)
- VMOVA %YMMZERO, VEC_SIZE(%rdi)
- VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi)
- VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi)
- add $(VEC_SIZE * 4), %rdi
- sub $(VEC_SIZE * 4), %r8
- jae L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
- add $(VEC_SIZE * 2), %r8
- jl L(StrncpyFillLessTwoVecSize)
- VMOVA %YMMZERO, (%rdi)
- VMOVA %YMMZERO, VEC_SIZE(%rdi)
- add $(VEC_SIZE * 2), %rdi
- sub $VEC_SIZE, %r8
- jl L(StrncpyFillExit)
- VMOVA %YMMZERO, (%rdi)
- add $VEC_SIZE, %rdi
- jmp L(Fill)
-
- .p2align 4
-L(StrncpyFillLessTwoVecSize):
- add $VEC_SIZE, %r8
- jl L(StrncpyFillExit)
- VMOVA %YMMZERO, (%rdi)
- add $VEC_SIZE, %rdi
- jmp L(Fill)
-
- .p2align 4
-L(StrncpyFillExit):
- add $VEC_SIZE, %r8
-L(Fill):
- cmp $17, %r8d
- jae L(Fill17_32)
- cmp $9, %r8d
- jae L(Fill9_16)
- cmp $5, %r8d
- jae L(Fill5_8)
- cmp $3, %r8d
- jae L(Fill3_4)
- cmp $1, %r8d
- ja L(Fill2)
- je L(Fill1)
+ .p2align 4,, 4
+L(page_cross_copy_4_7):
+ movl (%rsi), %ecx
+ movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
+ movl %ecx, (%rdi)
+ movl %esi, -(4 - CHAR_SIZE)(%END_REG)
ret
-
-/* end of ifndef USE_AS_STRCAT */
# endif
- .p2align 4
-L(UnalignedLeaveCase2OrCase3):
- test %rdx, %rdx
- jnz L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
- lea (VEC_SIZE * 4)(%r8), %rcx
- and $-VEC_SIZE, %rcx
- add $(VEC_SIZE * 3), %r8
- jl L(CopyVecSizeCase3)
- VMOVU %YMM4, (%rdi)
- sub $VEC_SIZE, %r8
- jb L(CopyVecSizeCase3)
- VMOVU %YMM5, VEC_SIZE(%rdi)
- sub $VEC_SIZE, %r8
- jb L(CopyVecSizeCase3)
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
- sub $VEC_SIZE, %r8
- jb L(CopyVecSizeCase3)
- VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
-# ifdef USE_AS_STPCPY
- lea (VEC_SIZE * 4)(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (VEC_SIZE * 4)(%rdi)
-# endif
+# if VEC_SIZE == 64
+ .p2align 4,, 4
+L(page_cross_copy_32_63):
+ VMOVU (%rsi), %VMM_256(0)
+ VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+ VMOVU %VMM_256(0), (%rdi)
+ VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
ret
-
- .p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
- xor %ecx, %ecx
- vpcmpb $0, %YMM4, %YMMZERO, %k1
- kmovd %k1, %edx
- add $(VEC_SIZE * 3), %r8
- jle L(CopyVecSizeCase2OrCase3)
- test %edx, %edx
-# ifndef USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec4)
-# else
- jnz L(CopyVecSize)
-# endif
- vpcmpb $0, %YMM5, %YMMZERO, %k2
- kmovd %k2, %edx
- VMOVU %YMM4, (%rdi)
- add $VEC_SIZE, %rcx
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
- test %edx, %edx
-# ifndef USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec5)
-# else
- jnz L(CopyVecSize)
# endif
- vpcmpb $0, %YMM6, %YMMZERO, %k3
- kmovd %k3, %edx
- VMOVU %YMM5, VEC_SIZE(%rdi)
- add $VEC_SIZE, %rcx
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
- test %edx, %edx
-# ifndef USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec6)
-# else
- jnz L(CopyVecSize)
-# endif
-
- vpcmpb $0, %YMM7, %YMMZERO, %k4
- kmovd %k4, %edx
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
- lea VEC_SIZE(%rdi, %rcx), %rdi
- lea VEC_SIZE(%rsi, %rcx), %rsi
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
-L(StrncpyExit):
- cmp $65, %r8d
- je L(StrncpyExit65)
- cmp $33, %r8d
- jae L(StrncpyExit33_64)
- cmp $17, %r8d
- jae L(StrncpyExit17_32)
- cmp $9, %r8d
- jae L(StrncpyExit9_16)
- cmp $5, %r8d
- jae L(StrncpyExit5_8)
- cmp $3, %r8d
- jae L(StrncpyExit3_4)
- cmp $1, %r8d
- ja L(StrncpyExit2)
- je L(StrncpyExit1)
-# ifdef USE_AS_STPCPY
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi)
-# endif
+ .p2align 4,, 4
+L(page_cross_copy_16_31):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
ret
- .p2align 4
-L(ExitZero):
-# ifndef USE_AS_STRCAT
- mov %rdi, %rax
-# endif
+ .p2align 4,, 4
+L(page_cross_copy_8_15):
+ movq (%rsi), %rcx
+ movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+ movq %rcx, (%rdi)
+ movq %rsi, -(8 - CHAR_SIZE)(%END_REG)
ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
# endif
+END(STRCPY)
#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
index 203a19b..bced4e8 100644
--- a/sysdeps/x86_64/multiarch/strncat-evex.S
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -1,7 +1,520 @@
-#ifndef STRNCAT
-# define STRNCAT __strncat_evex
-#endif
+/* {wcs|str}ncat with 256/512-bit EVEX.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+ /* Use evex-masked stores for small sizes. Turned off at the
+ moment. */
+# define USE_EVEX_MASKED_STORE 0
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
+
+# ifndef STRNCAT
+# define STRNCAT __strncat_evex
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+# define MOVCHAR movl
+# define VMOVU_MASK vmovdqu32
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
+# define VPTEST vptestmd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
+
+# define REP_MOVS rep movsd
+
+# define VMASK_REG VR10
+# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst
+
+# define USE_WIDE_CHAR
+# else
+# define MOVCHAR movb
+# define VMOVU_MASK vmovdqu8
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
+# define VPTEST vptestmb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
+
+# define REP_MOVS rep movsb
+
+# define VMASK_REG VRCX
+# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst
+
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# include "reg-macros.h"
+
+
+# define VZERO VMM(7)
+# define VZERO_128 VMM_128(7)
+
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+ movq %rdi, %rax
+
+ /* NB: It's safe to filter out zero-length strings WITHOUT
+ setting null-term. Destination MUST be a null-terminated
+ string so essentially the work is already done. */
+# ifdef USE_AS_WCSCPY
+ leaq -1(%rdx), %rcx
+ shrq $56, %rcx
+ jnz L(zero_len)
+# else
+ test %rdx, %rdx
+ jle L(zero_len)
+# endif
+
+# include "strcat-strlen-evex.h.S"
+
+ movl %esi, %ecx
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(page_cross)
+L(page_cross_continue):
+ VMOVU (%rsi), %VMM(0)
+ VPTESTN %VMM(0), %VMM(0), %k0
+
+ /* If USE_EVEX_MASK_STORE is enabled then we just handle length
+ <= CHAR_PER_VEC with masked instructions (which have
+ potential for dramatically bad perf if dst splits a page and
+ is not in the TLB). */
+# if USE_EVEX_MASKED_STORE
+ KMOV %k0, %VRCX
+ FIND_FIRST_ONE (VRCX, VR8)
+ cmpq %r8, %rdx
+ jbe L(less_1x_vec)
+
+ test %VRCX, %VRCX
+ jz L(more_1x_vec)
+
+ blsmsk %VRCX, %VRCX
+ KMOV %VRCX, %k1
+ VMOVU_MASK %VMM(0), (%rdi){%k1}
+ ret
+
+L(less_1x_vec):
+ mov $-1, %VRCX
+ bzhi %VRDX, %VRCX, %VRCX
+ KMOV %VRCX, %k1
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ VMOVU_MASK %VMM(0), (%rdi){%k1}
+
+ ret
+# else
+ KMOV %k0, %VMASK_REG
+ /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
+ %VMASK_REG, %VRCX` for wcsncat. */
+ FIND_FIRST_ONE (VMASK_REG, VRCX)
+ cmpq %rcx, %rdx
+ jbe L(less_1x_vec)
+
+ /* If there were no zero-CHARs (rcx was zero before
+ FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
+ cmpl $CHAR_PER_VEC, %ecx
+ je L(more_1x_vec)
+
+ movl %ecx, %edx
+
+L(less_1x_vec):
+# if VEC_SIZE == 64
+ cmpl $(32 / CHAR_SIZE), %edx
+ jae L(copy_32_63)
+# endif
+
+ cmpl $(16 / CHAR_SIZE), %edx
+ jae L(copy_16_31)
+
+
+ cmpl $(8 / CHAR_SIZE), %edx
+ jae L(copy_8_15)
+
+# ifdef USE_AS_WCSCPY
+ vmovd %VMM_128(0), (%rdi)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+# else
+
+ cmpl $4, %edx
+ jae L(copy_4_7)
+
+ movzbl (%rsi), %ecx
+ cmpl $1, %edx
+ jbe L(set_null_term)
+
+ movzwl 1(%rsi), %esi
+ movw %si, 1(%rdi)
+
+ .p2align 4,, 1
+L(set_null_term):
+ movb %cl, (%rdi)
+ MOVCHAR $0, (%rdi, %rdx)
+ ret
+# endif
+
+# if VEC_SIZE == 64
+ .p2align 4,, 6
+L(copy_32_63):
+ VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+ VMOVU %VMM_256(0), (%rdi)
+ VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+# endif
+ .p2align 4,, 6
+L(copy_16_31):
+ /* Use xmm1 explicitly here as it won't require a `vzeroupper`
+ and will save code size. */
+ vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
+ VMOVU %VMM_128(0), (%rdi)
+ vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+
+ .p2align 4,, 2
+L(copy_8_15):
+ movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
+ vmovq %VMM_128(0), (%rdi)
+ movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+
+# ifndef USE_AS_WCSCPY
+ .p2align 4,, 12
+L(copy_4_7):
+ movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
+ vmovd %VMM_128(0), (%rdi)
+ movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+# endif
+
+# endif
+ .p2align 4,, 4
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+ test %rdx, %rdx
+# endif
+ jne OVERFLOW_STRCAT
+ ret
-#define USE_AS_STRNCAT
-#define STRCAT STRNCAT
-#include "strcat-evex.S"
+ .p2align 4,, 8
+L(more_1x_vec):
+ VMOVU %VMM(0), (%rdi)
+
+ /* We are going to align rsi here so will need to be able to re-
+ adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+ so rsi + rdx * CHAR_SIZE cannot overflow. */
+
+ leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+ subq %rsi, %rdi
+ andq $-(VEC_SIZE), %rsi
+L(loop_last_4x_vec):
+ addq %rsi, %rdi
+ subq %rsi, %rdx
+# ifdef USE_AS_WCSCPY
+ shrq $2, %rdx
+# endif
+
+ /* Will need this regardless. */
+ VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VMASK_REG
+
+ cmpq $(CHAR_PER_VEC * 2), %rdx
+ ja L(more_2x_vec)
+
+L(last_2x_vec):
+ FIND_FIRST_ONE (VMASK_REG, VRCX)
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x1_len)
+
+ /* If there were no zero-CHARs (rcx was zero before
+ FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
+ cmpl $CHAR_PER_VEC, %ecx
+ jne L(ret_vec_x1)
+
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ addl $-CHAR_PER_VEC, %edx
+ bzhi %VRDX, %VRCX, %VR8
+ jz L(ret_vec_x2_len)
+L(ret_vec_x2):
+ bsf %VRCX, %VRDX
+L(ret_vec_x2_len):
+ VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+ VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+ ret
+
+ .p2align 4,, 4
+L(ret_vec_x1_len):
+ movl %edx, %ecx
+L(ret_vec_x1):
+ VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
+ VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+ VZEROUPPER_RETURN
+
+
+ .p2align 4,, 8
+L(last_4x_vec):
+ addl $-(CHAR_PER_VEC * 4), %edx
+ VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VMASK_REG
+ subq $-(VEC_SIZE * 4), %rsi
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpl $(CHAR_PER_VEC * 2), %edx
+ jbe L(last_2x_vec)
+ .p2align 4,, 8
+L(more_2x_vec):
+# ifdef USE_AS_WCSCPY
+ xorl %ecx, %ecx
+# endif
+ bsf %VMASK_REG, %VRCX
+ jnz L(ret_vec_x1)
+
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+
+ VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
+ VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
+ VPTESTN %VMM(3), %VMM(3), %k0
+ KMOV %k0, %VMASK_REG
+
+ cmpq $(CHAR_PER_VEC * 4), %rdx
+ ja L(more_4x_vec)
+
+ /* Adjust length before going to L(ret_vec_x3_len) or
+ L(ret_vec_x3). */
+ addl $(CHAR_PER_VEC * -2), %edx
+
+ FIND_FIRST_ONE (VMASK_REG, VRCX)
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x3_len)
+
+ /* If there were no zero-CHARs (rcx was zero before
+ FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
+ cmpl $CHAR_PER_VEC, %ecx
+ jne L(ret_vec_x3)
+
+ VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ VPTESTN %VMM(4), %VMM(4), %k0
+ KMOV %k0, %VRCX
+ addl $-CHAR_PER_VEC, %edx
+ bzhi %VRDX, %VRCX, %VR8
+ jz L(ret_vec_x4_len)
+L(ret_vec_x4):
+ bsf %VRCX, %VRDX
+L(ret_vec_x4_len):
+ VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
+ VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+ ret
+
+ .p2align 4,, 4
+L(ret_vec_x3_len):
+ movl %edx, %ecx
+L(ret_vec_x3):
+ VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+ VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+ ret
+
+ .p2align 4,, 8
+L(more_4x_vec):
+# ifdef USE_AS_WCSCPY
+ xorl %ecx, %ecx
+# endif
+ bsf %VMASK_REG, %VRCX
+ jnz L(ret_vec_x3)
+
+ VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ VPTESTN %VMM(4), %VMM(4), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x4)
+
+ VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
+
+ /* Check if we are near the end before aligning. */
+ cmpq $(CHAR_PER_VEC * 8), %rdx
+ jbe L(last_4x_vec)
+
+
+ /* Add rsi to rdx (length) before aligning rsi. NB: Since we
+ filtered out huge lengths this cannot overflow. */
+# ifdef USE_AS_WCSCPY
+ leaq (%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+ addq %rsi, %rdx
+# endif
+
+ /* Subtract rsi from rdi before aligning (add back will have
+ correct rdi for aligned rsi). */
+ subq %rsi, %rdi
+ subq $-(VEC_SIZE * 5), %rsi
+ andq $(VEC_SIZE * -4), %rsi
+
+ /* Load first half of the loop before entry. */
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+
+ /* Offset rsi by VEC_SIZE so that we can jump to
+ L(loop_last_4x_vec). */
+ addq $-(VEC_SIZE), %rsi
+ KORTEST %k2, %k4
+ jnz L(loop_4x_done)
+
+ /* Store loop end in r9. */
+ leaq -(VEC_SIZE * 5)(%rdx), %r9
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+ VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+ VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+ VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+ VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+ subq $(VEC_SIZE * -4), %rsi
+ cmpq %rsi, %r9
+ jbe L(loop_last_4x_vec)
+
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+ KORTEST %k2, %k4
+ jz L(loop_4x_vec)
+
+L(loop_4x_done):
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ /* Restore rdi (dst). */
+ addq %rsi, %rdi
+
+ /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
+ test with bsf. */
+ bsf %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+ VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
+
+ KMOV %k2, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+ VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ bsf %VRCX, %VRCX
+ jnz L(ret_vec_x3)
+ VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+
+ KMOV %k4, %VRCX
+ bsf %VRCX, %VRCX
+ VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+ ret
+
+
+ .p2align 4,, 4
+L(page_cross):
+ movq %rsi, %r8
+ andq $(VEC_SIZE * -1), %r8
+ VPCMPEQ (%r8), %VZERO, %k0
+
+# ifdef USE_AS_WCSCPY
+ KMOV %k0, %VR9
+ shrl $2, %ecx
+ andl $(CHAR_PER_VEC - 1), %ecx
+ shrx %VRCX, %VR9, %VRCX
+# else
+ KMOV %k0, %VRCX
+ shrx %VRSI, %VRCX, %VRCX
+# endif
+
+ subl %esi, %r8d
+ andl $(VEC_SIZE - 1), %r8d
+# ifdef USE_AS_WCSCPY
+ shrl $2, %r8d
+# endif
+ cmpq %r8, %rdx
+ jbe L(page_cross_small)
+ /* Optimizing more for space as this is very cold code. This
+ saves 2x cache lines. */
+
+ /* This adds once to the later result which will get correct
+ copy bounds. NB: this can never zero-out a non-zero RCX as
+ to be in the page cross case rsi cannot be aligned and we
+ already right-shift rcx by the misalignment. */
+ shl %VRCX
+ jz L(page_cross_continue)
+ bsf %VRCX, %VRCX
+ REP_MOVS
+ ret
+
+L(page_cross_small):
+ tzcnt %VRCX, %VRCX
+ jz L(page_cross_setz)
+ cmpl %edx, %ecx
+ cmova %edx, %ecx
+
+# ifdef USE_AS_WCSCPY
+ rep movsd
+# else
+ rep movsb
+# endif
+L(page_cross_setz):
+ MOVCHAR $0, (%rdi)
+ ret
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
index 1b3426d..49eaf4c 100644
--- a/sysdeps/x86_64/multiarch/strncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -1,7 +1,990 @@
-#ifndef STRNCPY
-# define STRNCPY __strncpy_evex
-#endif
+/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+ /* Use evex-masked stores for small sizes. Turned off at the
+ moment. */
+# define USE_EVEX_MASKED_STORE 0
+
+
+# include <sysdep.h>
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNCPY
+# define STRNCPY __strncpy_evex
+# endif
+
+# ifdef USE_AS_WCSCPY
+# define VMOVU_MASK vmovdqu32
+# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
+# define VPTEST vptestmd
+# define CHAR_SIZE 4
+
+# define REP_MOVS rep movsd
+# define REP_STOS rep stosl
+
+# define USE_WIDE_CHAR
+
+# else
+# define VMOVU_MASK vmovdqu8
+# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
+# define VPTEST vptestmb
+# define CHAR_SIZE 1
+
+# define REP_MOVS rep movsb
+# define REP_STOS rep stosb
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+
+# define VZERO VMM(7)
+# define VZERO_256 VMM_256(7)
+# define VZERO_128 VMM_128(7)
+
+# if VEC_SIZE == 64
+# define VZERO_HALF VZERO_256
+# else
+# define VZERO_HALF VZERO_128
+# endif
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+ /* Filter zero length strings and very long strings. Zero
+ length strings just return, very long strings are handled by
+ just running rep stos{b|l} to zero set (which will almost
+ certainly segfault), if that succeeds then just calling
+ OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
+# ifdef USE_AS_WCSCPY
+ decq %rdx
+ movq %rdx, %rax
+ /* 56 is end of max supported address space. */
+ shr $56, %rax
+ jnz L(zero_len)
+# else
+ decq %rdx
+ /* If the flag needs to become `jb` replace `dec` with `sub`.
+ */
+ jl L(zero_len)
+# endif
+
+ vpxorq %VZERO_128, %VZERO_128, %VZERO_128
+ movl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(page_cross)
+
+L(page_cross_continue):
+ VMOVU (%rsi), %VMM(0)
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+
+ /* If no STPCPY just save end ahead of time. */
+# ifndef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+
+
+ cmpq $(CHAR_PER_VEC), %rdx
+
+ /* If USE_EVEX_MASK_STORE is enabled then we just handle length
+ <= CHAR_PER_VEC with masked instructions (which have
+ potential for dramatically bad perf if dst splits a page and
+ is not in the TLB). */
+# if USE_EVEX_MASKED_STORE
+ /* `jae` because length rdx is now length - 1. */
+ jae L(more_1x_vec)
+
+ /* If there where multiple zero-CHAR matches in the first VEC,
+ VRCX will be overset but thats fine since any oversets where
+ at zero-positions anyways. */
+
+# ifdef USE_AS_STPCPY
+ tzcnt %VRCX, %VRAX
+ cmpl %eax, %edx
+ cmovb %edx, %eax
+# ifdef USE_AS_WCSCPY
+ adcl $0, %eax
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ adcq %rdi, %rax
+# endif
+# endif
+ dec %VRCX
+
+ /* Zero out all non-zero CHAR's after the first zero match. */
+ KMOV %VRCX, %k1
+
+ /* Use VZERO as destination so this can be reused for
+ L(zfill_less_vec) (which if jumped to by subsequent logic
+ will have zerod out VZERO. */
+ VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
+L(zfill_less_vec):
+ /* Get mask for what we need to set. */
+ incl %edx
+ mov $-1, %VRCX
+ bzhi %VRDX, %VRCX, %VRCX
+ KMOV %VRCX, %k1
+ VMOVU_MASK %VZERO, (%rdi){%k1}
+ ret
+
+ .p2align 4,, 4
+L(zero_len):
+ cmpq $-1, %rdx
+ jne L(best_effort_strncpy)
+ movq %rdi, %rax
+ ret
+
+ .p2align 4,, 8
+L(more_1x_vec):
+# else
+ /* `jb` because length rdx is now length - 1. */
+ jb L(less_1x_vec)
+# endif
+
+
+ /* This may overset but thats fine because we still need to zero
+ fill. */
+ VMOVU %VMM(0), (%rdi)
+
+
+ /* Length must be >= CHAR_PER_VEC so match here means we must
+ zero-fill. */
+ test %VRCX, %VRCX
+ jnz L(zfill)
+
+
+ /* We are going to align rsi here so will need to be able to re-
+ adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+ so rsi + rdx * CHAR_SIZE cannot overflow. */
+ leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+ subq %rsi, %rdi
+ andq $-(VEC_SIZE), %rsi
+
+L(loop_last_4x_vec):
+ addq %rsi, %rdi
+ subq %rsi, %rdx
+# ifdef USE_AS_WCSCPY
+ shrq $2, %rdx
+# endif
+
+ VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VRCX
+
+ /* -1 because of the `dec %rdx` earlier. */
+ cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
+ ja L(more_2x_vec)
+
+L(last_2x_vec):
+ /* This will be need to be computed no matter what. We do it
+ ahead of time for CHAR_PER_VEC == 64 because we can't adjust
+ the value of `tzcnt` with a shift. */
+# if CHAR_PER_VEC == 64
+ tzcntq %rcx, %rcx
+# endif
+
+ cmpl $(CHAR_PER_VEC), %edx
+ jb L(ret_vec_x1_len)
+
+ /* Seperate logic for CHAR_PER_VEC == 64 because we already did
+ `tzcnt` on VRCX. */
+# if CHAR_PER_VEC == 64
+ /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
+ cmpb $CHAR_PER_VEC, %cl
+ jnz L(ret_vec_x1_no_bsf)
+# else
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+# endif
+
+
+
+ VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ KMOV %k0, %VRCX
+
+# if CHAR_PER_VEC < 64
+ /* This essentiallys adds CHAR_PER_VEC to computed result. */
+ shlq $CHAR_PER_VEC, %rcx
+# else
+ tzcntq %rcx, %rcx
+ addl $CHAR_PER_VEC, %ecx
+# endif
+
+ .p2align 4,, 4
+L(ret_vec_x1_len):
+ /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
+ already been done. */
+# if CHAR_PER_VEC < 64
+ tzcntq %rcx, %rcx
+# endif
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x1_len_no_zfill)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x1_len_no_zfill_mov):
+ movl %ecx, %edx
+# ifdef USE_AS_STPCPY
+ /* clear flags. */
+ xorl %ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+ VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ leal (VEC_SIZE)(%rdx), %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+
+
+ .p2align 4,, 10
+L(ret_vec_x1):
+ bsf %VRCX, %VRCX
+L(ret_vec_x1_no_bsf):
+ VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+ subl %ecx, %edx
+ cmpl $CHAR_PER_VEC, %edx
+ jb L(ret_vec_x1_len_no_zfill_mov)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+ leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+ ret
+
+ .p2align 4,, 8
+L(last_4x_vec):
+ /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
+ $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
+ using `movzbl`. */
+# if CHAR_PER_VEC == 64
+ movzbl %dl, %edx
+# else
+ andl $(CHAR_PER_VEC * 4 - 1), %edx
+# endif
+ VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VRCX
+ subq $-(VEC_SIZE * 4), %rsi
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpl $(CHAR_PER_VEC * 2 - 1), %edx
+ jbe L(last_2x_vec)
+ .p2align 4,, 8
+L(more_2x_vec):
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ test %VRCX, %VRCX
+ /* Must fill at least 2x VEC. */
+ jnz L(zfill_vec1)
+
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
+ VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ /* Must fill at least 1x VEC. */
+ jnz L(zfill_vec2)
+
+ VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
+ VPTESTN %VMM(3), %VMM(3), %k0
+ KMOV %k0, %VRCX
+
+ /* Check if len is more 4x VEC. -1 because rdx is len - 1. */
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rdx
+ ja L(more_4x_vec)
+
+ subl $(CHAR_PER_VEC * 3), %edx
+ jb L(ret_vec_x3_len)
+
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x3)
+
+ VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ KMOV %k0, %VRCX
+ tzcnt %VRCX, %VRCX
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x4_len_no_zfill)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+ movl %ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+ VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ leal (VEC_SIZE * 4 + 0)(%rdx), %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+
+
+L(ret_vec_x3_len):
+ addl $(CHAR_PER_VEC * 1), %edx
+ tzcnt %VRCX, %VRCX
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x3_len_no_zfill)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x3_len_no_zfill_mov):
+ movl %ecx, %edx
+# ifdef USE_AS_STPCPY
+ /* clear flags. */
+ xorl %ecx, %ecx
+# endif
+ .p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+ VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ leal (VEC_SIZE * 3 + 0)(%rdx), %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+
+
+ .p2align 4,, 8
+L(ret_vec_x3):
+ bsf %VRCX, %VRCX
+ VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
+ subl %ecx, %edx
+ jl L(ret_vec_x3_len_no_zfill_mov)
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+ leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+ ret
+
+ .p2align 4,, 8
+L(more_4x_vec):
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ test %VRCX, %VRCX
+ jnz L(zfill_vec3)
+
+ VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
+ VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
+ VPTESTN %VMM(4), %VMM(4), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(zfill_vec4)
-#define USE_AS_STRNCPY
-#define STRCPY STRNCPY
-#include "strcpy-evex.S"
+ /* Recheck length before aligning. */
+ cmpq $(CHAR_PER_VEC * 8 - 1), %rdx
+ jbe L(last_4x_vec)
+
+ /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */
+# ifdef USE_AS_WCSCPY
+ leaq (%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+ addq %rsi, %rdx
+# endif
+ subq %rsi, %rdi
+ subq $-(VEC_SIZE * 5), %rsi
+ andq $(VEC_SIZE * -4), %rsi
+
+
+ /* Load first half of the loop before entry. */
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+
+
+ /* Offset rsi by VEC_SIZE so that we can jump to
+ L(loop_last_4x_vec). */
+ addq $-(VEC_SIZE), %rsi
+ KORTEST %k2, %k4
+ jnz L(loop_4x_done)
+
+ /* Store loop end in r9. */
+ leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+ VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+ VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+ VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+ VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+ subq $(VEC_SIZE * -4), %rsi
+ cmpq %rsi, %r9
+ jbe L(loop_last_4x_vec)
+
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+ KORTEST %k2, %k4
+ jz L(loop_4x_vec)
+
+L(loop_4x_done):
+ /* Restore rdx (length). */
+ subq %rsi, %rdx
+# ifdef USE_AS_WCSCPY
+ shrq $2, %rdx
+# endif
+ VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+ /* Restore rdi (dst). */
+ addq %rsi, %rdi
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(zfill_vec1)
+
+ VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+ KMOV %k2, %VRCX
+ test %VRCX, %VRCX
+ jnz L(zfill_vec2)
+
+ VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(zfill_vec3)
+
+ VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
+ KMOV %k4, %VRCX
+ // Zfill more....
+
+ .p2align 4,, 4
+L(zfill_vec4):
+ subq $(VEC_SIZE * -2), %rdi
+ addq $(CHAR_PER_VEC * -2), %rdx
+L(zfill_vec2):
+ subq $(VEC_SIZE * -2), %rdi
+ addq $(CHAR_PER_VEC * -1), %rdx
+L(zfill):
+ /* VRCX must be non-zero. */
+ bsf %VRCX, %VRCX
+
+ /* Adjust length / dst for zfill. */
+ subq %rcx, %rdx
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+ addq %rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+L(zfill_from_page_cross):
+
+ /* From here on out its just memset(rdi, 0, rdx). */
+ cmpq $CHAR_PER_VEC, %rdx
+ jb L(zfill_less_vec)
+
+L(zfill_more_1x_vec):
+ VMOVU %VZERO, (%rdi)
+ VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
+ ja L(zfill_more_2x_vec)
+L(zfill_done0):
+ ret
+
+ /* Coming from vec1/vec2 we must be able to zfill at least 2x
+ VEC. */
+ .p2align 4,, 8
+L(zfill_vec3):
+ subq $(VEC_SIZE * -2), %rdi
+ addq $(CHAR_PER_VEC * -2), %rdx
+ .p2align 4,, 2
+L(zfill_vec1):
+ bsfq %rcx, %rcx
+ /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
+ */
+ leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+ subq %rcx, %rdx
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+
+
+ VMOVU %VZERO, (%rdi)
+ VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ cmpq $(CHAR_PER_VEC * 2), %rdx
+ jb L(zfill_done0)
+L(zfill_more_2x_vec):
+ VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+ VMOVU %VZERO, (VEC_SIZE)(%rdi)
+ subq $(CHAR_PER_VEC * 4 - 1), %rdx
+ jbe L(zfill_done)
+
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rdx, CHAR_SIZE), %rdx
+# else
+ addq %rdi, %rdx
+# endif
+
+ VMOVU %VZERO, (VEC_SIZE * 2)(%rdi)
+ VMOVU %VZERO, (VEC_SIZE * 3)(%rdi)
+
+
+ VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+ VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpq %rdi, %rdx
+ jbe L(zfill_done)
+
+ /* Align rdi and zfill loop. */
+ andq $-(VEC_SIZE), %rdi
+ .p2align 4,, 12
+L(zfill_loop_4x_vec):
+ VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
+ VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
+ VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
+ VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpq %rdi, %rdx
+ ja L(zfill_loop_4x_vec)
+L(zfill_done):
+ ret
+
+
+ /* Less 1x VEC case if we are not using evex masked store. */
+# if !USE_EVEX_MASKED_STORE
+ .p2align 4,, 8
+L(copy_1x):
+ /* Special case for copy 1x. It can be handled quickly and many
+ buffer sizes have convenient alignment. */
+ VMOVU %VMM(0), (%rdi)
+ /* If no zeros then we are done. */
+ testl %ecx, %ecx
+ jz L(ret_1x_1x)
+
+ /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
+ only handle the small case here. */
+ bsf %VRCX, %VRCX
+L(zfill_less_vec_no_bsf):
+ /* Adjust length / dst then just zfill less_vec. */
+ subq %rcx, %rdx
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+ addq %rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+
+L(zfill_less_vec):
+ cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx
+ jb L(zfill_less_half)
+
+ VMOVU %VZERO_HALF, (%rdi)
+ VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ ret
+# ifdef USE_AS_STPCPY
+L(ret_1x_1x):
+ leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
+ ret
+# endif
+
+
+# if VEC_SIZE == 64
+ .p2align 4,, 4
+L(copy_32_63):
+ /* Overfill to avoid branches. */
+ VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+ VMOVU %VMM_256(0), (%rdi)
+ VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+
+ /* We are taking advantage of the fact that to be here we must
+ be writing null-term as (%rdi, %rcx) we have a byte of lee-
+ way for overwriting. */
+ cmpl %ecx, %edx
+ ja L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+# else
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+# endif
+
+ .p2align 4,, 4
+L(copy_16_31):
+ /* Overfill to avoid branches. */
+ vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+ VMOVU %VMM_128(0), (%rdi)
+ vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ cmpl %ecx, %edx
+
+ /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
+ we have a larger copy block for 32-63 so this is just falls
+ through to zfill 16-31. If VEC_SIZE == 32 then we check for
+ full zfill of less 1x VEC. */
+# if VEC_SIZE == 64
+ jbe L(ret_16_31)
+ subl %ecx, %edx
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+ addq %rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+L(zfill_less_half):
+L(zfill_less_32):
+ cmpl $(16 / CHAR_SIZE), %edx
+ jb L(zfill_less_16)
+ VMOVU %VZERO_128, (%rdi)
+ VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+ ret
+# endif
+L(ret_16_31):
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+# else
+ /* VEC_SIZE == 32 begins. */
+ ja L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+# else
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+# endif
+
+
+ .p2align 4,, 4
+L(copy_8_15):
+ /* Overfill to avoid branches. */
+ movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+ vmovq %VMM_128(0), (%rdi)
+ movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ cmpl %ecx, %edx
+ jbe L(ret_8_15)
+ subl %ecx, %edx
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+ addq %rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+ .p2align 4,, 8
+# if VEC_SIZE == 32
+L(zfill_less_half):
+# endif
+L(zfill_less_16):
+ xorl %ecx, %ecx
+ cmpl $(8 / CHAR_SIZE), %edx
+ jb L(zfill_less_8)
+ movq %rcx, (%rdi)
+ movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+# ifndef USE_AS_STPCPY
+L(ret_8_15):
+# endif
+ ret
+
+ .p2align 4,, 8
+L(less_1x_vec):
+ je L(copy_1x)
+
+ /* We will need `tzcnt` result for all other copy sizes. */
+ tzcnt %VRCX, %VRCX
+# if VEC_SIZE == 64
+ cmpl $(32 / CHAR_SIZE), %edx
+ jae L(copy_32_63)
+# endif
+
+ cmpl $(16 / CHAR_SIZE), %edx
+ jae L(copy_16_31)
+
+ cmpl $(8 / CHAR_SIZE), %edx
+ jae L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+ testl %ecx, %ecx
+ jz L(zfill_less_8_set_ret)
+
+ movl (%rsi, %rdx, CHAR_SIZE), %esi
+ vmovd %VMM_128(0), (%rdi)
+ movl %esi, (%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+ cmpl %ecx, %edx
+L(ret_8_15):
+ adcq $0, %rdx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# endif
+ ret
+L(zfill_less_8_set_ret):
+ xorl %ecx, %ecx
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+L(zfill_less_8):
+ movl %ecx, (%rdi)
+ movl %ecx, (%rdi, %rdx, CHAR_SIZE)
+ ret
+# else
+ cmpl $3, %edx
+ jb L(copy_0_3)
+ /* Overfill to avoid branches. */
+ movl -3(%rsi, %rdx), %esi
+ vmovd %VMM_128(0), (%rdi)
+ movl %esi, -3(%rdi, %rdx)
+ cmpl %ecx, %edx
+ jbe L(ret_4_7)
+ subq %rcx, %rdx
+ addq %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+ xorl %ecx, %ecx
+ .p2align 4,, 8
+L(zfill_less_8):
+ cmpl $3, %edx
+ jb L(zfill_less_3)
+ movl %ecx, (%rdi)
+ movl %ecx, -3(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ ret
+# endif
+
+L(ret_4_7):
+# ifdef USE_AS_STPCPY
+L(ret_8_15):
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+ ret
+
+ .p2align 4,, 4
+L(zfill_less_3):
+ testl %edx, %edx
+ jz L(zfill_1)
+ movw %cx, (%rdi)
+L(zfill_1):
+ movb %cl, (%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+L(copy_0_3):
+ vmovd %VMM_128(0), %r8d
+ testl %edx, %edx
+ jz L(copy_1)
+ movw %r8w, (%rdi)
+ cmpl %ecx, %edx
+ ja L(zfill_from_1)
+ movzbl (%rsi, %rdx), %r8d
+# ifdef USE_AS_STPCPY
+ movl %edx, %eax
+ adcq %rdi, %rax
+ movb %r8b, (%rdi, %rdx)
+ ret
+# endif
+
+L(copy_1):
+# ifdef USE_AS_STPCPY
+ movl %edx, %eax
+ cmpl %ecx, %edx
+ adcq %rdi, %rax
+# endif
+# ifdef USE_AS_WCSCPY
+ vmovd %VMM_128(0), (%rdi)
+# else
+ movb %r8b, (%rdi, %rdx)
+# endif
+ ret
+# endif
+
+
+# ifndef USE_AS_WCSCPY
+ .p2align 4,, 8
+L(zfill_from_1):
+# ifdef USE_AS_STPCPY
+ leaq (%rdi, %rcx), %rax
+# endif
+ movw $0, -1(%rdi, %rdx)
+ ret
+# endif
+
+ .p2align 4,, 4
+L(zero_len):
+ incq %rdx
+ jne L(best_effort_strncpy)
+ movq %rdi, %rax
+ ret
+# endif
+
+
+ .p2align 4,, 4
+ .p2align 6,, 8
+L(page_cross):
+ movq %rsi, %rax
+ andq $(VEC_SIZE * -1), %rax
+ VPCMPEQ (%rax), %VZERO, %k0
+ KMOV %k0, %VRCX
+# ifdef USE_AS_WCSCPY
+ movl %esi, %r8d
+ shrl $2, %r8d
+ andl $(CHAR_PER_VEC - 1), %r8d
+ shrx %VR8, %VRCX, %VRCX
+# else
+ shrx %VRSI, %VRCX, %VRCX
+# endif
+
+ /* Compute amount of bytes we checked. */
+ subl %esi, %eax
+ andl $(VEC_SIZE - 1), %eax
+# ifdef USE_AS_WCSCPY
+ shrl $2, %eax
+# endif
+
+ /* If rax > rdx then we are finishing the copy at the end of the
+ page. */
+ cmpq %rax, %rdx
+ jb L(page_cross_small)
+
+
+ /* If rcx is non-zero then continue. */
+ test %VRCX, %VRCX
+ jz L(page_cross_continue)
+
+ /* We found zero-CHAR so need to copy then zfill (we know we
+ didn't cover all of length here). */
+ bsf %VRCX, %VRCX
+L(movsb_and_zfill):
+ incl %ecx
+ subq %rcx, %rdx
+# ifdef USE_AS_STPCPY
+ leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
+# else
+ movq %rdi, %rax
+# endif
+
+ REP_MOVS
+# ifdef USE_AS_WCSCPY
+ movl $0, (%rdi)
+# else
+ movb $0, (%rdi)
+# endif
+ jmp L(zfill_from_page_cross)
+
+L(page_cross_small):
+ tzcnt %VRCX, %VRCX
+ cmpl %ecx, %edx
+ jbe L(page_cross_copy_only)
+
+ /* Do a zfill of the tail before copying. */
+ movq %rdi, %r9
+ xorl %eax, %eax
+
+ movl %ecx, %r8d
+
+ subl %ecx, %edx
+ leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+ movl %edx, %ecx
+ REP_STOS
+ movq %r9, %rdi
+ movl %r8d, %edx
+L(page_cross_copy_only):
+ leal 1(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcl $0, %edx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# else
+ movq %rdi, %rax
+# endif
+ REP_MOVS
+ ret
+
+
+L(best_effort_strncpy):
+ movq %rdx, %rcx
+ xorl %eax, %eax
+ movq %rdi, %r8
+ /* The length is >= 2^63. We very much so expect to segfault at
+ rep stos. If that doesn't happen then just strcpy to finish.
+ */
+ REP_STOS
+ movq %r8, %rdi
+ jmp OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
new file mode 100644
index 0000000..d4f4d6c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
@@ -0,0 +1,80 @@
+/* Helper for getting proper name of overflow fallback function for
+ {wc|st}{p|r|s}n{cat|cpy}
+
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
+#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
+
+#if defined USE_MULTIARCH && IS_IN(libc)
+# define UNDERSCORES __
+# ifdef USE_WITH_SSE2
+# define ISA_EXT _sse2
+# elif defined USE_WITH_AVX2
+# ifdef USE_WITH_RTM
+# define ISA_EXT _avx2_rtm
+# else
+# define ISA_EXT _avx2
+# endif
+
+# elif defined USE_WITH_EVEX256
+# define ISA_EXT _evex
+# elif defined USE_WITH_EVEX512
+# define ISA_EXT _evex512
+# endif
+#else
+# define UNDERSCORES
+# define ISA_EXT
+#endif
+
+#ifdef USE_AS_WCSCPY
+# define STRCPY_PREFIX wc
+# define STRCAT_PREFIX wcs
+# ifdef USE_AS_STPCPY
+# define STRCPY_POSTFIX pcpy
+# else
+# define STRCPY_POSTFIX scpy
+# endif
+#else
+# define STRCPY_PREFIX st
+# define STRCAT_PREFIX str
+# ifdef USE_AS_STPCPY
+# define STRCPY_POSTFIX pcpy
+# else
+# define STRCPY_POSTFIX rcpy
+# endif
+#endif
+#define STRCAT_POSTFIX cat
+
+#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext) \
+ underscores##prefix##postfix##ext
+
+#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
+
+#ifndef OVERFLOW_STRCPY
+# define OVERFLOW_STRCPY \
+ OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
+#endif
+
+#ifndef OVERFLOW_STRCAT
+# define OVERFLOW_STRCAT \
+ OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
+#endif
+
+#endif