diff options
author | Raoni Fassina Firmino <raoni@linux.ibm.com> | 2022-02-25 10:45:14 -0300 |
---|---|---|
committer | Raoni Fassina Firmino <raoni@linux.ibm.com> | 2022-02-25 10:45:14 -0300 |
commit | 821f0a9cb24486842f16f9b0d81d9c28e876c6ae (patch) | |
tree | 690876c3a706cb222b159e2ebff02d218d2fd1fb | |
parent | 889122cbfacedab8fdb62a9d3fd6244d528ea99c (diff) | |
parent | 178292fe011d5745aa0dd7ea99b388c30512fbdb (diff) | |
download | glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.zip glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.tar.gz glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.tar.bz2 |
Merge branch release/2.30/master into ibm/2.30/master
119 files changed, 7797 insertions, 1432 deletions
@@ -73,6 +73,8 @@ The following bugs are resolved with this release: [25976] nss_compat: internal_end*ent may clobber errno, hiding ERANGE [27130] "rep movsb" performance issue [27177] GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on doesn't work + [27457] vzeroupper use in AVX2 multiarch string functions cause HTM aborts + [28755] overflow bug in wcsncmp_avx2 and wcsncmp_evex Version 2.30 diff --git a/elf/Makefile b/elf/Makefile index 63416b2..b0d0f60 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -1514,8 +1514,6 @@ $(objpfx)tst-nodelete-dlclose.out: $(objpfx)tst-nodelete-dlclose-dso.so \ tst-env-setuid-ENV = MALLOC_CHECK_=2 MALLOC_MMAP_THRESHOLD_=4096 \ LD_HWCAP_MASK=0x1 -tst-env-setuid-tunables-ENV = \ - GLIBC_TUNABLES=glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096 $(objpfx)tst-debug1: $(libdl) $(objpfx)tst-debug1.out: $(objpfx)tst-debug1mod1.so diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c index b0980c5..a8425dd 100644 --- a/elf/dl-tunables.c +++ b/elf/dl-tunables.c @@ -178,6 +178,7 @@ parse_tunables (char *tunestr, char *valstring) return; char *p = tunestr; + size_t off = 0; while (true) { @@ -191,7 +192,11 @@ parse_tunables (char *tunestr, char *valstring) /* If we reach the end of the string before getting a valid name-value pair, bail out. */ if (p[len] == '\0') - return; + { + if (__libc_enable_secure) + tunestr[off] = '\0'; + return; + } /* We did not find a valid name-value pair before encountering the colon. */ @@ -217,35 +222,28 @@ parse_tunables (char *tunestr, char *valstring) if (tunable_is_name (cur->name, name)) { - /* If we are in a secure context (AT_SECURE) then ignore the tunable - unless it is explicitly marked as secure. Tunable values take - precendence over their envvar aliases. */ + /* If we are in a secure context (AT_SECURE) then ignore the + tunable unless it is explicitly marked as secure. Tunable + values take precedence over their envvar aliases. We write + the tunables that are not SXID_ERASE back to TUNESTR, thus + dropping all SXID_ERASE tunables and any invalid or + unrecognized tunables. */ if (__libc_enable_secure) { - if (cur->security_level == TUNABLE_SECLEVEL_SXID_ERASE) + if (cur->security_level != TUNABLE_SECLEVEL_SXID_ERASE) { - if (p[len] == '\0') - { - /* Last tunable in the valstring. Null-terminate and - return. */ - *name = '\0'; - return; - } - else - { - /* Remove the current tunable from the string. We do - this by overwriting the string starting from NAME - (which is where the current tunable begins) with - the remainder of the string. We then have P point - to NAME so that we continue in the correct - position in the valstring. */ - char *q = &p[len + 1]; - p = name; - while (*q != '\0') - *name++ = *q++; - name[0] = '\0'; - len = 0; - } + if (off > 0) + tunestr[off++] = ':'; + + const char *n = cur->name; + + while (*n != '\0') + tunestr[off++] = *n++; + + tunestr[off++] = '='; + + for (size_t j = 0; j < len; j++) + tunestr[off++] = value[j]; } if (cur->security_level != TUNABLE_SECLEVEL_NONE) @@ -258,9 +256,7 @@ parse_tunables (char *tunestr, char *valstring) } } - if (p[len] == '\0') - return; - else + if (p[len] != '\0') p += len + 1; } } diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c index e92e9f5..0e9aede 100644 --- a/elf/tst-env-setuid-tunables.c +++ b/elf/tst-env-setuid-tunables.c @@ -25,35 +25,76 @@ #include "config.h" #undef _LIBC -#define test_parent test_parent_tunables -#define test_child test_child_tunables - -static int test_child_tunables (void); -static int test_parent_tunables (void); - -#include "tst-env-setuid.c" - -#define CHILD_VALSTRING_VALUE "glibc.malloc.mmap_threshold=4096" -#define PARENT_VALSTRING_VALUE \ - "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096" +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <unistd.h> +#include <intprops.h> +#include <array_length.h> + +#include <support/check.h> +#include <support/support.h> +#include <support/test-driver.h> +#include <support/capture_subprocess.h> + +const char *teststrings[] = +{ + "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.check=2:glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096:glibc.malloc.check=2", + "glibc.malloc.perturb=0x800", + "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.perturb=0x800:not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096", + "glibc.not_valid.check=2:glibc.malloc.mmap_threshold=4096", + "not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096:glibc.malloc.check=2", + "glibc.malloc.check=4:glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096", + ":glibc.malloc.garbage=2:glibc.malloc.check=1", + "glibc.malloc.check=1:glibc.malloc.check=2", + "not_valid.malloc.check=2", + "glibc.not_valid.check=2", +}; + +const char *resultstrings[] = +{ + "glibc.malloc.mmap_threshold=4096", + "glibc.malloc.mmap_threshold=4096", + "glibc.malloc.mmap_threshold=4096", + "glibc.malloc.perturb=0x800", + "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096", + "glibc.malloc.mmap_threshold=4096", + "glibc.malloc.mmap_threshold=4096", + "", + "", + "", + "", + "", + "", +}; static int -test_child_tunables (void) +test_child (int off) { const char *val = getenv ("GLIBC_TUNABLES"); #if HAVE_TUNABLES - if (val != NULL && strcmp (val, CHILD_VALSTRING_VALUE) == 0) + if (val != NULL && strcmp (val, resultstrings[off]) == 0) return 0; if (val != NULL) - printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val); + printf ("[%d] Unexpected GLIBC_TUNABLES VALUE %s\n", off, val); return 1; #else if (val != NULL) { - printf ("GLIBC_TUNABLES not cleared\n"); + printf ("[%d] GLIBC_TUNABLES not cleared\n", off); return 1; } return 0; @@ -61,15 +102,48 @@ test_child_tunables (void) } static int -test_parent_tunables (void) +do_test (int argc, char **argv) { - const char *val = getenv ("GLIBC_TUNABLES"); + /* Setgid child process. */ + if (argc == 2) + { + if (getgid () == getegid ()) + /* This can happen if the file system is mounted nosuid. */ + FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", + (intmax_t) getgid ()); - if (val != NULL && strcmp (val, PARENT_VALSTRING_VALUE) == 0) - return 0; + int ret = test_child (atoi (argv[1])); - if (val != NULL) - printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val); + if (ret != 0) + exit (1); - return 1; + exit (EXIT_SUCCESS); + } + else + { + int ret = 0; + + /* Spawn tests. */ + for (int i = 0; i < array_length (teststrings); i++) + { + char buf[INT_BUFSIZE_BOUND (int)]; + + printf ("Spawned test for %s (%d)\n", teststrings[i], i); + snprintf (buf, sizeof (buf), "%d\n", i); + if (setenv ("GLIBC_TUNABLES", teststrings[i], 1) != 0) + exit (1); + + int status = support_capture_subprogram_self_sgid (buf); + + /* Bail out early if unsupported. */ + if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) + return EXIT_UNSUPPORTED; + + ret |= status; + } + return ret; + } } + +#define TEST_FUNCTION_ARGV do_test +#include <support/test-driver.c> diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c index a080d2f..2350976 100644 --- a/elf/tst-env-setuid.c +++ b/elf/tst-env-setuid.c @@ -29,173 +29,12 @@ #include <sys/wait.h> #include <unistd.h> +#include <support/check.h> #include <support/support.h> #include <support/test-driver.h> +#include <support/capture_subprocess.h> static char SETGID_CHILD[] = "setgid-child"; -#define CHILD_STATUS 42 - -/* Return a GID which is not our current GID, but is present in the - supplementary group list. */ -static gid_t -choose_gid (void) -{ - const int count = 64; - gid_t groups[count]; - int ret = getgroups (count, groups); - if (ret < 0) - { - printf ("getgroups: %m\n"); - exit (1); - } - gid_t current = getgid (); - for (int i = 0; i < ret; ++i) - { - if (groups[i] != current) - return groups[i]; - } - return 0; -} - -/* Spawn and execute a program and verify that it returns the CHILD_STATUS. */ -static pid_t -do_execve (char **args) -{ - pid_t kid = vfork (); - - if (kid < 0) - { - printf ("vfork: %m\n"); - return -1; - } - - if (kid == 0) - { - /* Child process. */ - execve (args[0], args, environ); - _exit (-errno); - } - - if (kid < 0) - return 1; - - int status; - - if (waitpid (kid, &status, 0) < 0) - { - printf ("waitpid: %m\n"); - return 1; - } - - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) - return EXIT_UNSUPPORTED; - - if (!WIFEXITED (status) || WEXITSTATUS (status) != CHILD_STATUS) - { - printf ("Unexpected exit status %d from child process\n", - WEXITSTATUS (status)); - return 1; - } - return 0; -} - -/* Copies the executable into a restricted directory, so that we can - safely make it SGID with the TARGET group ID. Then runs the - executable. */ -static int -run_executable_sgid (gid_t target) -{ - char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", - test_dir, (intmax_t) getpid ()); - char *execname = xasprintf ("%s/bin", dirname); - int infd = -1; - int outfd = -1; - int ret = 0; - if (mkdir (dirname, 0700) < 0) - { - printf ("mkdir: %m\n"); - goto err; - } - infd = open ("/proc/self/exe", O_RDONLY); - if (infd < 0) - { - printf ("open (/proc/self/exe): %m\n"); - goto err; - } - outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); - if (outfd < 0) - { - printf ("open (%s): %m\n", execname); - goto err; - } - char buf[4096]; - for (;;) - { - ssize_t rdcount = read (infd, buf, sizeof (buf)); - if (rdcount < 0) - { - printf ("read: %m\n"); - goto err; - } - if (rdcount == 0) - break; - char *p = buf; - char *end = buf + rdcount; - while (p != end) - { - ssize_t wrcount = write (outfd, buf, end - p); - if (wrcount == 0) - errno = ENOSPC; - if (wrcount <= 0) - { - printf ("write: %m\n"); - goto err; - } - p += wrcount; - } - } - if (fchown (outfd, getuid (), target) < 0) - { - printf ("fchown (%s): %m\n", execname); - goto err; - } - if (fchmod (outfd, 02750) < 0) - { - printf ("fchmod (%s): %m\n", execname); - goto err; - } - if (close (outfd) < 0) - { - printf ("close (outfd): %m\n"); - goto err; - } - if (close (infd) < 0) - { - printf ("close (infd): %m\n"); - goto err; - } - - char *args[] = {execname, SETGID_CHILD, NULL}; - - ret = do_execve (args); - -err: - if (outfd >= 0) - close (outfd); - if (infd >= 0) - close (infd); - if (execname) - { - unlink (execname); - free (execname); - } - if (dirname) - { - rmdir (dirname); - free (dirname); - } - return ret; -} #ifndef test_child static int @@ -256,40 +95,32 @@ do_test (int argc, char **argv) if (argc == 2 && strcmp (argv[1], SETGID_CHILD) == 0) { if (getgid () == getegid ()) - { - /* This can happen if the file system is mounted nosuid. */ - fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n", - (intmax_t) getgid ()); - exit (EXIT_UNSUPPORTED); - } + /* This can happen if the file system is mounted nosuid. */ + FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", + (intmax_t) getgid ()); int ret = test_child (); if (ret != 0) exit (1); - exit (CHILD_STATUS); + exit (EXIT_SUCCESS); } else { if (test_parent () != 0) exit (1); - /* Try running a setgid program. */ - gid_t target = choose_gid (); - if (target == 0) - { - fprintf (stderr, - "Could not find a suitable GID for user %jd, skipping test\n", - (intmax_t) getuid ()); - exit (0); - } + int status = support_capture_subprogram_self_sgid (SETGID_CHILD); - return run_executable_sgid (target); - } + if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) + return EXIT_UNSUPPORTED; + + if (!WIFEXITED (status)) + FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); - /* Something went wrong and our argv was corrupted. */ - _exit (1); + return 0; + } } #define TEST_FUNCTION_ARGV do_test diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c index 94de199..ab793ae 100644 --- a/stdlib/tst-secure-getenv.c +++ b/stdlib/tst-secure-getenv.c @@ -30,167 +30,12 @@ #include <sys/wait.h> #include <unistd.h> +#include <support/check.h> #include <support/support.h> +#include <support/capture_subprocess.h> #include <support/test-driver.h> static char MAGIC_ARGUMENT[] = "run-actual-test"; -#define MAGIC_STATUS 19 - -/* Return a GID which is not our current GID, but is present in the - supplementary group list. */ -static gid_t -choose_gid (void) -{ - int count = getgroups (0, NULL); - if (count < 0) - { - printf ("getgroups: %m\n"); - exit (1); - } - gid_t *groups; - groups = xcalloc (count, sizeof (*groups)); - int ret = getgroups (count, groups); - if (ret < 0) - { - printf ("getgroups: %m\n"); - exit (1); - } - gid_t current = getgid (); - gid_t not_current = 0; - for (int i = 0; i < ret; ++i) - { - if (groups[i] != current) - { - not_current = groups[i]; - break; - } - } - free (groups); - return not_current; -} - - -/* Copies the executable into a restricted directory, so that we can - safely make it SGID with the TARGET group ID. Then runs the - executable. */ -static int -run_executable_sgid (gid_t target) -{ - char *dirname = xasprintf ("%s/secure-getenv.%jd", - test_dir, (intmax_t) getpid ()); - char *execname = xasprintf ("%s/bin", dirname); - int infd = -1; - int outfd = -1; - int ret = -1; - if (mkdir (dirname, 0700) < 0) - { - printf ("mkdir: %m\n"); - goto err; - } - infd = open ("/proc/self/exe", O_RDONLY); - if (infd < 0) - { - printf ("open (/proc/self/exe): %m\n"); - goto err; - } - outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); - if (outfd < 0) - { - printf ("open (%s): %m\n", execname); - goto err; - } - char buf[4096]; - for (;;) - { - ssize_t rdcount = read (infd, buf, sizeof (buf)); - if (rdcount < 0) - { - printf ("read: %m\n"); - goto err; - } - if (rdcount == 0) - break; - char *p = buf; - char *end = buf + rdcount; - while (p != end) - { - ssize_t wrcount = write (outfd, buf, end - p); - if (wrcount == 0) - errno = ENOSPC; - if (wrcount <= 0) - { - printf ("write: %m\n"); - goto err; - } - p += wrcount; - } - } - if (fchown (outfd, getuid (), target) < 0) - { - printf ("fchown (%s): %m\n", execname); - goto err; - } - if (fchmod (outfd, 02750) < 0) - { - printf ("fchmod (%s): %m\n", execname); - goto err; - } - if (close (outfd) < 0) - { - printf ("close (outfd): %m\n"); - goto err; - } - if (close (infd) < 0) - { - printf ("close (infd): %m\n"); - goto err; - } - - int kid = fork (); - if (kid < 0) - { - printf ("fork: %m\n"); - goto err; - } - if (kid == 0) - { - /* Child process. */ - char *args[] = { execname, MAGIC_ARGUMENT, NULL }; - execve (execname, args, environ); - printf ("execve (%s): %m\n", execname); - _exit (1); - } - int status; - if (waitpid (kid, &status, 0) < 0) - { - printf ("waitpid: %m\n"); - goto err; - } - if (!WIFEXITED (status) || WEXITSTATUS (status) != MAGIC_STATUS) - { - printf ("Unexpected exit status %d from child process\n", - status); - goto err; - } - ret = 0; - -err: - if (outfd >= 0) - close (outfd); - if (infd >= 0) - close (infd); - if (execname) - { - unlink (execname); - free (execname); - } - if (dirname) - { - rmdir (dirname); - free (dirname); - } - return ret; -} static int do_test (void) @@ -212,15 +57,15 @@ do_test (void) exit (1); } - gid_t target = choose_gid (); - if (target == 0) - { - fprintf (stderr, - "Could not find a suitable GID for user %jd, skipping test\n", - (intmax_t) getuid ()); - exit (0); - } - return run_executable_sgid (target); + int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); + + if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) + return EXIT_UNSUPPORTED; + + if (!WIFEXITED (status)) + FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); + + return 0; } static void @@ -229,23 +74,15 @@ alternative_main (int argc, char **argv) if (argc == 2 && strcmp (argv[1], MAGIC_ARGUMENT) == 0) { if (getgid () == getegid ()) - { - /* This can happen if the file system is mounted nosuid. */ - fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n", - (intmax_t) getgid ()); - exit (MAGIC_STATUS); - } + /* This can happen if the file system is mounted nosuid. */ + FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", + (intmax_t) getgid ()); if (getenv ("PATH") == NULL) - { - printf ("PATH variable not present\n"); - exit (3); - } + FAIL_EXIT (3, "PATH variable not present\n"); if (secure_getenv ("PATH") != NULL) - { - printf ("PATH variable not filtered out\n"); - exit (4); - } - exit (MAGIC_STATUS); + FAIL_EXIT (4, "PATH variable not filtered out\n"); + + exit (EXIT_SUCCESS); } } diff --git a/string/test-strnlen.c b/string/test-strnlen.c index 8c75338..f7d0896 100644 --- a/string/test-strnlen.c +++ b/string/test-strnlen.c @@ -27,6 +27,7 @@ #ifndef WIDE # define STRNLEN strnlen +# define MEMSET memset # define CHAR char # define BIG_CHAR CHAR_MAX # define MIDDLE_CHAR 127 @@ -34,6 +35,7 @@ #else # include <wchar.h> # define STRNLEN wcsnlen +# define MEMSET wmemset # define CHAR wchar_t # define BIG_CHAR WCHAR_MAX # define MIDDLE_CHAR 1121 @@ -153,7 +155,7 @@ do_page_tests (void) size_t last_offset = (page_size / sizeof (CHAR)) - 1; CHAR *s = (CHAR *) buf2; - memset (s, 65, (last_offset - 1)); + MEMSET (s, 65, (last_offset - 1)); s[last_offset] = 0; /* Place short strings ending at page boundary. */ @@ -196,6 +198,35 @@ do_page_tests (void) } } +/* Tests meant to unveil fail on implementations that access bytes + beyond the maxium length. */ + +static void +do_page_2_tests (void) +{ + size_t i, exp_len, offset; + size_t last_offset = page_size / sizeof (CHAR); + + CHAR *s = (CHAR *) buf2; + MEMSET (s, 65, last_offset); + + /* Place short strings ending at page boundary without the null + byte. */ + offset = last_offset; + for (i = 0; i < 128; i++) + { + /* Decrease offset to stress several sizes and alignments. */ + offset--; + exp_len = last_offset - offset; + FOR_EACH_IMPL (impl, 0) + { + /* If an implementation goes beyond EXP_LEN, it will trigger + the segfault. */ + do_one_test (impl, (CHAR *) (s + offset), exp_len, exp_len); + } + } +} + int test_main (void) { @@ -242,6 +273,7 @@ test_main (void) do_random_tests (); do_page_tests (); + do_page_2_tests (); return ret; } diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h index 2832cfc..e42a84e 100644 --- a/support/capture_subprocess.h +++ b/support/capture_subprocess.h @@ -41,6 +41,12 @@ struct support_capture_subprocess support_capture_subprocess struct support_capture_subprocess support_capture_subprogram (const char *file, char *const argv[]); +/* Copy the running program into a setgid binary and run it with CHILD_ID + argument. If execution is successful, return the exit status of the child + program, otherwise return a non-zero failure exit code. */ +int support_capture_subprogram_self_sgid + (char *child_id); + /* Deallocate the subprocess data captured by support_capture_subprocess. */ void support_capture_subprocess_free (struct support_capture_subprocess *); diff --git a/support/subprocess.h b/support/subprocess.h index c031878d..a19335e 100644 --- a/support/subprocess.h +++ b/support/subprocess.h @@ -38,6 +38,11 @@ struct support_subprocess support_subprocess struct support_subprocess support_subprogram (const char *file, char *const argv[]); +/* Invoke program FILE with ARGV arguments by using posix_spawn and wait for it + to complete. Return program exit status. */ +int support_subprogram_wait + (const char *file, char *const argv[]); + /* Wait for the subprocess indicated by PROC::PID. Return the status indicate by waitpid call. */ int support_process_wait (struct support_subprocess *proc); diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c index 948ce5a..492b76d 100644 --- a/support/support_capture_subprocess.c +++ b/support/support_capture_subprocess.c @@ -20,11 +20,14 @@ #include <support/capture_subprocess.h> #include <errno.h> +#include <fcntl.h> #include <stdlib.h> #include <support/check.h> #include <support/xunistd.h> #include <support/xsocket.h> #include <support/xspawn.h> +#include <support/support.h> +#include <support/test-driver.h> static void transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream) @@ -36,7 +39,7 @@ transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream) if (ret < 0) { support_record_failure (); - printf ("error: reading from subprocess %s: %m", what); + printf ("error: reading from subprocess %s: %m\n", what); pfd->events = 0; pfd->revents = 0; } @@ -102,6 +105,129 @@ support_capture_subprogram (const char *file, char *const argv[]) return result; } +/* Copies the executable into a restricted directory, so that we can + safely make it SGID with the TARGET group ID. Then runs the + executable. */ +static int +copy_and_spawn_sgid (char *child_id, gid_t gid) +{ + char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", + test_dir, (intmax_t) getpid ()); + char *execname = xasprintf ("%s/bin", dirname); + int infd = -1; + int outfd = -1; + int ret = 1, status = 1; + + TEST_VERIFY (mkdir (dirname, 0700) == 0); + if (support_record_failure_is_failed ()) + goto err; + + infd = open ("/proc/self/exe", O_RDONLY); + if (infd < 0) + FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n"); + + outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); + TEST_VERIFY (outfd >= 0); + if (support_record_failure_is_failed ()) + goto err; + + char buf[4096]; + for (;;) + { + ssize_t rdcount = read (infd, buf, sizeof (buf)); + TEST_VERIFY (rdcount >= 0); + if (support_record_failure_is_failed ()) + goto err; + if (rdcount == 0) + break; + char *p = buf; + char *end = buf + rdcount; + while (p != end) + { + ssize_t wrcount = write (outfd, buf, end - p); + if (wrcount == 0) + errno = ENOSPC; + TEST_VERIFY (wrcount > 0); + if (support_record_failure_is_failed ()) + goto err; + p += wrcount; + } + } + TEST_VERIFY (fchown (outfd, getuid (), gid) == 0); + if (support_record_failure_is_failed ()) + goto err; + TEST_VERIFY (fchmod (outfd, 02750) == 0); + if (support_record_failure_is_failed ()) + goto err; + TEST_VERIFY (close (outfd) == 0); + if (support_record_failure_is_failed ()) + goto err; + TEST_VERIFY (close (infd) == 0); + if (support_record_failure_is_failed ()) + goto err; + + /* We have the binary, now spawn the subprocess. Avoid using + support_subprogram because we only want the program exit status, not the + contents. */ + ret = 0; + + char * const args[] = {execname, child_id, NULL}; + + status = support_subprogram_wait (args[0], args); + +err: + if (outfd >= 0) + close (outfd); + if (infd >= 0) + close (infd); + if (execname != NULL) + { + unlink (execname); + free (execname); + } + if (dirname != NULL) + { + rmdir (dirname); + free (dirname); + } + + if (ret != 0) + FAIL_EXIT1("Failed to make sgid executable for test\n"); + + return status; +} + +int +support_capture_subprogram_self_sgid (char *child_id) +{ + gid_t target = 0; + const int count = 64; + gid_t groups[count]; + + /* Get a GID which is not our current GID, but is present in the + supplementary group list. */ + int ret = getgroups (count, groups); + if (ret < 0) + FAIL_UNSUPPORTED("Could not get group list for user %jd\n", + (intmax_t) getuid ()); + + gid_t current = getgid (); + for (int i = 0; i < ret; ++i) + { + if (groups[i] != current) + { + target = groups[i]; + break; + } + } + + if (target == 0) + FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", + (intmax_t) getuid ()); + + return copy_and_spawn_sgid (child_id, target); +} + void support_capture_subprocess_free (struct support_capture_subprocess *p) { diff --git a/support/support_subprocess.c b/support/support_subprocess.c index 0c8cc6a..97e481e 100644 --- a/support/support_subprocess.c +++ b/support/support_subprocess.c @@ -27,7 +27,7 @@ #include <support/subprocess.h> static struct support_subprocess -support_suprocess_init (void) +support_subprocess_init (void) { struct support_subprocess result; @@ -48,7 +48,7 @@ support_suprocess_init (void) struct support_subprocess support_subprocess (void (*callback) (void *), void *closure) { - struct support_subprocess result = support_suprocess_init (); + struct support_subprocess result = support_subprocess_init (); result.pid = xfork (); if (result.pid == 0) @@ -71,7 +71,7 @@ support_subprocess (void (*callback) (void *), void *closure) struct support_subprocess support_subprogram (const char *file, char *const argv[]) { - struct support_subprocess result = support_suprocess_init (); + struct support_subprocess result = support_subprocess_init (); posix_spawn_file_actions_t fa; /* posix_spawn_file_actions_init does not fail. */ @@ -84,7 +84,7 @@ support_subprogram (const char *file, char *const argv[]) xposix_spawn_file_actions_addclose (&fa, result.stdout_pipe[1]); xposix_spawn_file_actions_addclose (&fa, result.stderr_pipe[1]); - result.pid = xposix_spawn (file, &fa, NULL, argv, NULL); + result.pid = xposix_spawn (file, &fa, NULL, argv, environ); xclose (result.stdout_pipe[1]); xclose (result.stderr_pipe[1]); @@ -93,6 +93,19 @@ support_subprogram (const char *file, char *const argv[]) } int +support_subprogram_wait (const char *file, char *const argv[]) +{ + posix_spawn_file_actions_t fa; + + posix_spawn_file_actions_init (&fa); + struct support_subprocess res = support_subprocess_init (); + + res.pid = xposix_spawn (file, &fa, NULL, argv, environ); + + return support_process_wait (&res); +} + +int support_process_wait (struct support_subprocess *proc) { xclose (proc->stdout_pipe[0]); diff --git a/sysdeps/s390/configure b/sysdeps/s390/configure index fa46e9e..e7f5763 100644 --- a/sysdeps/s390/configure +++ b/sysdeps/s390/configure @@ -123,7 +123,9 @@ void testinsn (char *buf) __asm__ (".machine \"arch13\" \n\t" ".machinemode \"zarch_nohighgprs\" \n\t" "lghi %%r0,16 \n\t" - "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0"); + "mvcrl 0(%0),32(%0) \n\t" + "vstrs %%v20,%%v20,%%v20,%%v20,0,2" + : : "a" (buf) : "memory", "r0"); } EOF if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c @@ -271,7 +273,9 @@ else void testinsn (char *buf) { __asm__ ("lghi %%r0,16 \n\t" - "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0"); + "mvcrl 0(%0),32(%0) \n\t" + "vstrs %%v20,%%v20,%%v20,%%v20,0,2" + : : "a" (buf) : "memory", "r0"); } EOF if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c diff --git a/sysdeps/s390/configure.ac b/sysdeps/s390/configure.ac index 3ed5a8e..5c3479e 100644 --- a/sysdeps/s390/configure.ac +++ b/sysdeps/s390/configure.ac @@ -88,7 +88,9 @@ void testinsn (char *buf) __asm__ (".machine \"arch13\" \n\t" ".machinemode \"zarch_nohighgprs\" \n\t" "lghi %%r0,16 \n\t" - "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0"); + "mvcrl 0(%0),32(%0) \n\t" + "vstrs %%v20,%%v20,%%v20,%%v20,0,2" + : : "a" (buf) : "memory", "r0"); } EOF dnl test, if assembler supports S390 arch13 instructions @@ -195,7 +197,9 @@ cat > conftest.c <<\EOF void testinsn (char *buf) { __asm__ ("lghi %%r0,16 \n\t" - "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0"); + "mvcrl 0(%0),32(%0) \n\t" + "vstrs %%v20,%%v20,%%v20,%%v20,0,2" + : : "a" (buf) : "memory", "r0"); } EOF dnl test, if assembler supports S390 arch13 zarch instructions as default diff --git a/sysdeps/s390/memmove.c b/sysdeps/s390/memmove.c index fb6b69a..dc27b1d 100644 --- a/sysdeps/s390/memmove.c +++ b/sysdeps/s390/memmove.c @@ -43,7 +43,7 @@ extern __typeof (__redirect_memmove) MEMMOVE_ARCH13 attribute_hidden; s390_libc_ifunc_expr (__redirect_memmove, memmove, ({ s390_libc_ifunc_expr_stfle_init (); - (HAVE_MEMMOVE_ARCH13 + (HAVE_MEMMOVE_ARCH13 && (hwcap & HWCAP_S390_VXRS_EXT2) && S390_IS_ARCH13_MIE3 (stfle_bits)) ? MEMMOVE_ARCH13 : (HAVE_MEMMOVE_Z13 && (hwcap & HWCAP_S390_VX)) diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c index 1948436..6d05652 100644 --- a/sysdeps/s390/multiarch/ifunc-impl-list.c +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c @@ -171,7 +171,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, memmove, # if HAVE_MEMMOVE_ARCH13 IFUNC_IMPL_ADD (array, i, memmove, - S390_IS_ARCH13_MIE3 (stfle_bits), + ((dl_hwcap & HWCAP_S390_VXRS_EXT2) + && S390_IS_ARCH13_MIE3 (stfle_bits)), MEMMOVE_ARCH13) # endif # if HAVE_MEMMOVE_Z13 diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile index a5112ef..a93139b 100644 --- a/sysdeps/x86/Makefile +++ b/sysdeps/x86/Makefile @@ -20,6 +20,33 @@ endif endif endif +ifeq ($(subdir),string) +sysdep_routines += cacheinfo + +tests += \ + tst-memchr-rtm \ + tst-memcmp-rtm \ + tst-memmove-rtm \ + tst-memrchr-rtm \ + tst-memset-rtm \ + tst-strchr-rtm \ + tst-strcpy-rtm \ + tst-strlen-rtm \ + tst-strncmp-rtm \ + tst-strrchr-rtm + +CFLAGS-tst-memchr-rtm.c += -mrtm +CFLAGS-tst-memcmp-rtm.c += -mrtm +CFLAGS-tst-memmove-rtm.c += -mrtm +CFLAGS-tst-memrchr-rtm.c += -mrtm +CFLAGS-tst-memset-rtm.c += -mrtm +CFLAGS-tst-strchr-rtm.c += -mrtm +CFLAGS-tst-strcpy-rtm.c += -mrtm +CFLAGS-tst-strlen-rtm.c += -mrtm +CFLAGS-tst-strncmp-rtm.c += -mrtm +CFLAGS-tst-strrchr-rtm.c += -mrtm +endif + ifeq ($(enable-cet),yes) ifeq ($(subdir),elf) sysdep-dl-routines += dl-cet diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 4bab154..a4d1eac 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -424,8 +424,24 @@ init_cpu_features (struct cpu_features *cpu_features) cpu_features->feature[index_arch_Prefer_No_VZEROUPPER] |= bit_arch_Prefer_No_VZEROUPPER; else - cpu_features->feature[index_arch_Prefer_No_AVX512] - |= bit_arch_Prefer_No_AVX512; + { + cpu_features->feature[index_arch_Prefer_No_AVX512] + |= bit_arch_Prefer_No_AVX512; + + /* Avoid RTM abort triggered by VZEROUPPER inside a + transactionally executing RTM region. */ + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + cpu_features->feature[index_arch_Prefer_No_VZEROUPPER] + |= bit_arch_Prefer_No_VZEROUPPER; + + /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp + requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp + requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB, + AVX2 strcmp is faster than EVEX strcmp. */ + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + cpu_features->feature[index_arch_Prefer_AVX2_STRCMP] + |= bit_arch_Prefer_AVX2_STRCMP; + } } /* This spells out "AuthenticAMD" or "HygonGenuine". */ else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h index 03a9b2a..ca2924b 100644 --- a/sysdeps/x86/cpu-features.h +++ b/sysdeps/x86/cpu-features.h @@ -897,6 +897,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define bit_arch_Prefer_FSRM (1u << 13) #define bit_arch_Prefer_No_AVX512 (1u << 14) #define bit_arch_MathVec_Prefer_No_AVX512 (1u << 15) +#define bit_arch_Prefer_AVX2_STRCMP (1u << 16) #define index_arch_Fast_Rep_String FEATURE_INDEX_2 #define index_arch_Fast_Copy_Backward FEATURE_INDEX_2 @@ -914,6 +915,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define index_arch_Prefer_No_AVX512 FEATURE_INDEX_2 #define index_arch_MathVec_Prefer_No_AVX512 FEATURE_INDEX_2 #define index_arch_Prefer_FSRM FEATURE_INDEX_2 +#define index_arch_Prefer_AVX2_STRCMP FEATURE_INDEX_2 /* XCR0 Feature flags. */ #define bit_XMM_state (1u << 1) diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c index 2cb3151..d4d5e45 100644 --- a/sysdeps/x86/cpu-tunables.c +++ b/sysdeps/x86/cpu-tunables.c @@ -282,6 +282,9 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) CHECK_GLIBC_IFUNC_ARCH_BOTH (n, cpu_features, Fast_Copy_Backward, disable, 18); + CHECK_GLIBC_IFUNC_ARCH_NEED_ARCH_BOTH + (n, cpu_features, Prefer_AVX2_STRCMP, AVX2_Usable, + disable, 18); } break; case 19: diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c new file mode 100644 index 0000000..e474940 --- /dev/null +++ b/sysdeps/x86/tst-memchr-rtm.c @@ -0,0 +1,54 @@ +/* Test case for memchr inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + string1[100] = 'c'; + string1[STRING_SIZE - 100] = 'c'; + char *p = memchr (string1, 'c', STRING_SIZE); + if (p == &string1[100]) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + char *p = memchr (string1, 'c', STRING_SIZE); + if (p == &string1[100]) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("memchr", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c new file mode 100644 index 0000000..e4c8a62 --- /dev/null +++ b/sysdeps/x86/tst-memcmp-rtm.c @@ -0,0 +1,52 @@ +/* Test case for memcmp inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; +char string2[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + memset (string2, 'a', STRING_SIZE); + if (memcmp (string1, string2, STRING_SIZE) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + if (memcmp (string1, string2, STRING_SIZE) == 0) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("memcmp", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c new file mode 100644 index 0000000..4bf97ef --- /dev/null +++ b/sysdeps/x86/tst-memmove-rtm.c @@ -0,0 +1,53 @@ +/* Test case for memmove inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; +char string2[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + if (memmove (string2, string1, STRING_SIZE) == string2 + && memcmp (string2, string1, STRING_SIZE) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + if (memmove (string2, string1, STRING_SIZE) == string2 + && memcmp (string2, string1, STRING_SIZE) == 0) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("memmove", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c new file mode 100644 index 0000000..a57a5a8 --- /dev/null +++ b/sysdeps/x86/tst-memrchr-rtm.c @@ -0,0 +1,54 @@ +/* Test case for memrchr inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + string1[100] = 'c'; + string1[STRING_SIZE - 100] = 'c'; + char *p = memrchr (string1, 'c', STRING_SIZE); + if (p == &string1[STRING_SIZE - 100]) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + char *p = memrchr (string1, 'c', STRING_SIZE); + if (p == &string1[STRING_SIZE - 100]) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("memrchr", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c new file mode 100644 index 0000000..bf343a4 --- /dev/null +++ b/sysdeps/x86/tst-memset-rtm.c @@ -0,0 +1,45 @@ +/* Test case for memset inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE); + return EXIT_SUCCESS; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + memset (string1, 'a', STRING_SIZE); + return 0; +} + +static int +do_test (void) +{ + return do_test_1 ("memset", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c new file mode 100644 index 0000000..a82e29c --- /dev/null +++ b/sysdeps/x86/tst-strchr-rtm.c @@ -0,0 +1,54 @@ +/* Test case for strchr inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + string1[100] = 'c'; + string1[STRING_SIZE - 100] = 'c'; + char *p = strchr (string1, 'c'); + if (p == &string1[100]) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + char *p = strchr (string1, 'c'); + if (p == &string1[100]) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strchr", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c new file mode 100644 index 0000000..2b2a583 --- /dev/null +++ b/sysdeps/x86/tst-strcpy-rtm.c @@ -0,0 +1,53 @@ +/* Test case for strcpy inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; +char string2[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + if (strcpy (string2, string1) == string2 + && strcmp (string2, string1) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + if (strcpy (string2, string1) == string2 + && strcmp (string2, string1) == 0) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strcpy", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h new file mode 100644 index 0000000..6ed9eca --- /dev/null +++ b/sysdeps/x86/tst-string-rtm.h @@ -0,0 +1,72 @@ +/* Test string function in a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <string.h> +#include <x86intrin.h> +#include <cpu-features.h> +#include <support/check.h> +#include <support/test-driver.h> + +static int +do_test_1 (const char *name, unsigned int loop, int (*prepare) (void), + int (*function) (void)) +{ + if (!CPU_FEATURE_USABLE (RTM)) + return EXIT_UNSUPPORTED; + + int status = prepare (); + if (status != EXIT_SUCCESS) + return status; + + unsigned int i; + unsigned int naborts = 0; + unsigned int failed = 0; + for (i = 0; i < loop; i++) + { + failed |= function (); + if (_xbegin() == _XBEGIN_STARTED) + { + failed |= function (); + _xend(); + } + else + { + failed |= function (); + ++naborts; + } + } + + if (failed) + FAIL_EXIT1 ("%s() failed", name); + + if (naborts) + { + /* NB: Low single digit (<= 5%) noise-level aborts are normal for + TSX. */ + double rate = 100 * ((double) naborts) / ((double) loop); + if (rate > 5) + FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)", + rate, naborts, loop); + } + + return EXIT_SUCCESS; +} + +static int do_test (void); + +#include <support/test-driver.c> diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c new file mode 100644 index 0000000..0dcf14db --- /dev/null +++ b/sysdeps/x86/tst-strlen-rtm.c @@ -0,0 +1,53 @@ +/* Test case for strlen inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + string1[STRING_SIZE - 100] = '\0'; + size_t len = strlen (string1); + if (len == STRING_SIZE - 100) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + size_t len = strlen (string1); + if (len == STRING_SIZE - 100) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strlen", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c new file mode 100644 index 0000000..236ad95 --- /dev/null +++ b/sysdeps/x86/tst-strncmp-rtm.c @@ -0,0 +1,52 @@ +/* Test case for strncmp inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; +char string2[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + memset (string2, 'a', STRING_SIZE - 1); + if (strncmp (string1, string2, STRING_SIZE) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + if (strncmp (string1, string2, STRING_SIZE) == 0) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strncmp", LOOP, prepare, function); +} diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c new file mode 100644 index 0000000..e32bfaf --- /dev/null +++ b/sysdeps/x86/tst-strrchr-rtm.c @@ -0,0 +1,53 @@ +/* Test case for strrchr inside a transactionally executing RTM region. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <tst-string-rtm.h> + +#define LOOP 3000 +#define STRING_SIZE 1024 +char string1[STRING_SIZE]; + +__attribute__ ((noinline, noclone)) +static int +prepare (void) +{ + memset (string1, 'a', STRING_SIZE - 1); + string1[STRING_SIZE - 100] = 'c'; + char *p = strrchr (string1, 'c'); + if (p == &string1[STRING_SIZE - 100]) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +} + +__attribute__ ((noinline, noclone)) +static int +function (void) +{ + char *p = strrchr (string1, 'c'); + if (p == &string1[STRING_SIZE - 100]) + return 0; + else + return 1; +} + +static int +do_test (void) +{ + return do_test_1 ("strrchr", LOOP, prepare, function); +} diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index ec96365..f2217b2 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -21,9 +21,11 @@ #ifdef USE_AS_WMEMCHR # define MEMCHR wmemchr # define PCMPEQ pcmpeqd +# define CHAR_PER_VEC 4 #else # define MEMCHR memchr # define PCMPEQ pcmpeqb +# define CHAR_PER_VEC 16 #endif /* fast SSE2 version with using pmaxub and 64 byte loop */ @@ -33,15 +35,14 @@ ENTRY(MEMCHR) movd %esi, %xmm1 mov %edi, %ecx +#ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +#endif #ifdef USE_AS_WMEMCHR test %RDX_LP, %RDX_LP jz L(return_null) - shl $2, %RDX_LP #else -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx -# endif punpcklbw %xmm1, %xmm1 test %RDX_LP, %RDX_LP jz L(return_null) @@ -60,13 +61,16 @@ ENTRY(MEMCHR) test %eax, %eax jnz L(matches_1) - sub $16, %rdx + sub $CHAR_PER_VEC, %rdx jbe L(return_null) add $16, %rdi and $15, %ecx and $-16, %rdi +#ifdef USE_AS_WMEMCHR + shr $2, %ecx +#endif add %rcx, %rdx - sub $64, %rdx + sub $(CHAR_PER_VEC * 4), %rdx jbe L(exit_loop) jmp L(loop_prolog) @@ -77,16 +81,21 @@ L(crosscache): movdqa (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 -/* Check if there is a match. */ + /* Check if there is a match. */ pmovmskb %xmm0, %eax -/* Remove the leading bytes. */ + /* Remove the leading bytes. */ sar %cl, %eax test %eax, %eax je L(unaligned_no_match) -/* Check which byte is a match. */ + /* Check which byte is a match. */ bsf %eax, %eax - +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) add %rdi, %rax add %rcx, %rax @@ -94,15 +103,18 @@ L(crosscache): .p2align 4 L(unaligned_no_match): - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void possible addition overflow. */ neg %rcx add $16, %rcx +#ifdef USE_AS_WMEMCHR + shr $2, %ecx +#endif sub %rcx, %rdx jbe L(return_null) add $16, %rdi - sub $64, %rdx + sub $(CHAR_PER_VEC * 4), %rdx jbe L(exit_loop) .p2align 4 @@ -135,7 +147,7 @@ L(loop_prolog): test $0x3f, %rdi jz L(align64_loop) - sub $64, %rdx + sub $(CHAR_PER_VEC * 4), %rdx jbe L(exit_loop) movdqa (%rdi), %xmm0 @@ -167,11 +179,14 @@ L(loop_prolog): mov %rdi, %rcx and $-64, %rdi and $63, %ecx +#ifdef USE_AS_WMEMCHR + shr $2, %ecx +#endif add %rcx, %rdx .p2align 4 L(align64_loop): - sub $64, %rdx + sub $(CHAR_PER_VEC * 4), %rdx jbe L(exit_loop) movdqa (%rdi), %xmm0 movdqa 16(%rdi), %xmm2 @@ -218,7 +233,7 @@ L(align64_loop): .p2align 4 L(exit_loop): - add $32, %edx + add $(CHAR_PER_VEC * 2), %edx jle L(exit_loop_32) movdqa (%rdi), %xmm0 @@ -238,7 +253,7 @@ L(exit_loop): pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32_1) - sub $16, %edx + sub $CHAR_PER_VEC, %edx jle L(return_null) PCMPEQ 48(%rdi), %xmm1 @@ -250,13 +265,13 @@ L(exit_loop): .p2align 4 L(exit_loop_32): - add $32, %edx + add $(CHAR_PER_VEC * 2), %edx movdqa (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches_1) - sub $16, %edx + sub $CHAR_PER_VEC, %edx jbe L(return_null) PCMPEQ 16(%rdi), %xmm1 @@ -293,7 +308,13 @@ L(matches32): .p2align 4 L(matches_1): bsf %eax, %eax +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) add %rdi, %rax ret @@ -301,7 +322,13 @@ L(matches_1): .p2align 4 L(matches16_1): bsf %eax, %eax +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) lea 16(%rdi, %rax), %rax ret @@ -309,7 +336,13 @@ L(matches16_1): .p2align 4 L(matches32_1): bsf %eax, %eax +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) lea 32(%rdi, %rax), %rax ret @@ -317,7 +350,13 @@ L(matches32_1): .p2align 4 L(matches48_1): bsf %eax, %eax +#ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +#else sub %rax, %rdx +#endif jbe L(return_null) lea 48(%rdi, %rax), %rax ret diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 395e432..da1446d 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -43,7 +43,45 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ memmove-avx512-unaligned-erms \ memset-sse2-unaligned-erms \ memset-avx2-unaligned-erms \ - memset-avx512-unaligned-erms + memset-avx512-unaligned-erms \ + memchr-avx2-rtm \ + memcmp-avx2-movbe-rtm \ + memmove-avx-unaligned-erms-rtm \ + memrchr-avx2-rtm \ + memset-avx2-unaligned-erms-rtm \ + rawmemchr-avx2-rtm \ + strchr-avx2-rtm \ + strcmp-avx2-rtm \ + strchrnul-avx2-rtm \ + stpcpy-avx2-rtm \ + stpncpy-avx2-rtm \ + strcat-avx2-rtm \ + strcpy-avx2-rtm \ + strlen-avx2-rtm \ + strncat-avx2-rtm \ + strncmp-avx2-rtm \ + strncpy-avx2-rtm \ + strnlen-avx2-rtm \ + strrchr-avx2-rtm \ + memchr-evex \ + memcmp-evex-movbe \ + memmove-evex-unaligned-erms \ + memrchr-evex \ + memset-evex-unaligned-erms \ + rawmemchr-evex \ + stpcpy-evex \ + stpncpy-evex \ + strcat-evex \ + strchr-evex \ + strchrnul-evex \ + strcmp-evex \ + strcpy-evex \ + strlen-evex \ + strncat-evex \ + strncmp-evex \ + strncpy-evex \ + strnlen-evex \ + strrchr-evex CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 @@ -59,8 +97,24 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ wcscpy-ssse3 wcscpy-c \ wcschr-sse2 wcschr-avx2 \ wcsrchr-sse2 wcsrchr-avx2 \ - wcsnlen-sse4_1 wcsnlen-c \ - wcslen-sse2 wcslen-avx2 wcsnlen-avx2 + wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \ + wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \ + wcschr-avx2-rtm \ + wcscmp-avx2-rtm \ + wcslen-avx2-rtm \ + wcsncmp-avx2-rtm \ + wcsnlen-avx2-rtm \ + wcsrchr-avx2-rtm \ + wmemchr-avx2-rtm \ + wmemcmp-avx2-movbe-rtm \ + wcschr-evex \ + wcscmp-evex \ + wcslen-evex \ + wcsncmp-evex \ + wcsnlen-evex \ + wcsrchr-evex \ + wmemchr-evex \ + wmemcmp-evex-movbe endif ifeq ($(subdir),debug) diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h index a70b62d..12cff84 100644 --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h @@ -21,16 +21,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable) + && CPU_FEATURES_CPU_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 39a45d0..e57fb42 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -43,6 +43,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memchr, HAS_ARCH_FEATURE (AVX2_Usable), __memchr_avx2) + IFUNC_IMPL_ADD (array, i, memchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memchr, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __memchr_evex) IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) /* Support sysdeps/x86_64/multiarch/memcmp.c. */ @@ -51,6 +60,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (HAS_ARCH_FEATURE (AVX2_Usable) && HAS_CPU_FEATURE (MOVBE)), __memcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE) + && HAS_CPU_FEATURE (RTM)), + __memcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, memcmp, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (MOVBE)), + __memcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1), __memcmp_sse4_1) IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), @@ -64,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX512F_Usable), __memmove_chk_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, __memmove_chk, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __memmove_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __memmove_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX_Usable), @@ -76,6 +95,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX_Usable), __memmove_chk_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memmove_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memmove_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memmove_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memmove_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memmove_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_CPU_FEATURE (SSSE3), __memmove_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memmove_chk, @@ -98,13 +131,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX_Usable), __memmove_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memmove_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memmove, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memmove_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memmove_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memmove_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX512F_Usable), __memmove_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, memmove, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __memmove_avx512_unaligned) IFUNC_IMPL_ADD (array, i, memmove, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __memmove_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), __memmove_ssse3_back) @@ -121,6 +168,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memrchr, HAS_ARCH_FEATURE (AVX2_Usable), __memrchr_avx2) + IFUNC_IMPL_ADD (array, i, memrchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memrchr, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __memrchr_evex) + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2)) #ifdef SHARED @@ -139,10 +195,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX2_Usable), __memset_chk_avx2_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, - HAS_ARCH_FEATURE (AVX512F_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memset_chk_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memset_chk_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __memset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __memset_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), __memset_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, - HAS_ARCH_FEATURE (AVX512F_Usable), + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), __memset_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX512F_Usable), @@ -164,10 +238,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX2_Usable), __memset_avx2_unaligned_erms) IFUNC_IMPL_ADD (array, i, memset, - HAS_ARCH_FEATURE (AVX512F_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memset_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __memset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __memset_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), __memset_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memset, - HAS_ARCH_FEATURE (AVX512F_Usable), + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), __memset_avx512_unaligned) IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX512F_Usable), @@ -179,20 +271,51 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_ARCH_FEATURE (AVX2_Usable), __rawmemchr_avx2) + IFUNC_IMPL_ADD (array, i, rawmemchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __rawmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, rawmemchr, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __rawmemchr_evex) IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/strlen.c. */ IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, - HAS_ARCH_FEATURE (AVX2_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2)), __strlen_avx2) + IFUNC_IMPL_ADD (array, i, strlen, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2) + && HAS_CPU_FEATURE (RTM)), + __strlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strlen, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __strlen_evex) IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) /* Support sysdeps/x86_64/multiarch/strnlen.c. */ IFUNC_IMPL (i, name, strnlen, IFUNC_IMPL_ADD (array, i, strnlen, - HAS_ARCH_FEATURE (AVX2_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2)), __strnlen_avx2) + IFUNC_IMPL_ADD (array, i, strnlen, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2) + && HAS_CPU_FEATURE (RTM)), + __strnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strnlen, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __strnlen_evex) IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ @@ -201,6 +324,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable), __stpncpy_avx2) + IFUNC_IMPL_ADD (array, i, stpncpy, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __stpncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpncpy, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __stpncpy_evex) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) @@ -211,6 +342,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable), __stpcpy_avx2) + IFUNC_IMPL_ADD (array, i, stpcpy, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __stpcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpcpy, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __stpcpy_evex) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) @@ -245,6 +384,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcat, IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable), __strcat_avx2) + IFUNC_IMPL_ADD (array, i, strcat, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strcat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcat, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __strcat_evex) IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) @@ -255,6 +402,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strchr, HAS_ARCH_FEATURE (AVX2_Usable), __strchr_avx2) + IFUNC_IMPL_ADD (array, i, strchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchr, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __strchr_evex) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) @@ -263,6 +419,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strchrnul, HAS_ARCH_FEATURE (AVX2_Usable), __strchrnul_avx2) + IFUNC_IMPL_ADD (array, i, strchrnul, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strchrnul_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchrnul, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __strchrnul_evex) IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2)) /* Support sysdeps/x86_64/multiarch/strrchr.c. */ @@ -270,6 +435,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strrchr, HAS_ARCH_FEATURE (AVX2_Usable), __strrchr_avx2) + IFUNC_IMPL_ADD (array, i, strrchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strrchr, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __strrchr_evex) IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2)) /* Support sysdeps/x86_64/multiarch/strcmp.c. */ @@ -277,6 +450,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcmp, HAS_ARCH_FEATURE (AVX2_Usable), __strcmp_avx2) + IFUNC_IMPL_ADD (array, i, strcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strcmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcmp, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __strcmp_evex) IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2), __strcmp_sse42) IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3), @@ -288,6 +470,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcpy, IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable), __strcpy_avx2) + IFUNC_IMPL_ADD (array, i, strcpy, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcpy, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __strcpy_evex) IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3), __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) @@ -331,6 +521,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strncat, IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable), __strncat_avx2) + IFUNC_IMPL_ADD (array, i, strncat, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strncat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncat, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __strncat_evex) IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, 1, @@ -341,6 +539,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strncpy, IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable), __strncpy_avx2) + IFUNC_IMPL_ADD (array, i, strncpy, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncpy, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __strncpy_evex) IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3), __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, @@ -370,6 +576,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcschr, HAS_ARCH_FEATURE (AVX2_Usable), __wcschr_avx2) + IFUNC_IMPL_ADD (array, i, wcschr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcschr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcschr, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __wcschr_evex) IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2)) /* Support sysdeps/x86_64/multiarch/wcsrchr.c. */ @@ -377,6 +592,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_ARCH_FEATURE (AVX2_Usable), __wcsrchr_avx2) + IFUNC_IMPL_ADD (array, i, wcsrchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcsrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsrchr, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __wcsrchr_evex) IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2)) /* Support sysdeps/x86_64/multiarch/wcscmp.c. */ @@ -384,6 +608,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcscmp, HAS_ARCH_FEATURE (AVX2_Usable), __wcscmp_avx2) + IFUNC_IMPL_ADD (array, i, wcscmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcscmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcscmp, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __wcscmp_evex) IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2)) /* Support sysdeps/x86_64/multiarch/wcsncmp.c. */ @@ -391,6 +624,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsncmp, HAS_ARCH_FEATURE (AVX2_Usable), __wcsncmp_avx2) + IFUNC_IMPL_ADD (array, i, wcsncmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcsncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsncmp, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __wcsncmp_evex) IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2)) /* Support sysdeps/x86_64/multiarch/wcscpy.c. */ @@ -402,16 +644,41 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wcslen.c. */ IFUNC_IMPL (i, name, wcslen, IFUNC_IMPL_ADD (array, i, wcslen, - HAS_ARCH_FEATURE (AVX2_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2)), __wcslen_avx2) + IFUNC_IMPL_ADD (array, i, wcslen, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2) + && HAS_CPU_FEATURE (RTM)), + __wcslen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcslen, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __wcslen_evex) + IFUNC_IMPL_ADD (array, i, wcsnlen, + CPU_FEATURE_USABLE (SSE4_1), + __wcsnlen_sse4_1) IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ IFUNC_IMPL (i, name, wcsnlen, IFUNC_IMPL_ADD (array, i, wcsnlen, - HAS_ARCH_FEATURE (AVX2_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2)), __wcsnlen_avx2) IFUNC_IMPL_ADD (array, i, wcsnlen, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (BMI2) + && HAS_CPU_FEATURE (RTM)), + __wcsnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __wcsnlen_evex) + IFUNC_IMPL_ADD (array, i, wcsnlen, HAS_CPU_FEATURE (SSE4_1), __wcsnlen_sse4_1) IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2)) @@ -421,6 +688,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wmemchr, HAS_ARCH_FEATURE (AVX2_Usable), __wmemchr_avx2) + IFUNC_IMPL_ADD (array, i, wmemchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wmemchr, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (BMI2)), + __wmemchr_evex) IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ @@ -429,6 +705,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (HAS_ARCH_FEATURE (AVX2_Usable) && HAS_CPU_FEATURE (MOVBE)), __wmemcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE) + && HAS_CPU_FEATURE (RTM)), + __wmemcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable) + && HAS_CPU_FEATURE (MOVBE)), + __wmemcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1), __wmemcmp_sse4_1) IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), @@ -443,7 +729,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX2_Usable), __wmemset_avx2_unaligned) IFUNC_IMPL_ADD (array, i, wmemset, - HAS_ARCH_FEATURE (AVX512F_Usable), + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wmemset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __wmemset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX512VL_Usable), __wmemset_avx512_unaligned)) #ifdef SHARED @@ -453,10 +746,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX512F_Usable), __memcpy_chk_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, __memcpy_chk, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __memcpy_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __memcpy_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), @@ -465,6 +758,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_chk_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memcpy_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memcpy_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_CPU_FEATURE (SSSE3), __memcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memcpy_chk, @@ -486,6 +793,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memcpy_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __memcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), @@ -494,10 +815,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX512F_Usable), __memcpy_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, memcpy, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __memcpy_avx512_unaligned) IFUNC_IMPL_ADD (array, i, memcpy, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __memcpy_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, memcpy, 1, @@ -511,10 +832,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX512F_Usable), __mempcpy_chk_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __mempcpy_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __mempcpy_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), @@ -523,6 +844,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_chk_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __mempcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __mempcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __mempcpy_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __mempcpy_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_CPU_FEATURE (SSSE3), __mempcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, @@ -542,10 +877,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX512F_Usable), __mempcpy_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, mempcpy, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __mempcpy_avx512_unaligned) IFUNC_IMPL_ADD (array, i, mempcpy, - HAS_ARCH_FEATURE (AVX512F_Usable), + HAS_ARCH_FEATURE (AVX512VL_Usable), __mempcpy_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX_Usable), @@ -553,6 +888,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __mempcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __mempcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __mempcpy_evex_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __mempcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), @@ -568,6 +917,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncmp, HAS_ARCH_FEATURE (AVX2_Usable), __strncmp_avx2) + IFUNC_IMPL_ADD (array, i, strncmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncmp, + (HAS_ARCH_FEATURE (AVX512VL_Usable) + && HAS_ARCH_FEATURE (AVX512BW_Usable)), + __strncmp_evex) IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2), __strncmp_sse42) IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), @@ -583,6 +940,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_ARCH_FEATURE (AVX2_Usable), __wmemset_chk_avx2_unaligned) IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX512VL_Usable), + __wmemset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __wmemset_chk_avx512_unaligned)) #endif diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index 945529f..963d327 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -23,17 +23,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) && CPU_FEATURES_CPU_P (cpu_features, MOVBE) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2_movbe); + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)) + return OPTIMIZE (evex_movbe); + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_movbe_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2_movbe); + } if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) return OPTIMIZE (sse4_1); diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index 9d63ee5..01fc6b9 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -29,6 +29,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) @@ -48,21 +56,42 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) { - if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) - return OPTIMIZE (avx512_no_vzeroupper); + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx512_unaligned_erms); - if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) - return OPTIMIZE (avx512_unaligned_erms); + return OPTIMIZE (avx512_unaligned); + } - return OPTIMIZE (avx512_unaligned); + return OPTIMIZE (avx512_no_vzeroupper); } if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) { - if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) - return OPTIMIZE (avx_unaligned_erms); + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (evex_unaligned_erms); + + return OPTIMIZE (evex_unaligned); + } + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx_unaligned_erms_rtm); + + return OPTIMIZE (avx_unaligned_rtm); + } + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx_unaligned_erms); - return OPTIMIZE (avx_unaligned); + return OPTIMIZE (avx_unaligned); + } } if (!CPU_FEATURES_CPU_P (cpu_features, SSSE3) diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h index c98d757..198c8c6 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h @@ -27,6 +27,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) @@ -45,21 +53,44 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) { - if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) - return OPTIMIZE (avx512_no_vzeroupper); + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx512_unaligned_erms); - if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) - return OPTIMIZE (avx512_unaligned_erms); + return OPTIMIZE (avx512_unaligned); + } - return OPTIMIZE (avx512_unaligned); + return OPTIMIZE (avx512_no_vzeroupper); } if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) { - if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) - return OPTIMIZE (avx2_unaligned_erms); - else - return OPTIMIZE (avx2_unaligned); + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (evex_unaligned_erms); + + return OPTIMIZE (evex_unaligned); + } + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx2_unaligned_erms_rtm); + + return OPTIMIZE (avx2_unaligned_rtm); + } + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx2_unaligned_erms); + + return OPTIMIZE (avx2_unaligned); + } } if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index bc6a70f..b05d119 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -25,16 +25,27 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)) + return OPTIMIZE (evex); + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h new file mode 100644 index 0000000..564cc8c --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h @@ -0,0 +1,52 @@ +/* Common definition for ifunc selections for wcslen and wcsnlen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_CPU_P (cpu_features, BMI2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)) + return OPTIMIZE (evex); + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } + + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h index f480b82..2913a36 100644 --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h @@ -20,6 +20,9 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; static inline void * @@ -27,14 +30,21 @@ IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) { - if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) - && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) - return OPTIMIZE (avx512_unaligned); - else + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)) + { + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + return OPTIMIZE (avx512_unaligned); + + return OPTIMIZE (evex_unaligned); + } + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_unaligned_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S new file mode 100644 index 0000000..87b076c --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCHR +# define MEMCHR __memchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index cfec165..9ddeab2 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -26,319 +26,407 @@ # ifdef USE_AS_WMEMCHR # define VPCMPEQ vpcmpeqd +# define VPBROADCAST vpbroadcastd +# define CHAR_SIZE 4 # else # define VPCMPEQ vpcmpeqb +# define VPBROADCAST vpbroadcastb +# define CHAR_SIZE 1 +# endif + +# ifdef USE_AS_RAWMEMCHR +# define ERAW_PTR_REG ecx +# define RRAW_PTR_REG rcx +# define ALGN_PTR_REG rdi +# else +# define ERAW_PTR_REG edi +# define RRAW_PTR_REG rdi +# define ALGN_PTR_REG rcx # endif # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCHR) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ +# ifdef __ILP32__ + /* Clear upper bits. */ + and %RDX_LP, %RDX_LP +# else test %RDX_LP, %RDX_LP +# endif jz L(null) # endif - movl %edi, %ecx - /* Broadcast CHAR to YMM0. */ + /* Broadcast CHAR to YMMMATCH. */ vmovd %esi, %xmm0 -# ifdef USE_AS_WMEMCHR - shl $2, %RDX_LP - vpbroadcastd %xmm0, %ymm0 -# else -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx -# endif - vpbroadcastb %xmm0, %ymm0 -# endif + VPBROADCAST %xmm0, %ymm0 /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 + VPCMPEQ (%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax - testl %eax, %eax - # ifndef USE_AS_RAWMEMCHR - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rdx - jbe L(zero) -# else - jnz L(first_vec_x0) + /* If length < CHAR_PER_VEC handle special. */ + cmpq $CHAR_PER_VEC, %rdx + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + addq %rdi, %rax + VZEROUPPER_RETURN # ifndef USE_AS_RAWMEMCHR - /* Adjust length. */ - addq %rcx, %rdx + .p2align 5 +L(first_vec_x0): + /* Check if first match was before length. */ + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %edx +# endif + xorl %ecx, %ecx + cmpl %eax, %edx + leaq (%rdi, %rax), %rax + cmovle %rcx, %rax + VZEROUPPER_RETURN - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) +L(null): + xorl %eax, %eax + ret # endif - jmp L(more_4x_vec) - .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 +L(cross_page_boundary): + /* Save pointer before aligning as its original value is + necessary for computer return address if byte is found or + adjusting length if it is not and this is memchr. */ + movq %rdi, %rcx + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr + and rdi for rawmemchr. */ + orq $(VEC_SIZE - 1), %ALGN_PTR_REG + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 vpmovmskb %ymm1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Calculate length until end of page (length checked for a + match). */ + leaq 1(%ALGN_PTR_REG), %rsi + subq %RRAW_PTR_REG, %rsi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %esi +# endif +# endif /* Remove the leading bytes. */ - sarl %cl, %eax - testl %eax, %eax - jz L(aligned_more) - tzcntl %eax, %eax + sarxl %ERAW_PTR_REG, %eax, %eax # ifndef USE_AS_RAWMEMCHR /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) + cmpq %rsi, %rdx + jbe L(first_vec_x0) # endif + testl %eax, %eax + jz L(cross_page_continue) + tzcntl %eax, %eax + addq %RRAW_PTR_REG, %rax +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax + incq %rdi addq %rdi, %rax - addq %rcx, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 -L(aligned_more): -# ifndef USE_AS_RAWMEMCHR - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition - overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx +L(first_vec_x2): + tzcntl %eax, %eax + addq $(VEC_SIZE + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN - /* Check the end of data. */ - subq %rcx, %rdx - jbe L(zero) -# endif + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + addq $(VEC_SIZE * 2 + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN - addq $VEC_SIZE, %rdi -# ifndef USE_AS_RAWMEMCHR - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) -# endif + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + addq $(VEC_SIZE * 3 + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN -L(more_4x_vec): + .p2align 4 +L(aligned_more): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +# ifndef USE_AS_RAWMEMCHR +L(cross_page_continue): + /* Align data to VEC_SIZE - 1. */ + xorl %ecx, %ecx + subl %edi, %ecx + orq $(VEC_SIZE - 1), %rdi + /* esi is for adjusting length to see if near the end. */ + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %esi +# endif +# else + orq $(VEC_SIZE - 1), %rdi +L(cross_page_continue): +# endif + /* Load first VEC regardless. */ + VPCMPEQ 1(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Adjust length. If near end handle specially. */ + subq %rsi, %rdx + jbe L(last_4x_vec_or_less) +# endif testl %eax, %eax jnz L(first_vec_x1) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x2) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi - -# ifndef USE_AS_RAWMEMCHR - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) # ifndef USE_AS_RAWMEMCHR - /* Adjust length. */ + /* Check if at last VEC_SIZE * 4 length. */ + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_4x_vec_or_less_cmpeq) + /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust + length. */ + incq %rdi + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif addq %rcx, %rdx +# else + /* Align data to VEC_SIZE * 4 - 1 for loop. */ + incq %rdi + orq $(VEC_SIZE * 4 - 1), %rdi # endif + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 - + VPCMPEQ 1(%rdi), %ymm0, %ymm1 + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 vpor %ymm1, %ymm2, %ymm5 vpor %ymm3, %ymm4, %ymm6 vpor %ymm5, %ymm6, %ymm5 - vpmovmskb %ymm5, %eax - testl %eax, %eax - jnz L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi - + vpmovmskb %ymm5, %ecx # ifdef USE_AS_RAWMEMCHR - jmp L(loop_4x_vec) + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) # else - subq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec) + testl %ecx, %ecx + jnz L(loop_4x_vec_end) -L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %edx - jle L(last_2x_vec) + subq $-(VEC_SIZE * 4), %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop_4x_vec) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + /* Fall through into less than 4 remaining vectors of length + case. */ + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax + .p2align 4 +L(last_4x_vec_or_less): +# ifdef USE_AS_WMEMCHR + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %edx +# endif + /* Check if first VEC contained match. */ testl %eax, %eax - jnz L(first_vec_x1) + jnz L(first_vec_x1_check) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax + /* If remaining length > VEC_SIZE * 2. */ + addl $(VEC_SIZE * 2), %edx + jg L(last_4x_vec) - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %edx - jle L(zero) +L(last_2x_vec): + /* If remaining length < VEC_SIZE. */ + addl $VEC_SIZE, %edx + jle L(zero_end) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + /* Check VEC2 and compare any match with remaining length. */ + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 vpmovmskb %ymm1, %eax - testl %eax, %eax - - jnz L(first_vec_x3_check) - xorl %eax, %eax - VZEROUPPER - ret + tzcntl %eax, %eax + cmpl %eax, %edx + jbe L(set_zero_end) + addq $(VEC_SIZE + 1), %rdi + addq %rdi, %rax +L(zero_end): + VZEROUPPER_RETURN .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %edx - VPCMPEQ (%rdi), %ymm0, %ymm1 +L(loop_4x_vec_end): +# endif + /* rawmemchr will fall through into this if match was found in + loop. */ + vpmovmskb %ymm1, %eax testl %eax, %eax + jnz L(last_vec_x1_return) - jnz L(first_vec_x0_check) - subl $VEC_SIZE, %edx - jle L(zero) - - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + vpmovmskb %ymm2, %eax testl %eax, %eax - jnz L(first_vec_x1_check) - xorl %eax, %eax - VZEROUPPER - ret + jnz L(last_vec_x2_return) - .p2align 4 -L(first_vec_x0_check): - tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) + vpmovmskb %ymm3, %eax + /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax +# ifdef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 2 - 1), %rdi +# else + subq $-(VEC_SIZE * 2 + 1), %rdi +# endif addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN +# ifndef USE_AS_RAWMEMCHR .p2align 4 L(first_vec_x1_check): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $VEC_SIZE, %rax + /* Adjust length. */ + subl $-(VEC_SIZE * 4), %edx + /* Check if match within remaining length. */ + cmpl %eax, %edx + jbe L(set_zero_end) + incq %rdi addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN + .p2align 4 +L(set_zero_end): + xorl %eax, %eax + VZEROUPPER_RETURN +# endif .p2align 4 -L(first_vec_x2_check): +L(last_vec_x1_return): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $(VEC_SIZE * 2), %rax +# ifdef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 4 - 1), %rdi +# else + incq %rdi +# endif addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 -L(first_vec_x3_check): +L(last_vec_x2_return): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rdx - jbe L(zero) - addq $(VEC_SIZE * 3), %rax +# ifdef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 3 - 1), %rdi +# else + subq $-(VEC_SIZE + 1), %rdi +# endif addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN +# ifndef USE_AS_RAWMEMCHR .p2align 4 -L(zero): - VZEROUPPER -L(null): - xorl %eax, %eax - ret -# endif +L(last_4x_vec_or_less_cmpeq): + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %edx +# endif + subq $-(VEC_SIZE * 4), %rdi + /* Check first VEC regardless. */ + testl %eax, %eax + jnz L(first_vec_x1_check) + /* If remaining length <= CHAR_PER_VEC * 2. */ + addl $(VEC_SIZE * 2), %edx + jle L(last_2x_vec) .p2align 4 -L(first_vec_x0): - tzcntl %eax, %eax - addq %rdi, %rax - VZEROUPPER - ret +L(last_4x_vec): + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x2_return) - .p2align 4 -L(first_vec_x1): - tzcntl %eax, %eax - addq $VEC_SIZE, %rax - addq %rdi, %rax - VZEROUPPER - ret + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax - .p2align 4 -L(first_vec_x2): + /* Create mask for possible matches within remaining length. */ + movq $-1, %rcx + bzhiq %rdx, %rcx, %rcx + + /* Test matches in data against length match. */ + andl %ecx, %eax + jnz L(last_vec_x3) + + /* if remaining length <= VEC_SIZE * 3 (Note this is after + remaining length was found to be > VEC_SIZE * 2. */ + subl $VEC_SIZE, %edx + jbe L(zero_end2) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + /* Shift remaining length mask for last VEC. */ + shrq $32, %rcx + andl %ecx, %eax + jz L(zero_end2) tzcntl %eax, %eax - addq $(VEC_SIZE * 2), %rax + addq $(VEC_SIZE * 3 + 1), %rdi addq %rdi, %rax - VZEROUPPER - ret +L(zero_end2): + VZEROUPPER_RETURN .p2align 4 -L(4x_vec_end): - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(first_vec_x1) - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(first_vec_x2) - vpmovmskb %ymm4, %eax - testl %eax, %eax -L(first_vec_x3): +L(last_vec_x3): tzcntl %eax, %eax - addq $(VEC_SIZE * 3), %rax + subq $-(VEC_SIZE * 2 + 1), %rdi addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN +# endif END (MEMCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S new file mode 100644 index 0000000..f3fdad4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -0,0 +1,478 @@ +/* memchr/wmemchr optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCHR +# define MEMCHR __memchr_evex +# endif + +# ifdef USE_AS_WMEMCHR +# define VPBROADCAST vpbroadcastd +# define VPMINU vpminud +# define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd +# define CHAR_SIZE 4 +# else +# define VPBROADCAST vpbroadcastb +# define VPMINU vpminub +# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 +# endif + +# ifdef USE_AS_RAWMEMCHR +# define RAW_PTR_REG rcx +# define ALGN_PTR_REG rdi +# else +# define RAW_PTR_REG rdi +# define ALGN_PTR_REG rcx +# endif + +# define XMMZERO xmm23 +# define YMMZERO ymm23 +# define XMMMATCH xmm16 +# define YMMMATCH ymm16 +# define YMM1 ymm17 +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 + +# define VEC_SIZE 32 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) +# define PAGE_SIZE 4096 + + .section .text.evex,"ax",@progbits +ENTRY (MEMCHR) +# ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# endif + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + /* Check if we may cross page boundary with one vector load. */ + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. */ + VPCMP $0, (%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax +# ifndef USE_AS_RAWMEMCHR + /* If length < CHAR_PER_VEC handle special. */ + cmpq $CHAR_PER_VEC, %rdx + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif + ret + +# ifndef USE_AS_RAWMEMCHR +L(zero): + xorl %eax, %eax + ret + + .p2align 5 +L(first_vec_x0): + /* Check if first match was before length. */ + tzcntl %eax, %eax + xorl %ecx, %ecx + cmpl %eax, %edx + leaq (%rdi, %rax, CHAR_SIZE), %rax + cmovle %rcx, %rax + ret +# else + /* NB: first_vec_x0 is 17 bytes which will leave + cross_page_boundary (which is relatively cold) close enough + to ideal alignment. So only realign L(cross_page_boundary) if + rawmemchr. */ + .p2align 4 +# endif +L(cross_page_boundary): + /* Save pointer before aligning as its original value is + necessary for computer return address if byte is found or + adjusting length if it is not and this is memchr. */ + movq %rdi, %rcx + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi + for rawmemchr. */ + andq $-VEC_SIZE, %ALGN_PTR_REG + VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 + kmovd %k0, %r8d +# ifdef USE_AS_WMEMCHR + /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ + sarl $2, %eax +# endif +# ifndef USE_AS_RAWMEMCHR + movl $(PAGE_SIZE / CHAR_SIZE), %esi + subl %eax, %esi +# endif +# ifdef USE_AS_WMEMCHR + andl $(CHAR_PER_VEC - 1), %eax +# endif + /* Remove the leading bytes. */ + sarxl %eax, %r8d, %eax +# ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ + cmpq %rsi, %rdx + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(cross_page_continue) + tzcntl %eax, %eax +# ifdef USE_AS_WMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax +# else + addq %RAW_PTR_REG, %rax +# endif + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 5 +L(aligned_more): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + +# ifndef USE_AS_RAWMEMCHR + /* Align data to VEC_SIZE. */ +L(cross_page_continue): + xorl %ecx, %ecx + subl %edi, %ecx + andq $-VEC_SIZE, %rdi + /* esi is for adjusting length to see if near the end. */ + leal (VEC_SIZE * 5)(%rdi, %rcx), %esi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %esi +# endif +# else + andq $-VEC_SIZE, %rdi +L(cross_page_continue): +# endif + /* Load first VEC regardless. */ + VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax +# ifndef USE_AS_RAWMEMCHR + /* Adjust length. If near end handle specially. */ + subq %rsi, %rdx + jbe L(last_4x_vec_or_less) +# endif + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x4) + + +# ifndef USE_AS_RAWMEMCHR + /* Check if at last CHAR_PER_VEC * 4 length. */ + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_4x_vec_or_less_cmpeq) + addq $VEC_SIZE, %rdi + + /* Align data to VEC_SIZE * 4 for the loop and readjust length. + */ +# ifdef USE_AS_WMEMCHR + movl %edi, %ecx + andq $-(4 * VEC_SIZE), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx + addq %rcx, %rdx +# else + addq %rdi, %rdx + andq $-(4 * VEC_SIZE), %rdi + subq %rdi, %rdx +# endif +# else + addq $VEC_SIZE, %rdi + andq $-(4 * VEC_SIZE), %rdi +# endif + + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* Compare 4 * VEC at a time forward. */ + .p2align 4 +L(loop_4x_vec): + /* It would be possible to save some instructions using 4x VPCMP + but bottleneck on port 5 makes it not woth it. */ + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 + /* xor will set bytes match esi to zero. */ + vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ + VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} + VPCMP $0, %YMM3, %YMMZERO, %k2 +# ifdef USE_AS_RAWMEMCHR + subq $-(VEC_SIZE * 4), %rdi + kortestd %k2, %k3 + jz L(loop_4x_vec) +# else + kortestd %k2, %k3 + jnz L(loop_4x_vec_end) + + subq $-(VEC_SIZE * 4), %rdi + + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop_4x_vec) + + /* Fall through into less than 4 remaining vectors of length case. + */ + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + addq $(VEC_SIZE * 3), %rdi + .p2align 4 +L(last_4x_vec_or_less): + /* Check if first VEC contained match. */ + testl %eax, %eax + jnz L(first_vec_x1_check) + + /* If remaining length > CHAR_PER_VEC * 2. */ + addl $(CHAR_PER_VEC * 2), %edx + jg L(last_4x_vec) + +L(last_2x_vec): + /* If remaining length < CHAR_PER_VEC. */ + addl $CHAR_PER_VEC, %edx + jle L(zero_end) + + /* Check VEC2 and compare any match with remaining length. */ + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax + cmpl %eax, %edx + jbe L(set_zero_end) + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax +L(zero_end): + ret + + + .p2align 4 +L(first_vec_x1_check): + tzcntl %eax, %eax + /* Adjust length. */ + subl $-(CHAR_PER_VEC * 4), %edx + /* Check if match within remaining length. */ + cmpl %eax, %edx + jbe L(set_zero_end) + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax + ret +L(set_zero_end): + xorl %eax, %eax + ret + + .p2align 4 +L(loop_4x_vec_end): +# endif + /* rawmemchr will fall through into this if match was found in + loop. */ + + /* k1 has not of matches with VEC1. */ + kmovd %k1, %eax +# ifdef USE_AS_WMEMCHR + subl $((1 << CHAR_PER_VEC) - 1), %eax +# else + incl %eax +# endif + jnz L(last_vec_x1_return) + + VPCMP $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2_return) + + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x3_return) + + kmovd %k3, %eax + tzcntl %eax, %eax +# ifdef USE_AS_RAWMEMCHR + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax +# else + leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax +# endif + ret + + .p2align 4 +L(last_vec_x1_return): + tzcntl %eax, %eax +# ifdef USE_AS_RAWMEMCHR +# ifdef USE_AS_WMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif +# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax +# endif + ret + + .p2align 4 +L(last_vec_x2_return): + tzcntl %eax, %eax +# ifdef USE_AS_RAWMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax +# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax +# endif + ret + + .p2align 4 +L(last_vec_x3_return): + tzcntl %eax, %eax +# ifdef USE_AS_RAWMEMCHR + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax +# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax +# endif + ret + + +# ifndef USE_AS_RAWMEMCHR +L(last_4x_vec_or_less_cmpeq): + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + subq $-(VEC_SIZE * 4), %rdi + /* Check first VEC regardless. */ + testl %eax, %eax + jnz L(first_vec_x1_check) + + /* If remaining length <= CHAR_PER_VEC * 2. */ + addl $(CHAR_PER_VEC * 2), %edx + jle L(last_2x_vec) + + .p2align 4 +L(last_4x_vec): + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + /* Create mask for possible matches within remaining length. */ +# ifdef USE_AS_WMEMCHR + movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx + bzhil %edx, %ecx, %ecx +# else + movq $-1, %rcx + bzhiq %rdx, %rcx, %rcx +# endif + /* Test matches in data against length match. */ + andl %ecx, %eax + jnz L(last_vec_x3) + + /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after + remaining length was found to be > CHAR_PER_VEC * 2. */ + subl $CHAR_PER_VEC, %edx + jbe L(zero_end2) + + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax + /* Shift remaining length mask for last VEC. */ +# ifdef USE_AS_WMEMCHR + shrl $CHAR_PER_VEC, %ecx +# else + shrq $CHAR_PER_VEC, %rcx +# endif + andl %ecx, %eax + jz L(zero_end2) + tzcntl %eax, %eax + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax +L(zero_end2): + ret + +L(last_vec_x2): + tzcntl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(last_vec_x3): + tzcntl %eax, %eax + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret +# endif + +END (MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000..cf4eff5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCMP +# define MEMCMP __memcmp_avx2_movbe_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memcmp-avx2-movbe.S" diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index d779639..0a9eab7 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -47,6 +47,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 # define VEC_MASK ((1 << VEC_SIZE) - 1) @@ -55,7 +59,7 @@ memcmp has to use UNSIGNED comparison for elemnts. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCMP) # ifdef USE_AS_WMEMCMP shl $2, %RDX_LP @@ -123,8 +127,8 @@ ENTRY (MEMCMP) vptest %ymm0, %ymm5 jnc L(4x_vec_end) xorl %eax, %eax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_2x_vec): @@ -144,8 +148,7 @@ L(last_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec): @@ -164,8 +167,7 @@ L(wmemcmp_return): movzbl (%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WMEMCMP .p2align 4 @@ -367,8 +369,7 @@ L(last_4x_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -394,8 +395,7 @@ L(4x_vec_end): movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -410,8 +410,7 @@ L(first_vec_x1): movzbl VEC_SIZE(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -426,7 +425,6 @@ L(first_vec_x2): movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMCMP) #endif diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S new file mode 100644 index 0000000..9c09397 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S @@ -0,0 +1,440 @@ +/* memcmp/wmemcmp optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +/* memcmp/wmemcmp is implemented as: + 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap + to avoid branches. + 2. Use overlapping compare to avoid branch. + 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 + bytes for wmemcmp. + 4. If size is 8 * VEC_SIZE or less, unroll the loop. + 5. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. + 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. + 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. + 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_evex_movbe +# endif + +# define VMOVU vmovdqu64 + +# ifdef USE_AS_WMEMCMP +# define VPCMPEQ vpcmpeqd +# else +# define VPCMPEQ vpcmpeqb +# endif + +# define XMM1 xmm17 +# define XMM2 xmm18 +# define YMM1 ymm17 +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 + +# define VEC_SIZE 32 +# ifdef USE_AS_WMEMCMP +# define VEC_MASK 0xff +# define XMM_MASK 0xf +# else +# define VEC_MASK 0xffffffff +# define XMM_MASK 0xffff +# endif + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.evex,"ax",@progbits +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %RDX_LP +# elif defined __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k1 + kmovd %k1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_vec) + + /* More than 2 * VEC. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + + /* From 4 * VEC to 8 * VEC, inclusively. */ + VMOVU (%rsi), %YMM1 + VPCMPEQ (%rdi), %YMM1, %k1 + + VMOVU VEC_SIZE(%rsi), %YMM2 + VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + + kandd %k1, %k2, %k5 + kandd %k3, %k4, %k6 + kandd %k5, %k6, %k6 + + kmovd %k6, %eax + cmpl $VEC_MASK, %eax + jne L(4x_vec_end) + + leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi + VMOVU (%rsi), %YMM1 + VPCMPEQ (%rdi), %YMM1, %k1 + + VMOVU VEC_SIZE(%rsi), %YMM2 + VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + kandd %k1, %k2, %k5 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + kandd %k3, %k5, %k5 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + kandd %k4, %k5, %k5 + + kmovd %k5, %eax + cmpl $VEC_MASK, %eax + jne L(4x_vec_end) + xorl %eax, %eax + ret + + .p2align 4 +L(last_2x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + +L(last_vec): + /* Use overlapping loads to avoid branches. */ + leaq -VEC_SIZE(%rdi, %rdx), %rdi + leaq -VEC_SIZE(%rsi, %rdx), %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(first_vec): + /* A byte or int32 is different within 16 or 32 bytes. */ + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (%rdi, %rcx, 4), %edx + cmpl (%rsi, %rcx, 4), %edx +L(wmemcmp_return): + setl %al + negl %eax + orl $1, %eax +# else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret + +# ifdef USE_AS_WMEMCMP + .p2align 4 +L(4): + xorl %eax, %eax + movl (%rdi), %edx + cmpl (%rsi), %edx + jne L(wmemcmp_return) + ret +# else + .p2align 4 +L(between_4_7): + /* Load as big endian with overlapping movbe to avoid branches. */ + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + je L(exit) + sbbl %eax, %eax + orl $1, %eax + ret + + .p2align 4 +L(exit): + ret + + .p2align 4 +L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + shll $8, %eax + shll $8, %ecx + bswap %eax + bswap %ecx + movb -1(%rdi, %rdx), %al + movb -1(%rsi, %rdx), %cl + /* Subtraction is okay because the upper 8 bits are zero. */ + subl %ecx, %eax + ret + + .p2align 4 +L(1): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax + ret +# endif + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(less_vec): +# ifdef USE_AS_WMEMCMP + /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ + cmpb $4, %dl + je L(4) + jb L(zero) +# else + cmpb $1, %dl + je L(1) + jb L(zero) + cmpb $4, %dl + jb L(between_2_3) + cmpb $8, %dl + jb L(between_4_7) +# endif + cmpb $16, %dl + jae L(between_16_31) + /* It is between 8 and 15 bytes. */ + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 + VPCMPEQ %XMM1, %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + /* Use overlapping loads to avoid branches. */ + leaq -8(%rdi, %rdx), %rdi + leaq -8(%rsi, %rdx), %rsi + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 + VPCMPEQ %XMM1, %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + VMOVU (%rsi), %XMM2 + VPCMPEQ (%rdi), %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -16(%rdi, %rdx), %rdi + leaq -16(%rsi, %rdx), %rsi + VMOVU (%rsi), %XMM2 + VPCMPEQ (%rdi), %XMM2, %k2 + kmovw %k2, %eax + subl $XMM_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(more_8x_vec): + /* More than 8 * VEC. Check the first VEC. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Align the first memory area for aligned loads in the loop. + Compute how much the first memory area is misaligned. */ + movq %rdi, %rcx + andl $(VEC_SIZE - 1), %ecx + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %rcx + /* Adjust the second memory area. */ + subq %rcx, %rsi + /* Adjust the first memory area which should be aligned now. */ + subq %rcx, %rdi + /* Adjust length. */ + addq %rcx, %rdx + +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + VMOVU (%rsi), %YMM1 + VPCMPEQ (%rdi), %YMM1, %k1 + + VMOVU VEC_SIZE(%rsi), %YMM2 + VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 + kandd %k2, %k1, %k5 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 + kandd %k3, %k5, %k5 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 + kandd %k4, %k5, %k5 + + kmovd %k5, %eax + cmpl $VEC_MASK, %eax + jne L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rsi + + subq $(VEC_SIZE * 4), %rdx + cmpq $(VEC_SIZE * 4), %rdx + jae L(loop_4x_vec) + + /* Less than 4 * VEC. */ + cmpq $VEC_SIZE, %rdx + jbe L(last_vec) + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_2x_vec) + +L(last_4x_vec): + /* From 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + VMOVU (%rsi), %YMM2 + VPCMPEQ (%rdi), %YMM2, %k2 + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(4x_vec_end): + kmovd %k1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + kmovd %k2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x1) + kmovd %k3, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x2) + kmovd %k4, %eax + subl $VEC_MASK, %eax + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rcx, 4), %edx + cmpl VEC_SIZE(%rsi, %rcx, 4), %edx + jmp L(wmemcmp_return) +# else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx + cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + ret +END (MEMCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S new file mode 100644 index 0000000..1ec1962 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S @@ -0,0 +1,17 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +# define VZEROUPPER_RETURN jmp L(return) + +# define SECTION(p) p##.avx.rtm +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S index aac1515..7dad1ad 100644 --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S @@ -1,11 +1,25 @@ #if IS_IN (libc) # define VEC_SIZE 64 -# define VEC(i) zmm##i +# define XMM0 xmm16 +# define XMM1 xmm17 +# define YMM0 ymm16 +# define YMM1 ymm17 +# define VEC0 zmm16 +# define VEC1 zmm17 +# define VEC2 zmm18 +# define VEC3 zmm19 +# define VEC4 zmm20 +# define VEC5 zmm21 +# define VEC6 zmm22 +# define VEC7 zmm23 +# define VEC8 zmm24 +# define VEC(i) VEC##i # define VMOVNT vmovntdq # define VMOVU vmovdqu64 # define VMOVA vmovdqa64 +# define VZEROUPPER -# define SECTION(p) p##.avx512 +# define SECTION(p) p##.evex512 # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s # include "memmove-vec-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S new file mode 100644 index 0000000..b879007 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S @@ -0,0 +1,26 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define XMM0 xmm16 +# define XMM1 xmm17 +# define YMM0 ymm16 +# define YMM1 ymm17 +# define VEC0 ymm16 +# define VEC1 ymm17 +# define VEC2 ymm18 +# define VEC3 ymm19 +# define VEC4 ymm20 +# define VEC5 ymm21 +# define VEC6 ymm22 +# define VEC7 ymm23 +# define VEC8 ymm24 +# define VEC(i) VEC##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 +# define VZEROUPPER + +# define SECTION(p) p##.evex +# define MEMMOVE_SYMBOL(p,s) p##_evex_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 95e9bb0..6f599ef 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -48,6 +48,14 @@ # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) #endif +#ifndef XMM0 +# define XMM0 xmm0 +#endif + +#ifndef YMM0 +# define YMM0 ymm0 +#endif + #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -150,11 +158,12 @@ L(last_2x_vec): VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) - VZEROUPPER #if !defined USE_MULTIARCH || !IS_IN (libc) L(nop): -#endif ret +#else + VZEROUPPER_RETURN +#endif #if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned)) @@ -247,8 +256,11 @@ L(last_2x_vec): VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(movsb): cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP @@ -312,21 +324,20 @@ L(less_vec): #if VEC_SIZE > 32 L(between_32_63): /* From 32 to 63. No branch when size == 32. */ - vmovdqu (%rsi), %ymm0 - vmovdqu -32(%rsi,%rdx), %ymm1 - vmovdqu %ymm0, (%rdi) - vmovdqu %ymm1, -32(%rdi,%rdx) - VZEROUPPER - ret + VMOVU (%rsi), %YMM0 + VMOVU -32(%rsi,%rdx), %YMM1 + VMOVU %YMM0, (%rdi) + VMOVU %YMM1, -32(%rdi,%rdx) + VZEROUPPER_RETURN #endif #if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): - vmovdqu (%rsi), %xmm0 - vmovdqu -16(%rsi,%rdx), %xmm1 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, -16(%rdi,%rdx) - ret + VMOVU (%rsi), %XMM0 + VMOVU -16(%rsi,%rdx), %XMM1 + VMOVU %XMM0, (%rdi) + VMOVU %XMM1, -16(%rdi,%rdx) + VZEROUPPER_RETURN #endif L(between_8_15): /* From 8 to 15. No branch when size == 8. */ @@ -379,8 +390,7 @@ L(more_2x_vec): VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(last_4x_vec): /* Copy from 2 * VEC to 4 * VEC. */ VMOVU (%rsi), %VEC(0) @@ -391,8 +401,7 @@ L(last_4x_vec): VMOVU %VEC(1), VEC_SIZE(%rdi) VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec): cmpq %rsi, %rdi @@ -448,8 +457,7 @@ L(loop_4x_vec_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec_backward): /* Load the first 4 * VEC and last VEC to support overlapping @@ -500,8 +508,7 @@ L(loop_4x_vec_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) L(large_forward): @@ -536,8 +543,7 @@ L(loop_large_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(large_backward): /* Don't use non-temporal store if there is overlap between @@ -571,8 +577,7 @@ L(loop_large_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S new file mode 100644 index 0000000..cea2d2a --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index e19a732..a9c33e4 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -20,14 +20,22 @@ # include <sysdep.h> +# ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2 +# endif + # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits -ENTRY (__memrchr_avx2) + .section SECTION(.text),"ax",@progbits +ENTRY (MEMRCHR) /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 vpbroadcastb %xmm0, %ymm0 @@ -134,8 +142,8 @@ L(loop_4x_vec): vpmovmskb %ymm1, %eax bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_4x_vec_or_less): @@ -169,8 +177,7 @@ L(last_4x_vec_or_less): addq %rax, %rdx jl L(zero) addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -191,31 +198,27 @@ L(last_2x_vec): jl L(zero) addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x0): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x1): bsrl %eax, %eax addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x2): bsrl %eax, %eax addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3): @@ -232,8 +235,7 @@ L(last_vec_x1_check): jl L(zero) addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3_check): @@ -243,12 +245,14 @@ L(last_vec_x3_check): jl L(zero) addl $(VEC_SIZE * 3), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): - VZEROUPPER + xorl %eax, %eax + VZEROUPPER_RETURN + + .p2align 4 L(null): xorl %eax, %eax ret @@ -273,8 +277,7 @@ L(last_vec_or_less_aligned): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_or_less): @@ -315,8 +318,7 @@ L(last_vec_or_less): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_2x_aligned): @@ -353,7 +355,6 @@ L(last_vec_2x_aligned): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret -END (__memrchr_avx2) + VZEROUPPER_RETURN +END (MEMRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S new file mode 100644 index 0000000..16bf8e0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S @@ -0,0 +1,337 @@ +/* memrchr optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define VMOVA vmovdqa64 + +# define YMMMATCH ymm16 + +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (__memrchr_evex) + /* Broadcast CHAR to YMMMATCH. */ + vpbroadcastb %esi, %YMMMATCH + + sub $VEC_SIZE, %RDX_LP + jbe L(last_vec_or_less) + + add %RDX_LP, %RDI_LP + + /* Check the last VEC_SIZE bytes. */ + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + subq $(VEC_SIZE * 4), %rdi + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(aligned_more) + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rdx + andq $-VEC_SIZE, %rdi + subq %rcx, %rdx + + .p2align 4 +L(aligned_more): + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 + kmovd %k3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + vpcmpb $0, (%rdi), %YMMMATCH, %k4 + kmovd %k4, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + /* Align data to 4 * VEC_SIZE for loop with fewer branches. + There are some overlaps with above if data isn't aligned + to 4 * VEC_SIZE. */ + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx + jz L(loop_4x_vec) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rdx + andq $-(VEC_SIZE * 4), %rdi + subq %rcx, %rdx + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + subq $(VEC_SIZE * 4), %rdi + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 + kord %k1, %k2, %k5 + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 + + kord %k3, %k4, %k6 + kortestd %k5, %k6 + jz L(loop_4x_vec) + + /* There is a match. */ + kmovd %k4, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + kmovd %k3, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + kmovd %k1, %eax + bsrl %eax, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_4x_vec_or_less): + addl $(VEC_SIZE * 4), %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 + kmovd %k3, %eax + testl %eax, %eax + jnz L(last_vec_x1_check) + cmpl $(VEC_SIZE * 3), %edx + jbe L(zero) + + vpcmpb $0, (%rdi), %YMMMATCH, %k4 + kmovd %k4, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 4), %rdx + addq %rax, %rdx + jl L(zero) + addq %rdi, %rax + ret + + .p2align 4 +L(last_2x_vec): + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x3_check) + cmpl $VEC_SIZE, %edx + jbe L(zero) + + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 2), %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x0): + bsrl %eax, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x1): + bsrl %eax, %eax + addl $VEC_SIZE, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x2): + bsrl %eax, %eax + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x3): + bsrl %eax, %eax + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x1_check): + bsrl %eax, %eax + subq $(VEC_SIZE * 3), %rdx + addq %rax, %rdx + jl L(zero) + addl $VEC_SIZE, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x3_check): + bsrl %eax, %eax + subq $VEC_SIZE, %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(last_vec_or_less_aligned): + movl %edx, %ecx + + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + + movl $1, %edx + /* Support rdx << 32. */ + salq %cl, %rdx + subq $1, %rdx + + kmovd %k1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_or_less): + addl $VEC_SIZE, %edx + + /* Check for zero length. */ + testl %edx, %edx + jz L(zero) + + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(last_vec_or_less_aligned) + + movl %ecx, %esi + movl %ecx, %r8d + addl %edx, %esi + andq $-VEC_SIZE, %rdi + + subl $VEC_SIZE, %esi + ja L(last_vec_2x_aligned) + + /* Check the last VEC. */ + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + + /* Remove the leading and trailing bytes. */ + sarl %cl, %eax + movl %edx, %ecx + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + ret + + .p2align 4 +L(last_vec_2x_aligned): + movl %esi, %ecx + + /* Check the last VEC. */ + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + kmovd %k1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + + testl %eax, %eax + jnz L(last_vec_x1) + + /* Check the second last VEC. */ + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + + movl %r8d, %ecx + + kmovd %k1, %eax + + /* Remove the leading bytes. Must use unsigned right shift for + bsrl below. */ + shrl %cl, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + ret +END (__memrchr_evex) +#endif diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S new file mode 100644 index 0000000..8ac3e47 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S @@ -0,0 +1,10 @@ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return) + +#define SECTION(p) p##.avx.rtm +#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm +#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + +#include "memset-avx2-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 7ab3d89..ae0860f 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -14,9 +14,15 @@ movq r, %rax; \ vpbroadcastd %xmm0, %ymm0 -# define SECTION(p) p##.avx -# define MEMSET_SYMBOL(p,s) p##_avx2_##s -# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# ifndef SECTION +# define SECTION(p) p##.avx +# endif +# ifndef MEMSET_SYMBOL +# define MEMSET_SYMBOL(p,s) p##_avx2_##s +# endif +# ifndef WMEMSET_SYMBOL +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# endif # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index 0783979..22e7b18 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -1,22 +1,22 @@ #if IS_IN (libc) # define VEC_SIZE 64 -# define VEC(i) zmm##i +# define XMM0 xmm16 +# define YMM0 ymm16 +# define VEC0 zmm16 +# define VEC(i) VEC##i # define VMOVU vmovdqu64 # define VMOVA vmovdqa64 +# define VZEROUPPER # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - vmovd d, %xmm0; \ movq r, %rax; \ - vpbroadcastb %xmm0, %xmm0; \ - vpbroadcastq %xmm0, %zmm0 + vpbroadcastb d, %VEC0 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - vmovd d, %xmm0; \ movq r, %rax; \ - vpbroadcastd %xmm0, %xmm0; \ - vpbroadcastq %xmm0, %zmm0 + vpbroadcastd d, %VEC0 -# define SECTION(p) p##.avx512 +# define SECTION(p) p##.evex512 # define MEMSET_SYMBOL(p,s) p##_avx512_##s # define WMEMSET_SYMBOL(p,s) p##_avx512_##s diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S new file mode 100644 index 0000000..ae0a4d6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -0,0 +1,24 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define XMM0 xmm16 +# define YMM0 ymm16 +# define VEC0 ymm16 +# define VEC(i) VEC##i +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 +# define VZEROUPPER + +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movq r, %rax; \ + vpbroadcastb d, %VEC0 + +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movq r, %rax; \ + vpbroadcastd d, %VEC0 + +# define SECTION(p) p##.evex +# define MEMSET_SYMBOL(p,s) p##_evex_##s +# define WMEMSET_SYMBOL(p,s) p##_evex_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 5e0d307..375844f 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -34,20 +34,25 @@ # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) #endif +#ifndef XMM0 +# define XMM0 xmm0 +#endif + +#ifndef YMM0 +# define YMM0 ymm0 +#endif + #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper +# define VZEROUPPER_SHORT_RETURN vzeroupper; ret # else # define VZEROUPPER # endif #endif #ifndef VZEROUPPER_SHORT_RETURN -# if VEC_SIZE > 16 -# define VZEROUPPER_SHORT_RETURN vzeroupper -# else -# define VZEROUPPER_SHORT_RETURN rep -# endif +# define VZEROUPPER_SHORT_RETURN rep; ret #endif #ifndef MOVQ @@ -77,7 +82,7 @@ ENTRY (__bzero) mov %RDI_LP, %RAX_LP /* Set return value. */ mov %RSI_LP, %RDX_LP /* Set n. */ - pxor %xmm0, %xmm0 + pxor %XMM0, %XMM0 jmp L(entry_from_bzero) END (__bzero) weak_alias (__bzero, bzero) @@ -119,8 +124,7 @@ L(entry_from_bzero): /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -143,14 +147,12 @@ ENTRY (__memset_erms) ENTRY (MEMSET_SYMBOL (__memset, erms)) # endif L(stosb): - /* Issue vzeroupper before rep stosb. */ - VZEROUPPER mov %RDX_LP, %RCX_LP movzbl %sil, %eax mov %RDI_LP, %RDX_LP rep stosb mov %RDX_LP, %RAX_LP - ret + VZEROUPPER_RETURN # if VEC_SIZE == 16 END (__memset_erms) # else @@ -177,8 +179,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(stosb_more_2x_vec): cmpq $REP_STOSB_THRESHOLD, %rdx @@ -192,8 +193,11 @@ L(more_2x_vec): VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(loop_start): leaq (VEC_SIZE * 4)(%rdi), %rcx @@ -219,7 +223,6 @@ L(loop): cmpq %rcx, %rdx jne L(loop) VZEROUPPER_SHORT_RETURN - ret L(less_vec): /* Less than 1 VEC. */ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 @@ -233,7 +236,7 @@ L(less_vec): cmpb $16, %dl jae L(between_16_31) # endif - MOVQ %xmm0, %rcx + MOVQ %XMM0, %rcx cmpb $8, %dl jae L(between_8_15) cmpb $4, %dl @@ -243,40 +246,34 @@ L(less_vec): jb 1f movb %cl, (%rdi) 1: - VZEROUPPER - ret + VZEROUPPER_RETURN # if VEC_SIZE > 32 /* From 32 to 63. No branch when size == 32. */ L(between_32_63): - vmovdqu %ymm0, -32(%rdi,%rdx) - vmovdqu %ymm0, (%rdi) - VZEROUPPER - ret + VMOVU %YMM0, -32(%rdi,%rdx) + VMOVU %YMM0, (%rdi) + VZEROUPPER_RETURN # endif # if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): - vmovdqu %xmm0, -16(%rdi,%rdx) - vmovdqu %xmm0, (%rdi) - VZEROUPPER - ret + VMOVU %XMM0, -16(%rdi,%rdx) + VMOVU %XMM0, (%rdi) + VZEROUPPER_RETURN # endif /* From 8 to 15. No branch when size == 8. */ L(between_8_15): movq %rcx, -8(%rdi,%rdx) movq %rcx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_4_7): /* From 4 to 7. No branch when size == 4. */ movl %ecx, -4(%rdi,%rdx) movl %ecx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_2_3): /* From 2 to 3. No branch when size == 2. */ movw %cx, -2(%rdi,%rdx) movw %cx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S new file mode 100644 index 0000000..acc5f6e --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __rawmemchr_avx2_rtm +#define USE_AS_RAWMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S new file mode 100644 index 0000000..ec942b7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S @@ -0,0 +1,4 @@ +#define MEMCHR __rawmemchr_evex +#define USE_AS_RAWMEMCHR 1 + +#include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S new file mode 100644 index 0000000..2b9c07a --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S new file mode 100644 index 0000000..7c6f26c --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_evex +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S new file mode 100644 index 0000000..60a2ccf --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S new file mode 100644 index 0000000..1570014 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_evex +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S new file mode 100644 index 0000000..637fb55 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCAT +# define STRCAT __strcat_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcat-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S index b062356..aa48c05 100644 --- a/sysdeps/x86_64/multiarch/strcat-avx2.S +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S @@ -30,7 +30,11 @@ /* Number of bytes in a vector register */ # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + + .section SECTION(.text),"ax",@progbits ENTRY (STRCAT) mov %rdi, %r9 # ifdef USE_AS_STRNCAT diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S new file mode 100644 index 0000000..97c3d85 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-evex.S @@ -0,0 +1,283 @@ +/* strcat with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCAT +# define STRCAT __strcat_evex +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +/* zero register */ +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMM0 ymm17 +# define YMM1 ymm18 + +# define USE_AS_STRCAT + +/* Number of bytes in a vector register */ +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (STRCAT) + mov %rdi, %r9 +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + + xor %eax, %eax + mov %edi, %ecx + and $((VEC_SIZE * 4) - 1), %ecx + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + cmp $(VEC_SIZE * 3), %ecx + ja L(fourth_vector_boundary) + vpcmpb $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %edx + test %edx, %edx + jnz L(exit_null_on_first_vector) + mov %rdi, %rax + and $-VEC_SIZE, %rax + jmp L(align_vec_size_start) +L(fourth_vector_boundary): + mov %rdi, %rax + and $-VEC_SIZE, %rax + vpcmpb $0, (%rax), %YMMZERO, %k0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + kmovd %k0, %edx + and %r10d, %edx + jnz L(exit) + +L(align_vec_size_start): + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 + kmovd %k0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + add $(VEC_SIZE * 4), %rax + kmovd %k4, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + kmovd %k4, %edx + add $(VEC_SIZE * 4), %rax + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + add $(VEC_SIZE * 4), %rax + kmovd %k4, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 + add $(VEC_SIZE * 5), %rax + kmovd %k4, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 + add $VEC_SIZE, %rax + kmovd %k0, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 + add $VEC_SIZE, %rax + kmovd %k0, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 + add $VEC_SIZE, %rax + kmovd %k1, %edx + test %edx, %edx + jnz L(exit) + + add $VEC_SIZE, %rax + + .p2align 4 +L(align_four_vec_loop): + VMOVA (%rax), %YMM0 + VMOVA (VEC_SIZE * 2)(%rax), %YMM1 + vpminub VEC_SIZE(%rax), %YMM0, %YMM0 + vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 + vpminub %YMM0, %YMM1, %YMM0 + /* If K0 != 0, there is a null byte. */ + vpcmpb $0, %YMM0, %YMMZERO, %k0 + add $(VEC_SIZE * 4), %rax + ktestd %k0, %k0 + jz L(align_four_vec_loop) + + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 + sub $(VEC_SIZE * 5), %rax + kmovd %k0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 + kmovd %k2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 + kmovd %k3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 4), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_null_on_first_vector): + bsf %rdx, %rdx + add %rdx, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_second_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $VEC_SIZE, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_third_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 2), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_fourth_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 3), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_fifth_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 4), %rax + + .p2align 4 +L(StartStrcpyPart): + lea (%r9, %rax), %rdi + mov %rsi, %rcx + mov %r9, %rax /* save result */ + +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(ExitZero) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-evex.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S new file mode 100644 index 0000000..81f20d1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCHR +# define STRCHR __strchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S index 71b1a1d..352074e 100644 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -38,9 +38,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCHR) movl %edi, %ecx /* Broadcast CHAR to YMM0. */ @@ -93,8 +97,8 @@ L(cros_page_boundary): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(aligned_more): @@ -190,8 +194,7 @@ L(first_vec_x0): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -205,8 +208,7 @@ L(first_vec_x1): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -220,8 +222,7 @@ L(first_vec_x2): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -247,8 +248,7 @@ L(first_vec_x3): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S new file mode 100644 index 0000000..ddc86a7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-evex.S @@ -0,0 +1,335 @@ +/* strchr/strchrnul optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCHR +# define STRCHR __strchr_evex +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# ifdef USE_AS_WCSCHR +# define VPBROADCAST vpbroadcastd +# define VPCMP vpcmpd +# define VPMINU vpminud +# define CHAR_REG esi +# define SHIFT_REG r8d +# else +# define VPBROADCAST vpbroadcastb +# define VPCMP vpcmpb +# define VPMINU vpminub +# define CHAR_REG sil +# define SHIFT_REG ecx +# endif + +# define XMMZERO xmm16 + +# define YMMZERO ymm16 +# define YMM0 ymm17 +# define YMM1 ymm18 +# define YMM2 ymm19 +# define YMM3 ymm20 +# define YMM4 ymm21 +# define YMM5 ymm22 +# define YMM6 ymm23 +# define YMM7 ymm24 +# define YMM8 ymm25 + +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 + + .section .text.evex,"ax",@progbits +ENTRY (STRCHR) + movl %edi, %ecx +# ifndef USE_AS_STRCHRNUL + xorl %edx, %edx +# endif + + /* Broadcast CHAR to YMM0. */ + VPBROADCAST %esi, %YMM0 + + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* Check if we cross page boundary with one vector load. */ + andl $(PAGE_SIZE - 1), %ecx + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null bytes. */ + VMOVU (%rdi), %YMM1 + + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + ktestd %k0, %k0 + jz L(more_vecs) + kmovd %k0, %eax + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (%rdi, %rax, 4), %rax +# else + addq %rdi, %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + .p2align 4 +L(more_vecs): + /* Align data for aligned loads in the loop. */ + andq $-VEC_SIZE, %rdi +L(aligned_more): + + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + VMOVA VEC_SIZE(%rdi), %YMM1 + addq $VEC_SIZE, %rdi + + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VMOVA VEC_SIZE(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + ktestd %k0, %k0 + jz L(prep_loop_4x) + + kmovd %k0, %eax + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax +# else + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + .p2align 4 +L(first_vec_x0): + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (%rdi, %rax, 4), %rax +# else + addq %rdi, %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq VEC_SIZE(%rdi, %rax, 4), %rax +# else + leaq VEC_SIZE(%rdi, %rax), %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax + /* Found CHAR or the null byte. */ +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax +# else + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + +L(prep_loop_4x): + /* Align data to 4 * VEC_SIZE. */ + andq $-(VEC_SIZE * 4), %rdi + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 + VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 + VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 + + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM5 + vpxorq %YMM2, %YMM0, %YMM6 + vpxorq %YMM3, %YMM0, %YMM7 + vpxorq %YMM4, %YMM0, %YMM8 + + VPMINU %YMM5, %YMM1, %YMM5 + VPMINU %YMM6, %YMM2, %YMM6 + VPMINU %YMM7, %YMM3, %YMM7 + VPMINU %YMM8, %YMM4, %YMM8 + + VPMINU %YMM5, %YMM6, %YMM1 + VPMINU %YMM7, %YMM8, %YMM2 + + VPMINU %YMM1, %YMM2, %YMM1 + + /* Each bit in K0 represents a CHAR or a null byte. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + + addq $(VEC_SIZE * 4), %rdi + + ktestd %k0, %k0 + jz L(loop_4x_vec) + + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM5, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ + VPCMP $0, %YMMZERO, %YMM6, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ + VPCMP $0, %YMMZERO, %YMM7, %k2 + /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ + VPCMP $0, %YMMZERO, %YMM8, %k3 + +# ifdef USE_AS_WCSCHR + /* NB: Each bit in K2/K3 represents 4-byte element. */ + kshiftlw $8, %k3, %k1 +# else + kshiftlq $32, %k3, %k1 +# endif + + /* Each bit in K1 represents a NULL or a mismatch. */ + korq %k1, %k2, %k1 + kmovq %k1, %rax + + tzcntq %rax, %rax +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax +# else + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + + /* Cold case for crossing page with first load. */ + .p2align 4 +L(cross_page_boundary): + andq $-VEC_SIZE, %rdi + andl $(VEC_SIZE - 1), %ecx + + VMOVA (%rdi), %YMM1 + + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + +# ifdef USE_AS_WCSCHR + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG + sarl $2, %SHIFT_REG +# endif + + /* Remove the leading bits. */ + sarxl %SHIFT_REG, %eax, %eax + testl %eax, %eax + + jz L(aligned_more) + tzcntl %eax, %eax + addq %rcx, %rdi +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq (%rdi, %rax, 4), %rax +# else + addq %rdi, %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + ret + +END (STRCHR) +# endif diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c index 5bc6eb3..bcecec9 100644 --- a/sysdeps/x86_64/multiarch/strchr.c +++ b/sysdeps/x86_64/multiarch/strchr.c @@ -29,16 +29,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable) + && CPU_FEATURES_CPU_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF)) return OPTIMIZE (sse2_no_bsf); diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S new file mode 100644 index 0000000..cdcf818 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __strchrnul_avx2_rtm +#define USE_AS_STRCHRNUL 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S new file mode 100644 index 0000000..064fe7c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S @@ -0,0 +1,3 @@ +#define STRCHR __strchrnul_evex +#define USE_AS_STRCHRNUL 1 +#include "strchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S new file mode 100644 index 0000000..aecd30d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCMP +# define STRCMP __strcmp_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S index c9f2464..f199c91 100644 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -55,6 +55,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + /* Warning! wcscmp/wcsncmp have to use SIGNED comparison for elements. strcmp/strncmp have to use UNSIGNED comparison for elements. @@ -75,7 +79,7 @@ the maximum offset is reached before a difference is found, zero is returned. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCMP) # ifdef USE_AS_STRNCMP /* Check for simple cases (0 or 1) in offset. */ @@ -83,6 +87,16 @@ ENTRY (STRCMP) je L(char0) jb L(zero) # ifdef USE_AS_WCSCMP +# ifndef __ILP32__ + movq %rdx, %rcx + /* Check if length could overflow when multiplied by + sizeof(wchar_t). Checking top 8 bits will cover all potential + overflow cases as well as redirect cases where its impossible to + length to bound a valid memory region. In these cases just use + 'wcscmp'. */ + shrq $56, %rcx + jnz __wcscmp_avx2 +# endif /* Convert units: from wide to byte char. */ shl $2, %RDX_LP # endif @@ -127,8 +141,8 @@ L(return): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(return_vec_size): @@ -161,8 +175,7 @@ L(return_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_2_vec_size): @@ -195,8 +208,7 @@ L(return_2_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_3_vec_size): @@ -229,8 +241,7 @@ L(return_3_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(next_3_vectors): @@ -356,8 +367,7 @@ L(back_to_loop): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_vec): @@ -400,8 +410,7 @@ L(test_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_2_vec): @@ -444,8 +453,7 @@ L(test_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_3_vec): @@ -486,8 +494,7 @@ L(test_3_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page): @@ -556,8 +563,7 @@ L(loop_cross_page): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page_2_vec): @@ -631,8 +637,7 @@ L(loop_cross_page_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_STRNCMP L(string_nbyte_offset_check): @@ -674,8 +679,7 @@ L(cross_page_loop): # ifndef USE_AS_WCSCMP L(different): # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WCSCMP .p2align 4 @@ -685,16 +689,14 @@ L(different): setl %al negl %eax orl $1, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN # endif # ifdef USE_AS_STRNCMP .p2align 4 L(zero): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char0): @@ -708,8 +710,7 @@ L(char0): movzbl (%rdi), %eax subl %ecx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # endif .p2align 4 @@ -734,8 +735,7 @@ L(last_vector): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN /* Comparing on page boundary region requires special treatment: It must done one vector at the time, starting with the wider @@ -856,7 +856,6 @@ L(cross_page_4bytes): testl %eax, %eax jne L(cross_page_loop) subl %ecx, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCMP) #endif diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S new file mode 100644 index 0000000..459eeed --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S @@ -0,0 +1,1043 @@ +/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCMP +# define STRCMP __strcmp_evex +# endif + +# define PAGE_SIZE 4096 + +/* VEC_SIZE = Number of bytes in a ymm register */ +# define VEC_SIZE 32 + +/* Shift for dividing by (VEC_SIZE * 4). */ +# define DIVIDE_BY_VEC_4_SHIFT 7 +# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# ifdef USE_AS_WCSCMP +/* Compare packed dwords. */ +# define VPCMP vpcmpd +# define SHIFT_REG32 r8d +# define SHIFT_REG64 r8 +/* 1 dword char == 4 bytes. */ +# define SIZE_OF_CHAR 4 +# else +/* Compare packed bytes. */ +# define VPCMP vpcmpb +# define SHIFT_REG32 ecx +# define SHIFT_REG64 rcx +/* 1 byte char == 1 byte. */ +# define SIZE_OF_CHAR 1 +# endif + +# define XMMZERO xmm16 +# define XMM0 xmm17 +# define XMM1 xmm18 + +# define YMMZERO ymm16 +# define YMM0 ymm17 +# define YMM1 ymm18 +# define YMM2 ymm19 +# define YMM3 ymm20 +# define YMM4 ymm21 +# define YMM5 ymm22 +# define YMM6 ymm23 +# define YMM7 ymm24 + +/* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. + strcmp/strncmp have to use UNSIGNED comparison for elements. +*/ + +/* The main idea of the string comparison (byte or dword) using 256-bit + EVEX instructions consists of comparing (VPCMP) two ymm vectors. The + latter can be on either packed bytes or dwords depending on + USE_AS_WCSCMP. In order to check the null char, algorithm keeps the + matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 + KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) + are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd + instructions. Main loop (away from from page boundary) compares 4 + vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 + bytes) on each loop. + + The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic + is the same as strcmp, except that an a maximum offset is tracked. If + the maximum offset is reached before a difference is found, zero is + returned. */ + + .section .text.evex,"ax",@progbits +ENTRY (STRCMP) +# ifdef USE_AS_STRNCMP + /* Check for simple cases (0 or 1) in offset. */ + cmp $1, %RDX_LP + je L(char0) + jb L(zero) +# ifdef USE_AS_WCSCMP + /* Convert units: from wide to byte char. */ + shl $2, %RDX_LP +# endif + /* Register %r11 tracks the maximum offset. */ + mov %RDX_LP, %R11_LP +# endif + movl %edi, %eax + xorl %edx, %edx + /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + orl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax + jg L(cross_page) + /* Start comparing 4 vectors. */ + VMOVU (%rdi), %YMM0 + VMOVU (%rsi), %YMM1 + + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ + VPCMP $4, %YMM0, %YMM1, %k0 + + /* Check for NULL in YMM0. */ + VPCMP $0, %YMMZERO, %YMM0, %k1 + /* Check for NULL in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k2 + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ + kord %k1, %k2, %k1 + + /* Each bit in K1 represents: + 1. A mismatch in YMM0 and YMM1. Or + 2. A NULL in YMM0 or YMM1. + */ + kord %k0, %k1, %k1 + + ktestd %k1, %k1 + je L(next_3_vectors) + kmovd %k1, %ecx + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx) is after the maximum + offset (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + je L(return) +L(wcscmp_return): + setl %al + negl %eax + orl $1, %eax +L(return): +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif + ret + + .p2align 4 +L(return_vec_size): + kmovd %k1, %ecx + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after + the maximum offset (%r11). */ + addq $VEC_SIZE, %rdx + cmpq %r11, %rdx + jae L(zero) +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rdx), %ecx + cmpl VEC_SIZE(%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl VEC_SIZE(%rdi, %rdx), %eax + movzbl VEC_SIZE(%rsi, %rdx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(return_2_vec_size): + kmovd %k1, %ecx + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is + after the maximum offset (%r11). */ + addq $(VEC_SIZE * 2), %rdx + cmpq %r11, %rdx + jae L(zero) +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx + cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(return_3_vec_size): + kmovd %k1, %ecx + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is + after the maximum offset (%r11). */ + addq $(VEC_SIZE * 3), %rdx + cmpq %r11, %rdx + jae L(zero) +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx + cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(next_3_vectors): + VMOVU VEC_SIZE(%rdi), %YMM0 + VMOVU VEC_SIZE(%rsi), %YMM1 + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ + VPCMP $4, %YMM0, %YMM1, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM1, %k2 + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + ktestd %k1, %k1 + jne L(return_vec_size) + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM2 + VMOVU (VEC_SIZE * 3)(%rdi), %YMM3 + VMOVU (VEC_SIZE * 2)(%rsi), %YMM4 + VMOVU (VEC_SIZE * 3)(%rsi), %YMM5 + + /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */ + VPCMP $4, %YMM2, %YMM4, %k0 + VPCMP $0, %YMMZERO, %YMM2, %k1 + VPCMP $0, %YMMZERO, %YMM4, %k2 + /* Each bit in K1 represents a NULL in YMM2 or YMM4. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + ktestd %k1, %k1 + jne L(return_2_vec_size) + + /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */ + VPCMP $4, %YMM3, %YMM5, %k0 + VPCMP $0, %YMMZERO, %YMM3, %k1 + VPCMP $0, %YMMZERO, %YMM5, %k2 + /* Each bit in K1 represents a NULL in YMM3 or YMM5. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + ktestd %k1, %k1 + jne L(return_3_vec_size) +L(main_loop_header): + leaq (VEC_SIZE * 4)(%rdi), %rdx + movl $PAGE_SIZE, %ecx + /* Align load via RAX. */ + andq $-(VEC_SIZE * 4), %rdx + subq %rdi, %rdx + leaq (%rdi, %rdx), %rax +# ifdef USE_AS_STRNCMP + /* Starting from this point, the maximum offset, or simply the + 'offset', DECREASES by the same amount when base pointers are + moved forward. Return 0 when: + 1) On match: offset <= the matched vector index. + 2) On mistmach, offset is before the mistmatched index. + */ + subq %rdx, %r11 + jbe L(zero) +# endif + addq %rsi, %rdx + movq %rdx, %rsi + andl $(PAGE_SIZE - 1), %esi + /* Number of bytes before page crossing. */ + subq %rsi, %rcx + /* Number of VEC_SIZE * 4 blocks before page crossing. */ + shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx + /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ + movl %ecx, %esi + jmp L(loop_start) + + .p2align 4 +L(loop): +# ifdef USE_AS_STRNCMP + /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease + the maximum offset (%r11) by the same amount. */ + subq $(VEC_SIZE * 4), %r11 + jbe L(zero) +# endif + addq $(VEC_SIZE * 4), %rax + addq $(VEC_SIZE * 4), %rdx +L(loop_start): + testl %esi, %esi + leal -1(%esi), %esi + je L(loop_cross_page) +L(back_to_loop): + /* Main loop, comparing 4 vectors are a time. */ + VMOVA (%rax), %YMM0 + VMOVA VEC_SIZE(%rax), %YMM2 + VMOVA (VEC_SIZE * 2)(%rax), %YMM4 + VMOVA (VEC_SIZE * 3)(%rax), %YMM6 + VMOVU (%rdx), %YMM1 + VMOVU VEC_SIZE(%rdx), %YMM3 + VMOVU (VEC_SIZE * 2)(%rdx), %YMM5 + VMOVU (VEC_SIZE * 3)(%rdx), %YMM7 + + VPCMP $4, %YMM0, %YMM1, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM1, %k2 + kord %k1, %k2, %k1 + /* Each bit in K4 represents a NULL or a mismatch in YMM0 and + YMM1. */ + kord %k0, %k1, %k4 + + VPCMP $4, %YMM2, %YMM3, %k0 + VPCMP $0, %YMMZERO, %YMM2, %k1 + VPCMP $0, %YMMZERO, %YMM3, %k2 + kord %k1, %k2, %k1 + /* Each bit in K5 represents a NULL or a mismatch in YMM2 and + YMM3. */ + kord %k0, %k1, %k5 + + VPCMP $4, %YMM4, %YMM5, %k0 + VPCMP $0, %YMMZERO, %YMM4, %k1 + VPCMP $0, %YMMZERO, %YMM5, %k2 + kord %k1, %k2, %k1 + /* Each bit in K6 represents a NULL or a mismatch in YMM4 and + YMM5. */ + kord %k0, %k1, %k6 + + VPCMP $4, %YMM6, %YMM7, %k0 + VPCMP $0, %YMMZERO, %YMM6, %k1 + VPCMP $0, %YMMZERO, %YMM7, %k2 + kord %k1, %k2, %k1 + /* Each bit in K7 represents a NULL or a mismatch in YMM6 and + YMM7. */ + kord %k0, %k1, %k7 + + kord %k4, %k5, %k0 + kord %k6, %k7, %k1 + + /* Test each mask (32 bits) individually because for VEC_SIZE + == 32 is not possible to OR the four masks and keep all bits + in a 64-bit integer register, differing from SSE2 strcmp + where ORing is possible. */ + kortestd %k0, %k1 + je L(loop) + ktestd %k4, %k4 + je L(test_vec) + kmovd %k4, %edi + tzcntl %edi, %ecx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +# endif +# ifdef USE_AS_STRNCMP + cmpq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(test_vec): +# ifdef USE_AS_STRNCMP + /* The first vector matched. Return 0 if the maximum offset + (%r11) <= VEC_SIZE. */ + cmpq $VEC_SIZE, %r11 + jbe L(zero) +# endif + ktestd %k5, %k5 + je L(test_2_vec) + kmovd %k5, %ecx + tzcntl %ecx, %edi +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edi +# endif +# ifdef USE_AS_STRNCMP + addq $VEC_SIZE, %rdi + cmpq %rdi, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rdi), %ecx + cmpl (%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl (%rax, %rdi), %eax + movzbl (%rdx, %rdi), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl VEC_SIZE(%rsi, %rdi), %ecx + cmpl VEC_SIZE(%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl VEC_SIZE(%rax, %rdi), %eax + movzbl VEC_SIZE(%rdx, %rdi), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(test_2_vec): +# ifdef USE_AS_STRNCMP + /* The first 2 vectors matched. Return 0 if the maximum offset + (%r11) <= 2 * VEC_SIZE. */ + cmpq $(VEC_SIZE * 2), %r11 + jbe L(zero) +# endif + ktestd %k6, %k6 + je L(test_3_vec) + kmovd %k6, %ecx + tzcntl %ecx, %edi +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edi +# endif +# ifdef USE_AS_STRNCMP + addq $(VEC_SIZE * 2), %rdi + cmpq %rdi, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rdi), %ecx + cmpl (%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl (%rax, %rdi), %eax + movzbl (%rdx, %rdi), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx + cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax + movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(test_3_vec): +# ifdef USE_AS_STRNCMP + /* The first 3 vectors matched. Return 0 if the maximum offset + (%r11) <= 3 * VEC_SIZE. */ + cmpq $(VEC_SIZE * 3), %r11 + jbe L(zero) +# endif + kmovd %k7, %esi + tzcntl %esi, %ecx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +# endif +# ifdef USE_AS_STRNCMP + addq $(VEC_SIZE * 3), %rcx + cmpq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %esi + cmpl (%rdx, %rcx), %esi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rsi, %rcx), %esi + cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(loop_cross_page): + xorl %r10d, %r10d + movq %rdx, %rcx + /* Align load via RDX. We load the extra ECX bytes which should + be ignored. */ + andl $((VEC_SIZE * 4) - 1), %ecx + /* R10 is -RCX. */ + subq %rcx, %r10 + + /* This works only if VEC_SIZE * 2 == 64. */ +# if (VEC_SIZE * 2) != 64 +# error (VEC_SIZE * 2) != 64 +# endif + + /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ + cmpl $(VEC_SIZE * 2), %ecx + jge L(loop_cross_page_2_vec) + + VMOVU (%rax, %r10), %YMM2 + VMOVU VEC_SIZE(%rax, %r10), %YMM3 + VMOVU (%rdx, %r10), %YMM4 + VMOVU VEC_SIZE(%rdx, %r10), %YMM5 + + VPCMP $4, %YMM4, %YMM2, %k0 + VPCMP $0, %YMMZERO, %YMM2, %k1 + VPCMP $0, %YMMZERO, %YMM4, %k2 + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch in YMM2 and + YMM4. */ + kord %k0, %k1, %k1 + + VPCMP $4, %YMM5, %YMM3, %k3 + VPCMP $0, %YMMZERO, %YMM3, %k4 + VPCMP $0, %YMMZERO, %YMM5, %k5 + kord %k4, %k5, %k4 + /* Each bit in K3 represents a NULL or a mismatch in YMM3 and + YMM5. */ + kord %k3, %k4, %k3 + +# ifdef USE_AS_WCSCMP + /* NB: Each bit in K1/K3 represents 4-byte element. */ + kshiftlw $8, %k3, %k2 + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG32 + sarl $2, %SHIFT_REG32 +# else + kshiftlq $32, %k3, %k2 +# endif + + /* Each bit in K1 represents a NULL or a mismatch. */ + korq %k1, %k2, %k1 + kmovq %k1, %rdi + + /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ + shrxq %SHIFT_REG64, %rdi, %rdi + testq %rdi, %rdi + je L(loop_cross_page_2_vec) + tzcntq %rdi, %rcx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +# endif +# ifdef USE_AS_STRNCMP + cmpq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + ret + + .p2align 4 +L(loop_cross_page_2_vec): + /* The first VEC_SIZE * 2 bytes match or are ignored. */ + VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 + VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 + VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2 + VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3 + + VPCMP $4, %YMM0, %YMM2, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM2, %k2 + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch in YMM0 and + YMM2. */ + kord %k0, %k1, %k1 + + VPCMP $4, %YMM1, %YMM3, %k3 + VPCMP $0, %YMMZERO, %YMM1, %k4 + VPCMP $0, %YMMZERO, %YMM3, %k5 + kord %k4, %k5, %k4 + /* Each bit in K3 represents a NULL or a mismatch in YMM1 and + YMM3. */ + kord %k3, %k4, %k3 + +# ifdef USE_AS_WCSCMP + /* NB: Each bit in K1/K3 represents 4-byte element. */ + kshiftlw $8, %k3, %k2 +# else + kshiftlq $32, %k3, %k2 +# endif + + /* Each bit in K1 represents a NULL or a mismatch. */ + korq %k1, %k2, %k1 + kmovq %k1, %rdi + + xorl %r8d, %r8d + /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ + subl $(VEC_SIZE * 2), %ecx + jle 1f + /* R8 has number of bytes skipped. */ + movl %ecx, %r8d +# ifdef USE_AS_WCSCMP + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + sarl $2, %ecx +# endif + /* Skip ECX bytes. */ + shrq %cl, %rdi +1: + /* Before jumping back to the loop, set ESI to the number of + VEC_SIZE * 4 blocks before page crossing. */ + movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi + + testq %rdi, %rdi +# ifdef USE_AS_STRNCMP + /* At this point, if %rdi value is 0, it already tested + VEC_SIZE*4+%r10 byte starting from %rax. This label + checks whether strncmp maximum offset reached or not. */ + je L(string_nbyte_offset_check) +# else + je L(back_to_loop) +# endif + tzcntq %rdi, %rcx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +# endif + addq %r10, %rcx + /* Adjust for number of bytes skipped. */ + addq %r8, %rcx +# ifdef USE_AS_STRNCMP + addq $(VEC_SIZE * 2), %rcx + subq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rsi, %rcx), %edi + cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + ret + +# ifdef USE_AS_STRNCMP +L(string_nbyte_offset_check): + leaq (VEC_SIZE * 4)(%r10), %r10 + cmpq %r10, %r11 + jbe L(zero) + jmp L(back_to_loop) +# endif + + .p2align 4 +L(cross_page_loop): + /* Check one byte/dword at a time. */ +# ifdef USE_AS_WCSCMP + cmpl %ecx, %eax +# else + subl %ecx, %eax +# endif + jne L(different) + addl $SIZE_OF_CHAR, %edx + cmpl $(VEC_SIZE * 4), %edx + je L(main_loop_header) +# ifdef USE_AS_STRNCMP + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + movl (%rdi, %rdx), %eax + movl (%rsi, %rdx), %ecx +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx +# endif + /* Check null char. */ + testl %eax, %eax + jne L(cross_page_loop) + /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED + comparisons. */ + subl %ecx, %eax +# ifndef USE_AS_WCSCMP +L(different): +# endif + ret + +# ifdef USE_AS_WCSCMP + .p2align 4 +L(different): + /* Use movl to avoid modifying EFLAGS. */ + movl $0, %eax + setl %al + negl %eax + orl $1, %eax + ret +# endif + +# ifdef USE_AS_STRNCMP + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(char0): +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi), %ecx + cmpl (%rsi), %ecx + jne L(wcscmp_return) +# else + movzbl (%rsi), %ecx + movzbl (%rdi), %eax + subl %ecx, %eax +# endif + ret +# endif + + .p2align 4 +L(last_vector): + addq %rdx, %rdi + addq %rdx, %rsi +# ifdef USE_AS_STRNCMP + subq %rdx, %r11 +# endif + tzcntl %ecx, %edx +# ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %edx +# endif +# ifdef USE_AS_STRNCMP + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif + ret + + /* Comparing on page boundary region requires special treatment: + It must done one vector at the time, starting with the wider + ymm vector if possible, if not, with xmm. If fetching 16 bytes + (xmm) still passes the boundary, byte comparison must be done. + */ + .p2align 4 +L(cross_page): + /* Try one ymm vector at a time. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(cross_page_1_vector) +L(loop_1_vector): + VMOVU (%rdi, %rdx), %YMM0 + VMOVU (%rsi, %rdx), %YMM1 + + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ + VPCMP $4, %YMM0, %YMM1, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM1, %k2 + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jne L(last_vector) + + addl $VEC_SIZE, %edx + + addl $VEC_SIZE, %eax +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jle L(loop_1_vector) +L(cross_page_1_vector): + /* Less than 32 bytes to check, try one xmm vector. */ + cmpl $(PAGE_SIZE - 16), %eax + jg L(cross_page_1_xmm) + VMOVU (%rdi, %rdx), %XMM0 + VMOVU (%rsi, %rdx), %XMM1 + + /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ + VPCMP $4, %XMM0, %XMM1, %k0 + VPCMP $0, %XMMZERO, %XMM0, %k1 + VPCMP $0, %XMMZERO, %XMM1, %k2 + /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ + korw %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + korw %k0, %k1, %k1 + kmovw %k1, %ecx + testl %ecx, %ecx + jne L(last_vector) + + addl $16, %edx +# ifndef USE_AS_WCSCMP + addl $16, %eax +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + +L(cross_page_1_xmm): +# ifndef USE_AS_WCSCMP + /* Less than 16 bytes to check, try 8 byte vector. NB: No need + for wcscmp nor wcsncmp since wide char is 4 bytes. */ + cmpl $(PAGE_SIZE - 8), %eax + jg L(cross_page_8bytes) + vmovq (%rdi, %rdx), %XMM0 + vmovq (%rsi, %rdx), %XMM1 + + /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ + VPCMP $4, %XMM0, %XMM1, %k0 + VPCMP $0, %XMMZERO, %XMM0, %k1 + VPCMP $0, %XMMZERO, %XMM1, %k2 + /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + kmovd %k1, %ecx + +# ifdef USE_AS_WCSCMP + /* Only last 2 bits are valid. */ + andl $0x3, %ecx +# else + /* Only last 8 bits are valid. */ + andl $0xff, %ecx +# endif + + testl %ecx, %ecx + jne L(last_vector) + + addl $8, %edx + addl $8, %eax +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + +L(cross_page_8bytes): + /* Less than 8 bytes to check, try 4 byte vector. */ + cmpl $(PAGE_SIZE - 4), %eax + jg L(cross_page_4bytes) + vmovd (%rdi, %rdx), %XMM0 + vmovd (%rsi, %rdx), %XMM1 + + /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ + VPCMP $4, %XMM0, %XMM1, %k0 + VPCMP $0, %XMMZERO, %XMM0, %k1 + VPCMP $0, %XMMZERO, %XMM1, %k2 + /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + kmovd %k1, %ecx + +# ifdef USE_AS_WCSCMP + /* Only the last bit is valid. */ + andl $0x1, %ecx +# else + /* Only last 4 bits are valid. */ + andl $0xf, %ecx +# endif + + testl %ecx, %ecx + jne L(last_vector) + + addl $4, %edx +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + +L(cross_page_4bytes): +# endif + /* Less than 4 bytes to check, try one byte/dword at a time. */ +# ifdef USE_AS_STRNCMP + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + movl (%rdi, %rdx), %eax + movl (%rsi, %rdx), %ecx +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx +# endif + testl %eax, %eax + jne L(cross_page_loop) + subl %ecx, %eax + ret +END (STRCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index ed31970..41d670b 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -30,16 +30,29 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable) + && CPU_FEATURES_CPU_P (cpu_features, BMI2) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) + return OPTIMIZE (evex); + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S new file mode 100644 index 0000000..c2c581e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCPY +# define STRCPY __strcpy_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S index e72a0ca..19f4d5f 100644 --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S @@ -37,6 +37,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + /* zero register */ #define xmmZ xmm0 #define ymmZ ymm0 @@ -46,7 +50,7 @@ # ifndef USE_AS_STRCAT - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCPY) # ifdef USE_AS_STRNCPY mov %RDX_LP, %R8_LP @@ -369,8 +373,8 @@ L(CopyVecSizeExit): lea 1(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(CopyTwoVecSize1): @@ -553,8 +557,7 @@ L(Exit1): lea 2(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit2): @@ -569,8 +572,7 @@ L(Exit2): lea 3(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit3): @@ -584,8 +586,7 @@ L(Exit3): lea 4(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit4_7): @@ -602,8 +603,7 @@ L(Exit4_7): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit8_15): @@ -620,8 +620,7 @@ L(Exit8_15): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit16_31): @@ -638,8 +637,7 @@ L(Exit16_31): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit32_63): @@ -656,8 +654,7 @@ L(Exit32_63): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_STRNCPY @@ -671,8 +668,7 @@ L(StrncpyExit1): # ifdef USE_AS_STRCAT movb $0, 1(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit2): @@ -684,8 +680,7 @@ L(StrncpyExit2): # ifdef USE_AS_STRCAT movb $0, 2(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit3_4): @@ -699,8 +694,7 @@ L(StrncpyExit3_4): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit5_8): @@ -714,8 +708,7 @@ L(StrncpyExit5_8): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit9_16): @@ -729,8 +722,7 @@ L(StrncpyExit9_16): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit17_32): @@ -744,8 +736,7 @@ L(StrncpyExit17_32): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit33_64): @@ -760,8 +751,7 @@ L(StrncpyExit33_64): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit65): @@ -778,50 +768,43 @@ L(StrncpyExit65): # ifdef USE_AS_STRCAT movb $0, 65(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifndef USE_AS_STRCAT .p2align 4 L(Fill1): mov %dl, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill2): mov %dx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill3_4): mov %dx, (%rdi) mov %dx, -2(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill5_8): mov %edx, (%rdi) mov %edx, -4(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill9_16): mov %rdx, (%rdi) mov %rdx, -8(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill17_32): vmovdqu %xmmZ, (%rdi) vmovdqu %xmmZ, -16(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(CopyVecSizeUnalignedVec2): @@ -898,8 +881,7 @@ L(Fill): cmp $1, %r8d ja L(Fill2) je L(Fill1) - VZEROUPPER - ret + VZEROUPPER_RETURN /* end of ifndef USE_AS_STRCAT */ # endif @@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3): # ifdef USE_AS_STRCAT movb $0, (VEC_SIZE * 4)(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(UnalignedFourVecSizeLeaveCase2): @@ -1001,16 +982,14 @@ L(StrncpyExit): # ifdef USE_AS_STRCAT movb $0, (%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(ExitZero): # ifndef USE_AS_STRCAT mov %rdi, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # endif diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S new file mode 100644 index 0000000..a343a1a --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S @@ -0,0 +1,1003 @@ +/* strcpy with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strcpy_evex +# endif + +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +/* Number of bytes in a vector register */ +# ifndef VEC_SIZE +# define VEC_SIZE 32 +# endif + +# define XMM2 xmm18 +# define XMM3 xmm19 + +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 +# define YMM7 ymm23 + +# ifndef USE_AS_STRCAT + +/* zero register */ +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMM1 ymm17 + + .section .text.evex,"ax",@progbits +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %RDX_LP, %R8_LP + test %R8_LP, %R8_LP + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + + vpxorq %XMMZERO, %XMMZERO, %XMMZERO +# endif + + and $((VEC_SIZE * 4) - 1), %ecx + cmp $(VEC_SIZE * 2), %ecx + jbe L(SourceStringAlignmentLessTwoVecSize) + + and $-VEC_SIZE, %rsi + and $(VEC_SIZE - 1), %ecx + + vpcmpb $0, (%rsi), %YMMZERO, %k0 + kmovd %k0, %edx + shr %cl, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + mov $VEC_SIZE, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# else + mov $(VEC_SIZE + 1), %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# endif + jbe L(CopyVecSizeTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyVecSizeTail) + + vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 + kmovd %k1, %edx + +# ifdef USE_AS_STRNCPY + add $VEC_SIZE, %r10 + cmp %r10, %r8 + jbe L(CopyTwoVecSizeCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyTwoVecSize) + + VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ + VMOVU %YMM2, (%rdi) + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(UnalignVecSizeBoth): + sub %rcx, %rdi +# ifdef USE_AS_STRNCPY + add %rcx, %r8 + sbb %rcx, %rcx + or %rcx, %r8 +# endif + mov $VEC_SIZE, %rcx + VMOVA (%rsi, %rcx), %YMM2 + VMOVU %YMM2, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 3), %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM2, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 + vpcmpb $0, %YMM3, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec3) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM3, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 + vpcmpb $0, %YMM4, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec4) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM4, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM2, (%rdi, %rcx) + VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 + VMOVU %YMM2, (%rdi, %rcx) + vpcmpb $0, %YMM3, %YMMZERO, %k0 + kmovd %k0, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec3) +# else + jnz L(CopyVecSize) +# endif + + VMOVU %YMM3, (%rdi, %rcx) + mov %rsi, %rdx + lea VEC_SIZE(%rsi, %rcx), %rsi + and $-(VEC_SIZE * 4), %rsi + sub %rsi, %rdx + sub %rdx, %rdi +# ifdef USE_AS_STRNCPY + lea (VEC_SIZE * 8)(%r8, %rdx), %r8 +# endif +L(UnalignedFourVecSizeLoop): + VMOVA (%rsi), %YMM4 + VMOVA VEC_SIZE(%rsi), %YMM5 + VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 + VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 + vpminub %YMM5, %YMM4, %YMM2 + vpminub %YMM7, %YMM6, %YMM3 + vpminub %YMM2, %YMM3, %YMM2 + /* If K7 != 0, there is a null byte. */ + vpcmpb $0, %YMM2, %YMMZERO, %k7 + kmovd %k7, %edx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 4), %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jnz L(UnalignedFourVecSizeLeave) + +L(UnalignedFourVecSizeLoop_start): + add $(VEC_SIZE * 4), %rdi + add $(VEC_SIZE * 4), %rsi + VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) + VMOVA (%rsi), %YMM4 + VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) + VMOVA VEC_SIZE(%rsi), %YMM5 + vpminub %YMM5, %YMM4, %YMM2 + VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) + VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 + VMOVU %YMM7, -VEC_SIZE(%rdi) + VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 + vpminub %YMM7, %YMM6, %YMM3 + vpminub %YMM2, %YMM3, %YMM2 + /* If K7 != 0, there is a null byte. */ + vpcmpb $0, %YMM2, %YMMZERO, %k7 + kmovd %k7, %edx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 4), %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jz L(UnalignedFourVecSizeLoop_start) + +L(UnalignedFourVecSizeLeave): + vpcmpb $0, %YMM4, %YMMZERO, %k1 + kmovd %k1, %edx + test %edx, %edx + jnz L(CopyVecSizeUnaligned_0) + + vpcmpb $0, %YMM5, %YMMZERO, %k2 + kmovd %k2, %ecx + test %ecx, %ecx + jnz L(CopyVecSizeUnaligned_16) + + vpcmpb $0, %YMM6, %YMMZERO, %k3 + kmovd %k3, %edx + test %edx, %edx + jnz L(CopyVecSizeUnaligned_32) + + vpcmpb $0, %YMM7, %YMMZERO, %k4 + kmovd %k4, %ecx + bsf %ecx, %edx + VMOVU %YMM4, (%rdi) + VMOVU %YMM5, VEC_SIZE(%rdi) + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 3)(%rdi, %rdx), %rax +# endif + VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) + add $(VEC_SIZE - 1), %r8 + sub %rdx, %r8 + lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $(VEC_SIZE * 3), %rsi + add $(VEC_SIZE * 3), %rdi + jmp L(CopyVecSizeExit) +# endif + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentLessTwoVecSize): + VMOVU (%rsi), %YMM3 + VMOVU VEC_SIZE(%rsi), %YMM2 + vpcmpb $0, %YMM3, %YMMZERO, %k0 + kmovd %k0, %edx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $VEC_SIZE, %r8 +# else + cmp $(VEC_SIZE + 1), %r8 +# endif + jbe L(CopyVecSizeTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyVecSizeTail1) + + VMOVU %YMM3, (%rdi) + vpcmpb $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %edx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $(VEC_SIZE * 2), %r8 +# else + cmp $((VEC_SIZE * 2) + 1), %r8 +# endif + jbe L(CopyTwoVecSize1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyTwoVecSize1) + + and $-VEC_SIZE, %rsi + and $(VEC_SIZE - 1), %ecx + jmp L(UnalignVecSizeBoth) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + +# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) + .p2align 4 +L(CopyVecSize): + add %rcx, %rdi +# endif +L(CopyVecSizeTail): + add %rcx, %rsi +L(CopyVecSizeTail1): + bsf %edx, %edx +L(CopyVecSizeExit): + cmp $32, %edx + jae L(Exit32_63) + cmp $16, %edx + jae L(Exit16_31) + cmp $8, %edx + jae L(Exit8_15) + cmp $4, %edx + jae L(Exit4_7) + cmp $3, %edx + je L(Exit3) + cmp $1, %edx + ja L(Exit2) + je L(Exit1) + movb $0, (%rdi) +# ifdef USE_AS_STPCPY + lea (%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(CopyTwoVecSize1): + add $VEC_SIZE, %rsi + add $VEC_SIZE, %rdi +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $VEC_SIZE, %r8 +# endif + jmp L(CopyVecSizeTail1) + + .p2align 4 +L(CopyTwoVecSize): + bsf %edx, %edx + add %rcx, %rsi + add $VEC_SIZE, %edx + sub %ecx, %edx + jmp L(CopyVecSizeExit) + + .p2align 4 +L(CopyVecSizeUnaligned_0): + bsf %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + VMOVU %YMM4, (%rdi) + add $((VEC_SIZE * 4) - 1), %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + jmp L(CopyVecSizeExit) +# endif + + .p2align 4 +L(CopyVecSizeUnaligned_16): + bsf %ecx, %edx + VMOVU %YMM4, (%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea VEC_SIZE(%rdi, %rdx), %rax +# endif + VMOVU %YMM5, VEC_SIZE(%rdi) + add $((VEC_SIZE * 3) - 1), %r8 + sub %rdx, %r8 + lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $VEC_SIZE, %rsi + add $VEC_SIZE, %rdi + jmp L(CopyVecSizeExit) +# endif + + .p2align 4 +L(CopyVecSizeUnaligned_32): + bsf %edx, %edx + VMOVU %YMM4, (%rdi) + VMOVU %YMM5, VEC_SIZE(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 2)(%rdi, %rdx), %rax +# endif + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) + add $((VEC_SIZE * 2) - 1), %r8 + sub %rdx, %r8 + lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $(VEC_SIZE * 2), %rsi + add $(VEC_SIZE * 2), %rdi + jmp L(CopyVecSizeExit) +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + .p2align 4 +L(CopyVecSizeUnalignedVec6): + VMOVU %YMM6, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec5): + VMOVU %YMM5, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec4): + VMOVU %YMM4, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec3): + VMOVU %YMM3, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) +# endif + +/* Case2 */ + + .p2align 4 +L(CopyVecSizeCase2): + add $VEC_SIZE, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSizeCase2): + add %rcx, %rsi + bsf %edx, %edx + add $VEC_SIZE, %edx + sub %ecx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +L(CopyVecSizeTailCase2): + add %rcx, %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +L(CopyVecSizeTail1Case2): + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyVecSizeCase2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeCase2) +L(CopyVecSizeCase3): + add $VEC_SIZE, %r8 + add %rcx, %rdi + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSizeCase2OrCase3): + test %rdx, %rdx + jnz L(CopyTwoVecSizeCase2) + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyVecSizeTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeTailCase2) + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSize1Case2OrCase3): + add $VEC_SIZE, %rdi + add $VEC_SIZE, %rsi + sub $VEC_SIZE, %r8 +L(CopyVecSizeTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeTail1Case2) + jmp L(StrncpyExit) +# endif + +/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ + + .p2align 4 +L(Exit1): + movzwl (%rsi), %edx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit2): + movzwl (%rsi), %ecx + mov %cx, (%rdi) + movb $0, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit3): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit4_7): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov -3(%rsi, %rdx), %ecx + mov %ecx, -3(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit8_15): + mov (%rsi), %rcx + mov -7(%rsi, %rdx), %r9 + mov %rcx, (%rdi) + mov %r9, -7(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit16_31): + VMOVU (%rsi), %XMM2 + VMOVU -15(%rsi, %rdx), %XMM3 + VMOVU %XMM2, (%rdi) + VMOVU %XMM3, -15(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit32_63): + VMOVU (%rsi), %YMM2 + VMOVU -31(%rsi, %rdx), %YMM3 + VMOVU %YMM2, (%rdi) + VMOVU %YMM3, -31(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit1): + movzbl (%rsi), %edx + mov %dl, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 1(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit2): + movzwl (%rsi), %edx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 2(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit3_4): + movzwl (%rsi), %ecx + movzwl -2(%rsi, %r8), %edx + mov %cx, (%rdi) + mov %dx, -2(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit5_8): + mov (%rsi), %ecx + mov -4(%rsi, %r8), %edx + mov %ecx, (%rdi) + mov %edx, -4(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit9_16): + mov (%rsi), %rcx + mov -8(%rsi, %r8), %rdx + mov %rcx, (%rdi) + mov %rdx, -8(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit17_32): + VMOVU (%rsi), %XMM2 + VMOVU -16(%rsi, %r8), %XMM3 + VMOVU %XMM2, (%rdi) + VMOVU %XMM3, -16(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit33_64): + /* 0/32, 31/16 */ + VMOVU (%rsi), %YMM2 + VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 + VMOVU %YMM2, (%rdi) + VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + ret + + .p2align 4 +L(StrncpyExit65): + /* 0/32, 32/32, 64/1 */ + VMOVU (%rsi), %YMM2 + VMOVU 32(%rsi), %YMM3 + mov 64(%rsi), %cl + VMOVU %YMM2, (%rdi) + VMOVU %YMM3, 32(%rdi) + mov %cl, 64(%rdi) +# ifdef USE_AS_STPCPY + lea 65(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 65(%rdi) +# endif + ret + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + ret + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + ret + + .p2align 4 +L(Fill3_4): + mov %dx, (%rdi) + mov %dx, -2(%rdi, %r8) + ret + + .p2align 4 +L(Fill5_8): + mov %edx, (%rdi) + mov %edx, -4(%rdi, %r8) + ret + + .p2align 4 +L(Fill9_16): + mov %rdx, (%rdi) + mov %rdx, -8(%rdi, %r8) + ret + + .p2align 4 +L(Fill17_32): + VMOVU %XMMZERO, (%rdi) + VMOVU %XMMZERO, -16(%rdi, %r8) + ret + + .p2align 4 +L(CopyVecSizeUnalignedVec2): + VMOVU %YMM2, (%rdi, %rcx) + + .p2align 4 +L(CopyVecSizeVecExit): + bsf %edx, %edx + add $(VEC_SIZE - 1), %r8 + add %rcx, %rdi +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + xor %edx, %edx + sub $VEC_SIZE, %r8 + jbe L(StrncpyFillExit) + + VMOVU %YMMZERO, (%rdi) + add $VEC_SIZE, %rdi + + mov %rdi, %rsi + and $(VEC_SIZE - 1), %esi + sub %rsi, %rdi + add %rsi, %r8 + sub $(VEC_SIZE * 4), %r8 + jb L(StrncpyFillLessFourVecSize) + +L(StrncpyFillLoopVmovdqa): + VMOVA %YMMZERO, (%rdi) + VMOVA %YMMZERO, VEC_SIZE(%rdi) + VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) + VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) + add $(VEC_SIZE * 4), %rdi + sub $(VEC_SIZE * 4), %r8 + jae L(StrncpyFillLoopVmovdqa) + +L(StrncpyFillLessFourVecSize): + add $(VEC_SIZE * 2), %r8 + jl L(StrncpyFillLessTwoVecSize) + VMOVA %YMMZERO, (%rdi) + VMOVA %YMMZERO, VEC_SIZE(%rdi) + add $(VEC_SIZE * 2), %rdi + sub $VEC_SIZE, %r8 + jl L(StrncpyFillExit) + VMOVA %YMMZERO, (%rdi) + add $VEC_SIZE, %rdi + jmp L(Fill) + + .p2align 4 +L(StrncpyFillLessTwoVecSize): + add $VEC_SIZE, %r8 + jl L(StrncpyFillExit) + VMOVA %YMMZERO, (%rdi) + add $VEC_SIZE, %rdi + jmp L(Fill) + + .p2align 4 +L(StrncpyFillExit): + add $VEC_SIZE, %r8 +L(Fill): + cmp $17, %r8d + jae L(Fill17_32) + cmp $9, %r8d + jae L(Fill9_16) + cmp $5, %r8d + jae L(Fill5_8) + cmp $3, %r8d + jae L(Fill3_4) + cmp $1, %r8d + ja L(Fill2) + je L(Fill1) + ret + +/* end of ifndef USE_AS_STRCAT */ +# endif + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(UnalignedFourVecSizeLeaveCase2) +L(UnalignedFourVecSizeLeaveCase3): + lea (VEC_SIZE * 4)(%r8), %rcx + and $-VEC_SIZE, %rcx + add $(VEC_SIZE * 3), %r8 + jl L(CopyVecSizeCase3) + VMOVU %YMM4, (%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + VMOVU %YMM5, VEC_SIZE(%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 4)(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (VEC_SIZE * 4)(%rdi) +# endif + ret + + .p2align 4 +L(UnalignedFourVecSizeLeaveCase2): + xor %ecx, %ecx + vpcmpb $0, %YMM4, %YMMZERO, %k1 + kmovd %k1, %edx + add $(VEC_SIZE * 3), %r8 + jle L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec4) +# else + jnz L(CopyVecSize) +# endif + vpcmpb $0, %YMM5, %YMMZERO, %k2 + kmovd %k2, %edx + VMOVU %YMM4, (%rdi) + add $VEC_SIZE, %rcx + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec5) +# else + jnz L(CopyVecSize) +# endif + + vpcmpb $0, %YMM6, %YMMZERO, %k3 + kmovd %k3, %edx + VMOVU %YMM5, VEC_SIZE(%rdi) + add $VEC_SIZE, %rcx + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec6) +# else + jnz L(CopyVecSize) +# endif + + vpcmpb $0, %YMM7, %YMMZERO, %k4 + kmovd %k4, %edx + VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) + lea VEC_SIZE(%rdi, %rcx), %rdi + lea VEC_SIZE(%rsi, %rcx), %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) +L(StrncpyExit): + cmp $65, %r8d + je L(StrncpyExit65) + cmp $33, %r8d + jae L(StrncpyExit33_64) + cmp $17, %r8d + jae L(StrncpyExit17_32) + cmp $9, %r8d + jae L(StrncpyExit9_16) + cmp $5, %r8d + jae L(StrncpyExit5_8) + cmp $3, %r8d + jae L(StrncpyExit3_4) + cmp $1, %r8d + ja L(StrncpyExit2) + je L(StrncpyExit1) +# ifdef USE_AS_STPCPY + mov %rdi, %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi) +# endif + ret + + .p2align 4 +L(ExitZero): +# ifndef USE_AS_STRCAT + mov %rdi, %rax +# endif + ret + +# endif + +# ifndef USE_AS_STRCAT +END (STRCPY) +# else +END (STRCAT) +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S new file mode 100644 index 0000000..75b4b76 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRLEN +# define STRLEN __strlen_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strlen-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S index 3e7f14a..f5d058a 100644 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S @@ -27,370 +27,475 @@ # ifdef USE_AS_WCSLEN # define VPCMPEQ vpcmpeqd # define VPMINU vpminud +# define CHAR_SIZE 4 # else # define VPCMPEQ vpcmpeqb # define VPMINU vpminub +# define CHAR_SIZE 1 # endif # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 +# define PAGE_SIZE 4096 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN - /* Check for zero length. */ + /* Check zero length. */ test %RSI_LP, %RSI_LP jz L(zero) + /* Store max len in R8_LP before adjusting if using WCSLEN. */ + mov %RSI_LP, %R8_LP # ifdef USE_AS_WCSLEN shl $2, %RSI_LP # elif defined __ILP32__ /* Clear the upper 32 bits. */ movl %esi, %esi # endif - mov %RSI_LP, %R8_LP # endif - movl %edi, %ecx + movl %edi, %eax movq %rdi, %rdx vpxor %xmm0, %xmm0, %xmm0 - + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. */ + andl $(PAGE_SIZE - 1), %eax /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax # ifdef USE_AS_STRNLEN - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rsi - jbe L(max) -# else - jnz L(first_vec_x0) + /* If length < VEC_SIZE handle special. */ + cmpq $VEC_SIZE, %rsi + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi + /* If empty continue to aligned_more. Otherwise return bit + position of first match. */ + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN # ifdef USE_AS_STRNLEN - /* Adjust length. */ - addq %rcx, %rsi +L(zero): + xorl %eax, %eax + ret - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + .p2align 4 +L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ + btsq %rsi, %rax + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN # endif - jmp L(more_4x_vec) .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - /* Remove the leading bytes. */ - sarl %cl, %eax - testl %eax, %eax - jz L(aligned_more) +L(first_vec_x1): tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE * 4 + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + incl %edi + addl %edi, %eax # endif - addq %rdi, %rax - addq %rcx, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN - shrq $2, %rax + shrl $2, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 -L(aligned_more): +L(first_vec_x2): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" - to void possible addition overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx - - /* Check the end of data. */ - subq %rcx, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE * 3 + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + addl $(VEC_SIZE + 1), %edi + addl %edi, %eax +# endif +# ifdef USE_AS_WCSLEN + shrl $2, %eax # endif + VZEROUPPER_RETURN - addq $VEC_SIZE, %rdi + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE * 2 + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + addl $(VEC_SIZE * 2 + 1), %edi + addl %edi, %eax +# endif +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + addl $(VEC_SIZE * 3 + 1), %edi + addl %edi, %eax # endif +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN -L(more_4x_vec): + .p2align 5 +L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of + instructions as using andq with -VEC_SIZE but saves 4 bytes of + code on the x4 check. */ + orq $(VEC_SIZE - 1), %rdi +L(cross_page_continue): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax +# ifdef USE_AS_STRNLEN + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because + it simplies the logic in last_4x_vec_or_less. */ + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx + subq %rdx, %rcx +# endif + /* Load first VEC regardless. */ + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +# ifdef USE_AS_STRNLEN + /* Adjust length. If near end handle specially. */ + subq %rcx, %rsi + jb L(last_4x_vec_or_less) +# endif + vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x1) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x2) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi - -# ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) + /* Align data to VEC_SIZE * 4 - 1. */ # ifdef USE_AS_STRNLEN - /* Adjust length. */ + /* Before adjusting length check if at last VEC_SIZE * 4. */ + cmpq $(VEC_SIZE * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + incq %rdi + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx + /* Readjust length. */ addq %rcx, %rsi +# else + incq %rdi + orq $(VEC_SIZE * 4 - 1), %rdi # endif - + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - vmovdqa (%rdi), %ymm1 - vmovdqa VEC_SIZE(%rdi), %ymm2 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 - VPMINU %ymm1, %ymm2, %ymm5 - VPMINU %ymm3, %ymm4, %ymm6 - VPMINU %ymm5, %ymm6, %ymm5 - - VPCMPEQ %ymm5, %ymm0, %ymm5 - vpmovmskb %ymm5, %eax - testl %eax, %eax - jnz L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi - -# ifndef USE_AS_STRNLEN - jmp L(loop_4x_vec) -# else +# ifdef USE_AS_STRNLEN + /* Break if at end of length. */ subq $(VEC_SIZE * 4), %rsi - ja L(loop_4x_vec) + jb L(last_4x_vec_or_less_cmpeq) +# endif + /* Save some code size by microfusing VPMINU with the load. Since + the matches in ymm2/ymm4 can only be returned if there where no + matches in ymm1/ymm3 respectively there is no issue with overlap. + */ + vmovdqa 1(%rdi), %ymm1 + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 + + VPMINU %ymm2, %ymm4, %ymm5 + VPCMPEQ %ymm5, %ymm0, %ymm5 + vpmovmskb %ymm5, %ecx -L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %esi - jle L(last_2x_vec) + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm1, %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + subq %rdx, %rdi testl %eax, %eax - jnz L(first_vec_x1) + jnz L(last_vec_return_x0) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm2, %ymm0, %ymm2 + vpmovmskb %ymm2, %eax testl %eax, %eax + jnz L(last_vec_return_x1) + + /* Combine last 2 VEC. */ + VPCMPEQ %ymm3, %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + /* rcx has combined result from all 4 VEC. It will only be used if + the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax + subq $(VEC_SIZE * 2 - 1), %rdi + addq %rdi, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER_RETURN - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %esi - jle L(max) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax +# ifdef USE_AS_STRNLEN + .p2align 4 +L(last_4x_vec_or_less_load): + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ + subq $-(VEC_SIZE * 4), %rdi +L(last_4x_vec_or_less_cmpeq): + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +L(last_4x_vec_or_less): - jnz L(first_vec_x3_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - VZEROUPPER - ret + vpmovmskb %ymm1, %eax + /* If remaining length > VEC_SIZE * 2. This works if esi is off by + VEC_SIZE * 4. */ + testl $(VEC_SIZE * 2), %esi + jnz L(last_4x_vec) - .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %esi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + /* length may have been negative or positive by an offset of + VEC_SIZE * 4 depending on where this was called from. This fixes + that. */ + andl $(VEC_SIZE * 4 - 1), %esi testl %eax, %eax + jnz L(last_vec_x1_check) - jnz L(first_vec_x0_check) subl $VEC_SIZE, %esi - jle L(max) - - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x1_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - VZEROUPPER - ret + jb L(max) - .p2align 4 -L(first_vec_x0_check): + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax tzcntl %eax, %eax /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + addl $(VEC_SIZE + 1), %eax addq %rdi, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN +# endif .p2align 4 -L(first_vec_x1_check): +L(last_vec_return_x0): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $VEC_SIZE, %rax + subq $(VEC_SIZE * 4 - 1), %rdi addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif - VZEROUPPER - ret +# endif + VZEROUPPER_RETURN .p2align 4 -L(first_vec_x2_check): +L(last_vec_return_x1): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 2), %rax + subq $(VEC_SIZE * 3 - 1), %rdi addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif - VZEROUPPER - ret +# endif + VZEROUPPER_RETURN +# ifdef USE_AS_STRNLEN .p2align 4 -L(first_vec_x3_check): +L(last_vec_x1_check): + tzcntl %eax, %eax /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 3), %rax + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + incl %eax addq %rdi, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN - .p2align 4 L(max): movq %r8, %rax + VZEROUPPER_RETURN + + .p2align 4 +L(last_4x_vec): + /* Test first 2x VEC normally. */ + testl %eax, %eax + jnz L(last_vec_x1) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + /* Normalize length. */ + andl $(VEC_SIZE * 4 - 1), %esi + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + subl $(VEC_SIZE * 3), %esi + jb L(max) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + addl $(VEC_SIZE * 3 + 1), %eax + addq %rdi, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN - .p2align 4 -L(zero): - xorl %eax, %eax - ret -# endif .p2align 4 -L(first_vec_x0): +L(last_vec_x1): + /* essentially duplicates of first_vec_x1 but use 64 bit + instructions. */ tzcntl %eax, %eax + subq %rdx, %rdi + incl %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif - VZEROUPPER - ret +# endif + VZEROUPPER_RETURN .p2align 4 -L(first_vec_x1): +L(last_vec_x2): + /* essentially duplicates of first_vec_x1 but use 64 bit + instructions. */ tzcntl %eax, %eax - addq $VEC_SIZE, %rax + subq %rdx, %rdi + addl $(VEC_SIZE + 1), %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif - VZEROUPPER - ret +# endif + VZEROUPPER_RETURN .p2align 4 -L(first_vec_x2): +L(last_vec_x3): tzcntl %eax, %eax - addq $(VEC_SIZE * 2), %rax + subl $(VEC_SIZE * 2), %esi + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max_end) + subq %rdx, %rdi + addl $(VEC_SIZE * 2 + 1), %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax +# endif + VZEROUPPER_RETURN +L(max_end): + movq %r8, %rax + VZEROUPPER_RETURN # endif - VZEROUPPER - ret + /* Cold case for crossing page with first load. */ .p2align 4 -L(4x_vec_end): - VPCMPEQ %ymm1, %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMPEQ %ymm2, %ymm0, %ymm2 - vpmovmskb %ymm2, %eax +L(cross_page_boundary): + /* Align data to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod rdx. */ + sarxl %edx, %eax, %eax +# ifdef USE_AS_STRNLEN testl %eax, %eax - jnz L(first_vec_x1) - VPCMPEQ %ymm3, %ymm0, %ymm3 - vpmovmskb %ymm3, %eax + jnz L(cross_page_less_vec) + leaq 1(%rdi), %rcx + subq %rdx, %rcx + /* Check length. */ + cmpq %rsi, %rcx + jb L(cross_page_continue) + movq %r8, %rax +# else testl %eax, %eax - jnz L(first_vec_x2) - VPCMPEQ %ymm4, %ymm0, %ymm4 - vpmovmskb %ymm4, %eax -L(first_vec_x3): + jz L(cross_page_continue) tzcntl %eax, %eax - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif +# endif +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +# ifdef USE_AS_STRNLEN + .p2align 4 +L(cross_page_less_vec): + tzcntl %eax, %eax + cmpq %rax, %rsi + cmovb %esi, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN # endif - VZEROUPPER - ret END (STRLEN) #endif diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S new file mode 100644 index 0000000..0583819 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-evex.S @@ -0,0 +1,436 @@ +/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRLEN +# define STRLEN __strlen_evex +# endif + +# define VMOVA vmovdqa64 + +# ifdef USE_AS_WCSLEN +# define VPCMP vpcmpd +# define VPMINU vpminud +# define SHIFT_REG r9d +# else +# define VPCMP vpcmpb +# define VPMINU vpminub +# define SHIFT_REG ecx +# endif + +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMM1 ymm17 +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 + +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (STRLEN) +# ifdef USE_AS_STRNLEN + /* Check for zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) +# ifdef USE_AS_WCSLEN + shl $2, %RSI_LP +# elif defined __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi +# endif + mov %RSI_LP, %R8_LP +# endif + movl %edi, %ecx + movq %rdi, %rdx + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx + cmpl $VEC_SIZE, %ecx + ja L(cros_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + +# ifdef USE_AS_STRNLEN + jnz L(first_vec_x0_check) + /* Adjust length and check the end of data. */ + subq $VEC_SIZE, %rsi + jbe L(max) +# else + jnz L(first_vec_x0) +# endif + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + +# ifdef USE_AS_STRNLEN + /* Adjust length. */ + addq %rcx, %rsi + + subq $(VEC_SIZE * 4), %rsi + jbe L(last_4x_vec_or_less) +# endif + jmp L(more_4x_vec) + + .p2align 4 +L(cros_page_boundary): + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + +# ifdef USE_AS_WCSLEN + /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG + sarl $2, %SHIFT_REG +# endif + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + + /* Remove the leading bytes. */ + sarxl %SHIFT_REG, %eax, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif +# ifdef USE_AS_STRNLEN + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +# endif + addq %rdi, %rax + addq %rcx, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(aligned_more): +# ifdef USE_AS_STRNLEN + /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" + with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" + to void possible addition overflow. */ + negq %rcx + addq $VEC_SIZE, %rcx + + /* Check the end of data. */ + subq %rcx, %rsi + jbe L(max) +# endif + + addq $VEC_SIZE, %rdi + +# ifdef USE_AS_STRNLEN + subq $(VEC_SIZE * 4), %rsi + jbe L(last_4x_vec_or_less) +# endif + +L(more_4x_vec): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + addq $(VEC_SIZE * 4), %rdi + +# ifdef USE_AS_STRNLEN + subq $(VEC_SIZE * 4), %rsi + jbe L(last_4x_vec_or_less) +# endif + + /* Align data to 4 * VEC_SIZE. */ + movq %rdi, %rcx + andl $(4 * VEC_SIZE - 1), %ecx + andq $-(4 * VEC_SIZE), %rdi + +# ifdef USE_AS_STRNLEN + /* Adjust length. */ + addq %rcx, %rsi +# endif + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + VMOVA (%rdi), %YMM1 + VMOVA VEC_SIZE(%rdi), %YMM2 + VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 + VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 + + VPMINU %YMM1, %YMM2, %YMM5 + VPMINU %YMM3, %YMM4, %YMM6 + + VPMINU %YMM5, %YMM6, %YMM5 + VPCMP $0, %YMM5, %YMMZERO, %k0 + ktestd %k0, %k0 + jnz L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + +# ifndef USE_AS_STRNLEN + jmp L(loop_4x_vec) +# else + subq $(VEC_SIZE * 4), %rsi + ja L(loop_4x_vec) + +L(last_4x_vec_or_less): + /* Less than 4 * VEC and aligned to VEC_SIZE. */ + addl $(VEC_SIZE * 2), %esi + jle L(last_2x_vec) + + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2_check) + subl $VEC_SIZE, %esi + jle L(max) + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3_check) + movq %r8, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(last_2x_vec): + addl $(VEC_SIZE * 2), %esi + + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0_check) + subl $VEC_SIZE, %esi + jle L(max) + + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1_check) + movq %r8, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x0_check): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x1_check): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq $VEC_SIZE, %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x2_check): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x3_check): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(max): + movq %r8, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret +# endif + + .p2align 4 +L(first_vec_x0): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + addq $VEC_SIZE, %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(4x_vec_end): + VPCMP $0, %YMM1, %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + VPCMP $0, %YMM2, %YMMZERO, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + VPCMP $0, %YMM3, %YMMZERO, %k2 + kmovd %k2, %eax + testl %eax, %eax + jnz L(first_vec_x2) + VPCMP $0, %YMM4, %YMMZERO, %k3 + kmovd %k3, %eax +L(first_vec_x3): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + +END (STRLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S index 6f5bfc7..8d748aa 100644 --- a/sysdeps/x86_64/multiarch/strlen-sse2.S +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S @@ -20,4 +20,4 @@ # define strlen __strlen_sse2 #endif -#include "../strlen.S" +#include "strlen-vec.S" diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S new file mode 100644 index 0000000..8f660bb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-vec.S @@ -0,0 +1,257 @@ +/* SSE2 version of strlen and SSE4.1 version of wcslen. + Copyright (C) 2012-2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef AS_WCSLEN +# define PMINU pminud +# define PCMPEQ pcmpeqd +# define SHIFT_RETURN shrq $2, %rax +#else +# define PMINU pminub +# define PCMPEQ pcmpeqb +# define SHIFT_RETURN +#endif + +/* Long lived register in strlen(s), strnlen(s, n) are: + + %xmm3 - zero + %rdi - s + %r10 (s+n) & (~(64-1)) + %r11 s+n +*/ + + +.text +ENTRY(strlen) + +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +#define FIND_ZERO \ + PCMPEQ (%rax), %xmm0; \ + PCMPEQ 16(%rax), %xmm1; \ + PCMPEQ 32(%rax), %xmm2; \ + PCMPEQ 48(%rax), %xmm3; \ + pmovmskb %xmm0, %esi; \ + pmovmskb %xmm1, %edx; \ + pmovmskb %xmm2, %r8d; \ + pmovmskb %xmm3, %ecx; \ + salq $16, %rdx; \ + salq $16, %rcx; \ + orq %rsi, %rdx; \ + orq %r8, %rcx; \ + salq $32, %rcx; \ + orq %rcx, %rdx; + +#ifdef AS_STRNLEN +/* Do not read anything when n==0. */ + test %RSI_LP, %RSI_LP + jne L(n_nonzero) + xor %rax, %rax + ret +L(n_nonzero): +# ifdef AS_WCSLEN + shl $2, %RSI_LP +# endif + +/* Initialize long lived registers. */ + + add %RDI_LP, %RSI_LP + mov %RSI_LP, %R10_LP + and $-64, %R10_LP + mov %RSI_LP, %R11_LP +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + movq %rdi, %rax + movq %rdi, %rcx + andq $4095, %rcx +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ + cmpq $4047, %rcx +/* We cannot unify this branching as it would be ~6 cycles slower. */ + ja L(cross_page) + +#ifdef AS_STRNLEN +/* Test if end is among first 64 bytes. */ +# define STRNLEN_PROLOG \ + mov %r11, %rsi; \ + subq %rax, %rsi; \ + andq $-64, %rax; \ + testq $-64, %rsi; \ + je L(strnlen_ret) +#else +# define STRNLEN_PROLOG andq $-64, %rax; +#endif + +/* Ignore bits in mask that come before start of string. */ +#define PROLOG(lab) \ + movq %rdi, %rcx; \ + xorq %rax, %rcx; \ + STRNLEN_PROLOG; \ + sarq %cl, %rdx; \ + test %rdx, %rdx; \ + je L(lab); \ + bsfq %rdx, %rax; \ + SHIFT_RETURN; \ + ret + +#ifdef AS_STRNLEN + andq $-16, %rax + FIND_ZERO +#else + /* Test first 16 bytes unaligned. */ + movdqu (%rax), %xmm4 + PCMPEQ %xmm0, %xmm4 + pmovmskb %xmm4, %edx + test %edx, %edx + je L(next48_bytes) + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + SHIFT_RETURN + ret + +L(next48_bytes): +/* Same as FIND_ZERO except we do not check first 16 bytes. */ + andq $-16, %rax + PCMPEQ 16(%rax), %xmm1 + PCMPEQ 32(%rax), %xmm2 + PCMPEQ 48(%rax), %xmm3 + pmovmskb %xmm1, %edx + pmovmskb %xmm2, %r8d + pmovmskb %xmm3, %ecx + salq $16, %rdx + salq $16, %rcx + orq %r8, %rcx + salq $32, %rcx + orq %rcx, %rdx +#endif + + /* When no zero byte is found xmm1-3 are zero so we do not have to + zero them. */ + PROLOG(loop) + + .p2align 4 +L(cross_page): + andq $-64, %rax + FIND_ZERO + PROLOG(loop_init) + +#ifdef AS_STRNLEN +/* We must do this check to correctly handle strnlen (s, -1). */ +L(strnlen_ret): + bts %rsi, %rdx + sarq %cl, %rdx + test %rdx, %rdx + je L(loop_init) + bsfq %rdx, %rax + SHIFT_RETURN + ret +#endif + .p2align 4 +L(loop_init): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 +#ifdef AS_STRNLEN + .p2align 4 +L(loop): + + addq $64, %rax + cmpq %rax, %r10 + je L(exit_end) + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit) + jmp L(loop) + + .p2align 4 +L(exit_end): + cmp %rax, %r11 + je L(first) /* Do not read when end is at page boundary. */ + pxor %xmm0, %xmm0 + FIND_ZERO + +L(first): + bts %r11, %rdx + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + + .p2align 4 +L(exit): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +#else + + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ + .p2align 4 +L(loop): + + movdqa 64(%rax), %xmm0 + PMINU 80(%rax), %xmm0 + PMINU 96(%rax), %xmm0 + PMINU 112(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit64) + + subq $-128, %rax + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit0) + jmp L(loop) + + .p2align 4 +L(exit64): + addq $64, %rax +L(exit0): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +#endif + +END(strlen) diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S new file mode 100644 index 0000000..0dcea18 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_avx2_rtm +#include "strcat-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S new file mode 100644 index 0000000..8884f02 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-evex.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_evex +#include "strcat-evex.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S new file mode 100644 index 0000000..37d1224 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCMP __strncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S new file mode 100644 index 0000000..a1d53e8 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-evex.S @@ -0,0 +1,3 @@ +#define STRCMP __strncmp_evex +#define USE_AS_STRNCMP 1 +#include "strcmp-evex.S" diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c index e7b29de..52f208c 100644 --- a/sysdeps/x86_64/multiarch/strncmp.c +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -30,16 +30,29 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable) + && CPU_FEATURES_CPU_P (cpu_features, BMI2) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) + return OPTIMIZE (evex); + + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURES_CPU_P (cpu_features, SSE4_2) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S new file mode 100644 index 0000000..79e7083 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S new file mode 100644 index 0000000..40e391f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_evex +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S new file mode 100644 index 0000000..04f1626 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __strnlen_avx2_rtm +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S new file mode 100644 index 0000000..722022f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S @@ -0,0 +1,4 @@ +#define STRLEN __strnlen_evex +#define USE_AS_STRNLEN 1 + +#include "strlen-evex.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S new file mode 100644 index 0000000..5def14e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRRCHR +# define STRRCHR __strrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index c66df12..9116472 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -36,9 +36,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRRCHR) movd %esi, %xmm4 movl %edi, %ecx @@ -166,8 +170,8 @@ L(return_value): # endif bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(match): @@ -198,8 +202,7 @@ L(find_nul): jz L(return_value) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char_and_nul): @@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec): jz L(return_null) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_null): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S new file mode 100644 index 0000000..f920b5a --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S @@ -0,0 +1,265 @@ +/* strrchr/wcsrchr optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRRCHR +# define STRRCHR __strrchr_evex +# endif + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# ifdef USE_AS_WCSRCHR +# define VPBROADCAST vpbroadcastd +# define VPCMP vpcmpd +# define SHIFT_REG r8d +# else +# define VPBROADCAST vpbroadcastb +# define VPCMP vpcmpb +# define SHIFT_REG ecx +# endif + +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMMMATCH ymm17 +# define YMM1 ymm18 + +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (STRRCHR) + movl %edi, %ecx + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx + cmpl $VEC_SIZE, %ecx + ja L(cros_page_boundary) + + VMOVU (%rdi), %YMM1 + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + + addq $VEC_SIZE, %rdi + + testl %eax, %eax + jnz L(first_vec) + + testl %ecx, %ecx + jnz L(return_null) + + andq $-VEC_SIZE, %rdi + xorl %edx, %edx + jmp L(aligned_loop) + + .p2align 4 +L(first_vec): + /* Check if there is a null byte. */ + testl %ecx, %ecx + jnz L(char_and_nul_in_first_vec) + + /* Remember the match and keep searching. */ + movl %eax, %edx + movq %rdi, %rsi + andq $-VEC_SIZE, %rdi + jmp L(aligned_loop) + + .p2align 4 +L(cros_page_boundary): + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + +# ifdef USE_AS_WCSRCHR + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG + sarl $2, %SHIFT_REG +# endif + + VMOVA (%rdi), %YMM1 + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %edx + kmovd %k1, %eax + + shrxl %SHIFT_REG, %edx, %edx + shrxl %SHIFT_REG, %eax, %eax + addq $VEC_SIZE, %rdi + + /* Check if there is a CHAR. */ + testl %eax, %eax + jnz L(found_char) + + testl %edx, %edx + jnz L(return_null) + + jmp L(aligned_loop) + + .p2align 4 +L(found_char): + testl %edx, %edx + jnz L(char_and_nul) + + /* Remember the match and keep searching. */ + movl %eax, %edx + leaq (%rdi, %rcx), %rsi + + .p2align 4 +L(aligned_loop): + VMOVA (%rdi), %YMM1 + addq $VEC_SIZE, %rdi + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + orl %eax, %ecx + jnz L(char_nor_null) + + VMOVA (%rdi), %YMM1 + add $VEC_SIZE, %rdi + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + orl %eax, %ecx + jnz L(char_nor_null) + + VMOVA (%rdi), %YMM1 + addq $VEC_SIZE, %rdi + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + orl %eax, %ecx + jnz L(char_nor_null) + + VMOVA (%rdi), %YMM1 + addq $VEC_SIZE, %rdi + + /* Each bit in K0 represents a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM1, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k0, %ecx + kmovd %k1, %eax + orl %eax, %ecx + jz L(aligned_loop) + + .p2align 4 +L(char_nor_null): + /* Find a CHAR or a null byte in a loop. */ + testl %eax, %eax + jnz L(match) +L(return_value): + testl %edx, %edx + jz L(return_null) + movl %edx, %eax + movq %rsi, %rdi + bsrl %eax, %eax +# ifdef USE_AS_WCSRCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq -VEC_SIZE(%rdi, %rax, 4), %rax +# else + leaq -VEC_SIZE(%rdi, %rax), %rax +# endif + ret + + .p2align 4 +L(match): + /* Find a CHAR. Check if there is a null byte. */ + kmovd %k0, %ecx + testl %ecx, %ecx + jnz L(find_nul) + + /* Remember the match and keep searching. */ + movl %eax, %edx + movq %rdi, %rsi + jmp L(aligned_loop) + + .p2align 4 +L(find_nul): + /* Mask out any matching bits after the null byte. */ + movl %ecx, %r8d + subl $1, %r8d + xorl %ecx, %r8d + andl %r8d, %eax + testl %eax, %eax + /* If there is no CHAR here, return the remembered one. */ + jz L(return_value) + bsrl %eax, %eax +# ifdef USE_AS_WCSRCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq -VEC_SIZE(%rdi, %rax, 4), %rax +# else + leaq -VEC_SIZE(%rdi, %rax), %rax +# endif + ret + + .p2align 4 +L(char_and_nul): + /* Find both a CHAR and a null byte. */ + addq %rcx, %rdi + movl %edx, %ecx +L(char_and_nul_in_first_vec): + /* Mask out any matching bits after the null byte. */ + movl %ecx, %r8d + subl $1, %r8d + xorl %ecx, %r8d + andl %r8d, %eax + testl %eax, %eax + /* Return null pointer if the null byte comes first. */ + jz L(return_null) + bsrl %eax, %eax +# ifdef USE_AS_WCSRCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + leaq -VEC_SIZE(%rdi, %rax, 4), %rax +# else + leaq -VEC_SIZE(%rdi, %rax), %rax +# endif + ret + + .p2align 4 +L(return_null): + xorl %eax, %eax + ret + +END (STRRCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S new file mode 100644 index 0000000..d49dbbf --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __wcschr_avx2_rtm +#define USE_AS_WCSCHR 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S new file mode 100644 index 0000000..7cb8f1e --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-evex.S @@ -0,0 +1,3 @@ +#define STRCHR __wcschr_evex +#define USE_AS_WCSCHR 1 +#include "strchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S new file mode 100644 index 0000000..d6ca2b8 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRCMP __wcscmp_avx2_rtm +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S new file mode 100644 index 0000000..42e73e5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S @@ -0,0 +1,4 @@ +#define STRCMP __wcscmp_evex +#define USE_AS_WCSCMP 1 + +#include "strcmp-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S new file mode 100644 index 0000000..35658d7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __wcslen_avx2_rtm +#define USE_AS_WCSLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S new file mode 100644 index 0000000..bdafa83 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-evex.S @@ -0,0 +1,4 @@ +#define STRLEN __wcslen_evex +#define USE_AS_WCSLEN 1 + +#include "strlen-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S new file mode 100644 index 0000000..7e62621 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S @@ -0,0 +1,4 @@ +#define AS_WCSLEN +#define strlen __wcslen_sse4_1 + +#include "strlen-vec.S" diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c index c23ce45..13070fd 100644 --- a/sysdeps/x86_64/multiarch/wcslen.c +++ b/sysdeps/x86_64/multiarch/wcslen.c @@ -24,7 +24,7 @@ # undef __wcslen # define SYMBOL_NAME wcslen -# include "ifunc-avx2.h" +# include "ifunc-wcslen.h" libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ()); weak_alias (__wcslen, wcslen); diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S new file mode 100644 index 0000000..4e88c70 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRCMP __wcsncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S new file mode 100644 index 0000000..8a8e310 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S @@ -0,0 +1,5 @@ +#define STRCMP __wcsncmp_evex +#define USE_AS_STRNCMP 1 +#define USE_AS_WCSCMP 1 + +#include "strcmp-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S new file mode 100644 index 0000000..7437ebe --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRLEN __wcsnlen_avx2_rtm +#define USE_AS_WCSLEN 1 +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S new file mode 100644 index 0000000..24773bb --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S @@ -0,0 +1,5 @@ +#define STRLEN __wcsnlen_evex +#define USE_AS_WCSLEN 1 +#define USE_AS_STRNLEN 1 + +#include "strlen-evex.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S index a8cab0c..5fa51fe 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S @@ -2,4 +2,4 @@ #define AS_STRNLEN #define strlen __wcsnlen_sse4_1 -#include "../strlen.S" +#include "strlen-vec.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c index 3da1197..f1b6bc8 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen.c +++ b/sysdeps/x86_64/multiarch/wcsnlen.c @@ -24,27 +24,7 @@ # undef __wcsnlen # define SYMBOL_NAME wcsnlen -# include <init-arch.h> - -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; - -static inline void * -IFUNC_SELECTOR (void) -{ - const struct cpu_features* cpu_features = __get_cpu_features (); - - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) - return OPTIMIZE (avx2); - - if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) - return OPTIMIZE (sse4_1); - - return OPTIMIZE (sse2); -} +# include "ifunc-wcslen.h" libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); weak_alias (__wcsnlen, wcsnlen); diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S new file mode 100644 index 0000000..9bf7608 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRRCHR __wcsrchr_avx2_rtm +#define USE_AS_WCSRCHR 1 +#include "strrchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S new file mode 100644 index 0000000..c64602f --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S @@ -0,0 +1,3 @@ +#define STRRCHR __wcsrchr_evex +#define USE_AS_WCSRCHR 1 +#include "strrchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S new file mode 100644 index 0000000..58ed21d --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __wmemchr_avx2_rtm +#define USE_AS_WMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S new file mode 100644 index 0000000..06cd0f9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S @@ -0,0 +1,4 @@ +#define MEMCHR __wmemchr_evex +#define USE_AS_WMEMCHR 1 + +#include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000..31104d1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_avx2_movbe_rtm +#define USE_AS_WMEMCMP 1 + +#include "memcmp-avx2-movbe-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S new file mode 100644 index 0000000..4726d74 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_evex_movbe +#define USE_AS_WMEMCMP 1 + +#include "memcmp-evex-movbe.S" diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index 9ab357f..ad047d8 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,5 +1,5 @@ -/* SSE2 version of strlen/wcslen. - Copyright (C) 2012-2019 Free Software Foundation, Inc. +/* SSE2 version of strlen. + Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,243 +16,6 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#include "multiarch/strlen-vec.S" -#ifdef AS_WCSLEN -# define PMINU pminud -# define PCMPEQ pcmpeqd -# define SHIFT_RETURN shrq $2, %rax -#else -# define PMINU pminub -# define PCMPEQ pcmpeqb -# define SHIFT_RETURN -#endif - -/* Long lived register in strlen(s), strnlen(s, n) are: - - %xmm3 - zero - %rdi - s - %r10 (s+n) & (~(64-1)) - %r11 s+n -*/ - - -.text -ENTRY(strlen) - -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ -#define FIND_ZERO \ - PCMPEQ (%rax), %xmm0; \ - PCMPEQ 16(%rax), %xmm1; \ - PCMPEQ 32(%rax), %xmm2; \ - PCMPEQ 48(%rax), %xmm3; \ - pmovmskb %xmm0, %esi; \ - pmovmskb %xmm1, %edx; \ - pmovmskb %xmm2, %r8d; \ - pmovmskb %xmm3, %ecx; \ - salq $16, %rdx; \ - salq $16, %rcx; \ - orq %rsi, %rdx; \ - orq %r8, %rcx; \ - salq $32, %rcx; \ - orq %rcx, %rdx; - -#ifdef AS_STRNLEN -/* Do not read anything when n==0. */ - test %RSI_LP, %RSI_LP - jne L(n_nonzero) - xor %rax, %rax - ret -L(n_nonzero): -# ifdef AS_WCSLEN - shl $2, %RSI_LP -# endif - -/* Initialize long lived registers. */ - - add %RDI_LP, %RSI_LP - mov %RSI_LP, %R10_LP - and $-64, %R10_LP - mov %RSI_LP, %R11_LP -#endif - - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - movq %rdi, %rax - movq %rdi, %rcx - andq $4095, %rcx -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ - cmpq $4047, %rcx -/* We cannot unify this branching as it would be ~6 cycles slower. */ - ja L(cross_page) - -#ifdef AS_STRNLEN -/* Test if end is among first 64 bytes. */ -# define STRNLEN_PROLOG \ - mov %r11, %rsi; \ - subq %rax, %rsi; \ - andq $-64, %rax; \ - testq $-64, %rsi; \ - je L(strnlen_ret) -#else -# define STRNLEN_PROLOG andq $-64, %rax; -#endif - -/* Ignore bits in mask that come before start of string. */ -#define PROLOG(lab) \ - movq %rdi, %rcx; \ - xorq %rax, %rcx; \ - STRNLEN_PROLOG; \ - sarq %cl, %rdx; \ - test %rdx, %rdx; \ - je L(lab); \ - bsfq %rdx, %rax; \ - SHIFT_RETURN; \ - ret - -#ifdef AS_STRNLEN - andq $-16, %rax - FIND_ZERO -#else - /* Test first 16 bytes unaligned. */ - movdqu (%rax), %xmm4 - PCMPEQ %xmm0, %xmm4 - pmovmskb %xmm4, %edx - test %edx, %edx - je L(next48_bytes) - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ - SHIFT_RETURN - ret - -L(next48_bytes): -/* Same as FIND_ZERO except we do not check first 16 bytes. */ - andq $-16, %rax - PCMPEQ 16(%rax), %xmm1 - PCMPEQ 32(%rax), %xmm2 - PCMPEQ 48(%rax), %xmm3 - pmovmskb %xmm1, %edx - pmovmskb %xmm2, %r8d - pmovmskb %xmm3, %ecx - salq $16, %rdx - salq $16, %rcx - orq %r8, %rcx - salq $32, %rcx - orq %rcx, %rdx -#endif - - /* When no zero byte is found xmm1-3 are zero so we do not have to - zero them. */ - PROLOG(loop) - - .p2align 4 -L(cross_page): - andq $-64, %rax - FIND_ZERO - PROLOG(loop_init) - -#ifdef AS_STRNLEN -/* We must do this check to correctly handle strnlen (s, -1). */ -L(strnlen_ret): - bts %rsi, %rdx - sarq %cl, %rdx - test %rdx, %rdx - je L(loop_init) - bsfq %rdx, %rax - SHIFT_RETURN - ret -#endif - .p2align 4 -L(loop_init): - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 -#ifdef AS_STRNLEN - .p2align 4 -L(loop): - - addq $64, %rax - cmpq %rax, %r10 - je L(exit_end) - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit) - jmp L(loop) - - .p2align 4 -L(exit_end): - cmp %rax, %r11 - je L(first) /* Do not read when end is at page boundary. */ - pxor %xmm0, %xmm0 - FIND_ZERO - -L(first): - bts %r11, %rdx - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - - .p2align 4 -L(exit): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#else - - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ - .p2align 4 -L(loop): - - movdqa 64(%rax), %xmm0 - PMINU 80(%rax), %xmm0 - PMINU 96(%rax), %xmm0 - PMINU 112(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit64) - - subq $-128, %rax - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit0) - jmp L(loop) - - .p2align 4 -L(exit64): - addq $64, %rax -L(exit0): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#endif - -END(strlen) libc_hidden_builtin_def (strlen) diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h index 7b64be9..7f5cd1b 100644 --- a/sysdeps/x86_64/sysdep.h +++ b/sysdeps/x86_64/sysdep.h @@ -95,6 +95,28 @@ lose: \ #define R14_LP r14 #define R15_LP r15 +/* Zero upper vector registers and return with xtest. NB: Use VZEROALL + to avoid RTM abort triggered by VZEROUPPER inside transactionally. */ +#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \ + xtest; \ + jz 1f; \ + vzeroall; \ + ret; \ +1: \ + vzeroupper; \ + ret + +/* Zero upper vector registers and return. */ +#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + VZEROUPPER; \ + ret +#endif + +#ifndef VZEROUPPER_RETURN +# define VZEROUPPER_RETURN VZEROUPPER; ret +#endif + #else /* __ASSEMBLER__ */ /* Long and pointer size in bytes. */ |