Merge branch release/2.30/master into ibm/2.30/master

author: Raoni Fassina Firmino <raoni@linux.ibm.com> 2022-02-25 10:45:14 -0300
committer: Raoni Fassina Firmino <raoni@linux.ibm.com> 2022-02-25 10:45:14 -0300
commit: 821f0a9cb24486842f16f9b0d81d9c28e876c6ae (patch)
tree: 690876c3a706cb222b159e2ebff02d218d2fd1fb
parent: 889122cbfacedab8fdb62a9d3fd6244d528ea99c (diff)
parent: 178292fe011d5745aa0dd7ea99b388c30512fbdb (diff)
download: glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.zip
glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.tar.gz
glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.tar.bz2
119 files changed, 7797 insertions, 1432 deletions
diff --git a/NEWS b/NEWS
index 66efd8e..f7501ac 100644
--- a/NEWS
+++ b/NEWS
@@ -73,6 +73,8 @@ The following bugs are resolved with this release:
   [25976] nss_compat: internal_end*ent may clobber errno, hiding ERANGE
   [27130] "rep movsb" performance issue
   [27177] GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on doesn't work
+  [27457] vzeroupper use in AVX2 multiarch string functions cause HTM aborts
+  [28755] overflow bug in wcsncmp_avx2 and wcsncmp_evex
 
 
 Version 2.30
diff --git a/elf/Makefile b/elf/Makefile
index 63416b2..b0d0f60 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -1514,8 +1514,6 @@ $(objpfx)tst-nodelete-dlclose.out: $(objpfx)tst-nodelete-dlclose-dso.so \
 
 tst-env-setuid-ENV = MALLOC_CHECK_=2 MALLOC_MMAP_THRESHOLD_=4096 \
 		     LD_HWCAP_MASK=0x1
-tst-env-setuid-tunables-ENV = \
-	GLIBC_TUNABLES=glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096
 
 $(objpfx)tst-debug1: $(libdl)
 $(objpfx)tst-debug1.out: $(objpfx)tst-debug1mod1.so
diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
index b0980c5..a8425dd 100644
--- a/elf/dl-tunables.c
+++ b/elf/dl-tunables.c
@@ -178,6 +178,7 @@ parse_tunables (char *tunestr, char *valstring)
     return;
 
   char *p = tunestr;
+  size_t off = 0;
 
   while (true)
     {
@@ -191,7 +192,11 @@ parse_tunables (char *tunestr, char *valstring)
       /* If we reach the end of the string before getting a valid name-value
 	 pair, bail out.  */
       if (p[len] == '\0')
-	return;
+	{
+	  if (__libc_enable_secure)
+	    tunestr[off] = '\0';
+	  return;
+	}
 
       /* We did not find a valid name-value pair before encountering the
 	 colon.  */
@@ -217,35 +222,28 @@ parse_tunables (char *tunestr, char *valstring)
 
 	  if (tunable_is_name (cur->name, name))
 	    {
-	      /* If we are in a secure context (AT_SECURE) then ignore the tunable
-		 unless it is explicitly marked as secure.  Tunable values take
-		 precendence over their envvar aliases.  */
+	      /* If we are in a secure context (AT_SECURE) then ignore the
+		 tunable unless it is explicitly marked as secure.  Tunable
+		 values take precedence over their envvar aliases.  We write
+		 the tunables that are not SXID_ERASE back to TUNESTR, thus
+		 dropping all SXID_ERASE tunables and any invalid or
+		 unrecognized tunables.  */
 	      if (__libc_enable_secure)
 		{
-		  if (cur->security_level == TUNABLE_SECLEVEL_SXID_ERASE)
+		  if (cur->security_level != TUNABLE_SECLEVEL_SXID_ERASE)
 		    {
-		      if (p[len] == '\0')
-			{
-			  /* Last tunable in the valstring.  Null-terminate and
-			     return.  */
-			  *name = '\0';
-			  return;
-			}
-		      else
-			{
-			  /* Remove the current tunable from the string.  We do
-			     this by overwriting the string starting from NAME
-			     (which is where the current tunable begins) with
-			     the remainder of the string.  We then have P point
-			     to NAME so that we continue in the correct
-			     position in the valstring.  */
-			  char *q = &p[len + 1];
-			  p = name;
-			  while (*q != '\0')
-			    *name++ = *q++;
-			  name[0] = '\0';
-			  len = 0;
-			}
+		      if (off > 0)
+			tunestr[off++] = ':';
+
+		      const char *n = cur->name;
+
+		      while (*n != '\0')
+			tunestr[off++] = *n++;
+
+		      tunestr[off++] = '=';
+
+		      for (size_t j = 0; j < len; j++)
+			tunestr[off++] = value[j];
 		    }
 
 		  if (cur->security_level != TUNABLE_SECLEVEL_NONE)
@@ -258,9 +256,7 @@ parse_tunables (char *tunestr, char *valstring)
 	    }
 	}
 
-      if (p[len] == '\0')
-	return;
-      else
+      if (p[len] != '\0')
 	p += len + 1;
     }
 }
diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
index e92e9f5..0e9aede 100644
--- a/elf/tst-env-setuid-tunables.c
+++ b/elf/tst-env-setuid-tunables.c
@@ -25,35 +25,76 @@
 #include "config.h"
 #undef _LIBC
 
-#define test_parent test_parent_tunables
-#define test_child test_child_tunables
-
-static int test_child_tunables (void);
-static int test_parent_tunables (void);
-
-#include "tst-env-setuid.c"
-
-#define CHILD_VALSTRING_VALUE "glibc.malloc.mmap_threshold=4096"
-#define PARENT_VALSTRING_VALUE \
-  "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096"
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <intprops.h>
+#include <array_length.h>
+
+#include <support/check.h>
+#include <support/support.h>
+#include <support/test-driver.h>
+#include <support/capture_subprocess.h>
+
+const char *teststrings[] =
+{
+  "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.check=2:glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096:glibc.malloc.check=2",
+  "glibc.malloc.perturb=0x800",
+  "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.perturb=0x800:not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+  "glibc.not_valid.check=2:glibc.malloc.mmap_threshold=4096",
+  "not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096:glibc.malloc.check=2",
+  "glibc.malloc.check=4:glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096",
+  ":glibc.malloc.garbage=2:glibc.malloc.check=1",
+  "glibc.malloc.check=1:glibc.malloc.check=2",
+  "not_valid.malloc.check=2",
+  "glibc.not_valid.check=2",
+};
+
+const char *resultstrings[] =
+{
+  "glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.perturb=0x800",
+  "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.mmap_threshold=4096",
+  "",
+  "",
+  "",
+  "",
+  "",
+  "",
+};
 
 static int
-test_child_tunables (void)
+test_child (int off)
 {
   const char *val = getenv ("GLIBC_TUNABLES");
 
 #if HAVE_TUNABLES
-  if (val != NULL && strcmp (val, CHILD_VALSTRING_VALUE) == 0)
+  if (val != NULL && strcmp (val, resultstrings[off]) == 0)
     return 0;
 
   if (val != NULL)
-    printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val);
+    printf ("[%d] Unexpected GLIBC_TUNABLES VALUE %s\n", off, val);
 
   return 1;
 #else
   if (val != NULL)
     {
-      printf ("GLIBC_TUNABLES not cleared\n");
+      printf ("[%d] GLIBC_TUNABLES not cleared\n", off);
       return 1;
     }
   return 0;
@@ -61,15 +102,48 @@ test_child_tunables (void)
 }
 
 static int
-test_parent_tunables (void)
+do_test (int argc, char **argv)
 {
-  const char *val = getenv ("GLIBC_TUNABLES");
+  /* Setgid child process.  */
+  if (argc == 2)
+    {
+      if (getgid () == getegid ())
+	/* This can happen if the file system is mounted nosuid.  */
+	FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+			  (intmax_t) getgid ());
 
-  if (val != NULL && strcmp (val, PARENT_VALSTRING_VALUE) == 0)
-    return 0;
+      int ret = test_child (atoi (argv[1]));
 
-  if (val != NULL)
-    printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val);
+      if (ret != 0)
+	exit (1);
 
-  return 1;
+      exit (EXIT_SUCCESS);
+    }
+  else
+    {
+      int ret = 0;
+
+      /* Spawn tests.  */
+      for (int i = 0; i < array_length (teststrings); i++)
+	{
+	  char buf[INT_BUFSIZE_BOUND (int)];
+
+	  printf ("Spawned test for %s (%d)\n", teststrings[i], i);
+	  snprintf (buf, sizeof (buf), "%d\n", i);
+	  if (setenv ("GLIBC_TUNABLES", teststrings[i], 1) != 0)
+	    exit (1);
+
+	  int status = support_capture_subprogram_self_sgid (buf);
+
+	  /* Bail out early if unsupported.  */
+	  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+	    return EXIT_UNSUPPORTED;
+
+	  ret |= status;
+	}
+      return ret;
+    }
 }
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c
index a080d2f..2350976 100644
--- a/elf/tst-env-setuid.c
+++ b/elf/tst-env-setuid.c
@@ -29,173 +29,12 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#include <support/check.h>
 #include <support/support.h>
 #include <support/test-driver.h>
+#include <support/capture_subprocess.h>
 
 static char SETGID_CHILD[] = "setgid-child";
-#define CHILD_STATUS 42
-
-/* Return a GID which is not our current GID, but is present in the
-   supplementary group list.  */
-static gid_t
-choose_gid (void)
-{
-  const int count = 64;
-  gid_t groups[count];
-  int ret = getgroups (count, groups);
-  if (ret < 0)
-    {
-      printf ("getgroups: %m\n");
-      exit (1);
-    }
-  gid_t current = getgid ();
-  for (int i = 0; i < ret; ++i)
-    {
-      if (groups[i] != current)
-	return groups[i];
-    }
-  return 0;
-}
-
-/* Spawn and execute a program and verify that it returns the CHILD_STATUS.  */
-static pid_t
-do_execve (char **args)
-{
-  pid_t kid = vfork ();
-
-  if (kid < 0)
-    {
-      printf ("vfork: %m\n");
-      return -1;
-    }
-
-  if (kid == 0)
-    {
-      /* Child process.  */
-      execve (args[0], args, environ);
-      _exit (-errno);
-    }
-
-  if (kid < 0)
-    return 1;
-
-  int status;
-
-  if (waitpid (kid, &status, 0) < 0)
-    {
-      printf ("waitpid: %m\n");
-      return 1;
-    }
-
-  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
-    return EXIT_UNSUPPORTED;
-
-  if (!WIFEXITED (status) || WEXITSTATUS (status) != CHILD_STATUS)
-    {
-      printf ("Unexpected exit status %d from child process\n",
-	      WEXITSTATUS (status));
-      return 1;
-    }
-  return 0;
-}
-
-/* Copies the executable into a restricted directory, so that we can
-   safely make it SGID with the TARGET group ID.  Then runs the
-   executable.  */
-static int
-run_executable_sgid (gid_t target)
-{
-  char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
-			     test_dir, (intmax_t) getpid ());
-  char *execname = xasprintf ("%s/bin", dirname);
-  int infd = -1;
-  int outfd = -1;
-  int ret = 0;
-  if (mkdir (dirname, 0700) < 0)
-    {
-      printf ("mkdir: %m\n");
-      goto err;
-    }
-  infd = open ("/proc/self/exe", O_RDONLY);
-  if (infd < 0)
-    {
-      printf ("open (/proc/self/exe): %m\n");
-      goto err;
-    }
-  outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
-  if (outfd < 0)
-    {
-      printf ("open (%s): %m\n", execname);
-      goto err;
-    }
-  char buf[4096];
-  for (;;)
-    {
-      ssize_t rdcount = read (infd, buf, sizeof (buf));
-      if (rdcount < 0)
-	{
-	  printf ("read: %m\n");
-	  goto err;
-	}
-      if (rdcount == 0)
-	break;
-      char *p = buf;
-      char *end = buf + rdcount;
-      while (p != end)
-	{
-	  ssize_t wrcount = write (outfd, buf, end - p);
-	  if (wrcount == 0)
-	    errno = ENOSPC;
-	  if (wrcount <= 0)
-	    {
-	      printf ("write: %m\n");
-	      goto err;
-	    }
-	  p += wrcount;
-	}
-    }
-  if (fchown (outfd, getuid (), target) < 0)
-    {
-      printf ("fchown (%s): %m\n", execname);
-      goto err;
-    }
-  if (fchmod (outfd, 02750) < 0)
-    {
-      printf ("fchmod (%s): %m\n", execname);
-      goto err;
-    }
-  if (close (outfd) < 0)
-    {
-      printf ("close (outfd): %m\n");
-      goto err;
-    }
-  if (close (infd) < 0)
-    {
-      printf ("close (infd): %m\n");
-      goto err;
-    }
-
-  char *args[] = {execname, SETGID_CHILD, NULL};
-
-  ret = do_execve (args);
-
-err:
-  if (outfd >= 0)
-    close (outfd);
-  if (infd >= 0)
-    close (infd);
-  if (execname)
-    {
-      unlink (execname);
-      free (execname);
-    }
-  if (dirname)
-    {
-      rmdir (dirname);
-      free (dirname);
-    }
-  return ret;
-}
 
 #ifndef test_child
 static int
@@ -256,40 +95,32 @@ do_test (int argc, char **argv)
   if (argc == 2 && strcmp (argv[1], SETGID_CHILD) == 0)
     {
       if (getgid () == getegid ())
-	{
-	  /* This can happen if the file system is mounted nosuid.  */
-	  fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n",
-		   (intmax_t) getgid ());
-	  exit (EXIT_UNSUPPORTED);
-	}
+	/* This can happen if the file system is mounted nosuid.  */
+	FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+			  (intmax_t) getgid ());
 
       int ret = test_child ();
 
       if (ret != 0)
 	exit (1);
 
-      exit (CHILD_STATUS);
+      exit (EXIT_SUCCESS);
     }
   else
     {
       if (test_parent () != 0)
 	exit (1);
 
-      /* Try running a setgid program.  */
-      gid_t target = choose_gid ();
-      if (target == 0)
-	{
-	  fprintf (stderr,
-		   "Could not find a suitable GID for user %jd, skipping test\n",
-		   (intmax_t) getuid ());
-	  exit (0);
-	}
+      int status = support_capture_subprogram_self_sgid (SETGID_CHILD);
 
-      return run_executable_sgid (target);
-    }
+      if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+	return EXIT_UNSUPPORTED;
+
+      if (!WIFEXITED (status))
+	FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
 
-  /* Something went wrong and our argv was corrupted.  */
-  _exit (1);
+      return 0;
+    }
 }
 
 #define TEST_FUNCTION_ARGV do_test
diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c
index 94de199..ab793ae 100644
--- a/stdlib/tst-secure-getenv.c
+++ b/stdlib/tst-secure-getenv.c
@@ -30,167 +30,12 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#include <support/check.h>
 #include <support/support.h>
+#include <support/capture_subprocess.h>
 #include <support/test-driver.h>
 
 static char MAGIC_ARGUMENT[] = "run-actual-test";
-#define MAGIC_STATUS 19
-
-/* Return a GID which is not our current GID, but is present in the
-   supplementary group list.  */
-static gid_t
-choose_gid (void)
-{
-  int count = getgroups (0, NULL);
-  if (count < 0)
-    {
-      printf ("getgroups: %m\n");
-      exit (1);
-    }
-  gid_t *groups;
-  groups = xcalloc (count, sizeof (*groups));
-  int ret = getgroups (count, groups);
-  if (ret < 0)
-    {
-      printf ("getgroups: %m\n");
-      exit (1);
-    }
-  gid_t current = getgid ();
-  gid_t not_current = 0;
-  for (int i = 0; i < ret; ++i)
-    {
-      if (groups[i] != current)
-        {
-          not_current = groups[i];
-          break;
-        }
-    }
-  free (groups);
-  return not_current;
-}
-
-
-/* Copies the executable into a restricted directory, so that we can
-   safely make it SGID with the TARGET group ID.  Then runs the
-   executable.  */
-static int
-run_executable_sgid (gid_t target)
-{
-  char *dirname = xasprintf ("%s/secure-getenv.%jd",
-			     test_dir, (intmax_t) getpid ());
-  char *execname = xasprintf ("%s/bin", dirname);
-  int infd = -1;
-  int outfd = -1;
-  int ret = -1;
-  if (mkdir (dirname, 0700) < 0)
-    {
-      printf ("mkdir: %m\n");
-      goto err;
-    }
-  infd = open ("/proc/self/exe", O_RDONLY);
-  if (infd < 0)
-    {
-      printf ("open (/proc/self/exe): %m\n");
-      goto err;
-    }
-  outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
-  if (outfd < 0)
-    {
-      printf ("open (%s): %m\n", execname);
-      goto err;
-    }
-  char buf[4096];
-  for (;;)
-    {
-      ssize_t rdcount = read (infd, buf, sizeof (buf));
-      if (rdcount < 0)
-	{
-	  printf ("read: %m\n");
-	  goto err;
-	}
-      if (rdcount == 0)
-	break;
-      char *p = buf;
-      char *end = buf + rdcount;
-      while (p != end)
-	{
-	  ssize_t wrcount = write (outfd, buf, end - p);
-	  if (wrcount == 0)
-	    errno = ENOSPC;
-	  if (wrcount <= 0)
-	    {
-	      printf ("write: %m\n");
-	      goto err;
-	    }
-	  p += wrcount;
-	}
-    }
-  if (fchown (outfd, getuid (), target) < 0)
-    {
-      printf ("fchown (%s): %m\n", execname);
-      goto err;
-    }
-  if (fchmod (outfd, 02750) < 0)
-    {
-      printf ("fchmod (%s): %m\n", execname);
-      goto err;
-    }
-  if (close (outfd) < 0)
-    {
-      printf ("close (outfd): %m\n");
-      goto err;
-    }
-  if (close (infd) < 0)
-    {
-      printf ("close (infd): %m\n");
-      goto err;
-    }
-
-  int kid = fork ();
-  if (kid < 0)
-    {
-      printf ("fork: %m\n");
-      goto err;
-    }
-  if (kid == 0)
-    {
-      /* Child process.  */
-      char *args[] = { execname, MAGIC_ARGUMENT, NULL };
-      execve (execname, args, environ);
-      printf ("execve (%s): %m\n", execname);
-      _exit (1);
-    }
-  int status;
-  if (waitpid (kid, &status, 0) < 0)
-    {
-      printf ("waitpid: %m\n");
-      goto err;
-    }
-  if (!WIFEXITED (status) || WEXITSTATUS (status) != MAGIC_STATUS)
-    {
-      printf ("Unexpected exit status %d from child process\n",
-	      status);
-      goto err;
-    }
-  ret = 0;
-
-err:
-  if (outfd >= 0)
-    close (outfd);
-  if (infd >= 0)
-    close (infd);
-  if (execname)
-    {
-      unlink (execname);
-      free (execname);
-    }
-  if (dirname)
-    {
-      rmdir (dirname);
-      free (dirname);
-    }
-  return ret;
-}
 
 static int
 do_test (void)
@@ -212,15 +57,15 @@ do_test (void)
       exit (1);
     }
 
-  gid_t target = choose_gid ();
-  if (target == 0)
-    {
-      fprintf (stderr,
-	       "Could not find a suitable GID for user %jd, skipping test\n",
-	       (intmax_t) getuid ());
-      exit (0);
-    }
-  return run_executable_sgid (target);
+  int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
+
+  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+    return EXIT_UNSUPPORTED;
+
+  if (!WIFEXITED (status))
+    FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
+
+  return 0;
 }
 
 static void
@@ -229,23 +74,15 @@ alternative_main (int argc, char **argv)
   if (argc == 2 && strcmp (argv[1], MAGIC_ARGUMENT) == 0)
     {
       if (getgid () == getegid ())
-	{
-	  /* This can happen if the file system is mounted nosuid.  */
-	  fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n",
-		  (intmax_t) getgid ());
-	  exit (MAGIC_STATUS);
-	}
+	/* This can happen if the file system is mounted nosuid.  */
+	FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+		   (intmax_t) getgid ());
       if (getenv ("PATH") == NULL)
-	{
-	  printf ("PATH variable not present\n");
-	  exit (3);
-	}
+	FAIL_EXIT (3, "PATH variable not present\n");
       if (secure_getenv ("PATH") != NULL)
-	{
-	  printf ("PATH variable not filtered out\n");
-	  exit (4);
-	}
-      exit (MAGIC_STATUS);
+	FAIL_EXIT (4, "PATH variable not filtered out\n");
+
+      exit (EXIT_SUCCESS);
     }
 }
 
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 8c75338..f7d0896 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -27,6 +27,7 @@
 
 #ifndef WIDE
 # define STRNLEN strnlen
+# define MEMSET memset
 # define CHAR char
 # define BIG_CHAR CHAR_MAX
 # define MIDDLE_CHAR 127
@@ -34,6 +35,7 @@
 #else
 # include <wchar.h>
 # define STRNLEN wcsnlen
+# define MEMSET wmemset
 # define CHAR wchar_t
 # define BIG_CHAR WCHAR_MAX
 # define MIDDLE_CHAR 1121
@@ -153,7 +155,7 @@ do_page_tests (void)
   size_t last_offset = (page_size / sizeof (CHAR)) - 1;
 
   CHAR *s = (CHAR *) buf2;
-  memset (s, 65, (last_offset - 1));
+  MEMSET (s, 65, (last_offset - 1));
   s[last_offset] = 0;
 
   /* Place short strings ending at page boundary.  */
@@ -196,6 +198,35 @@ do_page_tests (void)
     }
 }
 
+/* Tests meant to unveil fail on implementations that access bytes
+   beyond the maxium length.  */
+
+static void
+do_page_2_tests (void)
+{
+  size_t i, exp_len, offset;
+  size_t last_offset = page_size / sizeof (CHAR);
+
+  CHAR *s = (CHAR *) buf2;
+  MEMSET (s, 65, last_offset);
+
+  /* Place short strings ending at page boundary without the null
+     byte.  */
+  offset = last_offset;
+  for (i = 0; i < 128; i++)
+    {
+      /* Decrease offset to stress several sizes and alignments.  */
+      offset--;
+      exp_len = last_offset - offset;
+      FOR_EACH_IMPL (impl, 0)
+	{
+	  /* If an implementation goes beyond EXP_LEN, it will trigger
+	     the segfault.  */
+	  do_one_test (impl, (CHAR *) (s + offset), exp_len, exp_len);
+	}
+    }
+}
+
 int
 test_main (void)
 {
@@ -242,6 +273,7 @@ test_main (void)
 
   do_random_tests ();
   do_page_tests ();
+  do_page_2_tests ();
   return ret;
 }
 
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
index 2832cfc..e42a84e 100644
--- a/support/capture_subprocess.h
+++ b/support/capture_subprocess.h
@@ -41,6 +41,12 @@ struct support_capture_subprocess support_capture_subprocess
 struct support_capture_subprocess support_capture_subprogram
   (const char *file, char *const argv[]);
 
+/* Copy the running program into a setgid binary and run it with CHILD_ID
+   argument.  If execution is successful, return the exit status of the child
+   program, otherwise return a non-zero failure exit code.  */
+int support_capture_subprogram_self_sgid
+  (char *child_id);
+
 /* Deallocate the subprocess data captured by
    support_capture_subprocess.  */
 void support_capture_subprocess_free (struct support_capture_subprocess *);
diff --git a/support/subprocess.h b/support/subprocess.h
index c031878d..a19335e 100644
--- a/support/subprocess.h
+++ b/support/subprocess.h
@@ -38,6 +38,11 @@ struct support_subprocess support_subprocess
 struct support_subprocess support_subprogram
   (const char *file, char *const argv[]);
 
+/* Invoke program FILE with ARGV arguments by using posix_spawn and wait for it
+   to complete.  Return program exit status.  */
+int support_subprogram_wait
+  (const char *file, char *const argv[]);
+
 /* Wait for the subprocess indicated by PROC::PID.  Return the status
    indicate by waitpid call.  */
 int support_process_wait (struct support_subprocess *proc);
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
index 948ce5a..492b76d 100644
--- a/support/support_capture_subprocess.c
+++ b/support/support_capture_subprocess.c
@@ -20,11 +20,14 @@
 #include <support/capture_subprocess.h>
 
 #include <errno.h>
+#include <fcntl.h>
 #include <stdlib.h>
 #include <support/check.h>
 #include <support/xunistd.h>
 #include <support/xsocket.h>
 #include <support/xspawn.h>
+#include <support/support.h>
+#include <support/test-driver.h>
 
 static void
 transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream)
@@ -36,7 +39,7 @@ transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream)
       if (ret < 0)
         {
           support_record_failure ();
-          printf ("error: reading from subprocess %s: %m", what);
+          printf ("error: reading from subprocess %s: %m\n", what);
           pfd->events = 0;
           pfd->revents = 0;
         }
@@ -102,6 +105,129 @@ support_capture_subprogram (const char *file, char *const argv[])
   return result;
 }
 
+/* Copies the executable into a restricted directory, so that we can
+   safely make it SGID with the TARGET group ID.  Then runs the
+   executable.  */
+static int
+copy_and_spawn_sgid (char *child_id, gid_t gid)
+{
+  char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
+			     test_dir, (intmax_t) getpid ());
+  char *execname = xasprintf ("%s/bin", dirname);
+  int infd = -1;
+  int outfd = -1;
+  int ret = 1, status = 1;
+
+  TEST_VERIFY (mkdir (dirname, 0700) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+
+  infd = open ("/proc/self/exe", O_RDONLY);
+  if (infd < 0)
+    FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n");
+
+  outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
+  TEST_VERIFY (outfd >= 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+
+  char buf[4096];
+  for (;;)
+    {
+      ssize_t rdcount = read (infd, buf, sizeof (buf));
+      TEST_VERIFY (rdcount >= 0);
+      if (support_record_failure_is_failed ())
+	goto err;
+      if (rdcount == 0)
+	break;
+      char *p = buf;
+      char *end = buf + rdcount;
+      while (p != end)
+	{
+	  ssize_t wrcount = write (outfd, buf, end - p);
+	  if (wrcount == 0)
+	    errno = ENOSPC;
+	  TEST_VERIFY (wrcount > 0);
+	  if (support_record_failure_is_failed ())
+	    goto err;
+	  p += wrcount;
+	}
+    }
+  TEST_VERIFY (fchown (outfd, getuid (), gid) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+  TEST_VERIFY (fchmod (outfd, 02750) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+  TEST_VERIFY (close (outfd) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+  TEST_VERIFY (close (infd) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+
+  /* We have the binary, now spawn the subprocess.  Avoid using
+     support_subprogram because we only want the program exit status, not the
+     contents.  */
+  ret = 0;
+
+  char * const args[] = {execname, child_id, NULL};
+
+  status = support_subprogram_wait (args[0], args);
+
+err:
+  if (outfd >= 0)
+    close (outfd);
+  if (infd >= 0)
+    close (infd);
+  if (execname != NULL)
+    {
+      unlink (execname);
+      free (execname);
+    }
+  if (dirname != NULL)
+    {
+      rmdir (dirname);
+      free (dirname);
+    }
+
+  if (ret != 0)
+    FAIL_EXIT1("Failed to make sgid executable for test\n");
+
+  return status;
+}
+
+int
+support_capture_subprogram_self_sgid (char *child_id)
+{
+  gid_t target = 0;
+  const int count = 64;
+  gid_t groups[count];
+
+  /* Get a GID which is not our current GID, but is present in the
+     supplementary group list.  */
+  int ret = getgroups (count, groups);
+  if (ret < 0)
+    FAIL_UNSUPPORTED("Could not get group list for user %jd\n",
+		     (intmax_t) getuid ());
+
+  gid_t current = getgid ();
+  for (int i = 0; i < ret; ++i)
+    {
+      if (groups[i] != current)
+	{
+	  target = groups[i];
+	  break;
+	}
+    }
+
+  if (target == 0)
+    FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
+		     (intmax_t) getuid ());
+
+  return copy_and_spawn_sgid (child_id, target);
+}
+
 void
 support_capture_subprocess_free (struct support_capture_subprocess *p)
 {
diff --git a/support/support_subprocess.c b/support/support_subprocess.c
index 0c8cc6a..97e481e 100644
--- a/support/support_subprocess.c
+++ b/support/support_subprocess.c
@@ -27,7 +27,7 @@
 #include <support/subprocess.h>
 
 static struct support_subprocess
-support_suprocess_init (void)
+support_subprocess_init (void)
 {
   struct support_subprocess result;
 
@@ -48,7 +48,7 @@ support_suprocess_init (void)
 struct support_subprocess
 support_subprocess (void (*callback) (void *), void *closure)
 {
-  struct support_subprocess result = support_suprocess_init ();
+  struct support_subprocess result = support_subprocess_init ();
 
   result.pid = xfork ();
   if (result.pid == 0)
@@ -71,7 +71,7 @@ support_subprocess (void (*callback) (void *), void *closure)
 struct support_subprocess
 support_subprogram (const char *file, char *const argv[])
 {
-  struct support_subprocess result = support_suprocess_init ();
+  struct support_subprocess result = support_subprocess_init ();
 
   posix_spawn_file_actions_t fa;
   /* posix_spawn_file_actions_init does not fail.  */
@@ -84,7 +84,7 @@ support_subprogram (const char *file, char *const argv[])
   xposix_spawn_file_actions_addclose (&fa, result.stdout_pipe[1]);
   xposix_spawn_file_actions_addclose (&fa, result.stderr_pipe[1]);
 
-  result.pid = xposix_spawn (file, &fa, NULL, argv, NULL);
+  result.pid = xposix_spawn (file, &fa, NULL, argv, environ);
 
   xclose (result.stdout_pipe[1]);
   xclose (result.stderr_pipe[1]);
@@ -93,6 +93,19 @@ support_subprogram (const char *file, char *const argv[])
 }
 
 int
+support_subprogram_wait (const char *file, char *const argv[])
+{
+  posix_spawn_file_actions_t fa;
+
+  posix_spawn_file_actions_init (&fa);
+  struct support_subprocess res = support_subprocess_init ();
+
+  res.pid = xposix_spawn (file, &fa, NULL, argv, environ);
+
+  return support_process_wait (&res);
+}
+
+int
 support_process_wait (struct support_subprocess *proc)
 {
   xclose (proc->stdout_pipe[0]);
diff --git a/sysdeps/s390/configure b/sysdeps/s390/configure
index fa46e9e..e7f5763 100644
--- a/sysdeps/s390/configure
+++ b/sysdeps/s390/configure
@@ -123,7 +123,9 @@ void testinsn (char *buf)
     __asm__ (".machine \"arch13\" \n\t"
 	     ".machinemode \"zarch_nohighgprs\" \n\t"
 	     "lghi %%r0,16 \n\t"
-	     "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+	     "mvcrl 0(%0),32(%0) \n\t"
+	     "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+	     : : "a" (buf) : "memory", "r0");
 }
 EOF
 if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
@@ -271,7 +273,9 @@ else
 void testinsn (char *buf)
 {
     __asm__ ("lghi %%r0,16 \n\t"
-	     "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+	     "mvcrl 0(%0),32(%0) \n\t"
+	     "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+	     : : "a" (buf) : "memory", "r0");
 }
 EOF
 if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
diff --git a/sysdeps/s390/configure.ac b/sysdeps/s390/configure.ac
index 3ed5a8e..5c3479e 100644
--- a/sysdeps/s390/configure.ac
+++ b/sysdeps/s390/configure.ac
@@ -88,7 +88,9 @@ void testinsn (char *buf)
     __asm__ (".machine \"arch13\" \n\t"
 	     ".machinemode \"zarch_nohighgprs\" \n\t"
 	     "lghi %%r0,16 \n\t"
-	     "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+	     "mvcrl 0(%0),32(%0) \n\t"
+	     "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+	     : : "a" (buf) : "memory", "r0");
 }
 EOF
 dnl test, if assembler supports S390 arch13 instructions
@@ -195,7 +197,9 @@ cat > conftest.c <<\EOF
 void testinsn (char *buf)
 {
     __asm__ ("lghi %%r0,16 \n\t"
-	     "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+	     "mvcrl 0(%0),32(%0) \n\t"
+	     "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+	     : : "a" (buf) : "memory", "r0");
 }
 EOF
 dnl test, if assembler supports S390 arch13 zarch instructions as default
diff --git a/sysdeps/s390/memmove.c b/sysdeps/s390/memmove.c
index fb6b69a..dc27b1d 100644
--- a/sysdeps/s390/memmove.c
+++ b/sysdeps/s390/memmove.c
@@ -43,7 +43,7 @@ extern __typeof (__redirect_memmove) MEMMOVE_ARCH13 attribute_hidden;
 s390_libc_ifunc_expr (__redirect_memmove, memmove,
 		      ({
 			s390_libc_ifunc_expr_stfle_init ();
-			(HAVE_MEMMOVE_ARCH13
+			(HAVE_MEMMOVE_ARCH13 && (hwcap & HWCAP_S390_VXRS_EXT2)
 			 && S390_IS_ARCH13_MIE3 (stfle_bits))
 			  ? MEMMOVE_ARCH13
 			  : (HAVE_MEMMOVE_Z13 && (hwcap & HWCAP_S390_VX))
diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
index 1948436..6d05652 100644
--- a/sysdeps/s390/multiarch/ifunc-impl-list.c
+++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
@@ -171,7 +171,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
     IFUNC_IMPL (i, name, memmove,
 # if HAVE_MEMMOVE_ARCH13
 		IFUNC_IMPL_ADD (array, i, memmove,
-				S390_IS_ARCH13_MIE3 (stfle_bits),
+				((dl_hwcap & HWCAP_S390_VXRS_EXT2)
+				 && S390_IS_ARCH13_MIE3 (stfle_bits)),
 				MEMMOVE_ARCH13)
 # endif
 # if HAVE_MEMMOVE_Z13
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index a5112ef..a93139b 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -20,6 +20,33 @@ endif
 endif
 endif
 
+ifeq ($(subdir),string)
+sysdep_routines += cacheinfo
+
+tests += \
+  tst-memchr-rtm \
+  tst-memcmp-rtm \
+  tst-memmove-rtm \
+  tst-memrchr-rtm \
+  tst-memset-rtm \
+  tst-strchr-rtm \
+  tst-strcpy-rtm \
+  tst-strlen-rtm \
+  tst-strncmp-rtm \
+  tst-strrchr-rtm
+
+CFLAGS-tst-memchr-rtm.c += -mrtm
+CFLAGS-tst-memcmp-rtm.c += -mrtm
+CFLAGS-tst-memmove-rtm.c += -mrtm
+CFLAGS-tst-memrchr-rtm.c += -mrtm
+CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcpy-rtm.c += -mrtm
+CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strrchr-rtm.c += -mrtm
+endif
+
 ifeq ($(enable-cet),yes)
 ifeq ($(subdir),elf)
 sysdep-dl-routines += dl-cet
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 4bab154..a4d1eac 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -424,8 +424,24 @@ init_cpu_features (struct cpu_features *cpu_features)
 	cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
 	  |= bit_arch_Prefer_No_VZEROUPPER;
       else
-	cpu_features->feature[index_arch_Prefer_No_AVX512]
-	  |= bit_arch_Prefer_No_AVX512;
+	{
+	  cpu_features->feature[index_arch_Prefer_No_AVX512]
+	    |= bit_arch_Prefer_No_AVX512;
+
+	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+	     transactionally executing RTM region.  */
+	  if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	    cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
+	      |= bit_arch_Prefer_No_VZEROUPPER;
+
+	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+	     AVX2 strcmp is faster than EVEX strcmp.  */
+	  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
+	    cpu_features->feature[index_arch_Prefer_AVX2_STRCMP]
+	      |= bit_arch_Prefer_AVX2_STRCMP;
+	}
     }
   /* This spells out "AuthenticAMD" or "HygonGenuine".  */
   else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 03a9b2a..ca2924b 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -897,6 +897,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 #define bit_arch_Prefer_FSRM			(1u << 13)
 #define bit_arch_Prefer_No_AVX512		(1u << 14)
 #define bit_arch_MathVec_Prefer_No_AVX512	(1u << 15)
+#define bit_arch_Prefer_AVX2_STRCMP		(1u << 16)
 
 #define index_arch_Fast_Rep_String		FEATURE_INDEX_2
 #define index_arch_Fast_Copy_Backward		FEATURE_INDEX_2
@@ -914,6 +915,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 #define index_arch_Prefer_No_AVX512		FEATURE_INDEX_2
 #define index_arch_MathVec_Prefer_No_AVX512	FEATURE_INDEX_2
 #define index_arch_Prefer_FSRM			FEATURE_INDEX_2
+#define index_arch_Prefer_AVX2_STRCMP		FEATURE_INDEX_2
 
 /* XCR0 Feature flags.  */
 #define bit_XMM_state		(1u << 1)
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 2cb3151..d4d5e45 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -282,6 +282,9 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 	      CHECK_GLIBC_IFUNC_ARCH_BOTH (n, cpu_features,
 					   Fast_Copy_Backward, disable,
 					   18);
+	      CHECK_GLIBC_IFUNC_ARCH_NEED_ARCH_BOTH
+		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2_Usable,
+		 disable, 18);
 	    }
 	  break;
 	case 19:
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
new file mode 100644
index 0000000..e474940
--- /dev/null
+++ b/sysdeps/x86/tst-memchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = memchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = memchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
new file mode 100644
index 0000000..e4c8a62
--- /dev/null
+++ b/sysdeps/x86/tst-memcmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for memcmp inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  memset (string2, 'a', STRING_SIZE);
+  if (memcmp (string1, string2, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (memcmp (string1, string2, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memcmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
new file mode 100644
index 0000000..4bf97ef
--- /dev/null
+++ b/sysdeps/x86/tst-memmove-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for memmove inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  if (memmove (string2, string1, STRING_SIZE) == string2
+      && memcmp (string2, string1, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (memmove (string2, string1, STRING_SIZE) == string2
+      && memcmp (string2, string1, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memmove", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
new file mode 100644
index 0000000..a57a5a8
--- /dev/null
+++ b/sysdeps/x86/tst-memrchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memrchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = memrchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[STRING_SIZE - 100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = memrchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[STRING_SIZE - 100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memrchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
new file mode 100644
index 0000000..bf343a4
--- /dev/null
+++ b/sysdeps/x86/tst-memset-rtm.c
@@ -0,0 +1,45 @@
+/* Test case for memset inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  return EXIT_SUCCESS;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memset", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
new file mode 100644
index 0000000..a82e29c
--- /dev/null
+++ b/sysdeps/x86/tst-strchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for strchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = strchr (string1, 'c');
+  if (p == &string1[100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = strchr (string1, 'c');
+  if (p == &string1[100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
new file mode 100644
index 0000000..2b2a583
--- /dev/null
+++ b/sysdeps/x86/tst-strcpy-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strcpy inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  if (strcpy (string2, string1) == string2
+      && strcmp (string2, string1) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (strcpy (string2, string1) == string2
+      && strcmp (string2, string1) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strcpy", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
new file mode 100644
index 0000000..6ed9eca
--- /dev/null
+++ b/sysdeps/x86/tst-string-rtm.h
@@ -0,0 +1,72 @@
+/* Test string function in a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <x86intrin.h>
+#include <cpu-features.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+
+static int
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
+	   int (*function) (void))
+{
+  if (!CPU_FEATURE_USABLE (RTM))
+    return EXIT_UNSUPPORTED;
+
+  int status = prepare ();
+  if (status != EXIT_SUCCESS)
+    return status;
+
+  unsigned int i;
+  unsigned int naborts = 0;
+  unsigned int failed = 0;
+  for (i = 0; i < loop; i++)
+    {
+      failed |= function ();
+      if (_xbegin() == _XBEGIN_STARTED)
+	{
+	  failed |= function ();
+	  _xend();
+	}
+      else
+	{
+	  failed |= function ();
+	  ++naborts;
+	}
+    }
+
+  if (failed)
+    FAIL_EXIT1 ("%s() failed", name);
+
+  if (naborts)
+    {
+      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
+	 TSX.  */
+      double rate = 100 * ((double) naborts) / ((double) loop);
+      if (rate > 5)
+	FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
+		    rate, naborts, loop);
+    }
+
+  return EXIT_SUCCESS;
+}
+
+static int do_test (void);
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
new file mode 100644
index 0000000..0dcf14db
--- /dev/null
+++ b/sysdeps/x86/tst-strlen-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strlen inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[STRING_SIZE - 100] = '\0';
+  size_t len = strlen (string1);
+  if (len == STRING_SIZE - 100)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  size_t len = strlen (string1);
+  if (len == STRING_SIZE - 100)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strlen", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
new file mode 100644
index 0000000..236ad95
--- /dev/null
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for strncmp inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  memset (string2, 'a', STRING_SIZE - 1);
+  if (strncmp (string1, string2, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (strncmp (string1, string2, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strncmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
new file mode 100644
index 0000000..e32bfaf
--- /dev/null
+++ b/sysdeps/x86/tst-strrchr-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strrchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = strrchr (string1, 'c');
+  if (p == &string1[STRING_SIZE - 100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = strrchr (string1, 'c');
+  if (p == &string1[STRING_SIZE - 100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strrchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index ec96365..f2217b2 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
 #ifdef USE_AS_WMEMCHR
 # define MEMCHR		wmemchr
 # define PCMPEQ		pcmpeqd
+# define CHAR_PER_VEC	4
 #else
 # define MEMCHR		memchr
 # define PCMPEQ		pcmpeqb
+# define CHAR_PER_VEC	16
 #endif
 
 /* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
 	movd	%esi, %xmm1
 	mov	%edi, %ecx
 
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
 #ifdef USE_AS_WMEMCHR
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
-	shl	$2, %RDX_LP
 #else
-# ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
-# endif
 	punpcklbw %xmm1, %xmm1
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
 	test	%eax, %eax
 
 	jnz	L(matches_1)
-	sub	$16, %rdx
+	sub	$CHAR_PER_VEC, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
 	and	$15, %ecx
 	and	$-16, %rdi
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	jmp	L(loop_prolog)
 
@@ -77,16 +81,21 @@ L(crosscache):
 	movdqa	(%rdi), %xmm0
 
 	PCMPEQ	%xmm1, %xmm0
-/* Check if there is a match.  */
+	/* Check if there is a match.  */
 	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
+	/* Remove the leading bytes.  */
 	sar	%cl, %eax
 	test	%eax, %eax
 	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
+	/* Check which byte is a match.  */
 	bsf	%eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	add	%rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
 
 	.p2align 4
 L(unaligned_no_match):
-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
 	   possible addition overflow.  */
 	neg	%rcx
 	add	$16, %rcx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	sub	%rcx, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
 	test	$0x3f, %rdi
 	jz	L(align64_loop)
 
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	movdqa	(%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
 	mov	%rdi, %rcx
 	and	$-64, %rdi
 	and	$63, %ecx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
 
 	.p2align 4
 L(align64_loop):
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	movdqa	(%rdi), %xmm0
 	movdqa	16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jle	L(return_null)
 
 	PCMPEQ	48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
 
 	.p2align 4
 L(exit_loop_32):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jbe	L(return_null)
 
 	PCMPEQ	16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
 	.p2align 4
 L(matches_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	ret
@@ -301,7 +322,13 @@ L(matches_1):
 	.p2align 4
 L(matches16_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	16(%rdi, %rax), %rax
 	ret
@@ -309,7 +336,13 @@ L(matches16_1):
 	.p2align 4
 L(matches32_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	32(%rdi, %rax), %rax
 	ret
@@ -317,7 +350,13 @@ L(matches32_1):
 	.p2align 4
 L(matches48_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	48(%rdi, %rax), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 395e432..da1446d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -43,7 +43,45 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   memmove-avx512-unaligned-erms \
 		   memset-sse2-unaligned-erms \
 		   memset-avx2-unaligned-erms \
-		   memset-avx512-unaligned-erms
+		   memset-avx512-unaligned-erms \
+		   memchr-avx2-rtm \
+		   memcmp-avx2-movbe-rtm \
+		   memmove-avx-unaligned-erms-rtm \
+		   memrchr-avx2-rtm \
+		   memset-avx2-unaligned-erms-rtm \
+		   rawmemchr-avx2-rtm \
+		   strchr-avx2-rtm \
+		   strcmp-avx2-rtm \
+		   strchrnul-avx2-rtm \
+		   stpcpy-avx2-rtm \
+		   stpncpy-avx2-rtm \
+		   strcat-avx2-rtm \
+		   strcpy-avx2-rtm \
+		   strlen-avx2-rtm \
+		   strncat-avx2-rtm \
+		   strncmp-avx2-rtm \
+		   strncpy-avx2-rtm \
+		   strnlen-avx2-rtm \
+		   strrchr-avx2-rtm \
+		   memchr-evex \
+		   memcmp-evex-movbe \
+		   memmove-evex-unaligned-erms \
+		   memrchr-evex \
+		   memset-evex-unaligned-erms \
+		   rawmemchr-evex \
+		   stpcpy-evex \
+		   stpncpy-evex \
+		   strcat-evex \
+		   strchr-evex \
+		   strchrnul-evex \
+		   strcmp-evex \
+		   strcpy-evex \
+		   strlen-evex \
+		   strncat-evex \
+		   strncmp-evex \
+		   strncpy-evex \
+		   strnlen-evex \
+		   strrchr-evex
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
@@ -59,8 +97,24 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wcscpy-ssse3 wcscpy-c \
 		   wcschr-sse2 wcschr-avx2 \
 		   wcsrchr-sse2 wcsrchr-avx2 \
-		   wcsnlen-sse4_1 wcsnlen-c \
-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2
+		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+		   wcschr-avx2-rtm \
+		   wcscmp-avx2-rtm \
+		   wcslen-avx2-rtm \
+		   wcsncmp-avx2-rtm \
+		   wcsnlen-avx2-rtm \
+		   wcsrchr-avx2-rtm \
+		   wmemchr-avx2-rtm \
+		   wmemcmp-avx2-movbe-rtm \
+		   wcschr-evex \
+		   wcscmp-evex \
+		   wcslen-evex \
+		   wcsncmp-evex \
+		   wcsnlen-evex \
+		   wcsrchr-evex \
+		   wmemchr-evex \
+		   wmemcmp-evex-movbe
 endif
 
 ifeq ($(subdir),debug)
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
index a70b62d..12cff84 100644
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -21,16 +21,28 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
+	  && CPU_FEATURES_CPU_P (cpu_features, BMI2))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 39a45d0..e57fb42 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -43,6 +43,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memchr,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, memchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, memchr,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __memchr_evex)
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
@@ -51,6 +60,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (HAS_ARCH_FEATURE (AVX2_Usable)
 			       && HAS_CPU_FEATURE (MOVBE)),
 			      __memcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcmp_avx2_movbe_rtm)
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)),
+			      __memcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __memcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
@@ -64,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memmove_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memmove_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
@@ -76,6 +95,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memmove_chk_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memmove_chk_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __memmove_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __memmove_chk_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
@@ -98,13 +131,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memmove_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memmove_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __memmove_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __memmove_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memmove,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memmove_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memmove_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3_back)
@@ -121,6 +168,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memrchr,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, memrchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memrchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, memrchr,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __memrchr_evex)
+
 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
 
 #ifdef SHARED
@@ -139,10 +195,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_chk_avx2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memset_chk_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memset_chk_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __memset_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __memset_chk_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __memset_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __memset_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
@@ -164,10 +238,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_avx2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memset_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memset_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __memset_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __memset_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __memset_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
 			      __memset_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
@@ -179,20 +271,51 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __rawmemchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __rawmemchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __rawmemchr_evex)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
 			      __strlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strlen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __strlen_evex)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
   IFUNC_IMPL (i, name, strnlen,
 	      IFUNC_IMPL_ADD (array, i, strnlen,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
 			      __strnlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strnlen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __strnlen_evex)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
@@ -201,6 +324,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
 			      __stpncpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __stpncpy_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __stpncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
 			      __stpncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -211,6 +342,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
 			      __stpcpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, stpcpy,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __stpcpy_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, stpcpy,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __stpcpy_evex)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
 
@@ -245,6 +384,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcat,
 	      IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strcat_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcat,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strcat_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strcat,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __strcat_evex)
 	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
 			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -255,6 +402,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strchr,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, strchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strchr,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __strchr_evex)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
 
@@ -263,6 +419,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strchrnul_avx2)
+	      IFUNC_IMPL_ADD (array, i, strchrnul,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strchrnul_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strchrnul,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __strchrnul_evex)
 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strrchr.c.  */
@@ -270,6 +435,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strrchr,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, strrchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strrchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strrchr,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __strrchr_evex)
 	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcmp.c.  */
@@ -277,6 +450,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcmp,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strcmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strcmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strcmp,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __strcmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
 			      __strcmp_sse42)
 	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
@@ -288,6 +470,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcpy,
 	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strcpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcpy,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strcpy_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strcpy,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __strcpy_evex)
 	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
 			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -331,6 +521,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strncat,
 	      IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strncat_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncat,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strncat_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strncat,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __strncat_evex)
 	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
 			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -341,6 +539,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strncpy,
 	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strncpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strncpy_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __strncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
 			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -370,6 +576,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wcschr,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcschr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcschr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcschr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcschr,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __wcschr_evex)
 	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcsrchr.c.  */
@@ -377,6 +592,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcsrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcsrchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __wcsrchr_evex)
 	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcscmp.c.  */
@@ -384,6 +608,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wcscmp,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcscmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcscmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcscmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcscmp,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __wcscmp_evex)
 	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcsncmp.c.  */
@@ -391,6 +624,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wcsncmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcsncmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __wcsncmp_evex)
 	      IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
@@ -402,16 +644,41 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      IFUNC_IMPL_ADD (array, i, wcslen,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
 			      __wcslen_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcslen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      CPU_FEATURE_USABLE (SSE4_1),
+			      __wcsnlen_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
   IFUNC_IMPL (i, name, wcsnlen,
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
 			      __wcsnlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wcsnlen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __wcsnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      HAS_CPU_FEATURE (SSE4_1),
 			      __wcsnlen_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
@@ -421,6 +688,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wmemchr,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wmemchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wmemchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
+			      __wmemchr_evex)
 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
@@ -429,6 +705,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (HAS_ARCH_FEATURE (AVX2_Usable)
 			       && HAS_CPU_FEATURE (MOVBE)),
 			      __wmemcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wmemcmp_avx2_movbe_rtm)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)),
+			      __wmemcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __wmemcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
@@ -443,7 +729,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wmemset_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, wmemset,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __wmemset_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __wmemset_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __wmemset_avx512_unaligned))
 
 #ifdef SHARED
@@ -453,10 +746,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memcpy_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
@@ -465,6 +758,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcpy_chk_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcpy_chk_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __memcpy_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __memcpy_chk_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
@@ -486,6 +793,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcpy_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __memcpy_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __memcpy_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __memcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -494,10 +815,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __memcpy_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
@@ -511,10 +832,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __mempcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __mempcpy_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
@@ -523,6 +844,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __mempcpy_chk_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __mempcpy_chk_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __mempcpy_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __mempcpy_chk_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
@@ -542,10 +877,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __mempcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
-			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
 			      __mempcpy_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
@@ -553,6 +888,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __mempcpy_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      (HAS_ARCH_FEATURE (AVX_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __mempcpy_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __mempcpy_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __mempcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
@@ -568,6 +917,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncmp,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __strncmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (RTM)),
+			      __strncmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strncmp,
+			      (HAS_ARCH_FEATURE (AVX512VL_Usable)
+			       && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+			      __strncmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
 			      __strncmp_sse42)
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
@@ -583,6 +940,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __wmemset_chk_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX512VL_Usable),
+			      __wmemset_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __wmemset_chk_avx512_unaligned))
 #endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 945529f..963d327 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -23,17 +23,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
       && CPU_FEATURES_CPU_P (cpu_features, MOVBE)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2_movbe);
+    {
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+	return OPTIMIZE (evex_movbe);
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_movbe_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2_movbe);
+    }
 
   if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
     return OPTIMIZE (sse4_1);
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 9d63ee5..01fc6b9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -48,21 +56,42 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
     {
-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
-	return OPTIMIZE (avx512_no_vzeroupper);
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+	{
+	if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx512_unaligned_erms);
 
-      if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
-	return OPTIMIZE (avx512_unaligned_erms);
+	  return OPTIMIZE (avx512_unaligned);
+	}
 
-      return OPTIMIZE (avx512_unaligned);
+      return OPTIMIZE (avx512_no_vzeroupper);
     }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
-      if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
-	return OPTIMIZE (avx_unaligned_erms);
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+	{
+	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (evex_unaligned_erms);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx_unaligned_erms_rtm);
+
+	  return OPTIMIZE (avx_unaligned_rtm);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx_unaligned_erms);
 
-      return OPTIMIZE (avx_unaligned);
+	  return OPTIMIZE (avx_unaligned);
+	}
     }
 
   if (!CPU_FEATURES_CPU_P (cpu_features, SSSE3)
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index c98d757..198c8c6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -27,6 +27,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -45,21 +53,44 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
     {
-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
-	return OPTIMIZE (avx512_no_vzeroupper);
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+	{
+	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx512_unaligned_erms);
 
-      if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
-	return OPTIMIZE (avx512_unaligned_erms);
+	  return OPTIMIZE (avx512_unaligned);
+	}
 
-      return OPTIMIZE (avx512_unaligned);
+      return OPTIMIZE (avx512_no_vzeroupper);
     }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
     {
-      if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
-	return OPTIMIZE (avx2_unaligned_erms);
-      else
-	return OPTIMIZE (avx2_unaligned);
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+	{
+	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (evex_unaligned_erms);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx2_unaligned_erms_rtm);
+
+	  return OPTIMIZE (avx2_unaligned_rtm);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx2_unaligned_erms);
+
+	  return OPTIMIZE (avx2_unaligned);
+	}
     }
 
   if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index bc6a70f..b05d119 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -25,16 +25,27 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
new file mode 100644
index 0000000..564cc8c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
@@ -0,0 +1,52 @@
+/* Common definition for ifunc selections for wcslen and wcsnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+      && CPU_FEATURES_CPU_P (cpu_features, BMI2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
+
+  if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
+    return OPTIMIZE (sse4_1);
+
+  return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index f480b82..2913a36 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -20,6 +20,9 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
 
 static inline void *
@@ -27,14 +30,21 @@ IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
-      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
-	return OPTIMIZE (avx512_unaligned);
-      else
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+	{
+	  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+	    return OPTIMIZE (avx512_unaligned);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_unaligned_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2_unaligned);
     }
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
new file mode 100644
index 0000000..87b076c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index cfec165..9ddeab2 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,319 +26,407 @@
 
 # ifdef USE_AS_WMEMCHR
 #  define VPCMPEQ	vpcmpeqd
+#  define VPBROADCAST	vpbroadcastd
+#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
+#  define VPBROADCAST	vpbroadcastb
+#  define CHAR_SIZE	1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+#  define ERAW_PTR_REG	ecx
+#  define RRAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define ERAW_PTR_REG	edi
+#  define RRAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
 # endif
 
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+#  else
 	test	%RDX_LP, %RDX_LP
+#  endif
 	jz	L(null)
 # endif
-	movl	%edi, %ecx
-	/* Broadcast CHAR to YMM0.  */
+	/* Broadcast CHAR to YMMMATCH.  */
 	vmovd	%esi, %xmm0
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-	vpbroadcastd %xmm0, %ymm0
-# else
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
-#  endif
-	vpbroadcastb %xmm0, %ymm0
-# endif
+	VPBROADCAST %xmm0, %ymm0
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax), %rax
+	cmovle	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
+L(null):
+	xorl	%eax, %eax
+	ret
 # endif
-	jmp	L(more_4x_vec)
-
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a
+	   match).  */
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
+# endif
 	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	sarxl	%ERAW_PTR_REG, %eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
 # endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	incq	%rdi
 	addq	%rdi, %rax
-	addq	%rcx, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	addq	$VEC_SIZE, %rdi
 
-# ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-L(more_4x_vec):
+	.p2align 4
+L(aligned_more):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+	/* Align data to VEC_SIZE - 1.  */
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	orq	$(VEC_SIZE - 1), %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	/* Check if at last VEC_SIZE * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+	   length.  */
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	addq	%rcx, %rdx
+# else
+	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
 
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
 	vpor	%ymm1, %ymm2, %ymm5
 	vpor	%ymm3, %ymm4, %ymm6
 	vpor	%ymm5, %ymm6, %ymm5
 
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	vpmovmskb %ymm5, %ecx
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec)
+	testl	%ecx, %ecx
+	jnz	L(loop_4x_vec_end)
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
 
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop_4x_vec)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	/* Fall through into less than 4 remaining vectors of length
+	   case.  */
+	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(last_4x_vec_or_less):
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
+	/* If remaining length > VEC_SIZE * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jg	L(last_4x_vec)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < VEC_SIZE.  */
+	addl	$VEC_SIZE, %edx
+	jle	L(zero_end)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
-	jnz	L(first_vec_x3_check)
-	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+L(zero_end):
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x0_check):
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	vpmovmskb %ymm3, %eax
+	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
+# endif
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
 
 	.p2align 4
 L(first_vec_x1_check):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+	/* Adjust length.  */
+	subl	$-(VEC_SIZE * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	incq	%rdi
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+	.p2align 4
+L(set_zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4 - 1), %rdi
+# else
+	incq	%rdi
+# endif
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE + 1), %rdi
+# endif
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
+# ifndef USE_AS_RAWMEMCHR
 	.p2align 4
-L(zero):
-	VZEROUPPER
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
 
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
 	.p2align 4
-L(first_vec_x0):
-	tzcntl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+L(last_4x_vec):
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
 
-	.p2align 4
-L(first_vec_x2):
+	/* Create mask for possible matches within remaining length.  */
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= VEC_SIZE * 3 (Note this is after
+	   remaining length was found to be > VEC_SIZE * 2.  */
+	subl	$VEC_SIZE, %edx
+	jbe	L(zero_end2)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Shift remaining length mask for last VEC.  */
+	shrq	$32, %rcx
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 2), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+L(zero_end2):
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(4x_vec_end):
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3), %rax
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+# endif
 
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
new file mode 100644
index 0000000..f3fdad4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -0,0 +1,478 @@
+/* memchr/wmemchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCHR
+#  define MEMCHR	__memchr_evex
+# endif
+
+# ifdef USE_AS_WMEMCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPMINU	vpminud
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPMINU	vpminub
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+#  define RAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define RAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+# endif
+
+# define XMMZERO	xmm23
+# define YMMZERO	ymm23
+# define XMMMATCH	xmm16
+# define YMMMATCH	ymm16
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
+
+	.section .text.evex,"ax",@progbits
+ENTRY (MEMCHR)
+# ifndef USE_AS_RAWMEMCHR
+	/* Check for zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	L(zero)
+
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
+# endif
+	/* Broadcast CHAR to YMMMATCH.  */
+	VPBROADCAST %esi, %YMMMATCH
+	/* Check if we may cross page boundary with one vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  */
+	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	cmovle	%rcx, %rax
+	ret
+# else
+	/* NB: first_vec_x0 is 17 bytes which will leave
+	   cross_page_boundary (which is relatively cold) close enough
+	   to ideal alignment. So only realign L(cross_page_boundary) if
+	   rawmemchr.  */
+	.p2align 4
+# endif
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+	   for rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	kmovd	%k0, %r8d
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+	   bytes.  */
+	sarl	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
+	subl	%eax, %esi
+# endif
+# ifdef USE_AS_WMEMCHR
+	andl	$(CHAR_PER_VEC - 1), %eax
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%eax, %r8d, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+	addq	%RAW_PTR_REG, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 5
+L(aligned_more):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Align data to VEC_SIZE.  */
+L(cross_page_continue):
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	andq	$-VEC_SIZE, %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Check if at last CHAR_PER_VEC * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	addq	$VEC_SIZE, %rdi
+
+	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+	 */
+#  ifdef USE_AS_WMEMCHR
+	movl	%edi, %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+	addq	%rcx, %rdx
+#  else
+	addq	%rdi, %rdx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rdx
+#  endif
+# else
+	addq	$VEC_SIZE, %rdi
+	andq	$-(4 * VEC_SIZE), %rdi
+# endif
+
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4
+L(loop_4x_vec):
+	/* It would be possible to save some instructions using 4x VPCMP
+	   but bottleneck on port 5 makes it not woth it.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	/* xor will set bytes match esi to zero.  */
+	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+	VPCMP	$0, %YMM3, %YMMZERO, %k2
+# ifdef USE_AS_RAWMEMCHR
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd %k2, %k3
+	jz	L(loop_4x_vec)
+# else
+	kortestd %k2, %k3
+	jnz	L(loop_4x_vec_end)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop_4x_vec)
+
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	addq	$(VEC_SIZE * 3), %rdi
+	.p2align 4
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length > CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jg	L(last_4x_vec)
+
+L(last_2x_vec):
+	/* If remaining length < CHAR_PER_VEC.  */
+	addl	$CHAR_PER_VEC, %edx
+	jle	L(zero_end)
+
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+	ret
+
+
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Adjust length.  */
+	subl	$-(CHAR_PER_VEC * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
+	/* k1 has not of matches with VEC1.  */
+	kmovd	%k1, %eax
+# ifdef USE_AS_WMEMCHR
+	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+# else
+	incl	%eax
+# endif
+	jnz	L(last_vec_x1_return)
+
+	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
+
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3_return)
+
+	kmovd	%k3, %eax
+	tzcntl	%eax, %eax
+# ifdef USE_AS_RAWMEMCHR
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4
+L(last_vec_x1_return):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_RAWMEMCHR
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	addq	%rdi, %rax
+#  endif
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4
+L(last_vec_x2_return):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4
+L(last_vec_x3_return):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+# endif
+	ret
+
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jle	L(last_2x_vec)
+
+	.p2align 4
+L(last_4x_vec):
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Create mask for possible matches within remaining length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+	bzhil	%edx, %ecx, %ecx
+#  else
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+#  endif
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+	   remaining length was found to be > CHAR_PER_VEC * 2.  */
+	subl	$CHAR_PER_VEC, %edx
+	jbe	L(zero_end2)
+
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Shift remaining length mask for last VEC.  */
+#  ifdef USE_AS_WMEMCHR
+	shrl	$CHAR_PER_VEC, %ecx
+#  else
+	shrq	$CHAR_PER_VEC, %rcx
+#  endif
+	andl	%ecx, %eax
+	jz	L(zero_end2)
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
+	ret
+
+L(last_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(last_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
new file mode 100644
index 0000000..cf4eff5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCMP
+# define MEMCMP __memcmp_avx2_movbe_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memcmp-avx2-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index d779639..0a9eab7 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -47,6 +47,10 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
 # define VEC_MASK ((1 << VEC_SIZE) - 1)
 
@@ -55,7 +59,7 @@
            memcmp has to use UNSIGNED comparison for elemnts.
 */
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
 	shl	$2, %RDX_LP
@@ -123,8 +127,8 @@ ENTRY (MEMCMP)
 	vptest	%ymm0, %ymm5
 	jnc	L(4x_vec_end)
 	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(last_2x_vec):
@@ -144,8 +148,7 @@ L(last_vec):
 	vpmovmskb %ymm2, %eax
 	subl    $VEC_MASK, %eax
 	jnz	L(first_vec)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec):
@@ -164,8 +167,7 @@ L(wmemcmp_return):
 	movzbl	(%rsi, %rcx), %edx
 	sub	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_WMEMCMP
 	.p2align 4
@@ -367,8 +369,7 @@ L(last_4x_vec):
 	vpmovmskb %ymm2, %eax
 	subl    $VEC_MASK, %eax
 	jnz	L(first_vec)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(4x_vec_end):
@@ -394,8 +395,7 @@ L(4x_vec_end):
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 	sub	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x1):
@@ -410,8 +410,7 @@ L(first_vec_x1):
 	movzbl	VEC_SIZE(%rsi, %rcx), %edx
 	sub	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2):
@@ -426,7 +425,6 @@ L(first_vec_x2):
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 	sub	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 END (MEMCMP)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
new file mode 100644
index 0000000..9c09397
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -0,0 +1,440 @@
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+      to avoid branches.
+   2. Use overlapping compare to avoid branch.
+   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+      bytes for wmemcmp.
+   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_evex_movbe
+# endif
+
+# define VMOVU		vmovdqu64
+
+# ifdef USE_AS_WMEMCMP
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# define XMM1		xmm17
+# define XMM2		xmm18
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+# define VEC_SIZE 32
+# ifdef USE_AS_WMEMCMP
+#  define VEC_MASK 0xff
+#  define XMM_MASK 0xf
+# else
+#  define VEC_MASK 0xffffffff
+#  define XMM_MASK 0xffff
+# endif
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.evex,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec)
+
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_vec)
+
+	/* More than 2 * VEC.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+
+	/* From 4 * VEC to 8 * VEC, inclusively. */
+	VMOVU	(%rsi), %YMM1
+	VPCMPEQ (%rdi), %YMM1, %k1
+
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+
+	kandd	%k1, %k2, %k5
+	kandd	%k3, %k4, %k6
+	kandd	%k5, %k6, %k6
+
+	kmovd	%k6, %eax
+	cmpl	$VEC_MASK, %eax
+	jne	L(4x_vec_end)
+
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %YMM1
+	VPCMPEQ (%rdi), %YMM1, %k1
+
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+	kandd	%k1, %k2, %k5
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+	kandd	%k3, %k5, %k5
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+	kandd	%k4, %k5, %k5
+
+	kmovd	%k5, %eax
+	cmpl	$VEC_MASK, %eax
+	jne	L(4x_vec_end)
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+L(last_vec):
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(first_vec):
+	/* A byte or int32 is different within 16 or 32 bytes.  */
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rcx, 4), %edx
+	cmpl	(%rsi, %rcx, 4), %edx
+L(wmemcmp_return):
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(4):
+	xorl	%eax, %eax
+	movl	(%rdi), %edx
+	cmpl	(%rsi), %edx
+	jne	L(wmemcmp_return)
+	ret
+# else
+	.p2align 4
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.  */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	je	L(exit)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	ret
+
+	.p2align 4
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	shll	$8, %eax
+	shll	$8, %ecx
+	bswap	%eax
+	bswap	%ecx
+	movb	-1(%rdi, %rdx), %al
+	movb	-1(%rsi, %rdx), %cl
+	/* Subtraction is okay because the upper 8 bits are zero.  */
+	subl	%ecx, %eax
+	ret
+
+	.p2align 4
+L(1):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+	cmpb	$4, %dl
+	je	L(4)
+	jb	L(zero)
+# else
+	cmpb	$1, %dl
+	je	L(1)
+	jb	L(zero)
+	cmpb	$4, %dl
+	jb	L(between_2_3)
+	cmpb	$8, %dl
+	jb	L(between_4_7)
+# endif
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+	/* It is between 8 and 15 bytes.  */
+	vmovq	(%rdi), %XMM1
+	vmovq	(%rsi), %XMM2
+	VPCMPEQ %XMM1, %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-8(%rdi, %rdx), %rdi
+	leaq	-8(%rsi, %rdx), %rsi
+	vmovq	(%rdi), %XMM1
+	vmovq	(%rsi), %XMM2
+	VPCMPEQ %XMM1, %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	VMOVU	(%rsi), %XMM2
+	VPCMPEQ (%rdi), %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-16(%rdi, %rdx), %rdi
+	leaq	-16(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %XMM2
+	VPCMPEQ (%rdi), %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* More than 8 * VEC.  Check the first VEC.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Align the first memory area for aligned loads in the loop.
+	   Compute how much the first memory area is misaligned.  */
+	movq	%rdi, %rcx
+	andl	$(VEC_SIZE - 1), %ecx
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %rcx
+	/* Adjust the second memory area.  */
+	subq	%rcx, %rsi
+	/* Adjust the first memory area which should be aligned now.  */
+	subq	%rcx, %rdi
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	VMOVU	(%rsi), %YMM1
+	VPCMPEQ (%rdi), %YMM1, %k1
+
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+	kandd	%k2, %k1, %k5
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+	kandd	%k3, %k5, %k5
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+	kandd	%k4, %k5, %k5
+
+	kmovd	%k5, %eax
+	cmpl	$VEC_MASK, %eax
+	jne	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+
+	subq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jae	L(loop_4x_vec)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(last_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_2x_vec)
+
+L(last_4x_vec):
+	/* From 2 * VEC to 4 * VEC. */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	kmovd	%k1, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec)
+	kmovd	%k2, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x1)
+	kmovd	%k3, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x2)
+	kmovd	%k4, %eax
+	subl	$VEC_MASK, %eax
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
+	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
new file mode 100644
index 0000000..1ec1962
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -0,0 +1,17 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
+
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+# define VZEROUPPER_RETURN jmp	 L(return)
+
+# define SECTION(p)		p##.avx.rtm
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index aac1515..7dad1ad 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,11 +1,25 @@
 #if IS_IN (libc)
 # define VEC_SIZE	64
-# define VEC(i)		zmm##i
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define YMM0		ymm16
+# define YMM1		ymm17
+# define VEC0		zmm16
+# define VEC1		zmm17
+# define VEC2		zmm18
+# define VEC3		zmm19
+# define VEC4		zmm20
+# define VEC5		zmm21
+# define VEC6		zmm22
+# define VEC7		zmm23
+# define VEC8		zmm24
+# define VEC(i)		VEC##i
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
+# define VZEROUPPER
 
-# define SECTION(p)		p##.avx512
+# define SECTION(p)		p##.evex512
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
new file mode 100644
index 0000000..b879007
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -0,0 +1,26 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define YMM0		ymm16
+# define YMM1		ymm17
+# define VEC0		ymm16
+# define VEC1		ymm17
+# define VEC2		ymm18
+# define VEC3		ymm19
+# define VEC4		ymm20
+# define VEC5		ymm21
+# define VEC6		ymm22
+# define VEC7		ymm23
+# define VEC8		ymm24
+# define VEC(i)		VEC##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+# define VZEROUPPER
+
+# define SECTION(p)		p##.evex
+# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 95e9bb0..6f599ef 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -48,6 +48,14 @@
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
 #endif
 
+#ifndef XMM0
+# define XMM0				xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0				ymm0
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER vzeroupper
@@ -150,11 +158,12 @@ L(last_2x_vec):
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-	VZEROUPPER
 #if !defined USE_MULTIARCH || !IS_IN (libc)
 L(nop):
-#endif
 	ret
+#else
+	VZEROUPPER_RETURN
+#endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 
@@ -247,8 +256,11 @@ L(last_2x_vec):
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
 L(return):
-	VZEROUPPER
+#if VEC_SIZE > 16
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
 	ret
+#endif
 
 L(movsb):
 	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
@@ -312,21 +324,20 @@ L(less_vec):
 #if VEC_SIZE > 32
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu	-32(%rsi,%rdx), %ymm1
-	vmovdqu	%ymm0, (%rdi)
-	vmovdqu	%ymm1, -32(%rdi,%rdx)
-	VZEROUPPER
-	ret
+	VMOVU	(%rsi), %YMM0
+	VMOVU	-32(%rsi,%rdx), %YMM1
+	VMOVU	%YMM0, (%rdi)
+	VMOVU	%YMM1, -32(%rdi,%rdx)
+	VZEROUPPER_RETURN
 #endif
 #if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	-16(%rsi,%rdx), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, -16(%rdi,%rdx)
-	ret
+	VMOVU	(%rsi), %XMM0
+	VMOVU	-16(%rsi,%rdx), %XMM1
+	VMOVU	%XMM0, (%rdi)
+	VMOVU	%XMM1, -16(%rdi,%rdx)
+	VZEROUPPER_RETURN
 #endif
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
@@ -379,8 +390,7 @@ L(more_2x_vec):
 	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 L(last_4x_vec):
 	/* Copy from 2 * VEC to 4 * VEC. */
 	VMOVU	(%rsi), %VEC(0)
@@ -391,8 +401,7 @@ L(last_4x_vec):
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 L(more_8x_vec):
 	cmpq	%rsi, %rdi
@@ -448,8 +457,7 @@ L(loop_4x_vec_forward):
 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
 	/* Store the first VEC.  */
 	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
@@ -500,8 +508,7 @@ L(loop_4x_vec_backward):
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
 	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 L(large_forward):
@@ -536,8 +543,7 @@ L(loop_large_forward):
 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
 	/* Store the first VEC.  */
 	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 L(large_backward):
 	/* Don't use non-temporal store if there is overlap between
@@ -571,8 +577,7 @@ L(loop_large_backward):
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
 	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
new file mode 100644
index 0000000..cea2d2a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index e19a732..a9c33e4 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -20,14 +20,22 @@
 
 # include <sysdep.h>
 
+# ifndef MEMRCHR
+#  define MEMRCHR	__memrchr_avx2
+# endif
+
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-	.section .text.avx,"ax",@progbits
-ENTRY (__memrchr_avx2)
+	.section SECTION(.text),"ax",@progbits
+ENTRY (MEMRCHR)
 	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
 	vpbroadcastb %xmm0, %ymm0
@@ -134,8 +142,8 @@ L(loop_4x_vec):
 	vpmovmskb %ymm1, %eax
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(last_4x_vec_or_less):
@@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
 	addq	%rax, %rdx
 	jl	L(zero)
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_2x_vec):
@@ -191,31 +198,27 @@ L(last_2x_vec):
 	jl	L(zero)
 	addl	$(VEC_SIZE * 2), %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x0):
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x1):
 	bsrl	%eax, %eax
 	addl	$VEC_SIZE, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x2):
 	bsrl	%eax, %eax
 	addl	$(VEC_SIZE * 2), %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x3):
@@ -232,8 +235,7 @@ L(last_vec_x1_check):
 	jl	L(zero)
 	addl	$VEC_SIZE, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x3_check):
@@ -243,12 +245,14 @@ L(last_vec_x3_check):
 	jl	L(zero)
 	addl	$(VEC_SIZE * 3), %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(zero):
-	VZEROUPPER
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+
+	.p2align 4
 L(null):
 	xorl	%eax, %eax
 	ret
@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
 
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_or_less):
@@ -315,8 +318,7 @@ L(last_vec_or_less):
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
 	addq	%r8, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_2x_aligned):
@@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
 	addq	%r8, %rax
-	VZEROUPPER
-	ret
-END (__memrchr_avx2)
+	VZEROUPPER_RETURN
+END (MEMRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
new file mode 100644
index 0000000..16bf8e0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -0,0 +1,337 @@
+/* memrchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define VMOVA		vmovdqa64
+
+# define YMMMATCH	ymm16
+
+# define VEC_SIZE 32
+
+	.section .text.evex,"ax",@progbits
+ENTRY (__memrchr_evex)
+	/* Broadcast CHAR to YMMMATCH.  */
+	vpbroadcastb %esi, %YMMMATCH
+
+	sub	$VEC_SIZE, %RDX_LP
+	jbe	L(last_vec_or_less)
+
+	add	%RDX_LP, %RDI_LP
+
+	/* Check the last VEC_SIZE bytes.  */
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x0)
+
+	subq	$(VEC_SIZE * 4), %rdi
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	jz	L(aligned_more)
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rdx
+	andq	$-VEC_SIZE, %rdi
+	subq	%rcx, %rdx
+
+	.p2align 4
+L(aligned_more):
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+
+	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+	kmovd	%k3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
+	kmovd	%k4, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x0)
+
+	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
+	   There are some overlaps with above if data isn't aligned
+	   to 4 * VEC_SIZE.  */
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	jz	L(loop_4x_vec)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rdx
+	andq	$-(VEC_SIZE * 4), %rdi
+	subq	%rcx, %rdx
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	subq	$(VEC_SIZE * 4), %rdi
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+	kord	%k1, %k2, %k5
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+
+	kord	%k3, %k4, %k6
+	kortestd %k5, %k6
+	jz	L(loop_4x_vec)
+
+	/* There is a match.  */
+	kmovd	%k4, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	kmovd	%k3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	kmovd	%k1, %eax
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_4x_vec_or_less):
+	addl	$(VEC_SIZE * 4), %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+	kmovd	%k3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1_check)
+	cmpl	$(VEC_SIZE * 3), %edx
+	jbe	L(zero)
+
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
+	kmovd	%k4, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3_check)
+	cmpl	$VEC_SIZE, %edx
+	jbe	L(zero)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 2), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$(VEC_SIZE * 2), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x0):
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x1):
+	bsrl	%eax, %eax
+	addl	$VEC_SIZE, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x2):
+	bsrl	%eax, %eax
+	addl	$(VEC_SIZE * 2), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x3):
+	bsrl	%eax, %eax
+	addl	$(VEC_SIZE * 3), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x1_check):
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 3), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$VEC_SIZE, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x3_check):
+	bsrl	%eax, %eax
+	subq	$VEC_SIZE, %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$(VEC_SIZE * 3), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_vec_or_less_aligned):
+	movl	%edx, %ecx
+
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+
+	movl	$1, %edx
+	/* Support rdx << 32.  */
+	salq	%cl, %rdx
+	subq	$1, %rdx
+
+	kmovd	%k1, %eax
+
+	/* Remove the trailing bytes.  */
+	andl	%edx, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_or_less):
+	addl	$VEC_SIZE, %edx
+
+	/* Check for zero length.  */
+	testl	%edx, %edx
+	jz	L(zero)
+
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	jz	L(last_vec_or_less_aligned)
+
+	movl	%ecx, %esi
+	movl	%ecx, %r8d
+	addl	%edx, %esi
+	andq	$-VEC_SIZE, %rdi
+
+	subl	$VEC_SIZE, %esi
+	ja	L(last_vec_2x_aligned)
+
+	/* Check the last VEC.  */
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+
+	/* Remove the leading and trailing bytes.  */
+	sarl	%cl, %eax
+	movl	%edx, %ecx
+
+	movl	$1, %edx
+	sall	%cl, %edx
+	subl	$1, %edx
+
+	andl	%edx, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	addq	%r8, %rax
+	ret
+
+	.p2align 4
+L(last_vec_2x_aligned):
+	movl	%esi, %ecx
+
+	/* Check the last VEC.  */
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+
+	movl	$1, %edx
+	sall	%cl, %edx
+	subl	$1, %edx
+
+	kmovd	%k1, %eax
+
+	/* Remove the trailing bytes.  */
+	andl	%edx, %eax
+
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	/* Check the second last VEC.  */
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+
+	movl	%r8d, %ecx
+
+	kmovd	%k1, %eax
+
+	/* Remove the leading bytes.  Must use unsigned right shift for
+	   bsrl below.  */
+	shrl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	addq	%r8, %rax
+	ret
+END (__memrchr_evex)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
new file mode 100644
index 0000000..8ac3e47
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -0,0 +1,10 @@
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return)
+
+#define SECTION(p) p##.avx.rtm
+#define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+#define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+
+#include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 7ab3d89..ae0860f 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -14,9 +14,15 @@
   movq r, %rax; \
   vpbroadcastd %xmm0, %ymm0
 
-# define SECTION(p)		p##.avx
-# define MEMSET_SYMBOL(p,s)	p##_avx2_##s
-# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+# ifndef SECTION
+#  define SECTION(p)		p##.avx
+# endif
+# ifndef MEMSET_SYMBOL
+#  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# endif
+# ifndef WMEMSET_SYMBOL
+#  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+# endif
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 0783979..22e7b18 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,22 +1,22 @@
 #if IS_IN (libc)
 # define VEC_SIZE	64
-# define VEC(i)		zmm##i
+# define XMM0		xmm16
+# define YMM0		ymm16
+# define VEC0		zmm16
+# define VEC(i)		VEC##i
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
+# define VZEROUPPER
 
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
   movq r, %rax; \
-  vpbroadcastb %xmm0, %xmm0; \
-  vpbroadcastq %xmm0, %zmm0
+  vpbroadcastb d, %VEC0
 
 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
   movq r, %rax; \
-  vpbroadcastd %xmm0, %xmm0; \
-  vpbroadcastq %xmm0, %zmm0
+  vpbroadcastd d, %VEC0
 
-# define SECTION(p)		p##.avx512
+# define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
 
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
new file mode 100644
index 0000000..ae0a4d6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -0,0 +1,24 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define XMM0		xmm16
+# define YMM0		ymm16
+# define VEC0		ymm16
+# define VEC(i)		VEC##i
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+# define VZEROUPPER
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movq r, %rax; \
+  vpbroadcastb d, %VEC0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movq r, %rax; \
+  vpbroadcastd d, %VEC0
+
+# define SECTION(p)		p##.evex
+# define MEMSET_SYMBOL(p,s)	p##_evex_##s
+# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 5e0d307..375844f 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,20 +34,25 @@
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
 #endif
 
+#ifndef XMM0
+# define XMM0				xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0				ymm0
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
+#  define VZEROUPPER_SHORT_RETURN	vzeroupper; ret
 # else
 #  define VZEROUPPER
 # endif
 #endif
 
 #ifndef VZEROUPPER_SHORT_RETURN
-# if VEC_SIZE > 16
-#  define VZEROUPPER_SHORT_RETURN	vzeroupper
-# else
-#  define VZEROUPPER_SHORT_RETURN	rep
-# endif
+# define VZEROUPPER_SHORT_RETURN	rep; ret
 #endif
 
 #ifndef MOVQ
@@ -77,7 +82,7 @@
 ENTRY (__bzero)
 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
 	mov	%RSI_LP, %RDX_LP /* Set n.  */
-	pxor	%xmm0, %xmm0
+	pxor	%XMM0, %XMM0
 	jmp	L(entry_from_bzero)
 END (__bzero)
 weak_alias (__bzero, bzero)
@@ -119,8 +124,7 @@ L(entry_from_bzero):
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
@@ -143,14 +147,12 @@ ENTRY (__memset_erms)
 ENTRY (MEMSET_SYMBOL (__memset, erms))
 # endif
 L(stosb):
-	/* Issue vzeroupper before rep stosb.  */
-	VZEROUPPER
 	mov	%RDX_LP, %RCX_LP
 	movzbl	%sil, %eax
 	mov	%RDI_LP, %RDX_LP
 	rep stosb
 	mov	%RDX_LP, %RAX_LP
-	ret
+	VZEROUPPER_RETURN
 # if VEC_SIZE == 16
 END (__memset_erms)
 # else
@@ -177,8 +179,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 L(stosb_more_2x_vec):
 	cmpq	$REP_STOSB_THRESHOLD, %rdx
@@ -192,8 +193,11 @@ L(more_2x_vec):
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 L(return):
-	VZEROUPPER
+#if VEC_SIZE > 16
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
 	ret
+#endif
 
 L(loop_start):
 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
@@ -219,7 +223,6 @@ L(loop):
 	cmpq	%rcx, %rdx
 	jne	L(loop)
 	VZEROUPPER_SHORT_RETURN
-	ret
 L(less_vec):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -233,7 +236,7 @@ L(less_vec):
 	cmpb	$16, %dl
 	jae	L(between_16_31)
 # endif
-	MOVQ	%xmm0, %rcx
+	MOVQ	%XMM0, %rcx
 	cmpb	$8, %dl
 	jae	L(between_8_15)
 	cmpb	$4, %dl
@@ -243,40 +246,34 @@ L(less_vec):
 	jb	1f
 	movb	%cl, (%rdi)
 1:
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 # if VEC_SIZE > 32
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	vmovdqu	%ymm0, -32(%rdi,%rdx)
-	vmovdqu	%ymm0, (%rdi)
-	VZEROUPPER
-	ret
+	VMOVU	%YMM0, -32(%rdi,%rdx)
+	VMOVU	%YMM0, (%rdi)
+	VZEROUPPER_RETURN
 # endif
 # if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
-	vmovdqu	%xmm0, -16(%rdi,%rdx)
-	vmovdqu	%xmm0, (%rdi)
-	VZEROUPPER
-	ret
+	VMOVU	%XMM0, -16(%rdi,%rdx)
+	VMOVU	%XMM0, (%rdi)
+	VZEROUPPER_RETURN
 # endif
 	/* From 8 to 15.  No branch when size == 8.  */
 L(between_8_15):
 	movq	%rcx, -8(%rdi,%rdx)
 	movq	%rcx, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 	movl	%ecx, -4(%rdi,%rdx)
 	movl	%ecx, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
 	movw	%cx, -2(%rdi,%rdx)
 	movw	%cx, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
new file mode 100644
index 0000000..acc5f6e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2_rtm
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
new file mode 100644
index 0000000..ec942b7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_evex
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
new file mode 100644
index 0000000..2b9c07a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S
new file mode 100644
index 0000000..7c6f26c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
new file mode 100644
index 0000000..60a2ccf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
new file mode 100644
index 0000000..1570014
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
new file mode 100644
index 0000000..637fb55
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCAT
+# define STRCAT __strcat_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
index b062356..aa48c05 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -30,7 +30,11 @@
 /* Number of bytes in a vector register */
 # define VEC_SIZE	32
 
-	.section .text.avx,"ax",@progbits
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCAT)
 	mov	%rdi, %r9
 # ifdef USE_AS_STRNCAT
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
new file mode 100644
index 0000000..97c3d85
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -0,0 +1,283 @@
+/* strcat with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_evex
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+/* zero register */
+# define XMMZERO	xmm16
+# define YMMZERO	ymm16
+# define YMM0		ymm17
+# define YMM1		ymm18
+
+# define USE_AS_STRCAT
+
+/* Number of bytes in a vector register */
+# define VEC_SIZE	32
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+	xor	%eax, %eax
+	mov	%edi, %ecx
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+	cmp	$(VEC_SIZE * 3), %ecx
+	ja	L(fourth_vector_boundary)
+	vpcmpb	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_first_vector)
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	jmp	L(align_vec_size_start)
+L(fourth_vector_boundary):
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	vpcmpb	$0, (%rax), %YMMZERO, %k0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	kmovd	%k0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+
+L(align_vec_size_start):
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	add	$(VEC_SIZE * 4), %rax
+	kmovd	%k4, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	kmovd	%k4, %edx
+	add	$(VEC_SIZE * 4), %rax
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	add	$(VEC_SIZE * 4), %rax
+	kmovd	%k4, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	add	$(VEC_SIZE * 5), %rax
+	kmovd	%k4, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	add	$VEC_SIZE, %rax
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	add	$VEC_SIZE, %rax
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
+	add	$VEC_SIZE, %rax
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$VEC_SIZE, %rax
+
+	.p2align 4
+L(align_four_vec_loop):
+	VMOVA	(%rax), %YMM0
+	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
+	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
+	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
+	vpminub	%YMM0, %YMM1, %YMM0
+	/* If K0 != 0, there is a null byte.  */
+	vpcmpb	$0, %YMM0, %YMMZERO, %k0
+	add	$(VEC_SIZE * 4), %rax
+	ktestd	%k0, %k0
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
+	sub	$(VEC_SIZE * 5), %rax
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_null_on_first_vector):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_second_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$VEC_SIZE, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_third_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 2), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fourth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 3), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fifth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+
+	.p2align 4
+L(StartStrcpyPart):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax      /* save result */
+
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-evex.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
new file mode 100644
index 0000000..81f20d1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCHR
+# define STRCHR __strchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 71b1a1d..352074e 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -38,9 +38,13 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCHR)
 	movl	%edi, %ecx
 	/* Broadcast CHAR to YMM0.  */
@@ -93,8 +97,8 @@ L(cros_page_boundary):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(aligned_more):
@@ -190,8 +194,7 @@ L(first_vec_x0):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x1):
@@ -205,8 +208,7 @@ L(first_vec_x1):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2):
@@ -220,8 +222,7 @@ L(first_vec_x2):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(4x_vec_end):
@@ -247,8 +248,7 @@ L(first_vec_x3):
 	cmp	(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 END (STRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
new file mode 100644
index 0000000..ddc86a7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -0,0 +1,335 @@
+/* strchr/strchrnul optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+#  define STRCHR	__strchr_evex
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# ifdef USE_AS_WCSCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define CHAR_REG	esi
+#  define SHIFT_REG	r8d
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define CHAR_REG	sil
+#  define SHIFT_REG	ecx
+# endif
+
+# define XMMZERO	xmm16
+
+# define YMMZERO	ymm16
+# define YMM0		ymm17
+# define YMM1		ymm18
+# define YMM2		ymm19
+# define YMM3		ymm20
+# define YMM4		ymm21
+# define YMM5		ymm22
+# define YMM6		ymm23
+# define YMM7		ymm24
+# define YMM8		ymm25
+
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRCHR)
+	movl	%edi, %ecx
+# ifndef USE_AS_STRCHRNUL
+	xorl	%edx, %edx
+# endif
+
+	/* Broadcast CHAR to YMM0.	*/
+	VPBROADCAST %esi, %YMM0
+
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+
+	/* Check if we cross page boundary with one vector load.  */
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja  L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
+	   null bytes.  */
+	VMOVU	(%rdi), %YMM1
+
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	ktestd	%k0, %k0
+	jz	L(more_vecs)
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(%rdi, %rax, 4), %rax
+# else
+	addq	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 4
+L(more_vecs):
+	/* Align data for aligned loads in the loop.  */
+	andq	$-VEC_SIZE, %rdi
+L(aligned_more):
+
+	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.	*/
+	VMOVA	VEC_SIZE(%rdi), %YMM1
+	addq	$VEC_SIZE, %rdi
+
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VMOVA	VEC_SIZE(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	ktestd	%k0, %k0
+	jz	L(prep_loop_4x)
+
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+# else
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x0):
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(%rdi, %rax, 4), %rax
+# else
+	addq	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+	leaq	VEC_SIZE(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+L(prep_loop_4x):
+	/* Align data to 4 * VEC_SIZE.	*/
+	andq	$-(VEC_SIZE * 4), %rdi
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
+	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
+
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM5
+	vpxorq	%YMM2, %YMM0, %YMM6
+	vpxorq	%YMM3, %YMM0, %YMM7
+	vpxorq	%YMM4, %YMM0, %YMM8
+
+	VPMINU	%YMM5, %YMM1, %YMM5
+	VPMINU	%YMM6, %YMM2, %YMM6
+	VPMINU	%YMM7, %YMM3, %YMM7
+	VPMINU	%YMM8, %YMM4, %YMM8
+
+	VPMINU	%YMM5, %YMM6, %YMM1
+	VPMINU	%YMM7, %YMM8, %YMM2
+
+	VPMINU	%YMM1, %YMM2, %YMM1
+
+	/* Each bit in K0 represents a CHAR or a null byte.  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+	ktestd	%k0, %k0
+	jz	L(loop_4x_vec)
+
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM5, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
+	VPCMP	$0, %YMMZERO, %YMM6, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
+	VPCMP	$0, %YMMZERO, %YMM7, %k2
+	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
+	VPCMP	$0, %YMMZERO, %YMM8, %k3
+
+# ifdef USE_AS_WCSCHR
+	/* NB: Each bit in K2/K3 represents 4-byte element.  */
+	kshiftlw $8, %k3, %k1
+# else
+	kshiftlq $32, %k3, %k1
+# endif
+
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	korq	%k1, %k2, %k1
+	kmovq	%k1, %rax
+
+	tzcntq  %rax, %rax
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	/* Cold case for crossing page with first load.	 */
+	.p2align 4
+L(cross_page_boundary):
+	andq	$-VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+
+	VMOVA	(%rdi), %YMM1
+
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+
+# ifdef USE_AS_WCSCHR
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	   bytes.  */
+	movl	%ecx, %SHIFT_REG
+	sarl    $2, %SHIFT_REG
+# endif
+
+	/* Remove the leading bits.	 */
+	sarxl	%SHIFT_REG, %eax, %eax
+	testl	%eax, %eax
+
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	addq	%rcx, %rdi
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(%rdi, %rax, 4), %rax
+# else
+	addq	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+END (STRCHR)
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 5bc6eb3..bcecec9 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -29,16 +29,28 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
+	  && CPU_FEATURES_CPU_P (cpu_features, BMI2))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
     return OPTIMIZE (sse2_no_bsf);
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
new file mode 100644
index 0000000..cdcf818
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2_rtm
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S
new file mode 100644
index 0000000..064fe7c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_evex
+#define USE_AS_STRCHRNUL 1
+#include "strchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
new file mode 100644
index 0000000..aecd30d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCMP
+# define STRCMP __strcmp_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index c9f2464..f199c91 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -55,6 +55,10 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -75,7 +79,7 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCMP)
 # ifdef USE_AS_STRNCMP
 	/* Check for simple cases (0 or 1) in offset.  */
@@ -83,6 +87,16 @@ ENTRY (STRCMP)
 	je	L(char0)
 	jb	L(zero)
 #  ifdef USE_AS_WCSCMP
+#  ifndef __ILP32__
+	movq	%rdx, %rcx
+	/* Check if length could overflow when multiplied by
+	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+	   overflow cases as well as redirect cases where its impossible to
+	   length to bound a valid memory region. In these cases just use
+	   'wcscmp'.  */
+	shrq	$56, %rcx
+	jnz	__wcscmp_avx2
+#  endif
 	/* Convert units: from wide to byte char.  */
 	shl	$2, %RDX_LP
 #  endif
@@ -127,8 +141,8 @@ L(return):
 	movzbl	(%rsi, %rdx), %edx
 	subl	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(return_vec_size):
@@ -161,8 +175,7 @@ L(return_vec_size):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(return_2_vec_size):
@@ -195,8 +208,7 @@ L(return_2_vec_size):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(return_3_vec_size):
@@ -229,8 +241,7 @@ L(return_3_vec_size):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(next_3_vectors):
@@ -356,8 +367,7 @@ L(back_to_loop):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(test_vec):
@@ -400,8 +410,7 @@ L(test_vec):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(test_2_vec):
@@ -444,8 +453,7 @@ L(test_2_vec):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(test_3_vec):
@@ -486,8 +494,7 @@ L(test_3_vec):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(loop_cross_page):
@@ -556,8 +563,7 @@ L(loop_cross_page):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(loop_cross_page_2_vec):
@@ -631,8 +637,7 @@ L(loop_cross_page_2_vec):
 	subl	%edx, %eax
 #  endif
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNCMP
 L(string_nbyte_offset_check):
@@ -674,8 +679,7 @@ L(cross_page_loop):
 # ifndef USE_AS_WCSCMP
 L(different):
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_WCSCMP
 	.p2align 4
@@ -685,16 +689,14 @@ L(different):
 	setl	%al
 	negl	%eax
 	orl	$1, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 # endif
 
 # ifdef USE_AS_STRNCMP
 	.p2align 4
 L(zero):
 	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(char0):
@@ -708,8 +710,7 @@ L(char0):
 	movzbl	(%rdi), %eax
 	subl	%ecx, %eax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 # endif
 
 	.p2align 4
@@ -734,8 +735,7 @@ L(last_vector):
 	movzbl	(%rsi, %rdx), %edx
 	subl	%edx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	/* Comparing on page boundary region requires special treatment:
 	   It must done one vector at the time, starting with the wider
@@ -856,7 +856,6 @@ L(cross_page_4bytes):
 	testl	%eax, %eax
 	jne	L(cross_page_loop)
 	subl	%ecx, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 END (STRCMP)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
new file mode 100644
index 0000000..459eeed
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -0,0 +1,1043 @@
+/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCMP
+#  define STRCMP	__strcmp_evex
+# endif
+
+# define PAGE_SIZE	4096
+
+/* VEC_SIZE = Number of bytes in a ymm register */
+# define VEC_SIZE	32
+
+/* Shift for dividing by (VEC_SIZE * 4).  */
+# define DIVIDE_BY_VEC_4_SHIFT	7
+# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# ifdef USE_AS_WCSCMP
+/* Compare packed dwords.  */
+#  define VPCMP		vpcmpd
+#  define SHIFT_REG32	r8d
+#  define SHIFT_REG64	r8
+/* 1 dword char == 4 bytes.  */
+#  define SIZE_OF_CHAR	4
+# else
+/* Compare packed bytes.  */
+#  define VPCMP		vpcmpb
+#  define SHIFT_REG32	ecx
+#  define SHIFT_REG64	rcx
+/* 1 byte char == 1 byte.  */
+#  define SIZE_OF_CHAR	1
+# endif
+
+# define XMMZERO	xmm16
+# define XMM0		xmm17
+# define XMM1		xmm18
+
+# define YMMZERO	ymm16
+# define YMM0		ymm17
+# define YMM1		ymm18
+# define YMM2		ymm19
+# define YMM3		ymm20
+# define YMM4		ymm21
+# define YMM5		ymm22
+# define YMM6		ymm23
+# define YMM7		ymm24
+
+/* Warning!
+           wcscmp/wcsncmp have to use SIGNED comparison for elements.
+           strcmp/strncmp have to use UNSIGNED comparison for elements.
+*/
+
+/* The main idea of the string comparison (byte or dword) using 256-bit
+   EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
+   latter can be on either packed bytes or dwords depending on
+   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
+   matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
+   KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
+   are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
+   instructions.  Main loop (away from from page boundary) compares 4
+   vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
+   bytes) on each loop.
+
+   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
+   is the same as strcmp, except that an a maximum offset is tracked.  If
+   the maximum offset is reached before a difference is found, zero is
+   returned.  */
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRCMP)
+# ifdef USE_AS_STRNCMP
+	/* Check for simple cases (0 or 1) in offset.  */
+	cmp	$1, %RDX_LP
+	je	L(char0)
+	jb	L(zero)
+#  ifdef USE_AS_WCSCMP
+	/* Convert units: from wide to byte char.  */
+	shl	$2, %RDX_LP
+#  endif
+	/* Register %r11 tracks the maximum offset.  */
+	mov	%RDX_LP, %R11_LP
+# endif
+	movl	%edi, %eax
+	xorl	%edx, %edx
+	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+	jg	L(cross_page)
+	/* Start comparing 4 vectors.  */
+	VMOVU	(%rdi), %YMM0
+	VMOVU	(%rsi), %YMM1
+
+	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+	VPCMP	$4, %YMM0, %YMM1, %k0
+
+	/* Check for NULL in YMM0.  */
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
+	/* Check for NULL in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k2
+	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+	kord	%k1, %k2, %k1
+
+	/* Each bit in K1 represents:
+	   1. A mismatch in YMM0 and YMM1.  Or
+	   2. A NULL in YMM0 or YMM1.
+	 */
+	kord	%k0, %k1, %k1
+
+	ktestd	%k1, %k1
+	je	L(next_3_vectors)
+	kmovd	%k1, %ecx
+	tzcntl	%ecx, %edx
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+	/* Return 0 if the mismatched index (%rdx) is after the maximum
+	   offset (%r11).   */
+	cmpq	%r11, %rdx
+	jae	L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rdx), %ecx
+	cmpl	(%rsi, %rdx), %ecx
+	je	L(return)
+L(wcscmp_return):
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+L(return):
+# else
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	subl	%edx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(return_vec_size):
+	kmovd	%k1, %ecx
+	tzcntl	%ecx, %edx
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+	   the maximum offset (%r11).  */
+	addq	$VEC_SIZE, %rdx
+	cmpq	%r11, %rdx
+	jae	L(zero)
+#  ifdef USE_AS_WCSCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rdx), %ecx
+	cmpl	(%rsi, %rdx), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	subl	%edx, %eax
+#  endif
+# else
+#  ifdef USE_AS_WCSCMP
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rdi, %rdx), %ecx
+	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	VEC_SIZE(%rdi, %rdx), %eax
+	movzbl	VEC_SIZE(%rsi, %rdx), %edx
+	subl	%edx, %eax
+#  endif
+# endif
+	ret
+
+	.p2align 4
+L(return_2_vec_size):
+	kmovd	%k1, %ecx
+	tzcntl	%ecx, %edx
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+	   after the maximum offset (%r11).  */
+	addq	$(VEC_SIZE * 2), %rdx
+	cmpq	%r11, %rdx
+	jae	L(zero)
+#  ifdef USE_AS_WCSCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rdx), %ecx
+	cmpl	(%rsi, %rdx), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	subl	%edx, %eax
+#  endif
+# else
+#  ifdef USE_AS_WCSCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
+	subl	%edx, %eax
+#  endif
+# endif
+	ret
+
+	.p2align 4
+L(return_3_vec_size):
+	kmovd	%k1, %ecx
+	tzcntl	%ecx, %edx
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+	   after the maximum offset (%r11).  */
+	addq	$(VEC_SIZE * 3), %rdx
+	cmpq	%r11, %rdx
+	jae	L(zero)
+#  ifdef USE_AS_WCSCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rdx), %ecx
+	cmpl	(%rsi, %rdx), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	subl	%edx, %eax
+#  endif
+# else
+#  ifdef USE_AS_WCSCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
+	subl	%edx, %eax
+#  endif
+# endif
+	ret
+
+	.p2align 4
+L(next_3_vectors):
+	VMOVU	VEC_SIZE(%rdi), %YMM0
+	VMOVU	VEC_SIZE(%rsi), %YMM1
+	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+	VPCMP	$4, %YMM0, %YMM1, %k0
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
+	VPCMP	$0, %YMMZERO, %YMM1, %k2
+	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+	kord	%k1, %k2, %k1
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	kord	%k0, %k1, %k1
+	ktestd	%k1, %k1
+	jne	L(return_vec_size)
+
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
+
+	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
+	VPCMP	$4, %YMM2, %YMM4, %k0
+	VPCMP	$0, %YMMZERO, %YMM2, %k1
+	VPCMP	$0, %YMMZERO, %YMM4, %k2
+	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
+	kord	%k1, %k2, %k1
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	kord	%k0, %k1, %k1
+	ktestd	%k1, %k1
+	jne	L(return_2_vec_size)
+
+	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
+	VPCMP	$4, %YMM3, %YMM5, %k0
+	VPCMP	$0, %YMMZERO, %YMM3, %k1
+	VPCMP	$0, %YMMZERO, %YMM5, %k2
+	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
+	kord	%k1, %k2, %k1
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	kord	%k0, %k1, %k1
+	ktestd	%k1, %k1
+	jne	L(return_3_vec_size)
+L(main_loop_header):
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	movl	$PAGE_SIZE, %ecx
+	/* Align load via RAX.  */
+	andq	$-(VEC_SIZE * 4), %rdx
+	subq	%rdi, %rdx
+	leaq	(%rdi, %rdx), %rax
+# ifdef USE_AS_STRNCMP
+	/* Starting from this point, the maximum offset, or simply the
+	   'offset', DECREASES by the same amount when base pointers are
+	   moved forward.  Return 0 when:
+	     1) On match: offset <= the matched vector index.
+	     2) On mistmach, offset is before the mistmatched index.
+	 */
+	subq	%rdx, %r11
+	jbe	L(zero)
+# endif
+	addq	%rsi, %rdx
+	movq	%rdx, %rsi
+	andl	$(PAGE_SIZE - 1), %esi
+	/* Number of bytes before page crossing.  */
+	subq	%rsi, %rcx
+	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
+	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
+	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
+	movl	%ecx, %esi
+	jmp	L(loop_start)
+
+	.p2align 4
+L(loop):
+# ifdef USE_AS_STRNCMP
+	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
+	   the maximum offset (%r11) by the same amount.  */
+	subq	$(VEC_SIZE * 4), %r11
+	jbe	L(zero)
+# endif
+	addq	$(VEC_SIZE * 4), %rax
+	addq	$(VEC_SIZE * 4), %rdx
+L(loop_start):
+	testl	%esi, %esi
+	leal	-1(%esi), %esi
+	je	L(loop_cross_page)
+L(back_to_loop):
+	/* Main loop, comparing 4 vectors are a time.  */
+	VMOVA	(%rax), %YMM0
+	VMOVA	VEC_SIZE(%rax), %YMM2
+	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
+	VMOVU	(%rdx), %YMM1
+	VMOVU	VEC_SIZE(%rdx), %YMM3
+	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
+	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
+
+	VPCMP	$4, %YMM0, %YMM1, %k0
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
+	VPCMP	$0, %YMMZERO, %YMM1, %k2
+	kord	%k1, %k2, %k1
+	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
+	   YMM1.  */
+	kord	%k0, %k1, %k4
+
+	VPCMP	$4, %YMM2, %YMM3, %k0
+	VPCMP	$0, %YMMZERO, %YMM2, %k1
+	VPCMP	$0, %YMMZERO, %YMM3, %k2
+	kord	%k1, %k2, %k1
+	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
+	   YMM3.  */
+	kord	%k0, %k1, %k5
+
+	VPCMP	$4, %YMM4, %YMM5, %k0
+	VPCMP	$0, %YMMZERO, %YMM4, %k1
+	VPCMP	$0, %YMMZERO, %YMM5, %k2
+	kord	%k1, %k2, %k1
+	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
+	   YMM5.  */
+	kord	%k0, %k1, %k6
+
+	VPCMP	$4, %YMM6, %YMM7, %k0
+	VPCMP	$0, %YMMZERO, %YMM6, %k1
+	VPCMP	$0, %YMMZERO, %YMM7, %k2
+	kord	%k1, %k2, %k1
+	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
+	   YMM7.  */
+	kord	%k0, %k1, %k7
+
+	kord	%k4, %k5, %k0
+	kord	%k6, %k7, %k1
+
+	/* Test each mask (32 bits) individually because for VEC_SIZE
+	   == 32 is not possible to OR the four masks and keep all bits
+	   in a 64-bit integer register, differing from SSE2 strcmp
+	   where ORing is possible.  */
+	kortestd %k0, %k1
+	je	L(loop)
+	ktestd	%k4, %k4
+	je	L(test_vec)
+	kmovd	%k4, %edi
+	tzcntl	%edi, %ecx
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %ecx
+# endif
+# ifdef USE_AS_STRNCMP
+	cmpq	%rcx, %r11
+	jbe	L(zero)
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(%rsi, %rcx), %edi
+	cmpl	(%rdx, %rcx), %edi
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+#  endif
+# else
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(%rsi, %rcx), %edi
+	cmpl	(%rdx, %rcx), %edi
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+#  endif
+# endif
+	ret
+
+	.p2align 4
+L(test_vec):
+# ifdef USE_AS_STRNCMP
+	/* The first vector matched.  Return 0 if the maximum offset
+	   (%r11) <= VEC_SIZE.  */
+	cmpq	$VEC_SIZE, %r11
+	jbe	L(zero)
+# endif
+	ktestd	%k5, %k5
+	je	L(test_2_vec)
+	kmovd	%k5, %ecx
+	tzcntl	%ecx, %edi
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %edi
+# endif
+# ifdef USE_AS_STRNCMP
+	addq	$VEC_SIZE, %rdi
+	cmpq	%rdi, %r11
+	jbe	L(zero)
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(%rsi, %rdi), %ecx
+	cmpl	(%rdx, %rdi), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rax, %rdi), %eax
+	movzbl	(%rdx, %rdi), %edx
+	subl	%edx, %eax
+#  endif
+# else
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rsi, %rdi), %ecx
+	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	VEC_SIZE(%rax, %rdi), %eax
+	movzbl	VEC_SIZE(%rdx, %rdi), %edx
+	subl	%edx, %eax
+#  endif
+# endif
+	ret
+
+	.p2align 4
+L(test_2_vec):
+# ifdef USE_AS_STRNCMP
+	/* The first 2 vectors matched.  Return 0 if the maximum offset
+	   (%r11) <= 2 * VEC_SIZE.  */
+	cmpq	$(VEC_SIZE * 2), %r11
+	jbe	L(zero)
+# endif
+	ktestd	%k6, %k6
+	je	L(test_3_vec)
+	kmovd	%k6, %ecx
+	tzcntl	%ecx, %edi
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %edi
+# endif
+# ifdef USE_AS_STRNCMP
+	addq	$(VEC_SIZE * 2), %rdi
+	cmpq	%rdi, %r11
+	jbe	L(zero)
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(%rsi, %rdi), %ecx
+	cmpl	(%rdx, %rdi), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rax, %rdi), %eax
+	movzbl	(%rdx, %rdi), %edx
+	subl	%edx, %eax
+#  endif
+# else
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
+	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
+	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
+	subl	%edx, %eax
+#  endif
+# endif
+	ret
+
+	.p2align 4
+L(test_3_vec):
+# ifdef USE_AS_STRNCMP
+	/* The first 3 vectors matched.  Return 0 if the maximum offset
+	   (%r11) <= 3 * VEC_SIZE.  */
+	cmpq	$(VEC_SIZE * 3), %r11
+	jbe	L(zero)
+# endif
+	kmovd	%k7, %esi
+	tzcntl	%esi, %ecx
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %ecx
+# endif
+# ifdef USE_AS_STRNCMP
+	addq	$(VEC_SIZE * 3), %rcx
+	cmpq	%rcx, %r11
+	jbe	L(zero)
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(%rsi, %rcx), %esi
+	cmpl	(%rdx, %rcx), %esi
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+#  endif
+# else
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
+	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
+	subl	%edx, %eax
+#  endif
+# endif
+	ret
+
+	.p2align 4
+L(loop_cross_page):
+	xorl	%r10d, %r10d
+	movq	%rdx, %rcx
+	/* Align load via RDX.  We load the extra ECX bytes which should
+	   be ignored.  */
+	andl	$((VEC_SIZE * 4) - 1), %ecx
+	/* R10 is -RCX.  */
+	subq	%rcx, %r10
+
+	/* This works only if VEC_SIZE * 2 == 64. */
+# if (VEC_SIZE * 2) != 64
+#  error (VEC_SIZE * 2) != 64
+# endif
+
+	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
+	cmpl	$(VEC_SIZE * 2), %ecx
+	jge	L(loop_cross_page_2_vec)
+
+	VMOVU	(%rax, %r10), %YMM2
+	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
+	VMOVU	(%rdx, %r10), %YMM4
+	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
+
+	VPCMP	$4, %YMM4, %YMM2, %k0
+	VPCMP	$0, %YMMZERO, %YMM2, %k1
+	VPCMP	$0, %YMMZERO, %YMM4, %k2
+	kord	%k1, %k2, %k1
+	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
+	   YMM4.  */
+	kord	%k0, %k1, %k1
+
+	VPCMP	$4, %YMM5, %YMM3, %k3
+	VPCMP	$0, %YMMZERO, %YMM3, %k4
+	VPCMP	$0, %YMMZERO, %YMM5, %k5
+	kord	%k4, %k5, %k4
+	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
+	   YMM5.  */
+	kord	%k3, %k4, %k3
+
+# ifdef USE_AS_WCSCMP
+	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+	kshiftlw $8, %k3, %k2
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	   bytes.  */
+	movl	%ecx, %SHIFT_REG32
+	sarl	$2, %SHIFT_REG32
+# else
+	kshiftlq $32, %k3, %k2
+# endif
+
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	korq	%k1, %k2, %k1
+	kmovq	%k1, %rdi
+
+	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+	shrxq	%SHIFT_REG64, %rdi, %rdi
+	testq	%rdi, %rdi
+	je	L(loop_cross_page_2_vec)
+	tzcntq	%rdi, %rcx
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %ecx
+# endif
+# ifdef USE_AS_STRNCMP
+	cmpq	%rcx, %r11
+	jbe	L(zero)
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(%rsi, %rcx), %edi
+	cmpl	(%rdx, %rcx), %edi
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+#  endif
+# else
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(%rsi, %rcx), %edi
+	cmpl	(%rdx, %rcx), %edi
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+#  endif
+# endif
+	ret
+
+	.p2align 4
+L(loop_cross_page_2_vec):
+	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
+	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
+	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
+
+	VPCMP	$4, %YMM0, %YMM2, %k0
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
+	VPCMP	$0, %YMMZERO, %YMM2, %k2
+	kord	%k1, %k2, %k1
+	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
+	   YMM2.  */
+	kord	%k0, %k1, %k1
+
+	VPCMP	$4, %YMM1, %YMM3, %k3
+	VPCMP	$0, %YMMZERO, %YMM1, %k4
+	VPCMP	$0, %YMMZERO, %YMM3, %k5
+	kord	%k4, %k5, %k4
+	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
+	   YMM3.  */
+	kord	%k3, %k4, %k3
+
+# ifdef USE_AS_WCSCMP
+	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+	kshiftlw $8, %k3, %k2
+# else
+	kshiftlq $32, %k3, %k2
+# endif
+
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	korq	%k1, %k2, %k1
+	kmovq	%k1, %rdi
+
+	xorl	%r8d, %r8d
+	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+	subl	$(VEC_SIZE * 2), %ecx
+	jle	1f
+	/* R8 has number of bytes skipped.  */
+	movl	%ecx, %r8d
+# ifdef USE_AS_WCSCMP
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	   bytes.  */
+	sarl	$2, %ecx
+# endif
+	/* Skip ECX bytes.  */
+	shrq	%cl, %rdi
+1:
+	/* Before jumping back to the loop, set ESI to the number of
+	   VEC_SIZE * 4 blocks before page crossing.  */
+	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+
+	testq	%rdi, %rdi
+# ifdef USE_AS_STRNCMP
+	/* At this point, if %rdi value is 0, it already tested
+	   VEC_SIZE*4+%r10 byte starting from %rax. This label
+	   checks whether strncmp maximum offset reached or not.  */
+	je	L(string_nbyte_offset_check)
+# else
+	je	L(back_to_loop)
+# endif
+	tzcntq	%rdi, %rcx
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %ecx
+# endif
+	addq	%r10, %rcx
+	/* Adjust for number of bytes skipped.  */
+	addq	%r8, %rcx
+# ifdef USE_AS_STRNCMP
+	addq	$(VEC_SIZE * 2), %rcx
+	subq	%rcx, %r11
+	jbe	L(zero)
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(%rsi, %rcx), %edi
+	cmpl	(%rdx, %rcx), %edi
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rax, %rcx), %eax
+	movzbl	(%rdx, %rcx), %edx
+	subl	%edx, %eax
+#  endif
+# else
+#  ifdef USE_AS_WCSCMP
+	movq	%rax, %rsi
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
+	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
+	subl	%edx, %eax
+#  endif
+# endif
+	ret
+
+# ifdef USE_AS_STRNCMP
+L(string_nbyte_offset_check):
+	leaq	(VEC_SIZE * 4)(%r10), %r10
+	cmpq	%r10, %r11
+	jbe	L(zero)
+	jmp	L(back_to_loop)
+# endif
+
+	.p2align 4
+L(cross_page_loop):
+	/* Check one byte/dword at a time.  */
+# ifdef USE_AS_WCSCMP
+	cmpl	%ecx, %eax
+# else
+	subl	%ecx, %eax
+# endif
+	jne	L(different)
+	addl	$SIZE_OF_CHAR, %edx
+	cmpl	$(VEC_SIZE * 4), %edx
+	je	L(main_loop_header)
+# ifdef USE_AS_STRNCMP
+	cmpq	%r11, %rdx
+	jae	L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rdx), %eax
+	movl	(%rsi, %rdx), %ecx
+# else
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %ecx
+# endif
+	/* Check null char.  */
+	testl	%eax, %eax
+	jne	L(cross_page_loop)
+	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+	   comparisons.  */
+	subl	%ecx, %eax
+# ifndef USE_AS_WCSCMP
+L(different):
+# endif
+	ret
+
+# ifdef USE_AS_WCSCMP
+	.p2align 4
+L(different):
+	/* Use movl to avoid modifying EFLAGS.  */
+	movl	$0, %eax
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+	ret
+# endif
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(char0):
+#  ifdef USE_AS_WCSCMP
+	xorl	%eax, %eax
+	movl	(%rdi), %ecx
+	cmpl	(%rsi), %ecx
+	jne	L(wcscmp_return)
+#  else
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+	subl	%ecx, %eax
+#  endif
+	ret
+# endif
+
+	.p2align 4
+L(last_vector):
+	addq	%rdx, %rdi
+	addq	%rdx, %rsi
+# ifdef USE_AS_STRNCMP
+	subq	%rdx, %r11
+# endif
+	tzcntl	%ecx, %edx
+# ifdef USE_AS_WCSCMP
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+	cmpq	%r11, %rdx
+	jae	L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rdx), %ecx
+	cmpl	(%rsi, %rdx), %ecx
+	jne	L(wcscmp_return)
+# else
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	subl	%edx, %eax
+# endif
+	ret
+
+	/* Comparing on page boundary region requires special treatment:
+	   It must done one vector at the time, starting with the wider
+	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
+	   (xmm) still passes the boundary, byte comparison must be done.
+	 */
+	.p2align 4
+L(cross_page):
+	/* Try one ymm vector at a time.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(cross_page_1_vector)
+L(loop_1_vector):
+	VMOVU	(%rdi, %rdx), %YMM0
+	VMOVU	(%rsi, %rdx), %YMM1
+
+	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+	VPCMP	$4, %YMM0, %YMM1, %k0
+	VPCMP	$0, %YMMZERO, %YMM0, %k1
+	VPCMP	$0, %YMMZERO, %YMM1, %k2
+	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+	kord	%k1, %k2, %k1
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	kord	%k0, %k1, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jne	L(last_vector)
+
+	addl	$VEC_SIZE, %edx
+
+	addl	$VEC_SIZE, %eax
+# ifdef USE_AS_STRNCMP
+	/* Return 0 if the current offset (%rdx) >= the maximum offset
+	   (%r11).  */
+	cmpq	%r11, %rdx
+	jae	L(zero)
+# endif
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jle	L(loop_1_vector)
+L(cross_page_1_vector):
+	/* Less than 32 bytes to check, try one xmm vector.  */
+	cmpl	$(PAGE_SIZE - 16), %eax
+	jg	L(cross_page_1_xmm)
+	VMOVU	(%rdi, %rdx), %XMM0
+	VMOVU	(%rsi, %rdx), %XMM1
+
+	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+	VPCMP	$4, %XMM0, %XMM1, %k0
+	VPCMP	$0, %XMMZERO, %XMM0, %k1
+	VPCMP	$0, %XMMZERO, %XMM1, %k2
+	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+	korw	%k1, %k2, %k1
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	korw	%k0, %k1, %k1
+	kmovw	%k1, %ecx
+	testl	%ecx, %ecx
+	jne	L(last_vector)
+
+	addl	$16, %edx
+# ifndef USE_AS_WCSCMP
+	addl	$16, %eax
+# endif
+# ifdef USE_AS_STRNCMP
+	/* Return 0 if the current offset (%rdx) >= the maximum offset
+	   (%r11).  */
+	cmpq	%r11, %rdx
+	jae	L(zero)
+# endif
+
+L(cross_page_1_xmm):
+# ifndef USE_AS_WCSCMP
+	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
+	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
+	cmpl	$(PAGE_SIZE - 8), %eax
+	jg	L(cross_page_8bytes)
+	vmovq	(%rdi, %rdx), %XMM0
+	vmovq	(%rsi, %rdx), %XMM1
+
+	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+	VPCMP	$4, %XMM0, %XMM1, %k0
+	VPCMP	$0, %XMMZERO, %XMM0, %k1
+	VPCMP	$0, %XMMZERO, %XMM1, %k2
+	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+	kord	%k1, %k2, %k1
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	kord	%k0, %k1, %k1
+	kmovd	%k1, %ecx
+
+# ifdef USE_AS_WCSCMP
+	/* Only last 2 bits are valid.  */
+	andl	$0x3, %ecx
+# else
+	/* Only last 8 bits are valid.  */
+	andl	$0xff, %ecx
+# endif
+
+	testl	%ecx, %ecx
+	jne	L(last_vector)
+
+	addl	$8, %edx
+	addl	$8, %eax
+#  ifdef USE_AS_STRNCMP
+	/* Return 0 if the current offset (%rdx) >= the maximum offset
+	   (%r11).  */
+	cmpq	%r11, %rdx
+	jae	L(zero)
+#  endif
+
+L(cross_page_8bytes):
+	/* Less than 8 bytes to check, try 4 byte vector.  */
+	cmpl	$(PAGE_SIZE - 4), %eax
+	jg	L(cross_page_4bytes)
+	vmovd	(%rdi, %rdx), %XMM0
+	vmovd	(%rsi, %rdx), %XMM1
+
+	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+	VPCMP	$4, %XMM0, %XMM1, %k0
+	VPCMP	$0, %XMMZERO, %XMM0, %k1
+	VPCMP	$0, %XMMZERO, %XMM1, %k2
+	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+	kord	%k1, %k2, %k1
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	kord	%k0, %k1, %k1
+	kmovd	%k1, %ecx
+
+# ifdef USE_AS_WCSCMP
+	/* Only the last bit is valid.  */
+	andl	$0x1, %ecx
+# else
+	/* Only last 4 bits are valid.  */
+	andl	$0xf, %ecx
+# endif
+
+	testl	%ecx, %ecx
+	jne	L(last_vector)
+
+	addl	$4, %edx
+#  ifdef USE_AS_STRNCMP
+	/* Return 0 if the current offset (%rdx) >= the maximum offset
+	   (%r11).  */
+	cmpq	%r11, %rdx
+	jae	L(zero)
+#  endif
+
+L(cross_page_4bytes):
+# endif
+	/* Less than 4 bytes to check, try one byte/dword at a time.  */
+# ifdef USE_AS_STRNCMP
+	cmpq	%r11, %rdx
+	jae	L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rdx), %eax
+	movl	(%rsi, %rdx), %ecx
+# else
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %ecx
+# endif
+	testl	%eax, %eax
+	jne	L(cross_page_loop)
+	subl	%ecx, %eax
+	ret
+END (STRCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index ed31970..41d670b 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -30,16 +30,29 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
+	  && CPU_FEATURES_CPU_P (cpu_features, BMI2)
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
new file mode 100644
index 0000000..c2c581e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCPY
+# define STRCPY __strcpy_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index e72a0ca..19f4d5f 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -37,6 +37,10 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 /* zero register */
 #define xmmZ	xmm0
 #define ymmZ	ymm0
@@ -46,7 +50,7 @@
 
 # ifndef USE_AS_STRCAT
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCPY)
 #  ifdef USE_AS_STRNCPY
 	mov	%RDX_LP, %R8_LP
@@ -369,8 +373,8 @@ L(CopyVecSizeExit):
 	lea	1(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(CopyTwoVecSize1):
@@ -553,8 +557,7 @@ L(Exit1):
 	lea	2(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit2):
@@ -569,8 +572,7 @@ L(Exit2):
 	lea	3(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit3):
@@ -584,8 +586,7 @@ L(Exit3):
 	lea	4(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit4_7):
@@ -602,8 +603,7 @@ L(Exit4_7):
 	lea	1(%rdi, %rdx), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit8_15):
@@ -620,8 +620,7 @@ L(Exit8_15):
 	lea	1(%rdi, %rdx), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit16_31):
@@ -638,8 +637,7 @@ L(Exit16_31):
 	lea 1(%rdi, %rdx), %rdi
 	jnz L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit32_63):
@@ -656,8 +654,7 @@ L(Exit32_63):
 	lea	1(%rdi, %rdx), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNCPY
 
@@ -671,8 +668,7 @@ L(StrncpyExit1):
 #  ifdef USE_AS_STRCAT
 	movb	$0, 1(%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit2):
@@ -684,8 +680,7 @@ L(StrncpyExit2):
 #  ifdef USE_AS_STRCAT
 	movb	$0, 2(%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit3_4):
@@ -699,8 +694,7 @@ L(StrncpyExit3_4):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit5_8):
@@ -714,8 +708,7 @@ L(StrncpyExit5_8):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit9_16):
@@ -729,8 +722,7 @@ L(StrncpyExit9_16):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit17_32):
@@ -744,8 +736,7 @@ L(StrncpyExit17_32):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit33_64):
@@ -760,8 +751,7 @@ L(StrncpyExit33_64):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit65):
@@ -778,50 +768,43 @@ L(StrncpyExit65):
 #  ifdef USE_AS_STRCAT
 	movb	$0, 65(%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 #  ifndef USE_AS_STRCAT
 
 	.p2align 4
 L(Fill1):
 	mov	%dl, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill2):
 	mov	%dx, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill3_4):
 	mov	%dx, (%rdi)
 	mov     %dx, -2(%rdi, %r8)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill5_8):
 	mov	%edx, (%rdi)
 	mov     %edx, -4(%rdi, %r8)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill9_16):
 	mov	%rdx, (%rdi)
 	mov	%rdx, -8(%rdi, %r8)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill17_32):
 	vmovdqu %xmmZ, (%rdi)
 	vmovdqu %xmmZ, -16(%rdi, %r8)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(CopyVecSizeUnalignedVec2):
@@ -898,8 +881,7 @@ L(Fill):
 	cmp	$1, %r8d
 	ja	L(Fill2)
 	je	L(Fill1)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 /* end of ifndef USE_AS_STRCAT */
 #  endif
@@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (VEC_SIZE * 4)(%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(UnalignedFourVecSizeLeaveCase2):
@@ -1001,16 +982,14 @@ L(StrncpyExit):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(ExitZero):
 #  ifndef USE_AS_STRCAT
 	mov	%rdi, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # endif
 
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
new file mode 100644
index 0000000..a343a1a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -0,0 +1,1003 @@
+/* strcpy with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_evex
+#  endif
+
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+/* Number of bytes in a vector register */
+# ifndef VEC_SIZE
+#  define VEC_SIZE	32
+# endif
+
+# define XMM2		xmm18
+# define XMM3		xmm19
+
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+# define YMM7		ymm23
+
+# ifndef USE_AS_STRCAT
+
+/* zero register */
+#  define XMMZERO	xmm16
+#  define YMMZERO	ymm16
+#  define YMM1		ymm17
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRCPY)
+#  ifdef USE_AS_STRNCPY
+	mov	%RDX_LP, %R8_LP
+	test	%R8_LP, %R8_LP
+	jz	L(ExitZero)
+#  endif
+	mov	%rsi, %rcx
+#  ifndef USE_AS_STPCPY
+	mov	%rdi, %rax      /* save result */
+#  endif
+
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+# endif
+
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	cmp	$(VEC_SIZE * 2), %ecx
+	jbe	L(SourceStringAlignmentLessTwoVecSize)
+
+	and	$-VEC_SIZE, %rsi
+	and	$(VEC_SIZE - 1), %ecx
+
+	vpcmpb	$0, (%rsi), %YMMZERO, %k0
+	kmovd	%k0, %edx
+	shr	%cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	mov	$VEC_SIZE, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  else
+	mov	$(VEC_SIZE + 1), %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  endif
+	jbe	L(CopyVecSizeTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyVecSizeTail)
+
+	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
+	kmovd	%k1, %edx
+
+# ifdef USE_AS_STRNCPY
+	add	$VEC_SIZE, %r10
+	cmp	%r10, %r8
+	jbe	L(CopyTwoVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyTwoVecSize)
+
+	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
+	VMOVU	%YMM2, (%rdi)
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(UnalignVecSizeBoth):
+	sub	%rcx, %rdi
+# ifdef USE_AS_STRNCPY
+	add	%rcx, %r8
+	sbb	%rcx, %rcx
+	or	%rcx, %r8
+# endif
+	mov	$VEC_SIZE, %rcx
+	VMOVA	(%rsi, %rcx), %YMM2
+	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 3), %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec3)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM3, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
+	vpcmpb	$0, %YMM4, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec4)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM4, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
+	VMOVU	%YMM2, (%rdi, %rcx)
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec3)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM3, (%rdi, %rcx)
+	mov	%rsi, %rdx
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
+	and	$-(VEC_SIZE * 4), %rsi
+	sub	%rsi, %rdx
+	sub	%rdx, %rdi
+# ifdef USE_AS_STRNCPY
+	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
+# endif
+L(UnalignedFourVecSizeLoop):
+	VMOVA	(%rsi), %YMM4
+	VMOVA	VEC_SIZE(%rsi), %YMM5
+	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
+	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
+	vpminub	%YMM5, %YMM4, %YMM2
+	vpminub	%YMM7, %YMM6, %YMM3
+	vpminub	%YMM2, %YMM3, %YMM2
+	/* If K7 != 0, there is a null byte.  */
+	vpcmpb	$0, %YMM2, %YMMZERO, %k7
+	kmovd	%k7, %edx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 4), %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(UnalignedFourVecSizeLeave)
+
+L(UnalignedFourVecSizeLoop_start):
+	add	$(VEC_SIZE * 4), %rdi
+	add	$(VEC_SIZE * 4), %rsi
+	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
+	VMOVA	(%rsi), %YMM4
+	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
+	VMOVA	VEC_SIZE(%rsi), %YMM5
+	vpminub	%YMM5, %YMM4, %YMM2
+	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
+	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
+	VMOVU	%YMM7, -VEC_SIZE(%rdi)
+	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
+	vpminub	%YMM7, %YMM6, %YMM3
+	vpminub	%YMM2, %YMM3, %YMM2
+	/* If K7 != 0, there is a null byte.  */
+	vpcmpb	$0, %YMM2, %YMMZERO, %k7
+	kmovd	%k7, %edx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 4), %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jz	L(UnalignedFourVecSizeLoop_start)
+
+L(UnalignedFourVecSizeLeave):
+	vpcmpb	$0, %YMM4, %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(CopyVecSizeUnaligned_0)
+
+	vpcmpb	$0, %YMM5, %YMMZERO, %k2
+	kmovd	%k2, %ecx
+	test	%ecx, %ecx
+	jnz	L(CopyVecSizeUnaligned_16)
+
+	vpcmpb	$0, %YMM6, %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(CopyVecSizeUnaligned_32)
+
+	vpcmpb	$0, %YMM7, %YMMZERO, %k4
+	kmovd	%k4, %ecx
+	bsf	%ecx, %edx
+	VMOVU	%YMM4, (%rdi)
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
+# endif
+	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
+	add	$(VEC_SIZE - 1), %r8
+	sub	%rdx, %r8
+	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$(VEC_SIZE * 3), %rsi
+	add	$(VEC_SIZE * 3), %rdi
+	jmp	L(CopyVecSizeExit)
+# endif
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLessTwoVecSize):
+	VMOVU	(%rsi), %YMM3
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	kmovd	%k0, %edx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$VEC_SIZE, %r8
+#  else
+	cmp	$(VEC_SIZE + 1), %r8
+#  endif
+	jbe	L(CopyVecSizeTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyVecSizeTail1)
+
+	VMOVU	%YMM3, (%rdi)
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$(VEC_SIZE * 2), %r8
+#  else
+	cmp	$((VEC_SIZE * 2) + 1), %r8
+#  endif
+	jbe	L(CopyTwoVecSize1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyTwoVecSize1)
+
+	and	$-VEC_SIZE, %rsi
+	and	$(VEC_SIZE - 1), %ecx
+	jmp	L(UnalignVecSizeBoth)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+	.p2align 4
+L(CopyVecSize):
+	add	%rcx, %rdi
+# endif
+L(CopyVecSizeTail):
+	add	%rcx, %rsi
+L(CopyVecSizeTail1):
+	bsf	%edx, %edx
+L(CopyVecSizeExit):
+	cmp	$32, %edx
+	jae	L(Exit32_63)
+	cmp	$16, %edx
+	jae	L(Exit16_31)
+	cmp	$8, %edx
+	jae	L(Exit8_15)
+	cmp	$4, %edx
+	jae	L(Exit4_7)
+	cmp	$3, %edx
+	je	L(Exit3)
+	cmp	$1, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	movb	$0, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$1, %r8
+	lea	1(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(CopyTwoVecSize1):
+	add	$VEC_SIZE, %rsi
+	add	$VEC_SIZE, %rdi
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$VEC_SIZE, %r8
+# endif
+	jmp	L(CopyVecSizeTail1)
+
+	.p2align 4
+L(CopyTwoVecSize):
+	bsf	%edx, %edx
+	add	%rcx, %rsi
+	add	$VEC_SIZE, %edx
+	sub	%ecx, %edx
+	jmp	L(CopyVecSizeExit)
+
+	.p2align 4
+L(CopyVecSizeUnaligned_0):
+	bsf	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+	VMOVU	%YMM4, (%rdi)
+	add	$((VEC_SIZE * 4) - 1), %r8
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	jmp	L(CopyVecSizeExit)
+# endif
+
+	.p2align 4
+L(CopyVecSizeUnaligned_16):
+	bsf	%ecx, %edx
+	VMOVU	%YMM4, (%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	VEC_SIZE(%rdi, %rdx), %rax
+# endif
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	add	$((VEC_SIZE * 3) - 1), %r8
+	sub	%rdx, %r8
+	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$VEC_SIZE, %rsi
+	add	$VEC_SIZE, %rdi
+	jmp	L(CopyVecSizeExit)
+# endif
+
+	.p2align 4
+L(CopyVecSizeUnaligned_32):
+	bsf	%edx, %edx
+	VMOVU	%YMM4, (%rdi)
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
+# endif
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	add	$((VEC_SIZE * 2) - 1), %r8
+	sub	%rdx, %r8
+	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$(VEC_SIZE * 2), %rsi
+	add	$(VEC_SIZE * 2), %rdi
+	jmp	L(CopyVecSizeExit)
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
+	.p2align 4
+L(CopyVecSizeUnalignedVec6):
+	VMOVU	%YMM6, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec5):
+	VMOVU	%YMM5, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec4):
+	VMOVU	%YMM4, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec3):
+	VMOVU	%YMM3, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+#  endif
+
+/* Case2 */
+
+	.p2align 4
+L(CopyVecSizeCase2):
+	add	$VEC_SIZE, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+	jmp	L(StrncpyExit)
+
+	.p2align 4
+L(CopyTwoVecSizeCase2):
+	add	%rcx, %rsi
+	bsf	%edx, %edx
+	add	$VEC_SIZE, %edx
+	sub	%ecx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+	jmp	L(StrncpyExit)
+
+L(CopyVecSizeTailCase2):
+	add	%rcx, %rsi
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+	jmp	L(StrncpyExit)
+
+L(CopyVecSizeTail1Case2):
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+	jmp	L(StrncpyExit)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyVecSizeCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeCase2)
+L(CopyVecSizeCase3):
+	add	$VEC_SIZE, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	jmp	L(StrncpyExit)
+
+	.p2align 4
+L(CopyTwoVecSizeCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyTwoVecSizeCase2)
+	add	%rcx, %rsi
+	jmp	L(StrncpyExit)
+
+	.p2align 4
+L(CopyVecSizeTailCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeTailCase2)
+	add	%rcx, %rsi
+	jmp	L(StrncpyExit)
+
+	.p2align 4
+L(CopyTwoVecSize1Case2OrCase3):
+	add	$VEC_SIZE, %rdi
+	add	$VEC_SIZE, %rsi
+	sub	$VEC_SIZE, %r8
+L(CopyVecSizeTail1Case2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeTail1Case2)
+	jmp	L(StrncpyExit)
+# endif
+
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
+
+	.p2align 4
+L(Exit1):
+	movzwl	(%rsi), %edx
+	mov	%dx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$2, %r8
+	lea	2(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit2):
+	movzwl	(%rsi), %ecx
+	mov	%cx, (%rdi)
+	movb	$0, 2(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$3, %r8
+	lea	3(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit3):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$4, %r8
+	lea	4(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit4_7):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	-3(%rsi, %rdx), %ecx
+	mov	%ecx, -3(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	%rdx, %r8
+	sub	$1, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit8_15):
+	mov	(%rsi), %rcx
+	mov	-7(%rsi, %rdx), %r9
+	mov	%rcx, (%rdi)
+	mov	%r9, -7(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	%rdx, %r8
+	sub	$1, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit16_31):
+	VMOVU	(%rsi), %XMM2
+	VMOVU	-15(%rsi, %rdx), %XMM3
+	VMOVU	%XMM2, (%rdi)
+	VMOVU	%XMM3, -15(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub %rdx, %r8
+	sub $1, %r8
+	lea 1(%rdi, %rdx), %rdi
+	jnz L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit32_63):
+	VMOVU	(%rsi), %YMM2
+	VMOVU	-31(%rsi, %rdx), %YMM3
+	VMOVU	%YMM2, (%rdi)
+	VMOVU	%YMM3, -31(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	%rdx, %r8
+	sub	$1, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(StrncpyExit1):
+	movzbl	(%rsi), %edx
+	mov	%dl, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, 1(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit2):
+	movzwl	(%rsi), %edx
+	mov	%dx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, 2(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit3_4):
+	movzwl	(%rsi), %ecx
+	movzwl	-2(%rsi, %r8), %edx
+	mov	%cx, (%rdi)
+	mov	%dx, -2(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit5_8):
+	mov	(%rsi), %ecx
+	mov	-4(%rsi, %r8), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, -4(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit9_16):
+	mov	(%rsi), %rcx
+	mov	-8(%rsi, %r8), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, -8(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit17_32):
+	VMOVU	(%rsi), %XMM2
+	VMOVU	-16(%rsi, %r8), %XMM3
+	VMOVU	%XMM2, (%rdi)
+	VMOVU	%XMM3, -16(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit33_64):
+	/*  0/32, 31/16 */
+	VMOVU	(%rsi), %YMM2
+	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
+	VMOVU	%YMM2, (%rdi)
+	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit65):
+	/* 0/32, 32/32, 64/1 */
+	VMOVU	(%rsi), %YMM2
+	VMOVU	32(%rsi), %YMM3
+	mov	64(%rsi), %cl
+	VMOVU	%YMM2, (%rdi)
+	VMOVU	%YMM3, 32(%rdi)
+	mov	%cl, 64(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	65(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, 65(%rdi)
+#  endif
+	ret
+
+#  ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(Fill1):
+	mov	%dl, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill2):
+	mov	%dx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill3_4):
+	mov	%dx, (%rdi)
+	mov     %dx, -2(%rdi, %r8)
+	ret
+
+	.p2align 4
+L(Fill5_8):
+	mov	%edx, (%rdi)
+	mov     %edx, -4(%rdi, %r8)
+	ret
+
+	.p2align 4
+L(Fill9_16):
+	mov	%rdx, (%rdi)
+	mov	%rdx, -8(%rdi, %r8)
+	ret
+
+	.p2align 4
+L(Fill17_32):
+	VMOVU	%XMMZERO, (%rdi)
+	VMOVU	%XMMZERO, -16(%rdi, %r8)
+	ret
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec2):
+	VMOVU	%YMM2, (%rdi, %rcx)
+
+	.p2align 4
+L(CopyVecSizeVecExit):
+	bsf	%edx, %edx
+	add	$(VEC_SIZE - 1), %r8
+	add	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+#   endif
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	xor	%edx, %edx
+	sub	$VEC_SIZE, %r8
+	jbe	L(StrncpyFillExit)
+
+	VMOVU	%YMMZERO, (%rdi)
+	add	$VEC_SIZE, %rdi
+
+	mov	%rdi, %rsi
+	and	$(VEC_SIZE - 1), %esi
+	sub	%rsi, %rdi
+	add	%rsi, %r8
+	sub	$(VEC_SIZE * 4), %r8
+	jb	L(StrncpyFillLessFourVecSize)
+
+L(StrncpyFillLoopVmovdqa):
+	VMOVA	%YMMZERO, (%rdi)
+	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
+	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
+	add	$(VEC_SIZE * 4), %rdi
+	sub	$(VEC_SIZE * 4), %r8
+	jae	L(StrncpyFillLoopVmovdqa)
+
+L(StrncpyFillLessFourVecSize):
+	add	$(VEC_SIZE * 2), %r8
+	jl	L(StrncpyFillLessTwoVecSize)
+	VMOVA	%YMMZERO, (%rdi)
+	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
+	add	$(VEC_SIZE * 2), %rdi
+	sub	$VEC_SIZE, %r8
+	jl	L(StrncpyFillExit)
+	VMOVA	%YMMZERO, (%rdi)
+	add	$VEC_SIZE, %rdi
+	jmp	L(Fill)
+
+	.p2align 4
+L(StrncpyFillLessTwoVecSize):
+	add	$VEC_SIZE, %r8
+	jl	L(StrncpyFillExit)
+	VMOVA	%YMMZERO, (%rdi)
+	add	$VEC_SIZE, %rdi
+	jmp	L(Fill)
+
+	.p2align 4
+L(StrncpyFillExit):
+	add	$VEC_SIZE, %r8
+L(Fill):
+	cmp	$17, %r8d
+	jae	L(Fill17_32)
+	cmp	$9, %r8d
+	jae	L(Fill9_16)
+	cmp	$5, %r8d
+	jae	L(Fill5_8)
+	cmp	$3, %r8d
+	jae	L(Fill3_4)
+	cmp	$1, %r8d
+	ja	L(Fill2)
+	je	L(Fill1)
+	ret
+
+/* end of ifndef USE_AS_STRCAT */
+#  endif
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(UnalignedFourVecSizeLeaveCase2)
+L(UnalignedFourVecSizeLeaveCase3):
+	lea	(VEC_SIZE * 4)(%r8), %rcx
+	and	$-VEC_SIZE, %rcx
+	add	$(VEC_SIZE * 3), %r8
+	jl	L(CopyVecSizeCase3)
+	VMOVU	%YMM4, (%rdi)
+	sub	$VEC_SIZE, %r8
+	jb	L(CopyVecSizeCase3)
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	sub	$VEC_SIZE, %r8
+	jb	L(CopyVecSizeCase3)
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	sub	$VEC_SIZE, %r8
+	jb	L(CopyVecSizeCase3)
+	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	(VEC_SIZE * 4)(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (VEC_SIZE * 4)(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(UnalignedFourVecSizeLeaveCase2):
+	xor	%ecx, %ecx
+	vpcmpb	$0, %YMM4, %YMMZERO, %k1
+	kmovd	%k1, %edx
+	add	$(VEC_SIZE * 3), %r8
+	jle	L(CopyVecSizeCase2OrCase3)
+	test	%edx, %edx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec4)
+#  else
+	jnz	L(CopyVecSize)
+#  endif
+	vpcmpb	$0, %YMM5, %YMMZERO, %k2
+	kmovd	%k2, %edx
+	VMOVU	%YMM4, (%rdi)
+	add	$VEC_SIZE, %rcx
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+	test	%edx, %edx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec5)
+#  else
+	jnz	L(CopyVecSize)
+#  endif
+
+	vpcmpb	$0, %YMM6, %YMMZERO, %k3
+	kmovd	%k3, %edx
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	add	$VEC_SIZE, %rcx
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+	test	%edx, %edx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec6)
+#  else
+	jnz	L(CopyVecSize)
+#  endif
+
+	vpcmpb	$0, %YMM7, %YMMZERO, %k4
+	kmovd	%k4, %edx
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	lea	VEC_SIZE(%rdi, %rcx), %rdi
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+L(StrncpyExit):
+	cmp	$65, %r8d
+	je	L(StrncpyExit65)
+	cmp	$33, %r8d
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8d
+	jae	L(StrncpyExit17_32)
+	cmp	$9, %r8d
+	jae	L(StrncpyExit9_16)
+	cmp	$5, %r8d
+	jae	L(StrncpyExit5_8)
+	cmp	$3, %r8d
+	jae	L(StrncpyExit3_4)
+	cmp	$1, %r8d
+	ja	L(StrncpyExit2)
+	je	L(StrncpyExit1)
+#  ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(ExitZero):
+#  ifndef USE_AS_STRCAT
+	mov	%rdi, %rax
+#  endif
+	ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# else
+END (STRCAT)
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
new file mode 100644
index 0000000..75b4b76
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRLEN
+# define STRLEN __strlen_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 3e7f14a..f5d058a 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -27,370 +27,475 @@
 # ifdef USE_AS_WCSLEN
 #  define VPCMPEQ	vpcmpeqd
 #  define VPMINU	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
 #  define VPMINU	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
-	/* Check for zero length.  */
+	/* Check zero length.  */
 	test	%RSI_LP, %RSI_LP
 	jz	L(zero)
+	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+	mov	%RSI_LP, %R8_LP
 #  ifdef USE_AS_WCSLEN
 	shl	$2, %RSI_LP
 #  elif defined __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%esi, %esi
 #  endif
-	mov	%RSI_LP, %R8_LP
 # endif
-	movl	%edi, %ecx
+	movl	%edi, %eax
 	movq	%rdi, %rdx
 	vpxor	%xmm0, %xmm0, %xmm0
-
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 # ifdef USE_AS_STRNLEN
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rsi
-	jbe	L(max)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < VEC_SIZE handle special.  */
+	cmpq	$VEC_SIZE, %rsi
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	/* If empty continue to aligned_more. Otherwise return bit
+	   position of first match.  */
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
-	addq	%rcx, %rsi
+L(zero):
+	xorl	%eax, %eax
+	ret
 
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+	VZEROUPPER_RETURN
 # endif
-	jmp	L(more_4x_vec)
 
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
+L(first_vec_x1):
 	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 4 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	incl	%edi
+	addl	%edi, %eax
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
-	subq	%rdx, %rax
 # ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	shrl	$2, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(aligned_more):
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
-	    to void possible addition overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
-
-	/* Check the end of data.  */
-	subq	%rcx, %rsi
-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 3 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE + 1), %edi
+	addl	%edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
 # endif
+	VZEROUPPER_RETURN
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 2 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 2 + 1), %edi
+	addl	%edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 3 + 1), %edi
+	addl	%edi, %eax
 # endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
+	   code on the x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+# ifdef USE_AS_STRNLEN
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+	   it simplies the logic in last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+	subq	%rdx, %rcx
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
+	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
+	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* Readjust length.  */
 	addq	%rcx, %rsi
+# else
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
-
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa (%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-	VPMINU	%ymm1, %ymm2, %ymm5
-	VPMINU	%ymm3, %ymm4, %ymm6
-	VPMINU	%ymm5, %ymm6, %ymm5
-
-	VPCMPEQ	%ymm5, %ymm0, %ymm5
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_STRNLEN
-	jmp	L(loop_4x_vec)
-# else
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
 	subq	$(VEC_SIZE * 4), %rsi
-	ja	L(loop_4x_vec)
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load. Since
+	   the matches in ymm2/ymm4 can only be returned if there where no
+	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+	 */
+	vmovdqa	1(%rdi), %ymm1
+	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+	VPMINU	%ymm2, %ymm4, %ymm5
+	VPCMPEQ	%ymm5, %ymm0, %ymm5
+	vpmovmskb	%ymm5, %ecx
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %esi
-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	subq	%rdx, %rdi
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(last_vec_return_x0)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm0, %ymm2
+	vpmovmskb	%ymm2, %eax
 	testl	%eax, %eax
+	jnz	L(last_vec_return_x1)
+
+	/* Combine last 2 VEC.  */
+	VPCMPEQ	%ymm3, %ymm0, %ymm3
+	vpmovmskb	%ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used if
+	   the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER_RETURN
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %esi
-	jle	L(max)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
+# ifdef USE_AS_STRNLEN
+	.p2align 4
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
 
-	jnz	L(first_vec_x3_check)
-	movq	%r8, %rax
-#  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-#  endif
-	VZEROUPPER
-	ret
+	vpmovmskb	%ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+	   VEC_SIZE * 4.  */
+	testl	$(VEC_SIZE * 2), %esi
+	jnz	L(last_4x_vec)
 
-	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %esi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	/* length may have been negative or positive by an offset of
+	   VEC_SIZE * 4 depending on where this was called from. This fixes
+	   that.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
 	testl	%eax, %eax
+	jnz	L(last_vec_x1_check)
 
-	jnz	L(first_vec_x0_check)
 	subl	$VEC_SIZE, %esi
-	jle	L(max)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	movq	%r8, %rax
-#  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-#  endif
-	VZEROUPPER
-	ret
+	jb	L(max)
 
-	.p2align 4
-L(first_vec_x0_check):
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+# endif
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_return_x0):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$VEC_SIZE, %rax
+	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-#  endif
-	VZEROUPPER
-	ret
+# endif
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_return_x1):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 2), %rax
+	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-#  endif
-	VZEROUPPER
-	ret
+# endif
+	VZEROUPPER_RETURN
 
+# ifdef USE_AS_STRNLEN
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 3), %rax
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	incl	%eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
-	.p2align 4
 L(max):
 	movq	%r8, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	/* Normalize length.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	subl	$(VEC_SIZE * 3), %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 3 + 1), %eax
+	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
 
 	.p2align 4
-L(first_vec_x0):
+L(last_vec_x1):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+	incl	%eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-# endif
-	VZEROUPPER
-	ret
+#  endif
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x1):
+L(last_vec_x2):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
 	tzcntl	%eax, %eax
-	addq	$VEC_SIZE, %rax
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-# endif
-	VZEROUPPER
-	ret
+#  endif
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x2):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 2), %rax
+	subl	$(VEC_SIZE * 2), %esi
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
+#  endif
+	VZEROUPPER_RETURN
+L(max_end):
+	movq	%r8, %rax
+	VZEROUPPER_RETURN
 # endif
-	VZEROUPPER
-	ret
 
+	/* Cold case for crossing page with first load.	 */
 	.p2align 4
-L(4x_vec_end):
-	VPCMPEQ	%ymm1, %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	VPCMPEQ %ymm2, %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod rdx.  */
+	sarxl	%edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	VPCMPEQ %ymm3, %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
+	jnz	L(cross_page_less_vec)
+	leaq	1(%rdi), %rcx
+	subq	%rdx, %rcx
+	/* Check length.  */
+	cmpq	%rsi, %rcx
+	jb	L(cross_page_continue)
+	movq	%r8, %rax
+# else
 	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	VPCMPEQ %ymm4, %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-L(first_vec_x3):
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+# endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+	.p2align 4
+L(cross_page_less_vec):
+	tzcntl	%eax, %eax
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+	VZEROUPPER_RETURN
 # endif
-	VZEROUPPER
-	ret
 
 END (STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
new file mode 100644
index 0000000..0583819
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -0,0 +1,436 @@
+/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+#  define STRLEN	__strlen_evex
+# endif
+
+# define VMOVA		vmovdqa64
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define SHIFT_REG	r9d
+# else
+#  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define SHIFT_REG	ecx
+# endif
+
+# define XMMZERO	xmm16
+# define YMMZERO	ymm16
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+# define VEC_SIZE 32
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRLEN)
+# ifdef USE_AS_STRNLEN
+	/* Check for zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+#  ifdef USE_AS_WCSLEN
+	shl	$2, %RSI_LP
+#  elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+	mov	%RSI_LP, %R8_LP
+# endif
+	movl	%edi, %ecx
+	movq	%rdi, %rdx
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+
+# ifdef USE_AS_STRNLEN
+	jnz	L(first_vec_x0_check)
+	/* Adjust length and check the end of data.  */
+	subq	$VEC_SIZE, %rsi
+	jbe	L(max)
+# else
+	jnz	L(first_vec_x0)
+# endif
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+	/* Adjust length.  */
+	addq	%rcx, %rsi
+
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+	jmp	L(more_4x_vec)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+# ifdef USE_AS_WCSLEN
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+	   bytes.  */
+	movl	%ecx, %SHIFT_REG
+	sarl	$2, %SHIFT_REG
+# endif
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+
+	/* Remove the leading bytes.  */
+	sarxl	%SHIFT_REG, %eax, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+# ifdef USE_AS_STRNLEN
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+# endif
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+# ifdef USE_AS_STRNLEN
+        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+	    to void possible addition overflow.  */
+	negq	%rcx
+	addq	$VEC_SIZE, %rcx
+
+	/* Check the end of data.  */
+	subq	%rcx, %rsi
+	jbe	L(max)
+# endif
+
+	addq	$VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_STRNLEN
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+	/* Align data to 4 * VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andl	$(4 * VEC_SIZE - 1), %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+
+# ifdef USE_AS_STRNLEN
+	/* Adjust length.  */
+	addq	%rcx, %rsi
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	VMOVA	(%rdi), %YMM1
+	VMOVA	VEC_SIZE(%rdi), %YMM2
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
+
+	VPMINU	%YMM1, %YMM2, %YMM5
+	VPMINU	%YMM3, %YMM4, %YMM6
+
+	VPMINU	%YMM5, %YMM6, %YMM5
+	VPCMP	$0, %YMM5, %YMMZERO, %k0
+	ktestd	%k0, %k0
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_STRNLEN
+	jmp	L(loop_4x_vec)
+# else
+	subq	$(VEC_SIZE * 4), %rsi
+	ja	L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+	addl	$(VEC_SIZE * 2), %esi
+	jle	L(last_2x_vec)
+
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2_check)
+	subl	$VEC_SIZE, %esi
+	jle	L(max)
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3_check)
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	addl	$(VEC_SIZE * 2), %esi
+
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0_check)
+	subl	$VEC_SIZE, %esi
+	jle	L(max)
+
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	ret
+
+	.p2align 4
+L(first_vec_x0_check):
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+#  endif
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	ret
+
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+#  endif
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	ret
+
+	.p2align 4
+L(first_vec_x2_check):
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+#  endif
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	ret
+
+	.p2align 4
+L(first_vec_x3_check):
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+#  endif
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	ret
+
+	.p2align 4
+L(max):
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(first_vec_x0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	VPCMP	$0, %YMM1, %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+	VPCMP	$0, %YMM2, %YMMZERO, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+	VPCMP	$0, %YMM3, %YMMZERO, %k2
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+	VPCMP	$0, %YMM4, %YMMZERO, %k3
+	kmovd	%k3, %eax
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
index 6f5bfc7..8d748aa 100644
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -20,4 +20,4 @@
 # define strlen __strlen_sse2
 #endif
 
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
new file mode 100644
index 0000000..8f660bb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -0,0 +1,257 @@
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef AS_WCSLEN
+# define PMINU		pminud
+# define PCMPEQ		pcmpeqd
+# define SHIFT_RETURN	shrq $2, %rax
+#else
+# define PMINU		pminub
+# define PCMPEQ		pcmpeqb
+# define SHIFT_RETURN
+#endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+	%xmm3 - zero
+	%rdi   - s
+	%r10  (s+n) & (~(64-1))
+	%r11   s+n
+*/
+
+
+.text
+ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+#define FIND_ZERO	\
+	PCMPEQ	(%rax), %xmm0;	\
+	PCMPEQ	16(%rax), %xmm1;	\
+	PCMPEQ	32(%rax), %xmm2;	\
+	PCMPEQ	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
+	salq	$16, %rdx;	\
+	salq	$16, %rcx;	\
+	orq	%rsi, %rdx;	\
+	orq	%r8, %rcx;	\
+	salq	$32, %rcx;	\
+	orq	%rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0.  */
+	test	%RSI_LP, %RSI_LP
+	jne	L(n_nonzero)
+	xor	%rax, %rax
+	ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+	shl	$2, %RSI_LP
+# endif
+
+/* Initialize long lived registers.  */
+
+	add	%RDI_LP, %RSI_LP
+	mov	%RSI_LP, %R10_LP
+	and	$-64, %R10_LP
+	mov	%RSI_LP, %R11_LP
+#endif
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	movq	%rdi, %rax
+	movq	%rdi, %rcx
+	andq	$4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+	cmpq	$4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	ja	L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes.  */
+# define STRNLEN_PROLOG	\
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG  andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string.  */
+#define PROLOG(lab)	\
+	movq	%rdi, %rcx;	\
+	xorq	%rax, %rcx;	\
+	STRNLEN_PROLOG;	\
+	sarq	%cl, %rdx;	\
+	test	%rdx, %rdx;	\
+	je	L(lab);	\
+	bsfq	%rdx, %rax;	\
+	SHIFT_RETURN;		\
+	ret
+
+#ifdef AS_STRNLEN
+	andq	$-16, %rax
+	FIND_ZERO
+#else
+	/* Test first 16 bytes unaligned.  */
+	movdqu	(%rax), %xmm4
+	PCMPEQ	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
+	test	%edx, %edx
+	je 	L(next48_bytes)
+	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	SHIFT_RETURN
+	ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+	andq	$-16, %rax
+	PCMPEQ 16(%rax), %xmm1
+	PCMPEQ 32(%rax), %xmm2
+	PCMPEQ 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
+	salq	$16, %rdx
+	salq	$16, %rcx
+	orq	%r8, %rcx
+	salq	$32, %rcx
+	orq	%rcx, %rdx
+#endif
+
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
+	   zero them.  */
+	PROLOG(loop)
+
+	.p2align 4
+L(cross_page):
+	andq	$-64, %rax
+	FIND_ZERO
+	PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1).  */
+L(strnlen_ret):
+	bts	%rsi, %rdx
+	sarq	%cl, %rdx
+	test	%rdx, %rdx
+	je	L(loop_init)
+	bsfq	%rdx, %rax
+	SHIFT_RETURN
+	ret
+#endif
+	.p2align 4
+L(loop_init):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+#ifdef AS_STRNLEN
+	.p2align 4
+L(loop):
+
+	addq	$64, %rax
+	cmpq	%rax, %r10
+	je	L(exit_end)
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit_end):
+	cmp	%rax, %r11
+	je	L(first) /* Do not read when end is at page boundary.  */
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+L(first):
+	bts	%r11, %rdx
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+	.p2align 4
+L(exit):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#else
+
+	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+	.p2align 4
+L(loop):
+
+	movdqa	64(%rax), %xmm0
+	PMINU	80(%rax), %xmm0
+	PMINU	96(%rax), %xmm0
+	PMINU	112(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit64)
+
+	subq	$-128, %rax
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit0)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit64):
+	addq	$64, %rax
+L(exit0):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#endif
+
+END(strlen)
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
new file mode 100644
index 0000000..0dcea18
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_avx2_rtm
+#include "strcat-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
new file mode 100644
index 0000000..8884f02
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_evex
+#include "strcat-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
new file mode 100644
index 0000000..37d1224
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCMP	__strncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S
new file mode 100644
index 0000000..a1d53e8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-evex.S
@@ -0,0 +1,3 @@
+#define STRCMP	__strncmp_evex
+#define USE_AS_STRNCMP 1
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index e7b29de..52f208c 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -30,16 +30,29 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
+	  && CPU_FEATURES_CPU_P (cpu_features, BMI2)
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURES_CPU_P (cpu_features, SSE4_2)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
new file mode 100644
index 0000000..79e7083
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
new file mode 100644
index 0000000..40e391f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
new file mode 100644
index 0000000..04f1626
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2_rtm
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
new file mode 100644
index 0000000..722022f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
new file mode 100644
index 0000000..5def14e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRRCHR
+# define STRRCHR __strrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index c66df12..9116472 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -36,9 +36,13 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE	32
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRRCHR)
 	movd	%esi, %xmm4
 	movl	%edi, %ecx
@@ -166,8 +170,8 @@ L(return_value):
 # endif
 	bsrl	%eax, %eax
 	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(match):
@@ -198,8 +202,7 @@ L(find_nul):
 	jz	L(return_value)
 	bsrl	%eax, %eax
 	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(char_and_nul):
@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
 	jz	L(return_null)
 	bsrl	%eax, %eax
 	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(return_null):
 	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 END (STRRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
new file mode 100644
index 0000000..f920b5a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -0,0 +1,265 @@
+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRRCHR
+#  define STRRCHR	__strrchr_evex
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# ifdef USE_AS_WCSRCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define SHIFT_REG	r8d
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define SHIFT_REG	ecx
+# endif
+
+# define XMMZERO	xmm16
+# define YMMZERO	ymm16
+# define YMMMATCH	ymm17
+# define YMM1		ymm18
+
+# define VEC_SIZE	32
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRRCHR)
+	movl	%edi, %ecx
+	/* Broadcast CHAR to YMMMATCH.  */
+	VPBROADCAST %esi, %YMMMATCH
+
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	VMOVU	(%rdi), %YMM1
+
+	/* Each bit in K0 represents a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k0, %ecx
+	kmovd	%k1, %eax
+
+	addq	$VEC_SIZE, %rdi
+
+	testl	%eax, %eax
+	jnz	L(first_vec)
+
+	testl	%ecx, %ecx
+	jnz	L(return_null)
+
+	andq	$-VEC_SIZE, %rdi
+	xorl	%edx, %edx
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(first_vec):
+	/* Check if there is a null byte.  */
+	testl	%ecx, %ecx
+	jnz	L(char_and_nul_in_first_vec)
+
+	/* Remember the match and keep searching.  */
+	movl	%eax, %edx
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+# ifdef USE_AS_WCSRCHR
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	   bytes.  */
+	movl	%ecx, %SHIFT_REG
+	sarl	$2, %SHIFT_REG
+# endif
+
+	VMOVA	(%rdi), %YMM1
+
+	/* Each bit in K0 represents a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k0, %edx
+	kmovd	%k1, %eax
+
+	shrxl	%SHIFT_REG, %edx, %edx
+	shrxl	%SHIFT_REG, %eax, %eax
+	addq	$VEC_SIZE, %rdi
+
+	/* Check if there is a CHAR.  */
+	testl	%eax, %eax
+	jnz	L(found_char)
+
+	testl	%edx, %edx
+	jnz	L(return_null)
+
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(found_char):
+	testl	%edx, %edx
+	jnz	L(char_and_nul)
+
+	/* Remember the match and keep searching.  */
+	movl	%eax, %edx
+	leaq	(%rdi, %rcx), %rsi
+
+	.p2align 4
+L(aligned_loop):
+	VMOVA	(%rdi), %YMM1
+	addq	$VEC_SIZE, %rdi
+
+	/* Each bit in K0 represents a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k0, %ecx
+	kmovd	%k1, %eax
+	orl	%eax, %ecx
+	jnz	L(char_nor_null)
+
+	VMOVA	(%rdi), %YMM1
+	add	$VEC_SIZE, %rdi
+
+	/* Each bit in K0 represents a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k0, %ecx
+	kmovd	%k1, %eax
+	orl	%eax, %ecx
+	jnz	L(char_nor_null)
+
+	VMOVA	(%rdi), %YMM1
+	addq	$VEC_SIZE, %rdi
+
+	/* Each bit in K0 represents a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k0, %ecx
+	kmovd	%k1, %eax
+	orl	%eax, %ecx
+	jnz	L(char_nor_null)
+
+	VMOVA	(%rdi), %YMM1
+	addq	$VEC_SIZE, %rdi
+
+	/* Each bit in K0 represents a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k0, %ecx
+	kmovd	%k1, %eax
+	orl	%eax, %ecx
+	jz	L(aligned_loop)
+
+	.p2align 4
+L(char_nor_null):
+	/* Find a CHAR or a null byte in a loop.  */
+	testl	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	testl	%edx, %edx
+	jz	L(return_null)
+	movl	%edx, %eax
+	movq	%rsi, %rdi
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
+# endif
+	ret
+
+	.p2align 4
+L(match):
+	/* Find a CHAR.  Check if there is a null byte.  */
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(find_nul)
+
+	/* Remember the match and keep searching.  */
+	movl	%eax, %edx
+	movq	%rdi, %rsi
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(find_nul):
+	/* Mask out any matching bits after the null byte.  */
+	movl	%ecx, %r8d
+	subl	$1, %r8d
+	xorl	%ecx, %r8d
+	andl	%r8d, %eax
+	testl	%eax, %eax
+	/* If there is no CHAR here, return the remembered one.  */
+	jz	L(return_value)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
+# endif
+	ret
+
+	.p2align 4
+L(char_and_nul):
+	/* Find both a CHAR and a null byte.  */
+	addq	%rcx, %rdi
+	movl	%edx, %ecx
+L(char_and_nul_in_first_vec):
+	/* Mask out any matching bits after the null byte.  */
+	movl	%ecx, %r8d
+	subl	$1, %r8d
+	xorl	%ecx, %r8d
+	andl	%r8d, %eax
+	testl	%eax, %eax
+	/* Return null pointer if the null byte comes first.  */
+	jz	L(return_null)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
+# endif
+	ret
+
+	.p2align 4
+L(return_null):
+	xorl	%eax, %eax
+	ret
+
+END (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
new file mode 100644
index 0000000..d49dbbf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2_rtm
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S
new file mode 100644
index 0000000..7cb8f1e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-evex.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_evex
+#define USE_AS_WCSCHR 1
+#include "strchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
new file mode 100644
index 0000000..d6ca2b8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_avx2_rtm
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S
new file mode 100644
index 0000000..42e73e5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_evex
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
new file mode 100644
index 0000000..35658d7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2_rtm
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S
new file mode 100644
index 0000000..bdafa83
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
new file mode 100644
index 0000000..7e62621
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -0,0 +1,4 @@
+#define AS_WCSLEN
+#define strlen	__wcslen_sse4_1
+
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
index c23ce45..13070fd 100644
--- a/sysdeps/x86_64/multiarch/wcslen.c
+++ b/sysdeps/x86_64/multiarch/wcslen.c
@@ -24,7 +24,7 @@
 # undef __wcslen
 
 # define SYMBOL_NAME wcslen
-# include "ifunc-avx2.h"
+# include "ifunc-wcslen.h"
 
 libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
 weak_alias (__wcslen, wcslen);
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
new file mode 100644
index 0000000..4e88c70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
new file mode 100644
index 0000000..8a8e310
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_evex
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
new file mode 100644
index 0000000..7437ebe
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2_rtm
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
new file mode 100644
index 0000000..24773bb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
index a8cab0c..5fa51fe 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -2,4 +2,4 @@
 #define AS_STRNLEN
 #define strlen	__wcsnlen_sse4_1
 
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
index 3da1197..f1b6bc8 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -24,27 +24,7 @@
 # undef __wcsnlen
 
 # define SYMBOL_NAME wcsnlen
-# include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
-  const struct cpu_features* cpu_features = __get_cpu_features ();
-
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
-
-  if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
-    return OPTIMIZE (sse4_1);
-
-  return OPTIMIZE (sse2);
-}
+# include "ifunc-wcslen.h"
 
 libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
 weak_alias (__wcsnlen, wcsnlen);
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
new file mode 100644
index 0000000..9bf7608
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2_rtm
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
new file mode 100644
index 0000000..c64602f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_evex
+#define USE_AS_WCSRCHR 1
+#include "strrchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
new file mode 100644
index 0000000..58ed21d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2_rtm
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S
new file mode 100644
index 0000000..06cd0f9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_evex
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
new file mode 100644
index 0000000..31104d1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2_movbe_rtm
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2-movbe-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
new file mode 100644
index 0000000..4726d74
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_evex_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-evex-movbe.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 9ab357f..ad047d8 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@
-/* SSE2 version of strlen/wcslen.
-   Copyright (C) 2012-2019 Free Software Foundation, Inc.
+/* SSE2 version of strlen.
+   Copyright (C) 2021 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,243 +16,6 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
+#include "multiarch/strlen-vec.S"
 
-#ifdef AS_WCSLEN
-# define PMINU		pminud
-# define PCMPEQ		pcmpeqd
-# define SHIFT_RETURN	shrq $2, %rax
-#else
-# define PMINU		pminub
-# define PCMPEQ		pcmpeqb
-# define SHIFT_RETURN
-#endif
-
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
-	%xmm3 - zero
-	%rdi   - s
-	%r10  (s+n) & (~(64-1))
-	%r11   s+n
-*/
-
-
-.text
-ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
-#define FIND_ZERO	\
-	PCMPEQ	(%rax), %xmm0;	\
-	PCMPEQ	16(%rax), %xmm1;	\
-	PCMPEQ	32(%rax), %xmm2;	\
-	PCMPEQ	48(%rax), %xmm3;	\
-	pmovmskb	%xmm0, %esi;	\
-	pmovmskb	%xmm1, %edx;	\
-	pmovmskb	%xmm2, %r8d;	\
-	pmovmskb	%xmm3, %ecx;	\
-	salq	$16, %rdx;	\
-	salq	$16, %rcx;	\
-	orq	%rsi, %rdx;	\
-	orq	%r8, %rcx;	\
-	salq	$32, %rcx;	\
-	orq	%rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0.  */
-	test	%RSI_LP, %RSI_LP
-	jne	L(n_nonzero)
-	xor	%rax, %rax
-	ret
-L(n_nonzero):
-# ifdef AS_WCSLEN
-	shl	$2, %RSI_LP
-# endif
-
-/* Initialize long lived registers.  */
-
-	add	%RDI_LP, %RSI_LP
-	mov	%RSI_LP, %R10_LP
-	and	$-64, %R10_LP
-	mov	%RSI_LP, %R11_LP
-#endif
-
-	pxor	%xmm0, %xmm0
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-	movq	%rdi, %rax
-	movq	%rdi, %rcx
-	andq	$4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
-	cmpq	$4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower.  */
-	ja	L(cross_page)
-
-#ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes.  */
-# define STRNLEN_PROLOG	\
-	mov	%r11, %rsi;	\
-	subq	%rax, %rsi;	\
-	andq	$-64, %rax;	\
-	testq	$-64, %rsi;	\
-	je	L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG  andq $-64, %rax;
-#endif
-
-/* Ignore bits in mask that come before start of string.  */
-#define PROLOG(lab)	\
-	movq	%rdi, %rcx;	\
-	xorq	%rax, %rcx;	\
-	STRNLEN_PROLOG;	\
-	sarq	%cl, %rdx;	\
-	test	%rdx, %rdx;	\
-	je	L(lab);	\
-	bsfq	%rdx, %rax;	\
-	SHIFT_RETURN;		\
-	ret
-
-#ifdef AS_STRNLEN
-	andq	$-16, %rax
-	FIND_ZERO
-#else
-	/* Test first 16 bytes unaligned.  */
-	movdqu	(%rax), %xmm4
-	PCMPEQ	%xmm0, %xmm4
-	pmovmskb	%xmm4, %edx
-	test	%edx, %edx
-	je 	L(next48_bytes)
-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
-	SHIFT_RETURN
-	ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
-	andq	$-16, %rax
-	PCMPEQ 16(%rax), %xmm1
-	PCMPEQ 32(%rax), %xmm2
-	PCMPEQ 48(%rax), %xmm3
-	pmovmskb	%xmm1, %edx
-	pmovmskb	%xmm2, %r8d
-	pmovmskb	%xmm3, %ecx
-	salq	$16, %rdx
-	salq	$16, %rcx
-	orq	%r8, %rcx
-	salq	$32, %rcx
-	orq	%rcx, %rdx
-#endif
-
-	/* When no zero byte is found xmm1-3 are zero so we do not have to
-	   zero them.  */
-	PROLOG(loop)
-
-	.p2align 4
-L(cross_page):
-	andq	$-64, %rax
-	FIND_ZERO
-	PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1).  */
-L(strnlen_ret):
-	bts	%rsi, %rdx
-	sarq	%cl, %rdx
-	test	%rdx, %rdx
-	je	L(loop_init)
-	bsfq	%rdx, %rax
-	SHIFT_RETURN
-	ret
-#endif
-	.p2align 4
-L(loop_init):
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-#ifdef AS_STRNLEN
-	.p2align 4
-L(loop):
-
-	addq	$64, %rax
-	cmpq	%rax, %r10
-	je	L(exit_end)
-
-	movdqa	(%rax), %xmm0
-	PMINU	16(%rax), %xmm0
-	PMINU	32(%rax), %xmm0
-	PMINU	48(%rax), %xmm0
-	PCMPEQ	%xmm3, %xmm0
-	pmovmskb	%xmm0, %edx
-	testl	%edx, %edx
-	jne	L(exit)
-	jmp	L(loop)
-
-	.p2align 4
-L(exit_end):
-	cmp	%rax, %r11
-	je	L(first) /* Do not read when end is at page boundary.  */
-	pxor	%xmm0, %xmm0
-	FIND_ZERO
-
-L(first):
-	bts	%r11, %rdx
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	SHIFT_RETURN
-	ret
-
-	.p2align 4
-L(exit):
-	pxor	%xmm0, %xmm0
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	SHIFT_RETURN
-	ret
-
-#else
-
-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
-	.p2align 4
-L(loop):
-
-	movdqa	64(%rax), %xmm0
-	PMINU	80(%rax), %xmm0
-	PMINU	96(%rax), %xmm0
-	PMINU	112(%rax), %xmm0
-	PCMPEQ	%xmm3, %xmm0
-	pmovmskb	%xmm0, %edx
-	testl	%edx, %edx
-	jne	L(exit64)
-
-	subq	$-128, %rax
-
-	movdqa	(%rax), %xmm0
-	PMINU	16(%rax), %xmm0
-	PMINU	32(%rax), %xmm0
-	PMINU	48(%rax), %xmm0
-	PCMPEQ	%xmm3, %xmm0
-	pmovmskb	%xmm0, %edx
-	testl	%edx, %edx
-	jne	L(exit0)
-	jmp	L(loop)
-
-	.p2align 4
-L(exit64):
-	addq	$64, %rax
-L(exit0):
-	pxor	%xmm0, %xmm0
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	SHIFT_RETURN
-	ret
-
-#endif
-
-END(strlen)
 libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index 7b64be9..7f5cd1b 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -95,6 +95,28 @@ lose:									      \
 #define R14_LP	r14
 #define R15_LP	r15
 
+/* Zero upper vector registers and return with xtest.  NB: Use VZEROALL
+   to avoid RTM abort triggered by VZEROUPPER inside transactionally.  */
+#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
+	xtest;							\
+	jz	1f;						\
+	vzeroall;						\
+	ret;							\
+1:								\
+	vzeroupper;						\
+	ret
+
+/* Zero upper vector registers and return.  */
+#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+	VZEROUPPER;						\
+	ret
+#endif
+
+#ifndef VZEROUPPER_RETURN
+# define VZEROUPPER_RETURN	VZEROUPPER; ret
+#endif
+
 #else	/* __ASSEMBLER__ */
 
 /* Long and pointer size in bytes.  */
author	Raoni Fassina Firmino <raoni@linux.ibm.com>	2022-02-25 10:45:14 -0300
committer	Raoni Fassina Firmino <raoni@linux.ibm.com>	2022-02-25 10:45:14 -0300
commit	821f0a9cb24486842f16f9b0d81d9c28e876c6ae (patch)
tree	690876c3a706cb222b159e2ebff02d218d2fd1fb
parent	889122cbfacedab8fdb62a9d3fd6244d528ea99c (diff)
parent	178292fe011d5745aa0dd7ea99b388c30512fbdb (diff)
download	glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.zip glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.tar.gz glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.tar.bz2