22 files changed, 632 insertions, 158 deletions
diff --git a/NEWS b/NEWS
index 1884f74..1668896 100644
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,7 @@ The following bugs are resolved with this release:
   [18035] Fix pldd hang
   [19444] build failures with -O1 due to -Wmaybe-uninitialized
   [20018] getaddrinfo should reject IP addresses with trailing characters
+  [20019] NULL pointer dereference in libc.so.6 IFUNC due to uninitialized GOT
   [20209] localedata: Spelling mistake for Sunday in Greenlandic kl_GL
   [20568] Fix crash in _IO_wfile_sync
   [22927] libanl: properly cleanup if first helper thread creation failed
@@ -76,6 +77,8 @@ The following bugs are resolved with this release:
   [25414] 'glob' use-after-free bug (CVE-2020-1752)
   [25423] Array overflow in backtrace on powerpc
   [25933] Off by one error in __strncmp_avx2
+  [27130] "rep movsb" performance issue
+  [27177] GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on doesn't work
 
 Security related changes:
 
diff --git a/elf/Makefile b/elf/Makefile
index 6027926..564b814 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -1267,6 +1267,8 @@ CFLAGS-ifuncmain5pie.c += $(pie-ccflag)
 CFLAGS-ifuncmain6pie.c += $(pie-ccflag)
 CFLAGS-ifuncmain7pie.c += $(pie-ccflag)
 
+LDFLAGS-ifuncmain6pie = -Wl,-z,lazy
+
 $(objpfx)ifuncmain1pie: $(objpfx)ifuncmod1.so
 $(objpfx)ifuncmain1staticpie: $(objpfx)ifuncdep1pic.o
 $(objpfx)ifuncmain1vispie: $(objpfx)ifuncmod1.so
diff --git a/elf/ifuncmain6pie.c b/elf/ifuncmain6pie.c
index 04faeb8..4a01906 100644
--- a/elf/ifuncmain6pie.c
+++ b/elf/ifuncmain6pie.c
@@ -9,7 +9,6 @@
 #include "ifunc-sel.h"
 
 typedef int (*foo_p) (void);
-extern foo_p foo_ptr;
 
 static int
 one (void)
@@ -28,20 +27,17 @@ foo_ifunc (void)
 }
 
 extern int foo (void);
-extern foo_p get_foo (void);
+extern int call_foo (void);
 extern foo_p get_foo_p (void);
 
-foo_p my_foo_ptr = foo;
+foo_p foo_ptr = foo;
 
 int
 main (void)
 {
   foo_p p;
 
-  p = get_foo ();
-  if (p != foo)
-    abort ();
-  if ((*p) () != -30)
+  if (call_foo () != -30)
     abort ();
 
   p = get_foo_p ();
@@ -52,12 +48,8 @@ main (void)
 
   if (foo_ptr != foo)
     abort ();
-  if (my_foo_ptr != foo)
-    abort ();
   if ((*foo_ptr) () != -30)
     abort ();
-  if ((*my_foo_ptr) () != -30)
-    abort ();
   if (foo () != -30)
     abort ();
 
diff --git a/elf/ifuncmod6.c b/elf/ifuncmod6.c
index 2e16c1d..2f6d071 100644
--- a/elf/ifuncmod6.c
+++ b/elf/ifuncmod6.c
@@ -4,7 +4,7 @@ extern int foo (void);
 
 typedef int (*foo_p) (void);
 
-foo_p foo_ptr = foo;
+extern foo_p foo_ptr;
 
 foo_p
 get_foo_p (void)
@@ -12,8 +12,8 @@ get_foo_p (void)
   return foo_ptr;
 }
 
-foo_p
-get_foo (void)
+int
+call_foo (void)
 {
-  return foo;
+  return foo ();
 }
diff --git a/iconvdata/Makefile b/iconvdata/Makefile
index 06e161d..36dd5d1 100644
--- a/iconvdata/Makefile
+++ b/iconvdata/Makefile
@@ -73,7 +73,7 @@ modules.so := $(addsuffix .so, $(modules))
 ifeq (yes,$(build-shared))
 tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \
 	tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \
-	bug-iconv10 bug-iconv11 bug-iconv12
+	bug-iconv10 bug-iconv11 bug-iconv12 bug-iconv14
 ifeq ($(have-thread-library),yes)
 tests += bug-iconv3
 endif
@@ -316,6 +316,8 @@ $(objpfx)bug-iconv10.out: $(objpfx)gconv-modules \
 			  $(addprefix $(objpfx),$(modules.so))
 $(objpfx)bug-iconv12.out: $(objpfx)gconv-modules \
 			  $(addprefix $(objpfx),$(modules.so))
+$(objpfx)bug-iconv14.out: $(objpfx)gconv-modules \
+			  $(addprefix $(objpfx),$(modules.so))
 
 $(objpfx)iconv-test.out: run-iconv-test.sh $(objpfx)gconv-modules \
 			 $(addprefix $(objpfx),$(modules.so)) \
diff --git a/iconvdata/bug-iconv14.c b/iconvdata/bug-iconv14.c
new file mode 100644
index 0000000..902f140
--- /dev/null
+++ b/iconvdata/bug-iconv14.c
@@ -0,0 +1,127 @@
+/* Assertion in ISO-2022-JP-3 due to two-character sequence (bug 27256).
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <iconv.h>
+#include <string.h>
+#include <errno.h>
+#include <support/check.h>
+
+/* Use an escape sequence to return to the initial state.  */
+static void
+with_escape_sequence (void)
+{
+  iconv_t c = iconv_open ("UTF-8", "ISO-2022-JP-3");
+  TEST_VERIFY_EXIT (c != (iconv_t) -1);
+
+  char in[] = "\e$(O+D\e(B";
+  char *inbuf = in;
+  size_t inleft = strlen (in);
+  char out[3];                  /* Space for one output character.  */
+  char *outbuf;
+  size_t outleft;
+
+  outbuf = out;
+  outleft = sizeof (out);
+  TEST_COMPARE (iconv (c, &inbuf, &inleft, &outbuf, &outleft), (size_t) -1);
+  TEST_COMPARE (errno, E2BIG);
+  TEST_COMPARE (inleft, 3);
+  TEST_COMPARE (inbuf - in, strlen (in) - 3);
+  TEST_COMPARE (outleft, sizeof (out) - 2);
+  TEST_COMPARE (outbuf - out, 2);
+  TEST_COMPARE (out[0] & 0xff, 0xc3);
+  TEST_COMPARE (out[1] & 0xff, 0xa6);
+
+  /* Return to the initial shift state, producing the pending
+     character.  */
+  outbuf = out;
+  outleft = sizeof (out);
+  TEST_COMPARE (iconv (c, &inbuf, &inleft, &outbuf, &outleft), 0);
+  TEST_COMPARE (inleft, 0);
+  TEST_COMPARE (inbuf - in, strlen (in));
+  TEST_COMPARE (outleft, sizeof (out) - 2);
+  TEST_COMPARE (outbuf - out, 2);
+  TEST_COMPARE (out[0] & 0xff, 0xcc);
+  TEST_COMPARE (out[1] & 0xff, 0x80);
+
+  /* Nothing should be flushed the second time.  */
+  outbuf = out;
+  outleft = sizeof (out);
+  TEST_COMPARE (iconv (c, NULL, 0, &outbuf, &outleft), 0);
+  TEST_COMPARE (outleft, sizeof (out));
+  TEST_COMPARE (outbuf - out, 0);
+  TEST_COMPARE (out[0] & 0xff, 0xcc);
+  TEST_COMPARE (out[1] & 0xff, 0x80);
+
+  TEST_COMPARE (iconv_close (c), 0);
+}
+
+/* Use an explicit flush to return to the initial state.  */
+static void
+with_flush (void)
+{
+  iconv_t c = iconv_open ("UTF-8", "ISO-2022-JP-3");
+  TEST_VERIFY_EXIT (c != (iconv_t) -1);
+
+  char in[] = "\e$(O+D";
+  char *inbuf = in;
+  size_t inleft = strlen (in);
+  char out[3];                  /* Space for one output character.  */
+  char *outbuf;
+  size_t outleft;
+
+  outbuf = out;
+  outleft = sizeof (out);
+  TEST_COMPARE (iconv (c, &inbuf, &inleft, &outbuf, &outleft), (size_t) -1);
+  TEST_COMPARE (errno, E2BIG);
+  TEST_COMPARE (inleft, 0);
+  TEST_COMPARE (inbuf - in, strlen (in));
+  TEST_COMPARE (outleft, sizeof (out) - 2);
+  TEST_COMPARE (outbuf - out, 2);
+  TEST_COMPARE (out[0] & 0xff, 0xc3);
+  TEST_COMPARE (out[1] & 0xff, 0xa6);
+
+  /* Flush the pending character.  */
+  outbuf = out;
+  outleft = sizeof (out);
+  TEST_COMPARE (iconv (c, NULL, 0, &outbuf, &outleft), 0);
+  TEST_COMPARE (outleft, sizeof (out) - 2);
+  TEST_COMPARE (outbuf - out, 2);
+  TEST_COMPARE (out[0] & 0xff, 0xcc);
+  TEST_COMPARE (out[1] & 0xff, 0x80);
+
+  /* Nothing should be flushed the second time.  */
+  outbuf = out;
+  outleft = sizeof (out);
+  TEST_COMPARE (iconv (c, NULL, 0, &outbuf, &outleft), 0);
+  TEST_COMPARE (outleft, sizeof (out));
+  TEST_COMPARE (outbuf - out, 0);
+  TEST_COMPARE (out[0] & 0xff, 0xcc);
+  TEST_COMPARE (out[1] & 0xff, 0x80);
+
+  TEST_COMPARE (iconv_close (c), 0);
+}
+
+static int
+do_test (void)
+{
+  with_escape_sequence ();
+  with_flush ();
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/iconvdata/iso-2022-jp-3.c b/iconvdata/iso-2022-jp-3.c
index de25958..047fab8 100644
--- a/iconvdata/iso-2022-jp-3.c
+++ b/iconvdata/iso-2022-jp-3.c
@@ -67,23 +67,34 @@ enum
   CURRENT_SEL_MASK = 7 << 3
 };
 
-/* During UCS-4 to ISO-2022-JP-3 conversion, the COUNT element of the state
-   also contains the last two bytes to be output, shifted by 6 bits, and a
-   one-bit indicator whether they must be preceded by the shift sequence,
-   in bit 22.  */
+/* During UCS-4 to ISO-2022-JP-3 conversion, the COUNT element of the
+   state also contains the last two bytes to be output, shifted by 6
+   bits, and a one-bit indicator whether they must be preceded by the
+   shift sequence, in bit 22.  During ISO-2022-JP-3 to UCS-4
+   conversion, COUNT may also contain a non-zero pending wide
+   character, shifted by six bits.  This happens for certain inputs in
+   JISX0213_1_2004_set and JISX0213_2_set if the second wide character
+   in a combining sequence cannot be written because the buffer is
+   full.  */
 
 /* Since this is a stateful encoding we have to provide code which resets
    the output state to the initial state.  This has to be done during the
    flushing.  */
 #define EMIT_SHIFT_TO_INIT \
-  if ((data->__statep->__count & ~7) != ASCII_set)			      \
+  if (data->__statep->__count != ASCII_set)			      \
     {									      \
       if (FROM_DIRECTION)						      \
 	{								      \
-	  /* It's easy, we don't have to emit anything, we just reset the     \
-	     state for the input.  */					      \
-	  data->__statep->__count &= 7;					      \
-	  data->__statep->__count |= ASCII_set;				      \
+	  if (__glibc_likely (outbuf + 4 <= outend))			      \
+	    {								      \
+	      /* Write out the last character.  */			      \
+	      *((uint32_t *) outbuf) = data->__statep->__count >> 6;	      \
+	      outbuf += sizeof (uint32_t);				      \
+	      data->__statep->__count = ASCII_set;			\
+	    }								      \
+	  else								      \
+	    /* We don't have enough room in the output buffer.  */	      \
+	    status = __GCONV_FULL_OUTPUT;				      \
 	}								      \
       else								      \
 	{								      \
@@ -151,7 +162,21 @@ enum
 #define LOOPFCT			FROM_LOOP
 #define BODY \
   {									      \
-    uint32_t ch = *inptr;						      \
+    uint32_t ch;							      \
+									      \
+    /* Output any pending character.  */				      \
+    ch = set >> 6;							      \
+    if (__glibc_unlikely (ch != 0))					      \
+      {									      \
+	put32 (outptr, ch);						      \
+	outptr += 4;							      \
+	/* Remove the pending character, but preserve state bits.  */	      \
+	set &= (1 << 6) - 1;						      \
+	continue;							      \
+      }									      \
+									      \
+    /* Otherwise read the next input byte.  */				      \
+    ch = *inptr;							      \
 									      \
     /* Recognize escape sequences.  */					      \
     if (__glibc_unlikely (ch == ESC))					      \
@@ -297,21 +322,25 @@ enum
 	    uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];	      \
 	    uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];	      \
 									      \
+	    inptr += 2;							      \
+									      \
+	    put32 (outptr, u1);						      \
+	    outptr += 4;						      \
+									      \
 	    /* See whether we have room for two characters.  */		      \
-	    if (outptr + 8 <= outend)					      \
+	    if (outptr + 4 <= outend)					      \
 	      {								      \
-		inptr += 2;						      \
-		put32 (outptr, u1);					      \
-		outptr += 4;						      \
 		put32 (outptr, u2);					      \
 		outptr += 4;						      \
 		continue;						      \
 	      }								      \
-	    else							      \
-	      {								      \
-		result = __GCONV_FULL_OUTPUT;				      \
-		break;							      \
-	      }								      \
+									      \
+	    /* Otherwise store only the first character now, and	      \
+	       put the second one into the queue.  */			      \
+	    set |= u2 << 6;						      \
+	    /* Tell the caller why we terminate the loop.  */		      \
+	    result = __GCONV_FULL_OUTPUT;				      \
+	    break;							      \
 	  }								      \
 									      \
 	inptr += 2;							      \
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 9617cb7..c7ae423 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -388,13 +388,6 @@ elf_machine_lazy_rel (struct link_map *map,
   /* Check for unexpected PLT reloc type.  */
   if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
     {
-      if (map->l_mach.plt == 0)
-	{
-	  /* Prelinking.  */
-	  *reloc_addr += l_addr;
-	  return;
-	}
-
       if (1) /* DT_AARCH64_VARIANT_PCS is not available, so always check.  */
 	{
 	  /* Check the symbol table for variant PCS symbols.  */
@@ -418,7 +411,10 @@ elf_machine_lazy_rel (struct link_map *map,
 	    }
 	}
 
-      *reloc_addr = map->l_mach.plt;
+      if (map->l_mach.plt == 0)
+	*reloc_addr += l_addr;
+      else
+	*reloc_addr = map->l_mach.plt;
     }
   else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1))
     {
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 7e1163e..919b28d 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -33,32 +33,24 @@
 #define A_l	x6
 #define A_lw	w6
 #define A_h	x7
-#define A_hw	w7
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
 #define C_l	x10
+#define C_lw	w10
 #define C_h	x11
 #define D_l	x12
 #define D_h	x13
-#define E_l	src
-#define E_h	count
-#define F_l	srcend
-#define F_h	dst
+#define E_l	x14
+#define E_h	x15
+#define F_l	x16
+#define F_h	x17
 #define G_l	count
 #define G_h	dst
+#define H_l	src
+#define H_h	srcend
 #define tmp1	x14
 
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..96 bytes which are fully unrolled. Large copies
-   of more than 96 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   In order to share code with memmove, small and medium copies read all
-   data before writing, allowing any kind of overlap. So small, medium
-   and large backwards memmoves are handled by falling through into memcpy.
-   Overlapping large forward memmoves use a loop that copies backwards.
-*/
-
 #ifndef MEMMOVE
 # define MEMMOVE memmove
 #endif
@@ -66,108 +58,115 @@
 # define MEMCPY memcpy
 #endif
 
-ENTRY_ALIGN (MEMMOVE, 6)
-
-	DELOUSE (0)
-	DELOUSE (1)
-	DELOUSE (2)
+/* This implementation supports both memcpy and memmove and shares most code.
+   It uses unaligned accesses and branchless sequences to keep the code small,
+   simple and improve performance.
 
-	sub	tmp1, dstin, src
-	cmp	count, 96
-	ccmp	tmp1, count, 2, hi
-	b.lo	L(move_long)
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check in memmove is negligible since it is only required for large copies.
 
-	/* Common case falls through into memcpy.  */
-END (MEMMOVE)
-libc_hidden_builtin_def (MEMMOVE)
-ENTRY (MEMCPY)
+   Large copies use a software pipelined loop processing 64 bytes per
+   iteration.  The destination pointer is 16-byte aligned to minimize
+   unaligned accesses.  The loop tail is handled by always copying 64 bytes
+   from the end.
+*/
 
+ENTRY_ALIGN (MEMCPY, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	DELOUSE (2)
 
-	prfm	PLDL1KEEP, [src]
 	add	srcend, src, count
 	add	dstend, dstin, count
-	cmp	count, 16
-	b.ls	L(copy16)
-	cmp	count, 96
+	cmp	count, 128
 	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
 
-	/* Medium copies: 17..96 bytes.  */
-	sub	tmp1, count, 1
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
 	ldp	A_l, A_h, [src]
-	tbnz	tmp1, 6, L(copy96)
 	ldp	D_l, D_h, [srcend, -16]
-	tbz	tmp1, 5, 1f
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-1:
 	stp	A_l, A_h, [dstin]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 
-	.p2align 4
-	/* Small copies: 0..16 bytes.  */
+	/* Copy 8-15 bytes.  */
 L(copy16):
-	cmp	count, 8
-	b.lo	1f
+	tbz	count, 3, L(copy8)
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
-	.p2align 4
-1:
-	tbz	count, 2, 1f
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
 	ldr	A_lw, [src]
-	ldr	A_hw, [srcend, -4]
+	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
-	str	A_hw, [dstend, -4]
+	str	B_lw, [dstend, -4]
 	ret
 
-	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-	cbz	count, 2f
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
 	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
-	ldrb	A_hw, [srcend, -1]
+	ldrb	C_lw, [srcend, -1]
 	ldrb	B_lw, [src, tmp1]
 	strb	A_lw, [dstin]
 	strb	B_lw, [dstin, tmp1]
-	strb	A_hw, [dstend, -1]
-2:	ret
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
 
 	.p2align 4
-	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
-	   32 bytes from the end.  */
-L(copy96):
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_l, A_h, [src]
 	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [src, 32]
-	ldp	D_l, D_h, [src, 48]
-	ldp	E_l, E_h, [srcend, -32]
-	ldp	F_l, F_h, [srcend, -16]
+	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	cmp	count, 64
+	b.hi	L(copy128)
 	stp	A_l, A_h, [dstin]
 	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin, 32]
-	stp	D_l, D_h, [dstin, 48]
-	stp	E_l, E_h, [dstend, -32]
-	stp	F_l, F_h, [dstend, -16]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
 	ret
 
-	/* Align DST to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 96 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_l, E_h, [src, 32]
+	ldp	F_l, F_h, [src, 48]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_l, G_h, [srcend, -64]
+	ldp	H_l, H_h, [srcend, -48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
+L(copy96):
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	E_l, E_h, [dstin, 32]
+	stp	F_l, F_h, [dstin, 48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
 
 	.p2align 4
+	/* Copy more than 128 bytes.  */
 L(copy_long):
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+	ldp	D_l, D_h, [src]
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
-	ldp	D_l, D_h, [src]
 	sub	src, src, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
@@ -176,7 +175,8 @@ L(copy_long):
 	ldp	C_l, C_h, [src, 48]
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	L(last64)
+	b.ls	L(copy64_from_end)
+
 L(loop64):
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
@@ -189,10 +189,8 @@ L(loop64):
 	subs	count, count, 64
 	b.hi	L(loop64)
 
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the end even if
-	   there is just 1 byte left.  */
-L(last64):
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
 	ldp	E_l, E_h, [srcend, -64]
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [srcend, -48]
@@ -207,20 +205,42 @@ L(last64):
 	stp	C_l, C_h, [dstend, -16]
 	ret
 
-	.p2align 4
-L(move_long):
-	cbz	tmp1, 3f
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+ENTRY_ALIGN (MEMMOVE, 4)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
 
 	add	srcend, src, count
 	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(move_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldp	A_l, A_h, [src]
+	ldp	D_l, D_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
 
-	/* Align dstend to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 96 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+	.p2align 4
+L(move_long):
+	/* Only use backward copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cbz	tmp1, L(copy0)
+	cmp	tmp1, count
+	b.hs	L(copy_long)
 
-	and	tmp1, dstend, 15
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
 	ldp	D_l, D_h, [srcend, -16]
+	and	tmp1, dstend, 15
 	sub	srcend, srcend, tmp1
 	sub	count, count, tmp1
 	ldp	A_l, A_h, [srcend, -16]
@@ -230,10 +250,9 @@ L(move_long):
 	ldp	D_l, D_h, [srcend, -64]!
 	sub	dstend, dstend, tmp1
 	subs	count, count, 128
-	b.ls	2f
+	b.ls	L(copy64_from_start)
 
-	nop
-1:
+L(loop64_backwards):
 	stp	A_l, A_h, [dstend, -16]
 	ldp	A_l, A_h, [srcend, -16]
 	stp	B_l, B_h, [dstend, -32]
@@ -243,12 +262,10 @@ L(move_long):
 	stp	D_l, D_h, [dstend, -64]!
 	ldp	D_l, D_h, [srcend, -64]!
 	subs	count, count, 64
-	b.hi	1b
+	b.hi	L(loop64_backwards)
 
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the start even if
-	   there is just 1 byte left.  */
-2:
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
 	ldp	G_l, G_h, [src, 48]
 	stp	A_l, A_h, [dstend, -16]
 	ldp	A_l, A_h, [src, 32]
@@ -261,7 +278,7 @@ L(move_long):
 	stp	A_l, A_h, [dstin, 32]
 	stp	B_l, B_h, [dstin, 16]
 	stp	C_l, C_h, [dstin]
-3:	ret
+	ret
 
-END (MEMCPY)
-libc_hidden_builtin_def (MEMCPY)
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 57ffdf7..134fe46 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,4 +1,4 @@
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor memset_generic memset_falkor
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index e55be80..0ccd141 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -42,10 +42,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
   IFUNC_IMPL (i, name, memset,
 	      /* Enable this on non-falkor processors too so that other cores
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 8f5d4e7..e69a1ae 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,6 +29,7 @@
 extern __typeof (__redirect_memcpy) __libc_memcpy;
 
 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
@@ -36,11 +37,14 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
 libc_ifunc (__libc_memcpy,
             (IS_THUNDERX (midr)
 	     ? __memcpy_thunderx
-	     : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr)
+	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
 		? __memcpy_falkor
 		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
 		  ? __memcpy_thunderx2
-		  : __memcpy_generic))));
+		  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+		     || IS_NEOVERSE_V1 (midr)
+		     ? __memcpy_simd
+		     : __memcpy_generic)))));
 
 # undef memcpy
 strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
new file mode 100644
index 0000000..48bb6d7
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
@@ -0,0 +1,248 @@
+/* Generic optimized memcpy using SIMD.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_lw	w10
+#define tmp1	x14
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+
+/* This implementation supports both memcpy and memmove and shares most code.
+   It uses unaligned accesses and branchless sequences to keep the code small,
+   simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check in memmove is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per
+   iteration.  The destination pointer is 16-byte aligned to minimize
+   unaligned accesses.  The loop tail is handled by always copying 64 bytes
+   from the end.  */
+
+ENTRY (__memcpy_simd)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Align loop64 below to 16 bytes.  */
+	nop
+
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+END (__memcpy_simd)
+libc_hidden_builtin_def (__memcpy_simd)
+
+
+ENTRY (__memmove_simd)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(move_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small moves: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
+	ret
+
+L(move_long):
+	/* Only use backward copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cbz	tmp1, L(move0)
+	cmp	tmp1, count
+	b.hs	L(copy_long)
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+L(move0):
+	ret
+
+END (__memmove_simd)
+libc_hidden_builtin_def (__memmove_simd)
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index e69d816..b426dad 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,6 +29,7 @@
 extern __typeof (__redirect_memmove) __libc_memmove;
 
 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
 
@@ -37,7 +38,10 @@ libc_ifunc (__libc_memmove,
 	     ? __memmove_thunderx
 	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
 		? __memmove_falkor
-		: __memmove_generic)));
+		  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+		     || IS_NEOVERSE_V1 (midr)
+		     ? __memmove_simd
+		     : __memmove_generic))));
 
 # undef memmove
 strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
index 5b30709..509e3e1 100644
--- a/sysdeps/aarch64/sysdep.h
+++ b/sysdeps/aarch64/sysdep.h
@@ -45,7 +45,7 @@
 #define ENTRY(name)						\
   .globl C_SYMBOL_NAME(name);					\
   .type C_SYMBOL_NAME(name),%function;				\
-  .align 4;							\
+  .p2align 6;							\
   C_LABEL(name)							\
   cfi_startproc;						\
   CALL_MCOUNT
diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index f6cfb90..1e3ef25 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -338,16 +338,22 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
 	{
 # ifndef RTLD_BOOTSTRAP
 	  if (sym_map != map
-	      && sym_map->l_type != lt_executable
 	      && !sym_map->l_relocated)
 	    {
 	      const char *strtab
 		= (const char *) D_PTR (map, l_info[DT_STRTAB]);
-	      _dl_error_printf ("\
+	      if (sym_map->l_type == lt_executable)
+		_dl_fatal_printf ("\
+%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \
+and creates an unsatisfiable circular dependency.\n",
+				  RTLD_PROGNAME, strtab + refsym->st_name,
+				  map->l_name);
+	      else
+		_dl_error_printf ("\
 %s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
-				RTLD_PROGNAME, map->l_name,
-				sym_map->l_name,
-				strtab + refsym->st_name);
+				  RTLD_PROGNAME, map->l_name,
+				  sym_map->l_name,
+				  strtab + refsym->st_name);
 	    }
 # endif
 	  value = ((Elf32_Addr (*) (void)) value) ();
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 153d258..fbe1148 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -51,8 +51,12 @@
 
 #define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h'			      \
                         && MIDR_PARTNUM(midr) == 0x000)
-#define IS_ARES(midr) (MIDR_IMPLEMENTOR(midr) == 'A'			      \
-			&& MIDR_PARTNUM(midr) == 0xd0c)
+#define IS_NEOVERSE_N1(midr) (MIDR_IMPLEMENTOR(midr) == 'A'		      \
+			      && MIDR_PARTNUM(midr) == 0xd0c)
+#define IS_NEOVERSE_N2(midr) (MIDR_IMPLEMENTOR(midr) == 'A'		      \
+			      && MIDR_PARTNUM(midr) == 0xd49)
+#define IS_NEOVERSE_V1(midr) (MIDR_IMPLEMENTOR(midr) == 'A'		      \
+			      && MIDR_PARTNUM(midr) == 0xd40)
 
 struct cpu_features
 {
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 43ad4a7..891a4fd 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -12,6 +12,12 @@ endif
 ifeq ($(subdir),setjmp)
 gen-as-const-headers += jmp_buf-ssp.sym
 sysdep_routines += __longjmp_cancel
+ifneq ($(enable-cet),no)
+ifneq ($(have-tunables),no)
+tests += tst-setjmp-cet
+tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on
+endif
+endif
 endif
 
 ifeq ($(enable-cet),yes)
diff --git a/sysdeps/x86/dl-cet.c b/sysdeps/x86/dl-cet.c
index b82ba14..56c37bc 100644
--- a/sysdeps/x86/dl-cet.c
+++ b/sysdeps/x86/dl-cet.c
@@ -105,7 +105,11 @@ dl_cet_check (struct link_map *m, const char *program)
   /* No legacy object check if both IBT and SHSTK are always on.  */
   if (enable_ibt_type == CET_ALWAYS_ON
       && enable_shstk_type == CET_ALWAYS_ON)
-    return;
+    {
+      THREAD_SETMEM (THREAD_SELF, header.feature_1,
+		     GL(dl_x86_feature_1)[0]);
+      return;
+    }
 
   /* Check if IBT is enabled by kernel.  */
   bool ibt_enabled
diff --git a/sysdeps/x86/tst-setjmp-cet.c b/sysdeps/x86/tst-setjmp-cet.c
new file mode 100644
index 0000000..42c795d
--- /dev/null
+++ b/sysdeps/x86/tst-setjmp-cet.c
@@ -0,0 +1 @@
+#include <setjmp/tst-setjmp.c>
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 1942ed5..c5fdcd3 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -315,16 +315,22 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 	{
 # ifndef RTLD_BOOTSTRAP
 	  if (sym_map != map
-	      && sym_map->l_type != lt_executable
 	      && !sym_map->l_relocated)
 	    {
 	      const char *strtab
 		= (const char *) D_PTR (map, l_info[DT_STRTAB]);
-	      _dl_error_printf ("\
+	      if (sym_map->l_type == lt_executable)
+		_dl_fatal_printf ("\
+%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \
+and creates an unsatisfiable circular dependency.\n",
+				  RTLD_PROGNAME, strtab + refsym->st_name,
+				  map->l_name);
+	      else
+		_dl_error_printf ("\
 %s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
-				RTLD_PROGNAME, map->l_name,
-				sym_map->l_name,
-				strtab + refsym->st_name);
+				  RTLD_PROGNAME, map->l_name,
+				  sym_map->l_name,
+				  strtab + refsym->st_name);
 	    }
 # endif
 	  value = ((ElfW(Addr) (*) (void)) value) ();
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 9bab114..5aaadc2 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -67,6 +67,13 @@
 # define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
 #endif
 
+/* Avoid short distance rep movsb only with non-SSE vector.  */
+#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+#else
+# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
+#endif
+
 #ifndef PREFETCH
 # define PREFETCH(addr) prefetcht0 addr
 #endif
@@ -257,7 +264,21 @@ L(movsb):
 #  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
 # endif
 	jb	L(more_8x_vec_backward)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	jmp	2f
+# endif
 1:
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+	movq	%rsi, %rcx
+	subq	%rdi, %rcx
+2:
+/* Avoid "rep movsb" if RCX, the distance between source and destination,
+   is N*4GB + [1..63] with N >= 0.  */
+	cmpl	$63, %ecx
+	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
+# endif
 	mov	%RDX_LP, %RCX_LP
 	rep movsb
 L(nop):