From 7ebba91361badf7531d4e75050627a88d424872f Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 5 Mar 2021 07:26:42 -0800 Subject: x86-64: Add AVX optimized string/memory functions for RTM Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX optimized string/memory functions with xtest jz 1f vzeroall ret 1: vzeroupper ret at function exit on processors with usable RTM, but without 256-bit EVEX instructions to avoid VZEROUPPER inside a transactionally executing RTM region. --- sysdeps/x86_64/multiarch/Makefile | 27 ++++ sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 170 +++++++++++++++++++++ sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 + sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 ++ sysdeps/x86_64/multiarch/ifunc-memset.h | 12 ++ sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 + sysdeps/x86_64/multiarch/ifunc-wmemset.h | 5 + sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/memchr-avx2.S | 45 +++--- sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S | 12 ++ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 28 ++-- .../multiarch/memmove-avx-unaligned-erms-rtm.S | 17 +++ .../x86_64/multiarch/memmove-vec-unaligned-erms.S | 33 ++-- sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/memrchr-avx2.S | 53 +++---- .../multiarch/memset-avx2-unaligned-erms-rtm.S | 10 ++ .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 12 +- .../x86_64/multiarch/memset-vec-unaligned-erms.S | 41 ++--- sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/strcat-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strcat-avx2.S | 6 +- sysdeps/x86_64/multiarch/strchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strchr-avx2.S | 28 ++-- sysdeps/x86_64/multiarch/strchr.c | 4 + sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strcmp-avx2.S | 55 +++---- sysdeps/x86_64/multiarch/strcmp.c | 4 + sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strcpy-avx2.S | 85 ++++------- sysdeps/x86_64/multiarch/strlen-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strlen-avx2.S | 43 +++--- sysdeps/x86_64/multiarch/strncat-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strncmp.c | 4 + sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strrchr-avx2.S | 19 +-- sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 5 + sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S | 5 + sysdeps/x86_64/multiarch/wcsnlen.c | 4 + sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S | 4 + sysdeps/x86_64/sysdep.h | 22 +++ 52 files changed, 670 insertions(+), 248 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strlen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 9d79b13..491c769 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -40,6 +40,25 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ memset-sse2-unaligned-erms \ memset-avx2-unaligned-erms \ memset-avx512-unaligned-erms \ + memchr-avx2-rtm \ + memcmp-avx2-movbe-rtm \ + memmove-avx-unaligned-erms-rtm \ + memrchr-avx2-rtm \ + memset-avx2-unaligned-erms-rtm \ + rawmemchr-avx2-rtm \ + strchr-avx2-rtm \ + strcmp-avx2-rtm \ + strchrnul-avx2-rtm \ + stpcpy-avx2-rtm \ + stpncpy-avx2-rtm \ + strcat-avx2-rtm \ + strcpy-avx2-rtm \ + strlen-avx2-rtm \ + strncat-avx2-rtm \ + strncmp-avx2-rtm \ + strncpy-avx2-rtm \ + strnlen-avx2-rtm \ + strrchr-avx2-rtm \ memchr-evex \ memcmp-evex-movbe \ memmove-evex-unaligned-erms \ @@ -76,6 +95,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ wcsrchr-sse2 wcsrchr-avx2 \ wcsnlen-sse4_1 wcsnlen-c \ wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ + wcschr-avx2-rtm \ + wcscmp-avx2-rtm \ + wcslen-avx2-rtm \ + wcsncmp-avx2-rtm \ + wcsnlen-avx2-rtm \ + wcsrchr-avx2-rtm \ + wmemchr-avx2-rtm \ + wmemcmp-avx2-movbe-rtm \ wcschr-evex \ wcscmp-evex \ wcslen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h index 634c3c3..e3ec62c 100644 --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h @@ -21,6 +21,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -36,6 +37,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 96344a7..0249130 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -44,6 +44,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __memchr_avx2) IFUNC_IMPL_ADD (array, i, memchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -57,6 +61,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (MOVBE)), __memcmp_avx2_movbe) IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __memcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, memcmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (MOVBE)), @@ -86,6 +95,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX), __memmove_chk_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memmove_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX512VL), __memmove_chk_evex_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, @@ -114,6 +131,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX), __memmove_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memmove, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX512VL), __memmove_evex_unaligned) IFUNC_IMPL_ADD (array, i, memmove, @@ -144,6 +169,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __memrchr_avx2) IFUNC_IMPL_ADD (array, i, memrchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memrchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __memrchr_evex) @@ -166,6 +195,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __memset_chk_avx2_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_chk_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_chk_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __memset_chk_evex_unaligned) @@ -199,6 +236,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __memset_avx2_unaligned_erms) IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memset, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __memset_evex_unaligned) @@ -223,6 +268,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __rawmemchr_avx2) IFUNC_IMPL_ADD (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __rawmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -235,6 +284,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __strlen_avx2) IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strlen, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strlen_evex) @@ -246,6 +299,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __strnlen_avx2) IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strnlen, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strnlen_evex) @@ -258,6 +315,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) IFUNC_IMPL_ADD (array, i, stpncpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __stpncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpncpy, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __stpncpy_evex) @@ -272,6 +333,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __stpcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpcpy, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __stpcpy_evex) @@ -310,6 +375,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2), __strcat_avx2) IFUNC_IMPL_ADD (array, i, strcat, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcat, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcat_evex) @@ -324,6 +393,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __strchr_avx2) IFUNC_IMPL_ADD (array, i, strchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -337,6 +410,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __strchrnul_avx2) IFUNC_IMPL_ADD (array, i, strchrnul, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strchrnul_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchrnul, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -349,6 +426,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __strrchr_avx2) IFUNC_IMPL_ADD (array, i, strrchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strrchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strrchr_evex) @@ -360,6 +441,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __strcmp_avx2) IFUNC_IMPL_ADD (array, i, strcmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -376,6 +461,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2), __strcpy_avx2) IFUNC_IMPL_ADD (array, i, strcpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcpy, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcpy_evex) @@ -423,6 +512,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2), __strncat_avx2) IFUNC_IMPL_ADD (array, i, strncat, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncat, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncat_evex) @@ -437,6 +530,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2), __strncpy_avx2) IFUNC_IMPL_ADD (array, i, strncpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncpy, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncpy_evex) @@ -470,6 +567,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __wcschr_avx2) IFUNC_IMPL_ADD (array, i, wcschr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcschr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcschr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -482,6 +583,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __wcsrchr_avx2) IFUNC_IMPL_ADD (array, i, wcsrchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcsrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsrchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -494,6 +599,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __wcscmp_avx2) IFUNC_IMPL_ADD (array, i, wcscmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcscmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcscmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -506,6 +615,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __wcsncmp_avx2) IFUNC_IMPL_ADD (array, i, wcsncmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcsncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsncmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -524,6 +637,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __wcslen_avx2) IFUNC_IMPL_ADD (array, i, wcslen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcslen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcslen, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -536,6 +653,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __wcsnlen_avx2) IFUNC_IMPL_ADD (array, i, wcsnlen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcsnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsnlen, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -551,6 +672,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __wmemchr_avx2) IFUNC_IMPL_ADD (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), @@ -564,6 +689,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (MOVBE)), __wmemcmp_avx2_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __wmemcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, wmemcmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (MOVBE)), @@ -582,6 +712,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __wmemset_avx2_unaligned) IFUNC_IMPL_ADD (array, i, wmemset, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, wmemset, CPU_FEATURE_USABLE (AVX512VL), __wmemset_evex_unaligned) IFUNC_IMPL_ADD (array, i, wmemset, @@ -607,6 +741,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX), __memcpy_chk_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __memcpy_chk_evex_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, @@ -635,6 +777,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX), __memcpy_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512VL), __memcpy_evex_unaligned) IFUNC_IMPL_ADD (array, i, memcpy, @@ -677,6 +827,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX), __mempcpy_chk_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_chk_evex_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, @@ -714,6 +872,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX), __mempcpy_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_evex_unaligned) IFUNC_IMPL_ADD (array, i, mempcpy, @@ -735,6 +901,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX2), __strncmp_avx2) IFUNC_IMPL_ADD (array, i, strncmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncmp_evex) diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index 5ac41a1..8bee1af 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; static inline void * @@ -38,6 +39,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) return OPTIMIZE (evex_movbe); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_movbe_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2_movbe); } diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index 517b332..4eba926 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) @@ -71,6 +75,14 @@ IFUNC_SELECTOR (void) return OPTIMIZE (evex_unaligned); } + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx_unaligned_erms_rtm); + + return OPTIMIZE (avx_unaligned_rtm); + } + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) { if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h index 0246818..43655fb 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h @@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) @@ -69,6 +73,14 @@ IFUNC_SELECTOR (void) return OPTIMIZE (evex_unaligned); } + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx2_unaligned_erms_rtm); + + return OPTIMIZE (avx2_unaligned_rtm); + } + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) { if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index f31f436..39568f4 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -39,6 +40,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h index 7e947c5..8d952ef 100644 --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h @@ -20,6 +20,8 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; @@ -39,6 +41,9 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) return OPTIMIZE (evex_unaligned); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_unaligned_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S new file mode 100644 index 0000000..87b076c --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCHR +# define MEMCHR __memchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index 77a9523..1fcb1c3 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -34,9 +34,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCHR) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ @@ -107,8 +111,8 @@ L(cros_page_boundary): # endif addq %rdi, %rax addq %rcx, %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(aligned_more): @@ -224,8 +228,7 @@ L(last_4x_vec_or_less): jnz L(first_vec_x3_check) xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -243,8 +246,7 @@ L(last_2x_vec): testl %eax, %eax jnz L(first_vec_x1_check) xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x0_check): @@ -253,8 +255,7 @@ L(first_vec_x0_check): cmpq %rax, %rdx jbe L(zero) addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1_check): @@ -264,8 +265,7 @@ L(first_vec_x1_check): jbe L(zero) addq $VEC_SIZE, %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2_check): @@ -275,8 +275,7 @@ L(first_vec_x2_check): jbe L(zero) addq $(VEC_SIZE * 2), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x3_check): @@ -286,12 +285,14 @@ L(first_vec_x3_check): jbe L(zero) addq $(VEC_SIZE * 3), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): - VZEROUPPER + xorl %eax, %eax + jmp L(return_vzeroupper) + + .p2align 4 L(null): xorl %eax, %eax ret @@ -301,24 +302,21 @@ L(null): L(first_vec_x0): tzcntl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): tzcntl %eax, %eax addq $VEC_SIZE, %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): tzcntl %eax, %eax addq $(VEC_SIZE * 2), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -337,8 +335,7 @@ L(first_vec_x3): tzcntl %eax, %eax addq $(VEC_SIZE * 3), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000..cf4eff5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCMP +# define MEMCMP __memcmp_avx2_movbe_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memcmp-avx2-movbe.S" diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index cf9c9b8..ad0fa96 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -47,6 +47,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 # define VEC_MASK ((1 << VEC_SIZE) - 1) @@ -55,7 +59,7 @@ memcmp has to use UNSIGNED comparison for elemnts. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCMP) # ifdef USE_AS_WMEMCMP shl $2, %RDX_LP @@ -123,8 +127,8 @@ ENTRY (MEMCMP) vptest %ymm0, %ymm5 jnc L(4x_vec_end) xorl %eax, %eax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_2x_vec): @@ -144,8 +148,7 @@ L(last_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec): @@ -164,8 +167,7 @@ L(wmemcmp_return): movzbl (%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WMEMCMP .p2align 4 @@ -367,8 +369,7 @@ L(last_4x_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -394,8 +395,7 @@ L(4x_vec_end): movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -410,8 +410,7 @@ L(first_vec_x1): movzbl VEC_SIZE(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -426,7 +425,6 @@ L(first_vec_x2): movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMCMP) #endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S new file mode 100644 index 0000000..1ec1962 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S @@ -0,0 +1,17 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +# define VZEROUPPER_RETURN jmp L(return) + +# define SECTION(p) p##.avx.rtm +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index d713d7d..897a3d9 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -150,11 +150,12 @@ L(last_2x_vec): VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) - VZEROUPPER #if !defined USE_MULTIARCH || !IS_IN (libc) L(nop): -#endif ret +#else + VZEROUPPER_RETURN +#endif #if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned)) @@ -247,8 +248,11 @@ L(last_2x_vec): VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(movsb): cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP @@ -313,8 +317,7 @@ L(between_32_63): VMOVU -32(%rsi,%rdx), %YMM1 VMOVU %YMM0, (%rdi) VMOVU %YMM1, -32(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN #endif #if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ @@ -323,7 +326,7 @@ L(between_16_31): VMOVU -16(%rsi,%rdx), %XMM1 VMOVU %XMM0, (%rdi) VMOVU %XMM1, -16(%rdi,%rdx) - ret + VZEROUPPER_RETURN #endif L(between_8_15): /* From 8 to 15. No branch when size == 8. */ @@ -376,8 +379,7 @@ L(more_2x_vec): VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(last_4x_vec): /* Copy from 2 * VEC to 4 * VEC. */ VMOVU (%rsi), %VEC(0) @@ -388,8 +390,7 @@ L(last_4x_vec): VMOVU %VEC(1), VEC_SIZE(%rdi) VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec): cmpq %rsi, %rdi @@ -445,8 +446,7 @@ L(loop_4x_vec_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec_backward): /* Load the first 4 * VEC and last VEC to support overlapping @@ -497,8 +497,7 @@ L(loop_4x_vec_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) L(large_forward): @@ -533,8 +532,7 @@ L(loop_large_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(large_backward): /* Don't use non-temporal store if there is overlap between @@ -568,8 +566,7 @@ L(loop_large_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S new file mode 100644 index 0000000..cea2d2a --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index eddede4..ac7370c 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -20,14 +20,22 @@ # include +# ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2 +# endif + # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits -ENTRY (__memrchr_avx2) + .section SECTION(.text),"ax",@progbits +ENTRY (MEMRCHR) /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 vpbroadcastb %xmm0, %ymm0 @@ -134,8 +142,8 @@ L(loop_4x_vec): vpmovmskb %ymm1, %eax bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_4x_vec_or_less): @@ -169,8 +177,7 @@ L(last_4x_vec_or_less): addq %rax, %rdx jl L(zero) addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -191,31 +198,27 @@ L(last_2x_vec): jl L(zero) addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x0): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x1): bsrl %eax, %eax addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x2): bsrl %eax, %eax addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3): @@ -232,8 +235,7 @@ L(last_vec_x1_check): jl L(zero) addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3_check): @@ -243,12 +245,14 @@ L(last_vec_x3_check): jl L(zero) addl $(VEC_SIZE * 3), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): - VZEROUPPER + xorl %eax, %eax + VZEROUPPER_RETURN + + .p2align 4 L(null): xorl %eax, %eax ret @@ -273,8 +277,7 @@ L(last_vec_or_less_aligned): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_or_less): @@ -315,8 +318,7 @@ L(last_vec_or_less): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_2x_aligned): @@ -353,7 +355,6 @@ L(last_vec_2x_aligned): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret -END (__memrchr_avx2) + VZEROUPPER_RETURN +END (MEMRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S new file mode 100644 index 0000000..8ac3e47 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S @@ -0,0 +1,10 @@ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return) + +#define SECTION(p) p##.avx.rtm +#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm +#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + +#include "memset-avx2-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 7ab3d89..ae0860f 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -14,9 +14,15 @@ movq r, %rax; \ vpbroadcastd %xmm0, %ymm0 -# define SECTION(p) p##.avx -# define MEMSET_SYMBOL(p,s) p##_avx2_##s -# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# ifndef SECTION +# define SECTION(p) p##.avx +# endif +# ifndef MEMSET_SYMBOL +# define MEMSET_SYMBOL(p,s) p##_avx2_##s +# endif +# ifndef WMEMSET_SYMBOL +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# endif # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 358ee4b..584747f 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -45,17 +45,14 @@ #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper +# define VZEROUPPER_SHORT_RETURN vzeroupper; ret # else # define VZEROUPPER # endif #endif #ifndef VZEROUPPER_SHORT_RETURN -# if VEC_SIZE > 16 -# define VZEROUPPER_SHORT_RETURN vzeroupper -# else -# define VZEROUPPER_SHORT_RETURN rep -# endif +# define VZEROUPPER_SHORT_RETURN rep; ret #endif #ifndef MOVQ @@ -117,8 +114,7 @@ L(entry_from_bzero): /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -141,14 +137,12 @@ ENTRY (__memset_erms) ENTRY (MEMSET_SYMBOL (__memset, erms)) # endif L(stosb): - /* Issue vzeroupper before rep stosb. */ - VZEROUPPER mov %RDX_LP, %RCX_LP movzbl %sil, %eax mov %RDI_LP, %RDX_LP rep stosb mov %RDX_LP, %RAX_LP - ret + VZEROUPPER_RETURN # if VEC_SIZE == 16 END (__memset_erms) # else @@ -175,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(stosb_more_2x_vec): cmp __x86_rep_stosb_threshold(%rip), %RDX_LP @@ -190,8 +183,11 @@ L(more_2x_vec): VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(loop_start): leaq (VEC_SIZE * 4)(%rdi), %rcx @@ -217,7 +213,6 @@ L(loop): cmpq %rcx, %rdx jne L(loop) VZEROUPPER_SHORT_RETURN - ret L(less_vec): /* Less than 1 VEC. */ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 @@ -241,40 +236,34 @@ L(less_vec): jb 1f movb %cl, (%rdi) 1: - VZEROUPPER - ret + VZEROUPPER_RETURN # if VEC_SIZE > 32 /* From 32 to 63. No branch when size == 32. */ L(between_32_63): VMOVU %YMM0, -32(%rdi,%rdx) VMOVU %YMM0, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN # endif # if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): VMOVU %XMM0, -16(%rdi,%rdx) VMOVU %XMM0, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN # endif /* From 8 to 15. No branch when size == 8. */ L(between_8_15): movq %rcx, -8(%rdi,%rdx) movq %rcx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_4_7): /* From 4 to 7. No branch when size == 4. */ movl %ecx, -4(%rdi,%rdx) movl %ecx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_2_3): /* From 2 to 3. No branch when size == 2. */ movw %cx, -2(%rdi,%rdx) movw %cx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S new file mode 100644 index 0000000..acc5f6e --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __rawmemchr_avx2_rtm +#define USE_AS_RAWMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S new file mode 100644 index 0000000..2b9c07a --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S new file mode 100644 index 0000000..60a2ccf --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S new file mode 100644 index 0000000..637fb55 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCAT +# define STRCAT __strcat_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcat-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S index 41de8b2..4356fa7 100644 --- a/sysdeps/x86_64/multiarch/strcat-avx2.S +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S @@ -30,7 +30,11 @@ /* Number of bytes in a vector register */ # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + + .section SECTION(.text),"ax",@progbits ENTRY (STRCAT) mov %rdi, %r9 # ifdef USE_AS_STRNCAT diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S new file mode 100644 index 0000000..81f20d1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCHR +# define STRCHR __strchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S index 476c20c..25bec38 100644 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -40,10 +40,14 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 # define PAGE_SIZE 4096 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCHR) movl %edi, %ecx # ifndef USE_AS_STRCHRNUL @@ -76,8 +80,8 @@ ENTRY (STRCHR) cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(more_vecs): @@ -126,8 +130,7 @@ L(aligned_more): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x0): @@ -138,8 +141,7 @@ L(first_vec_x0): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -149,8 +151,7 @@ L(first_vec_x1): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -161,8 +162,7 @@ L(first_vec_x2): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN L(prep_loop_4x): /* Align data to 4 * VEC_SIZE. */ @@ -221,8 +221,7 @@ L(loop_4x_vec): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN /* Cold case for crossing page with first load. */ .p2align 4 @@ -246,8 +245,7 @@ L(cross_page_boundary): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCHR) # endif diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c index 2c0a3e7..691770f 100644 --- a/sysdeps/x86_64/multiarch/strchr.c +++ b/sysdeps/x86_64/multiarch/strchr.c @@ -29,6 +29,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S new file mode 100644 index 0000000..cdcf818 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __strchrnul_avx2_rtm +#define USE_AS_STRCHRNUL 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S new file mode 100644 index 0000000..aecd30d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCMP +# define STRCMP __strcmp_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S index 53cb7a6..4033301 100644 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -55,6 +55,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + /* Warning! wcscmp/wcsncmp have to use SIGNED comparison for elements. strcmp/strncmp have to use UNSIGNED comparison for elements. @@ -75,7 +79,7 @@ the maximum offset is reached before a difference is found, zero is returned. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCMP) # ifdef USE_AS_STRNCMP /* Check for simple cases (0 or 1) in offset. */ @@ -127,8 +131,8 @@ L(return): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(return_vec_size): @@ -161,8 +165,7 @@ L(return_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_2_vec_size): @@ -195,8 +198,7 @@ L(return_2_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_3_vec_size): @@ -229,8 +231,7 @@ L(return_3_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(next_3_vectors): @@ -356,8 +357,7 @@ L(back_to_loop): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_vec): @@ -400,8 +400,7 @@ L(test_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_2_vec): @@ -444,8 +443,7 @@ L(test_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_3_vec): @@ -486,8 +484,7 @@ L(test_3_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page): @@ -556,8 +553,7 @@ L(loop_cross_page): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page_2_vec): @@ -631,8 +627,7 @@ L(loop_cross_page_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_STRNCMP L(string_nbyte_offset_check): @@ -674,8 +669,7 @@ L(cross_page_loop): # ifndef USE_AS_WCSCMP L(different): # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WCSCMP .p2align 4 @@ -685,16 +679,14 @@ L(different): setl %al negl %eax orl $1, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN # endif # ifdef USE_AS_STRNCMP .p2align 4 L(zero): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char0): @@ -708,8 +700,7 @@ L(char0): movzbl (%rdi), %eax subl %ecx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # endif .p2align 4 @@ -734,8 +725,7 @@ L(last_vector): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN /* Comparing on page boundary region requires special treatment: It must done one vector at the time, starting with the wider @@ -856,7 +846,6 @@ L(cross_page_4bytes): testl %eax, %eax jne L(cross_page_loop) subl %ecx, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCMP) #endif diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index 1df7569..62b7abe 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S new file mode 100644 index 0000000..c2c581e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCPY +# define STRCPY __strcpy_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S index b7629ea..5b6506d 100644 --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S @@ -37,6 +37,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + /* zero register */ #define xmmZ xmm0 #define ymmZ ymm0 @@ -46,7 +50,7 @@ # ifndef USE_AS_STRCAT - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCPY) # ifdef USE_AS_STRNCPY mov %RDX_LP, %R8_LP @@ -369,8 +373,8 @@ L(CopyVecSizeExit): lea 1(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(CopyTwoVecSize1): @@ -553,8 +557,7 @@ L(Exit1): lea 2(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit2): @@ -569,8 +572,7 @@ L(Exit2): lea 3(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit3): @@ -584,8 +586,7 @@ L(Exit3): lea 4(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit4_7): @@ -602,8 +603,7 @@ L(Exit4_7): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit8_15): @@ -620,8 +620,7 @@ L(Exit8_15): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit16_31): @@ -638,8 +637,7 @@ L(Exit16_31): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit32_63): @@ -656,8 +654,7 @@ L(Exit32_63): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_STRNCPY @@ -671,8 +668,7 @@ L(StrncpyExit1): # ifdef USE_AS_STRCAT movb $0, 1(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit2): @@ -684,8 +680,7 @@ L(StrncpyExit2): # ifdef USE_AS_STRCAT movb $0, 2(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit3_4): @@ -699,8 +694,7 @@ L(StrncpyExit3_4): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit5_8): @@ -714,8 +708,7 @@ L(StrncpyExit5_8): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit9_16): @@ -729,8 +722,7 @@ L(StrncpyExit9_16): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit17_32): @@ -744,8 +736,7 @@ L(StrncpyExit17_32): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit33_64): @@ -760,8 +751,7 @@ L(StrncpyExit33_64): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit65): @@ -778,50 +768,43 @@ L(StrncpyExit65): # ifdef USE_AS_STRCAT movb $0, 65(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifndef USE_AS_STRCAT .p2align 4 L(Fill1): mov %dl, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill2): mov %dx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill3_4): mov %dx, (%rdi) mov %dx, -2(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill5_8): mov %edx, (%rdi) mov %edx, -4(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill9_16): mov %rdx, (%rdi) mov %rdx, -8(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill17_32): vmovdqu %xmmZ, (%rdi) vmovdqu %xmmZ, -16(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(CopyVecSizeUnalignedVec2): @@ -898,8 +881,7 @@ L(Fill): cmp $1, %r8d ja L(Fill2) je L(Fill1) - VZEROUPPER - ret + VZEROUPPER_RETURN /* end of ifndef USE_AS_STRCAT */ # endif @@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3): # ifdef USE_AS_STRCAT movb $0, (VEC_SIZE * 4)(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(UnalignedFourVecSizeLeaveCase2): @@ -1001,16 +982,14 @@ L(StrncpyExit): # ifdef USE_AS_STRCAT movb $0, (%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(ExitZero): # ifndef USE_AS_STRCAT mov %rdi, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # endif diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S new file mode 100644 index 0000000..75b4b76 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRLEN +# define STRLEN __strlen_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strlen-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S index caa6159..1caae9e 100644 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S @@ -36,9 +36,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN /* Check for zero length. */ @@ -111,8 +115,8 @@ L(cros_page_boundary): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(aligned_more): @@ -231,8 +235,7 @@ L(last_4x_vec_or_less): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -253,8 +256,7 @@ L(last_2x_vec): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x0_check): @@ -267,8 +269,7 @@ L(first_vec_x0_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1_check): @@ -282,8 +283,7 @@ L(first_vec_x1_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2_check): @@ -297,8 +297,7 @@ L(first_vec_x2_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x3_check): @@ -312,8 +311,7 @@ L(first_vec_x3_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(max): @@ -321,8 +319,7 @@ L(max): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): @@ -338,8 +335,7 @@ L(first_vec_x0): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -350,8 +346,7 @@ L(first_vec_x1): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -362,8 +357,7 @@ L(first_vec_x2): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -389,8 +383,7 @@ L(first_vec_x3): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRLEN) #endif diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S new file mode 100644 index 0000000..0dcea18 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_avx2_rtm +#include "strcat-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S new file mode 100644 index 0000000..37d1224 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCMP __strncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c index 617c4e1..60ba0fe 100644 --- a/sysdeps/x86_64/multiarch/strncmp.c +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S new file mode 100644 index 0000000..79e7083 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S new file mode 100644 index 0000000..04f1626 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __strnlen_avx2_rtm +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S new file mode 100644 index 0000000..5def14e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRRCHR +# define STRRCHR __strrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index 53ea445..0deba97 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -36,9 +36,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRRCHR) movd %esi, %xmm4 movl %edi, %ecx @@ -166,8 +170,8 @@ L(return_value): # endif bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(match): @@ -198,8 +202,7 @@ L(find_nul): jz L(return_value) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char_and_nul): @@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec): jz L(return_null) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_null): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S new file mode 100644 index 0000000..d49dbbf --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __wcschr_avx2_rtm +#define USE_AS_WCSCHR 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S new file mode 100644 index 0000000..d6ca2b8 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRCMP __wcscmp_avx2_rtm +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S new file mode 100644 index 0000000..35658d7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __wcslen_avx2_rtm +#define USE_AS_WCSLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S new file mode 100644 index 0000000..4e88c70 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRCMP __wcsncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S new file mode 100644 index 0000000..7437ebe --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRLEN __wcsnlen_avx2_rtm +#define USE_AS_WCSLEN 1 +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c index 19bc6fd..4983f1b 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen.c +++ b/sysdeps/x86_64/multiarch/wcsnlen.c @@ -29,6 +29,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S new file mode 100644 index 0000000..9bf7608 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRRCHR __wcsrchr_avx2_rtm +#define USE_AS_WCSRCHR 1 +#include "strrchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S new file mode 100644 index 0000000..58ed21d --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __wmemchr_avx2_rtm +#define USE_AS_WMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000..31104d1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_avx2_movbe_rtm +#define USE_AS_WMEMCMP 1 + +#include "memcmp-avx2-movbe-rtm.S" diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h index d07b8f0..7bebdeb 100644 --- a/sysdeps/x86_64/sysdep.h +++ b/sysdeps/x86_64/sysdep.h @@ -95,6 +95,28 @@ lose: \ #define R14_LP r14 #define R15_LP r15 +/* Zero upper vector registers and return with xtest. NB: Use VZEROALL + to avoid RTM abort triggered by VZEROUPPER inside transactionally. */ +#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \ + xtest; \ + jz 1f; \ + vzeroall; \ + ret; \ +1: \ + vzeroupper; \ + ret + +/* Zero upper vector registers and return. */ +#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + VZEROUPPER; \ + ret +#endif + +#ifndef VZEROUPPER_RETURN +# define VZEROUPPER_RETURN VZEROUPPER; ret +#endif + #else /* __ASSEMBLER__ */ /* Long and pointer size in bytes. */ -- cgit v1.1