aboutsummaryrefslogtreecommitdiff
path: root/libiberty
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2023-11-28 13:29:58 +0100
committerJakub Jelinek <jakub@redhat.com>2023-11-28 13:29:58 +0100
commit4a50820ee8f153265ec8ffd068618607d4be3a26 (patch)
tree680392257eafcb9160a66e62070f802374d2eda6 /libiberty
parente5f1ee1832ff9e970833fa5773f46c3e0b93bc04 (diff)
downloadfsf-binutils-gdb-4a50820ee8f153265ec8ffd068618607d4be3a26.zip
fsf-binutils-gdb-4a50820ee8f153265ec8ffd068618607d4be3a26.tar.gz
fsf-binutils-gdb-4a50820ee8f153265ec8ffd068618607d4be3a26.tar.bz2
libiberty, ld: Use x86 HW optimized sha1
The following patch attempts to use x86 SHA ISA if available to speed up in my testing about 2.5x sha1 build-id processing (in my case on AMD Ryzen 5 3600) while producing the same result. I believe AArch64 has similar HW acceleration for SHA1, perhaps it could be added similarly. Note, seems lld uses BLAKE3 rather than md5/sha1. I think it would be a bad idea to lie to users, if they choose --buildid=sha1, we should be using SHA1, not some other checksum, but perhaps we could add some other --buildid= styles and perhaps make one of the new the default. Tested on x86_64-linux, both on Intel i9-7960X (which doesn't have sha_ni ISA support) without/with the patch and on AMD Ryzen 5 3600 (which does have it) without/with the patch. 2023-11-28 Jakub Jelinek <jakub@redhat.com> include/ * sha1.h (sha1_process_bytes_fn): New typedef. (sha1_choose_process_bytes): Declare. libiberty/ * configure.ac (HAVE_X86_SHA1_HW_SUPPORT): New check. * sha1.c: If HAVE_X86_SHA1_HW_SUPPORT is defined, include x86intrin.h and cpuid.h. (sha1_hw_process_bytes, sha1_hw_process_block, sha1_choose_process_bytes): New functions. * config.in: Regenerated. * configure: Regenerated. ld/ * ldbuildid.c (generate_build_id): Use sha1_choose_process_bytes () instead of &sha1_process_bytes.
Diffstat (limited to 'libiberty')
-rw-r--r--libiberty/ChangeLog10
-rw-r--r--libiberty/config.in3
-rwxr-xr-xlibiberty/configure58
-rw-r--r--libiberty/configure.ac40
-rw-r--r--libiberty/sha1.c305
5 files changed, 416 insertions, 0 deletions
diff --git a/libiberty/ChangeLog b/libiberty/ChangeLog
index 3424fef..a8fdc1b 100644
--- a/libiberty/ChangeLog
+++ b/libiberty/ChangeLog
@@ -1,3 +1,13 @@
+2023-11-28 Jakub Jelinek <jakub@redhat.com>
+
+ * configure.ac (HAVE_X86_SHA1_HW_SUPPORT): New check.
+ * sha1.c: If HAVE_X86_SHA1_HW_SUPPORT is defined, include x86intrin.h
+ and cpuid.h.
+ (sha1_hw_process_bytes, sha1_hw_process_block,
+ sha1_choose_process_bytes): New functions.
+ * config.in: Regenerated.
+ * configure: Regenerated.
+
2023-06-15 Marek Polacek <polacek@redhat.com>
* configure.ac: Also set shared when enable_host_pie.
diff --git a/libiberty/config.in b/libiberty/config.in
index f7052b5..6c4a259 100644
--- a/libiberty/config.in
+++ b/libiberty/config.in
@@ -432,6 +432,9 @@
/* Define to 1 if `vfork' works. */
#undef HAVE_WORKING_VFORK
+/* Define if you have x86 SHA1 HW acceleration support. */
+#undef HAVE_X86_SHA1_HW_SUPPORT
+
/* Define to 1 if you have the `_doprnt' function. */
#undef HAVE__DOPRNT
diff --git a/libiberty/configure b/libiberty/configure
index dd89627..9cdf802 100755
--- a/libiberty/configure
+++ b/libiberty/configure
@@ -7544,6 +7544,64 @@ case "${host}" in
esac
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SHA1 HW acceleration support" >&5
+$as_echo_n "checking for SHA1 HW acceleration support... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+#include <x86intrin.h>
+#include <cpuid.h>
+
+__attribute__((__target__ ("sse4.1,sha")))
+void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1)
+{
+ __m128i abcd = _mm_loadu_si128 ((const __m128i *) buf);
+ __m128i e0 = _mm_set_epi32 (e, 0, 0, 0);
+ abcd = _mm_shuffle_epi32 (abcd, 0x1b);
+ const __m128i shuf_mask = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
+ abcd = _mm_shuffle_epi8 (abcd, shuf_mask);
+ e0 = _mm_sha1nexte_epu32 (e0, msg1);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
+ msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
+ msg0 = _mm_sha1msg2_epu32 (msg0, msg1);
+ msg0 = _mm_xor_si128 (msg0, msg1);
+ e0 = _mm_add_epi32 (e0, msg0);
+ e0 = abcd;
+ _mm_storeu_si128 (buf, abcd);
+ e = _mm_extract_epi32 (e0, 3);
+}
+
+int bar (void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
+ && (ebx & bit_SHA) != 0
+ && __get_cpuid (1, &eax, &ebx, &ecx, &edx)
+ && (ecx & bit_SSE4_1) != 0)
+ return 1;
+ return 0;
+}
+
+int
+main ()
+{
+bar ();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: x86 SHA1" >&5
+$as_echo "x86 SHA1" >&6; }
+
+$as_echo "#define HAVE_X86_SHA1_HW_SUPPORT 1" >>confdefs.h
+
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
diff --git a/libiberty/configure.ac b/libiberty/configure.ac
index 0748c59..e07cbb8 100644
--- a/libiberty/configure.ac
+++ b/libiberty/configure.ac
@@ -740,6 +740,46 @@ case "${host}" in
esac
AC_SUBST(pexecute)
+AC_MSG_CHECKING([for SHA1 HW acceleration support])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#include <x86intrin.h>
+#include <cpuid.h>
+
+__attribute__((__target__ ("sse4.1,sha")))
+void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1)
+{
+ __m128i abcd = _mm_loadu_si128 ((const __m128i *) buf);
+ __m128i e0 = _mm_set_epi32 (e, 0, 0, 0);
+ abcd = _mm_shuffle_epi32 (abcd, 0x1b);
+ const __m128i shuf_mask = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
+ abcd = _mm_shuffle_epi8 (abcd, shuf_mask);
+ e0 = _mm_sha1nexte_epu32 (e0, msg1);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
+ msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
+ msg0 = _mm_sha1msg2_epu32 (msg0, msg1);
+ msg0 = _mm_xor_si128 (msg0, msg1);
+ e0 = _mm_add_epi32 (e0, msg0);
+ e0 = abcd;
+ _mm_storeu_si128 (buf, abcd);
+ e = _mm_extract_epi32 (e0, 3);
+}
+
+int bar (void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
+ && (ebx & bit_SHA) != 0
+ && __get_cpuid (1, &eax, &ebx, &ecx, &edx)
+ && (ecx & bit_SSE4_1) != 0)
+ return 1;
+ return 0;
+}
+]], [[bar ();]])],
+ [AC_MSG_RESULT([x86 SHA1])
+ AC_DEFINE(HAVE_X86_SHA1_HW_SUPPORT, 1,
+ [Define if you have x86 SHA1 HW acceleration support.])],
+ [AC_MSG_RESULT([no])])
+
libiberty_AC_FUNC_STRNCMP
# Install a library built with a cross compiler in $(tooldir) rather
diff --git a/libiberty/sha1.c b/libiberty/sha1.c
index 6c71e3e..bb47268 100644
--- a/libiberty/sha1.c
+++ b/libiberty/sha1.c
@@ -29,6 +29,11 @@
#include <stddef.h>
#include <string.h>
+#ifdef HAVE_X86_SHA1_HW_SUPPORT
+# include <x86intrin.h>
+# include <cpuid.h>
+#endif
+
#if USE_UNLOCKED_IO
# include "unlocked-io.h"
#endif
@@ -412,3 +417,303 @@ sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
e = ctx->E += e;
}
}
+
+#if defined(HAVE_X86_SHA1_HW_SUPPORT)
+/* HW specific version of sha1_process_bytes. */
+
+static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *);
+
+static void
+sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
+{
+ /* When we already have some bits in our internal buffer concatenate
+ both inputs first. */
+ if (ctx->buflen != 0)
+ {
+ size_t left_over = ctx->buflen;
+ size_t add = 128 - left_over > len ? len : 128 - left_over;
+
+ memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
+ ctx->buflen += add;
+
+ if (ctx->buflen > 64)
+ {
+ sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
+
+ ctx->buflen &= 63;
+ /* The regions in the following copy operation cannot overlap. */
+ memcpy (ctx->buffer,
+ &((char *) ctx->buffer)[(left_over + add) & ~63],
+ ctx->buflen);
+ }
+
+ buffer = (const char *) buffer + add;
+ len -= add;
+ }
+
+ /* Process available complete blocks. */
+ if (len >= 64)
+ {
+#if !_STRING_ARCH_unaligned
+# define alignof(type) offsetof (struct { char c; type x; }, x)
+# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
+ if (UNALIGNED_P (buffer))
+ while (len > 64)
+ {
+ sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
+ buffer = (const char *) buffer + 64;
+ len -= 64;
+ }
+ else
+#endif
+ {
+ sha1_hw_process_block (buffer, len & ~63, ctx);
+ buffer = (const char *) buffer + (len & ~63);
+ len &= 63;
+ }
+ }
+
+ /* Move remaining bytes in internal buffer. */
+ if (len > 0)
+ {
+ size_t left_over = ctx->buflen;
+
+ memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
+ left_over += len;
+ if (left_over >= 64)
+ {
+ sha1_hw_process_block (ctx->buffer, 64, ctx);
+ left_over -= 64;
+ memmove (ctx->buffer, &ctx->buffer[16], left_over);
+ }
+ ctx->buflen = left_over;
+ }
+}
+
+/* Process LEN bytes of BUFFER, accumulating context into CTX.
+ Using CPU specific intrinsics. */
+
+#ifdef HAVE_X86_SHA1_HW_SUPPORT
+__attribute__((__target__ ("sse4.1,sha")))
+#endif
+static void
+sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
+{
+#ifdef HAVE_X86_SHA1_HW_SUPPORT
+ /* Implemented from
+ https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html */
+ const __m128i *words = (const __m128i *) buffer;
+ const __m128i *endp = (const __m128i *) ((const char *) buffer + len);
+ __m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3;
+ const __m128i shuf_mask
+ = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
+ char check[((offsetof (struct sha1_ctx, B)
+ == offsetof (struct sha1_ctx, A) + sizeof (ctx->A))
+ && (offsetof (struct sha1_ctx, C)
+ == offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A))
+ && (offsetof (struct sha1_ctx, D)
+ == offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A)))
+ ? 1 : -1];
+
+ /* First increment the byte count. RFC 1321 specifies the possible
+ length of the file up to 2^64 bits. Here we only compute the
+ number of bytes. Do a double word increment. */
+ ctx->total[0] += len;
+ ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
+
+ (void) &check[0];
+ abcd = _mm_loadu_si128 ((const __m128i *) &ctx->A);
+ e0 = _mm_set_epi32 (ctx->E, 0, 0, 0);
+ abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
+
+ while (words < endp)
+ {
+ abcd_save = abcd;
+ e0_save = e0;
+
+ /* 0..3 */
+ msg0 = _mm_loadu_si128 (words);
+ msg0 = _mm_shuffle_epi8 (msg0, shuf_mask);
+ e0 = _mm_add_epi32 (e0, msg0);
+ e1 = abcd;
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
+
+ /* 4..7 */
+ msg1 = _mm_loadu_si128 (words + 1);
+ msg1 = _mm_shuffle_epi8 (msg1, shuf_mask);
+ e1 = _mm_sha1nexte_epu32 (e1, msg1);
+ e0 = abcd;
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
+ msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
+
+ /* 8..11 */
+ msg2 = _mm_loadu_si128 (words + 2);
+ msg2 = _mm_shuffle_epi8 (msg2, shuf_mask);
+ e0 = _mm_sha1nexte_epu32 (e0, msg2);
+ e1 = abcd;
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
+ msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
+ msg0 = _mm_xor_si128 (msg0, msg2);
+
+ /* 12..15 */
+ msg3 = _mm_loadu_si128 (words + 3);
+ msg3 = _mm_shuffle_epi8 (msg3, shuf_mask);
+ e1 = _mm_sha1nexte_epu32 (e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
+ msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
+ msg1 = _mm_xor_si128 (msg1, msg3);
+
+ /* 16..19 */
+ e0 = _mm_sha1nexte_epu32 (e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
+ msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
+ msg2 = _mm_xor_si128 (msg2, msg0);
+
+ /* 20..23 */
+ e1 = _mm_sha1nexte_epu32 (e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
+ msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
+ msg3 = _mm_xor_si128 (msg3, msg1);
+
+ /* 24..27 */
+ e0 = _mm_sha1nexte_epu32 (e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
+ msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
+ msg0 = _mm_xor_si128 (msg0, msg2);
+
+ /* 28..31 */
+ e1 = _mm_sha1nexte_epu32 (e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
+ msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
+ msg1 = _mm_xor_si128 (msg1, msg3);
+
+ /* 32..35 */
+ e0 = _mm_sha1nexte_epu32 (e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
+ msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
+ msg2 = _mm_xor_si128 (msg2, msg0);
+
+ /* 36..39 */
+ e1 = _mm_sha1nexte_epu32 (e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
+ msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
+ msg3 = _mm_xor_si128 (msg3, msg1);
+
+ /* 40..43 */
+ e0 = _mm_sha1nexte_epu32 (e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
+ msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
+ msg0 = _mm_xor_si128 (msg0, msg2);
+
+ /* 44..47 */
+ e1 = _mm_sha1nexte_epu32 (e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
+ msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
+ msg1 = _mm_xor_si128 (msg1, msg3);
+
+ /* 48..51 */
+ e0 = _mm_sha1nexte_epu32 (e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
+ msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
+ msg2 = _mm_xor_si128 (msg2, msg0);
+
+ /* 52..55 */
+ e1 = _mm_sha1nexte_epu32 (e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
+ msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
+ msg3 = _mm_xor_si128 (msg3, msg1);
+
+ /* 56..59 */
+ e0 = _mm_sha1nexte_epu32 (e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
+ msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
+ msg0 = _mm_xor_si128 (msg0, msg2);
+
+ /* 60..63 */
+ e1 = _mm_sha1nexte_epu32 (e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
+ msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
+ msg1 = _mm_xor_si128 (msg1, msg3);
+
+ /* 64..67 */
+ e0 = _mm_sha1nexte_epu32 (e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
+ msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
+ msg2 = _mm_xor_si128 (msg2, msg0);
+
+ /* 68..71 */
+ e1 = _mm_sha1nexte_epu32 (e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
+ msg3 = _mm_xor_si128 (msg3, msg1);
+
+ /* 72..75 */
+ e0 = _mm_sha1nexte_epu32 (e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
+
+ /* 76..79 */
+ e1 = _mm_sha1nexte_epu32 (e1, msg3);
+ e0 = abcd;
+ abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
+
+ /* Finalize. */
+ e0 = _mm_sha1nexte_epu32 (e0, e0_save);
+ abcd = _mm_add_epi32 (abcd, abcd_save);
+
+ words = words + 4;
+ }
+
+ abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
+ _mm_storeu_si128 ((__m128i *) &ctx->A, abcd);
+ ctx->E = _mm_extract_epi32 (e0, 3);
+#endif
+}
+#endif
+
+/* Return sha1_process_bytes or some hardware optimized version thereof
+ depending on current CPU. */
+
+sha1_process_bytes_fn
+sha1_choose_process_bytes (void)
+{
+#ifdef HAVE_X86_SHA1_HW_SUPPORT
+ unsigned int eax, ebx, ecx, edx;
+ if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
+ && (ebx & bit_SHA) != 0
+ && __get_cpuid (1, &eax, &ebx, &ecx, &edx)
+ && (ecx & bit_SSE4_1) != 0)
+ return sha1_hw_process_bytes;
+#endif
+ return sha1_process_bytes;
+}