aboutsummaryrefslogtreecommitdiff
path: root/libc
diff options
context:
space:
mode:
authorGuillaume Chatelet <gchatelet@google.com>2023-07-05 11:07:21 +0000
committerGuillaume Chatelet <gchatelet@google.com>2023-07-07 10:37:32 +0000
commitcb1468d3cbb7774332647dee3475d4e4f85c21e2 (patch)
treee60244949b3727c79a0da3edbcd3e7e303f8964d /libc
parent4d847bf4d06589ff90648f3857ae73a12950d16c (diff)
downloadllvm-cb1468d3cbb7774332647dee3475d4e4f85c21e2.zip
llvm-cb1468d3cbb7774332647dee3475d4e4f85c21e2.tar.gz
llvm-cb1468d3cbb7774332647dee3475d4e4f85c21e2.tar.bz2
[libc] Adding a version of memcpy w/ software prefetching
For machines with a lot of cores, hardware prefetchers can saturate the memory bus when utilization is high. In this case it is desirable to turn off the hardware prefetcher completely. This has a big impact on the performance of memory functions such as `memcpy` that rely on the fact that the next cache line will be readily available. This patch adds the 'LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING' compile time option that generates a version of memcpy with software prefetching. While not fully restoring the original performances it mitigates the impact to an acceptable level. Reviewed By: rtenneti Differential Revision: https://reviews.llvm.org/D154494
Diffstat (limited to 'libc')
-rw-r--r--libc/src/string/CMakeLists.txt2
-rw-r--r--libc/src/string/memory_utils/op_builtin.h26
-rw-r--r--libc/src/string/memory_utils/x86_64/memcpy_implementations.h190
3 files changed, 156 insertions, 62 deletions
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 330c50e..b010190 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -572,6 +572,8 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memcpy(memcpy_x86_64_opt_avx COMPILE_OPTIONS -march=sandybridge REQUIRE AVX)
add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+ add_memcpy(memcpy_x86_64_opt_sw_prefetch_sse4 COMPILE_OPTIONS -DLIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING -march=nehalem REQUIRE SSE4_2)
+ add_memcpy(memcpy_x86_64_opt_sw_prefetch_avx COMPILE_OPTIONS -DLIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING -march=sandybridge REQUIRE AVX)
add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memcpy(memcpy)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h
index cf90c02..210ba60 100644
--- a/libc/src/string/memory_utils/op_builtin.h
+++ b/libc/src/string/memory_utils/op_builtin.h
@@ -23,19 +23,24 @@ namespace __llvm_libc::builtin {
// Memcpy
template <size_t Size> struct Memcpy {
static constexpr size_t SIZE = Size;
- LIBC_INLINE static void block(Ptr __restrict dst, CPtr __restrict src) {
+ LIBC_INLINE static void block_offset(Ptr __restrict dst, CPtr __restrict src,
+ size_t offset) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
- return __builtin_memcpy_inline(dst, src, SIZE);
+ return __builtin_memcpy_inline(dst + offset, src + offset, SIZE);
#else
// The codegen may be suboptimal.
for (size_t i = 0; i < Size; ++i)
- dst[i] = src[i];
+ dst[i + offset] = src[i + offset];
#endif
}
+ LIBC_INLINE static void block(Ptr __restrict dst, CPtr __restrict src) {
+ block_offset(dst, src, 0);
+ }
+
LIBC_INLINE static void tail(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
- block(dst + count - SIZE, src + count - SIZE);
+ block_offset(dst, src, count - SIZE);
}
LIBC_INLINE static void head_tail(Ptr __restrict dst, CPtr __restrict src,
@@ -44,16 +49,21 @@ template <size_t Size> struct Memcpy {
tail(dst, src, count);
}
- LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
- size_t count) {
+ LIBC_INLINE static void loop_and_tail_offset(Ptr __restrict dst,
+ CPtr __restrict src,
+ size_t count, size_t offset) {
static_assert(Size > 1, "a loop of size 1 does not need tail");
- size_t offset = 0;
do {
- block(dst + offset, src + offset);
+ block_offset(dst, src, offset);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, src, count);
}
+
+ LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
+ return loop_and_tail_offset(dst, src, count, 0);
+ }
};
///////////////////////////////////////////////////////////////////////////////
diff --git a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h
index be870e7..0d6e371 100644
--- a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h
@@ -8,6 +8,7 @@
#ifndef LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCPY_IMPLEMENTATIONS_H
#define LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCPY_IMPLEMENTATIONS_H
+#include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR
#include "src/__support/macros/config.h" // LIBC_INLINE
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/string/memory_utils/op_builtin.h"
@@ -17,28 +18,53 @@
#include <stddef.h> // size_t
#include <stdint.h> // SIZE_MAX
+#ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
+#error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
+#endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
+
+#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+#error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
+#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+
namespace __llvm_libc {
+namespace x86 {
+
+LIBC_INLINE_VAR constexpr size_t kOneCacheline = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelines = 2 * kOneCacheline;
+LIBC_INLINE_VAR constexpr size_t kThreeCachelines = 3 * kOneCacheline;
+
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
+ LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING);
+
+// Whether to use rep;movsb exclusively (0), not at all (SIZE_MAX), or only
+// above a certain threshold. Defaults to "do not use rep;movsb".
+#ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+#define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX
+#endif
+LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
+ LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+
+} // namespace x86
+
+// TODO: Move to a shared header when appropriate.
+[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
+ __builtin_prefetch(addr, 0, 3);
+}
+
[[maybe_unused]] LIBC_INLINE void
-inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) {
- if (count == 0)
- return;
- if (count == 1)
- return builtin::Memcpy<1>::block(dst, src);
- if (count == 2)
- return builtin::Memcpy<2>::block(dst, src);
- if (count == 3)
- return builtin::Memcpy<3>::block(dst, src);
- if (count == 4)
- return builtin::Memcpy<4>::block(dst, src);
- if (count < 8)
- return builtin::Memcpy<4>::head_tail(dst, src, count);
- if (count < 16)
- return builtin::Memcpy<8>::head_tail(dst, src, count);
- if (count < 32)
- return builtin::Memcpy<16>::head_tail(dst, src, count);
- if (count < 64)
- return builtin::Memcpy<32>::head_tail(dst, src, count);
+inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
+ if (count < 128)
+ return builtin::Memcpy<64>::head_tail(dst, src, count);
+ builtin::Memcpy<32>::block(dst, src);
+ align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+ return builtin::Memcpy<32>::loop_and_tail(dst, src, count);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
if (count < 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
if (count < 256)
@@ -48,9 +74,81 @@ inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) {
return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
}
-[[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_no_avx(Ptr __restrict dst,
- CPtr __restrict src,
- size_t count) {
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
+ CPtr __restrict src, size_t count) {
+ using namespace __llvm_libc::x86;
+ prefetch_to_local_cache(src + kOneCacheline);
+ if (count < 128)
+ return builtin::Memcpy<64>::head_tail(dst, src, count);
+ prefetch_to_local_cache(src + kTwoCachelines);
+ // Aligning 'dst' on a 32B boundary.
+ builtin::Memcpy<32>::block(dst, src);
+ align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+ builtin::Memcpy<96>::block(dst, src);
+ size_t offset = 96;
+ // At this point:
+ // - we copied between 96B and 128B,
+ // - we prefetched cachelines at 'src + 64' and 'src + 128',
+ // - 'dst' is 32B aligned,
+ // - count >= 128.
+ if (count < 352) {
+ // Two cache lines at a time.
+ while (offset + kTwoCachelines + 32 <= count) {
+ prefetch_to_local_cache(src + offset + kOneCacheline);
+ prefetch_to_local_cache(src + offset + kTwoCachelines);
+ builtin::Memcpy<kTwoCachelines>::block_offset(dst, src, offset);
+ offset += kTwoCachelines;
+ }
+ } else {
+ // Three cache lines at a time.
+ while (offset + kThreeCachelines + 32 <= count) {
+ prefetch_to_local_cache(src + offset + kOneCacheline);
+ prefetch_to_local_cache(src + offset + kTwoCachelines);
+ prefetch_to_local_cache(src + offset + kThreeCachelines);
+ // It is likely that this copy will be turned into a 'rep;movsb' on
+ // non-AVX machines.
+ builtin::Memcpy<kThreeCachelines>::block_offset(dst, src, offset);
+ offset += kThreeCachelines;
+ }
+ }
+ return builtin::Memcpy<32>::loop_and_tail_offset(dst, src, count, offset);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
+ CPtr __restrict src, size_t count) {
+ using namespace __llvm_libc::x86;
+ prefetch_to_local_cache(src + kOneCacheline);
+ if (count < 128)
+ return builtin::Memcpy<64>::head_tail(dst, src, count);
+ prefetch_to_local_cache(src + kTwoCachelines);
+ prefetch_to_local_cache(src + kThreeCachelines);
+ if (count < 256)
+ return builtin::Memcpy<128>::head_tail(dst, src, count);
+ // Aligning 'dst' on a 32B boundary.
+ builtin::Memcpy<32>::block(dst, src);
+ align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+ builtin::Memcpy<224>::block(dst, src);
+ size_t offset = 224;
+ // At this point:
+ // - we copied between 224B and 256B,
+ // - we prefetched cachelines at 'src + 64', 'src + 128', and 'src + 196'
+ // - 'dst' is 32B aligned,
+ // - count >= 128.
+ while (offset + kThreeCachelines + 64 <= count) {
+ // Three cache lines at a time.
+ prefetch_to_local_cache(src + offset + kOneCacheline);
+ prefetch_to_local_cache(src + offset + kTwoCachelines);
+ prefetch_to_local_cache(src + offset + kThreeCachelines);
+ builtin::Memcpy<kThreeCachelines>::block_offset(dst, src, offset);
+ offset += kThreeCachelines;
+ }
+ return builtin::Memcpy<64>::loop_and_tail_offset(dst, src, count, offset);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
if (count == 0)
return;
if (count == 1)
@@ -69,46 +167,30 @@ inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) {
return builtin::Memcpy<16>::head_tail(dst, src, count);
if (count < 64)
return builtin::Memcpy<32>::head_tail(dst, src, count);
- if (count < 128)
- return builtin::Memcpy<64>::head_tail(dst, src, count);
- builtin::Memcpy<32>::block(dst, src);
- align_to_next_boundary<32, Arg::Dst>(dst, src, count);
- return builtin::Memcpy<32>::loop_and_tail(dst, src, count);
-}
-
-[[maybe_unused]] LIBC_INLINE void
-inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
- if constexpr (x86::kAvx)
- return inline_memcpy_x86_avx(dst, src, count);
- else
- return inline_memcpy_x86_no_avx(dst, src, count);
+ if constexpr (x86::kAvx) {
+ if constexpr (x86::kUseSoftwarePrefetching) {
+ return inline_memcpy_x86_avx_ge64_sw_prefetching(dst, src, count);
+ } else {
+ return inline_memcpy_x86_avx_ge64(dst, src, count);
+ }
+ } else {
+ if constexpr (x86::kUseSoftwarePrefetching) {
+ return inline_memcpy_x86_sse2_ge64_sw_prefetching(dst, src, count);
+ } else {
+ return inline_memcpy_x86_sse2_ge64(dst, src, count);
+ }
+ }
}
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
- // Whether to use rep;movsb exclusively, not at all, or only above a certain
- // threshold.
-#ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-#define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX
-#endif
-
-#ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
-#error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
-#endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
-
-#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-#error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
-#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-
- static constexpr size_t kRepMovsbThreshold =
- LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
- if constexpr (kRepMovsbThreshold == 0) {
+ if constexpr (x86::kRepMovsbThreshold == 0) {
return x86::Memcpy::repmovsb(dst, src, count);
- } else if constexpr (kRepMovsbThreshold == SIZE_MAX) {
+ } else if constexpr (x86::kRepMovsbThreshold == SIZE_MAX) {
return inline_memcpy_x86(dst, src, count);
} else {
- if (LIBC_UNLIKELY(count >= kRepMovsbThreshold))
+ if (LIBC_UNLIKELY(count >= x86::kRepMovsbThreshold))
return x86::Memcpy::repmovsb(dst, src, count);
else
return inline_memcpy_x86(dst, src, count);