aboutsummaryrefslogtreecommitdiff
path: root/util
diff options
context:
space:
mode:
authorRobert Hoo <robert.hu@linux.intel.com>2020-03-25 14:50:21 +0800
committerPaolo Bonzini <pbonzini@redhat.com>2020-04-01 14:24:03 -0400
commit8f13a39dc02ea8a3e923102a8444185630c635ea (patch)
tree21825595ebc0e81e8e102033b0c12031bfb84983 /util
parentb87c99d0731fa30f1f455b211cbcf385b0fe427c (diff)
downloadqemu-8f13a39dc02ea8a3e923102a8444185630c635ea.zip
qemu-8f13a39dc02ea8a3e923102a8444185630c635ea.tar.gz
qemu-8f13a39dc02ea8a3e923102a8444185630c635ea.tar.bz2
util/bufferiszero: improve avx2 accelerator
By increasing avx2 length_to_accel to 128, we can simplify its logic and reduce a branch. The authorship of this patch actually belongs to Richard Henderson <richard.henderson@linaro.org>, I just fixed a boundary case on his original patch. Suggested-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Robert Hoo <robert.hu@linux.intel.com> Message-Id: <1585119021-46593-2-git-send-email-robert.hu@linux.intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Diffstat (limited to 'util')
-rw-r--r--util/bufferiszero.c26
1 files changed, 9 insertions, 17 deletions
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index b801253..695bb4c 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -158,27 +158,19 @@ buffer_zero_avx2(const void *buf, size_t len)
__m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
__m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
- if (likely(p <= e)) {
- /* Loop over 32-byte aligned blocks of 128. */
- do {
- __builtin_prefetch(p);
- if (unlikely(!_mm256_testz_si256(t, t))) {
- return false;
- }
- t = p[-4] | p[-3] | p[-2] | p[-1];
- p += 4;
- } while (p <= e);
- } else {
- t |= _mm256_loadu_si256(buf + 32);
- if (len <= 128) {
- goto last2;
+ /* Loop over 32-byte aligned blocks of 128. */
+ while (p <= e) {
+ __builtin_prefetch(p);
+ if (unlikely(!_mm256_testz_si256(t, t))) {
+ return false;
}
- }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ } ;
/* Finish the last block of 128 unaligned. */
t |= _mm256_loadu_si256(buf + len - 4 * 32);
t |= _mm256_loadu_si256(buf + len - 3 * 32);
- last2:
t |= _mm256_loadu_si256(buf + len - 2 * 32);
t |= _mm256_loadu_si256(buf + len - 1 * 32);
@@ -263,7 +255,7 @@ static void init_accel(unsigned cache)
}
if (cache & CACHE_AVX2) {
fn = buffer_zero_avx2;
- length_to_accel = 64;
+ length_to_accel = 128;
}
#endif
#ifdef CONFIG_AVX512F_OPT