diff options
author | Alexander Monakov <amonakov@ispras.ru> | 2024-02-06 23:48:05 +0300 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2024-05-03 08:03:05 -0700 |
commit | cbe3d5264631aa193fd2705820cbde6c5a602abb (patch) | |
tree | 8d7c64e91e3b1e7d595c0987a4453419e922ff70 /include | |
parent | d018425c324704949c7f65230def9586e71f07f5 (diff) | |
download | qemu-cbe3d5264631aa193fd2705820cbde6c5a602abb.zip qemu-cbe3d5264631aa193fd2705820cbde6c5a602abb.tar.gz qemu-cbe3d5264631aa193fd2705820cbde6c5a602abb.tar.bz2 |
util/bufferiszero: Reorganize for early test for acceleration
Test for length >= 256 inline, where is is often a constant.
Before calling into the accelerated routine, sample three bytes
from the buffer, which handles most non-zero buffers.
Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru>
Message-Id: <20240206204809.9859-3-amonakov@ispras.ru>
[rth: Use __builtin_constant_p; move the indirect call out of line.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'include')
-rw-r--r-- | include/qemu/cutils.h | 32 |
1 files changed, 31 insertions, 1 deletions
diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h index 92c927a..741dade 100644 --- a/include/qemu/cutils.h +++ b/include/qemu/cutils.h @@ -187,9 +187,39 @@ char *freq_to_str(uint64_t freq_hz); /* used to print char* safely */ #define STR_OR_NULL(str) ((str) ? (str) : "null") -bool buffer_is_zero(const void *buf, size_t len); +/* + * Check if a buffer is all zeroes. + */ + +bool buffer_is_zero_ool(const void *vbuf, size_t len); +bool buffer_is_zero_ge256(const void *vbuf, size_t len); bool test_buffer_is_zero_next_accel(void); +static inline bool buffer_is_zero_sample3(const char *buf, size_t len) +{ + /* + * For any reasonably sized buffer, these three samples come from + * three different cachelines. In qemu-img usage, we find that + * each byte eliminates more than half of all buffer testing. + * It is therefore critical to performance that the byte tests + * short-circuit, so that we do not pull in additional cache lines. + * Do not "optimize" this to !(a | b | c). + */ + return !buf[0] && !buf[len - 1] && !buf[len / 2]; +} + +#ifdef __OPTIMIZE__ +static inline bool buffer_is_zero(const void *buf, size_t len) +{ + return (__builtin_constant_p(len) && len >= 256 + ? buffer_is_zero_sample3(buf, len) && + buffer_is_zero_ge256(buf, len) + : buffer_is_zero_ool(buf, len)); +} +#else +#define buffer_is_zero buffer_is_zero_ool +#endif + /* * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128) * Input is limited to 14-bit numbers |