diff options
author | Wilco Dijkstra <Wilco.Dijkstra@arm.com> | 2018-12-31 18:01:52 +0000 |
---|---|---|
committer | Eric Blake <eblake@redhat.com> | 2019-01-01 09:44:59 -0600 |
commit | 353ebae30465606bd8b15bea1194f7a2881903fb (patch) | |
tree | 7ecab382557ac2042d03a91fe3e1ead3b51e94bd /newlib/libc/string | |
parent | 572687310059534b2da9428ca19df992509c8a5d (diff) | |
download | newlib-353ebae30465606bd8b15bea1194f7a2881903fb.zip newlib-353ebae30465606bd8b15bea1194f7a2881903fb.tar.gz newlib-353ebae30465606bd8b15bea1194f7a2881903fb.tar.bz2 |
Improve performance of memmem
This patch significantly improves performance of memmem using a novel
modified Horspool algorithm. Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.
By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.
Small needles up to size 2 use a dedicated linear search. Very long needles
use the Two-Way algorithm (to avoid increasing stack size inlining is now disabled).
The performance gain is 6.6 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).
The size-optimized memmem has also been rewritten from scratch to get a
2.7x performance gain.
Tested against GLIBC testsuite and randomized tests.
Message-Id: <DB5PR08MB1030649D051FA8532A4512C883B20@DB5PR08MB1030.eurprd08.prod.outlook.com>
Diffstat (limited to 'newlib/libc/string')
-rw-r--r-- | newlib/libc/string/memmem.c | 185 | ||||
-rw-r--r-- | newlib/libc/string/str-two-way.h | 3 |
2 files changed, 137 insertions, 51 deletions
diff --git a/newlib/libc/string/memmem.c b/newlib/libc/string/memmem.c index 5c57eff..55d2459 100644 --- a/newlib/libc/string/memmem.c +++ b/newlib/libc/string/memmem.c @@ -1,8 +1,30 @@ -/* Byte-wise substring search, using the Two-Way algorithm. - * Copyright (C) 2008 Eric Blake - * Permission to use, copy, modify, and distribute this software - * is freely granted, provided that this notice is preserved. - */ +/* Optimized memmem function. + Copyright (c) 2018 Arm Ltd. All rights reserved. + + SPDX-License-Identifier: BSD-3-Clause + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the company may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* FUNCTION @@ -13,7 +35,7 @@ INDEX SYNOPSIS #include <string.h> - char *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>, + void *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>, size_t <[l2]>); DESCRIPTION @@ -21,8 +43,8 @@ DESCRIPTION Locates the first occurrence in the memory region pointed to by <[s1]> with length <[l1]> of the sequence of bytes pointed to by <[s2]> of length <[l2]>. If you already know the - lengths of your haystack and needle, <<memmem>> can be much - faster than <<strstr>>. + lengths of your haystack and needle, <<memmem>> is much faster + than <<strstr>>. RETURNS Returns a pointer to the located segment, or a null pointer if @@ -38,64 +60,127 @@ QUICKREF */ #include <string.h> +#include <stdint.h> + +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) + +/* Small and efficient memmem implementation (quadratic worst-case). */ +void * +memmem (const void *haystack, size_t hs_len, const void *needle, size_t ne_len) +{ + const char *hs = haystack; + const char *ne = needle; + + if (ne_len == 0) + return (void *)hs; + int i; + int c = ne[0]; + const char *end = hs + hs_len - ne_len; + + for ( ; hs <= end; hs++) + { + if (hs[0] != c) + continue; + for (i = ne_len - 1; i != 0; i--) + if (hs[i] != ne[i]) + break; + if (i == 0) + return (void *)hs; + } + + return NULL; +} + +#else -#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) # define RETURN_TYPE void * # define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l)) # include "str-two-way.h" -#endif +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast memmem algorithm with guaranteed linear-time performance. + Small needles up to size 2 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ void * -memmem (const void *haystack_start, - size_t haystack_len, - const void *needle_start, - size_t needle_len) +memmem (const void *haystack, size_t hs_len, const void *needle, size_t ne_len) { - /* Abstract memory is considered to be an array of 'unsigned char' values, - not an array of 'char' values. See ISO C 99 section 6.2.6.1. */ - const unsigned char *haystack = (const unsigned char *) haystack_start; - const unsigned char *needle = (const unsigned char *) needle_start; + const unsigned char *hs = haystack; + const unsigned char *ne = needle; - if (needle_len == 0) - /* The first occurrence of the empty string is deemed to occur at - the beginning of the string. */ - return (void *) haystack; + if (ne_len == 0) + return (void *) hs; + if (ne_len == 1) + return (void *) memchr (hs, ne[0], hs_len); -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) + /* Ensure haystack length is >= needle length. */ + if (hs_len < ne_len) + return NULL; + + const unsigned char *end = hs + hs_len - ne_len; - /* Less code size, but quadratic performance in the worst case. */ - while (needle_len <= haystack_len) + if (ne_len == 2) { - if (!memcmp (haystack, needle, needle_len)) - return (void *) haystack; - haystack++; - haystack_len--; + uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1]; + for (hs++; hs <= end && hw != nw; ) + hw = hw << 16 | *++hs; + return hw == nw ? (void *)(hs - 1) : NULL; } - return NULL; -#else /* compilation for speed */ + /* Use Two-Way algorithm for very long needles. */ + if (__builtin_expect (ne_len > 256, 0)) + return two_way_long_needle (hs, hs_len, ne, ne_len); - /* Larger code size, but guaranteed linear performance. */ + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; - /* Sanity check, otherwise the loop might search through the whole - memory. */ - if (haystack_len < needle_len) - return NULL; + /* Initialize bad character shift hash table. */ + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; - /* Use optimizations in memchr when possible, to reduce the search - size of haystack using a linear algorithm with a smaller - coefficient. However, avoid memchr for long needles, since we - can often achieve sublinear performance. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) + for ( ; hs <= end; ) { - haystack = memchr (haystack, *needle, haystack_len); - if (!haystack || needle_len == 1) - return (void *) haystack; - haystack_len -= haystack - (const unsigned char *) haystack_start; - if (haystack_len < needle_len) - return NULL; - return two_way_short_needle (haystack, haystack_len, needle, needle_len); + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (hs <= end && tmp == 0); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the last 2 characters. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* The last 2 characters match. If the needle is long, check a + fixed number of characters first to quickly filter out mismatches. */ + if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long); + } + + /* Skip based on matching the last 2 characters. */ + hs += shift1; } - return two_way_long_needle (haystack, haystack_len, needle, needle_len); -#endif /* compilation for speed */ + return NULL; } +#endif /* Compilation for speed. */ diff --git a/newlib/libc/string/str-two-way.h b/newlib/libc/string/str-two-way.h index 416f9d2..90345a8 100644 --- a/newlib/libc/string/str-two-way.h +++ b/newlib/libc/string/str-two-way.h @@ -31,6 +31,7 @@ #include <limits.h> #include <stdint.h> +#include <_ansi.h> /* We use the Two-Way string matching algorithm, which guarantees linear complexity with constant space. Additionally, for long @@ -288,7 +289,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and sublinear performance is not possible. */ -static RETURN_TYPE +_NOINLINE_STATIC RETURN_TYPE two_way_long_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { |