aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog2
-rw-r--r--sysdeps/x86_64/multiarch/strstr.c25
2 files changed, 14 insertions, 13 deletions
diff --git a/ChangeLog b/ChangeLog
index feb3567..6ae84cb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,6 +4,8 @@
the three constants needed as parameters. Drop the others.
(strcasestr_sse42): Load uclow, uchigh, and lcqword and pass to
__m128i_strloadu_tolower.
+ Create and initialize variable zero and use it in all the places
+ where _mm_setzero_si128 was used.
* sysdeps/x86_64/fpu/multiarch/Makefile: Don't build brandred-avx.c,
doasin-avx.c, dosincos-avx.c, e_asin-avx.c, mpatan-avx.c,
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index 6e93744..36dc676 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -88,14 +88,12 @@
cross to next page. */
static inline __m128i
-__m128i_strloadu (const unsigned char * p)
+__m128i_strloadu (const unsigned char * p, __m128i zero)
{
- int offset = ((size_t) p & (16 - 1));
-
- if (offset && (int) ((size_t) p & 0xfff) > 0xff0)
+ if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
{
+ size_t offset = ((size_t) p & (16 - 1));
__m128i a = _mm_load_si128 ((__m128i *) (p - offset));
- __m128i zero = _mm_setzero_si128 ();
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
if ((bmsk >> offset) != 0)
return __m128i_shift_right (a, offset);
@@ -109,10 +107,10 @@ __m128i_strloadu (const unsigned char * p)
locale and other which have single-byte letters only in the ASCII
range. */
static inline __m128i
-__m128i_strloadu_tolower (const unsigned char *p, __m128i uclow,
+__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
__m128i uchigh, __m128i lcqword)
{
- __m128i frag = __m128i_strloadu (p);
+ __m128i frag = __m128i_strloadu (p, zero);
/* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
__m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
@@ -191,12 +189,15 @@ STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
const __m128i uclow = _mm_set1_epi8 (0x40);
const __m128i uchigh = _mm_set1_epi8 (0x5b);
const __m128i lcqword = _mm_set1_epi8 (0x20);
-# define strloadu(p) __m128i_strloadu_tolower (p, uclow, uchigh, lcqword)
+ const __m128i zero = _mm_setzero_si128 ();
+# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
# else
# define strloadu __m128i_strloadu_tolower
+# define zero _mm_setzero_si128 ()
# endif
#else
-# define strloadu __m128i_strloadu
+# define strloadu(p) __m128i_strloadu (p, zero)
+ const __m128i zero = _mm_setzero_si128 ();
#endif
/* p1 > 1 byte long. Load up to 16 bytes of fragment. */
@@ -207,7 +208,7 @@ STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
/* p2 is > 1 byte long. */
frag2 = strloadu (p2);
else
- frag2 = _mm_insert_epi8 (_mm_setzero_si128 (), LOADBYTE (p2[0]), 0);
+ frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
/* Unsigned bytes, equal order, does frag2 has null? */
int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
@@ -216,8 +217,7 @@ STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
if (cmp_s & cmp_c)
{
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2,
- _mm_setzero_si128 ()));
+ int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
int len;
__asm ("bsfl %[bmsk], %[len]"
: [len] "=r" (len) : [bmsk] "r" (bmsk));
@@ -343,7 +343,6 @@ re_trace:
/* Handle both zero and sign flag set and s1 is shorter in
length. */
- __m128i zero = _mm_setzero_si128 ();
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
int len;