From e6a1c5dc776dd6b562e0aae17dbb61e396a81fb3 Mon Sep 17 00:00:00 2001 From: Patrick McGehearty Date: Wed, 13 Dec 2017 18:14:17 -0200 Subject: sparc: M7 optimized memset/bzero Support added to identify Sparc M7/T7/S7/M8/T8 processor capability. Performance tests run on Sparc S7 using new code and old niagara4 code. Optimizations for memset also apply to bzero as they share code. For memset/bzero, performance comparison with niagara4 code: For memset nonzero data, 256-1023 bytes - 60-90% gain (in cache); 5% gain (out of cache) 1K+ bytes - 80-260% gain (in cache); 40-80% gain (out of cache) For memset zero data (and bzero), 256-1023 bytes - 80-120% gain (in cache), 0% gain (out of cache) 1024+ bytes - 2-4x gain (in cache), 10-35% gain (out of cache) Tested in sparcv9-*-* and sparc64-*-* targets in both multi and non-multi arch configurations. Patrick McGehearty Adhemerval Zanella * sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile (sysdeps_routines): Add memset-niagara7. * sysdeps/sparc/sparc64/multiarch/Makefile (sysdes_rotuines): Likewise. * sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S: New file. * sysdeps/sparc/sparc64/multiarch/memset-niagara7.S: Likewise. * sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __bzero_niagara7 and __memset_niagara7. * sysdeps/sparc/sparc64/multiarch/ifunc-memset.h (IFUNC_SELECTOR): Add niagara7 option. * NEWS: Mention sparc m7 optimized memcpy, mempcpy, memmove, and memset. --- sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile | 2 +- .../sparc32/sparcv9/multiarch/memset-niagara7.S | 2 + sysdeps/sparc/sparc64/multiarch/Makefile | 2 +- sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c | 4 + sysdeps/sparc/sparc64/multiarch/ifunc-memset.h | 3 + sysdeps/sparc/sparc64/multiarch/memset-niagara7.S | 334 +++++++++++++++++++++ 6 files changed, 345 insertions(+), 2 deletions(-) create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S create mode 100644 sysdeps/sparc/sparc64/multiarch/memset-niagara7.S (limited to 'sysdeps/sparc') diff --git a/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile b/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile index 869f063..a6d08f3 100644 --- a/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile +++ b/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile @@ -10,5 +10,5 @@ ifeq ($(subdir),string) sysdep_routines += memcpy-ultra3 memcpy-niagara1 memcpy-niagara2 \ memset-niagara1 memcpy-niagara4 memset-niagara4 \ memcpy-ultra1 memset-ultra1 memcpy-memmove-niagara7 \ - memmove-ultra1 + memmove-ultra1 memset-niagara7 endif diff --git a/sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S b/sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S new file mode 100644 index 0000000..de91aa4 --- /dev/null +++ b/sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S @@ -0,0 +1,2 @@ +#define XCC icc +#include diff --git a/sysdeps/sparc/sparc64/multiarch/Makefile b/sysdeps/sparc/sparc64/multiarch/Makefile index 69292ca..eaf758e 100644 --- a/sysdeps/sparc/sparc64/multiarch/Makefile +++ b/sysdeps/sparc/sparc64/multiarch/Makefile @@ -10,7 +10,7 @@ ifeq ($(subdir),string) sysdep_routines += memcpy-ultra3 memcpy-niagara1 memcpy-niagara2 \ memset-niagara1 memcpy-niagara4 memset-niagara4 \ memcpy-ultra1 memset-ultra1 memcpy-memmove-niagara7 \ - memmove-ultra1 + memmove-ultra1 memset-niagara7 endif ifeq ($(subdir),stdlib) diff --git a/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c b/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c index a803392..cce78f0 100644 --- a/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c @@ -62,6 +62,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ultra1)); IFUNC_IMPL (i, name, bzero, + IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_ADP, + __bzero_niagara7) IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_CRYPTO, __bzero_niagara4) IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_BLKINIT, @@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ultra1)); IFUNC_IMPL (i, name, memset, + IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_ADP, + __memset_niagara7) IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_CRYPTO, __memset_niagara4) IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_BLKINIT, diff --git a/sysdeps/sparc/sparc64/multiarch/ifunc-memset.h b/sysdeps/sparc/sparc64/multiarch/ifunc-memset.h index f3b9293..bc273d6 100644 --- a/sysdeps/sparc/sparc64/multiarch/ifunc-memset.h +++ b/sysdeps/sparc/sparc64/multiarch/ifunc-memset.h @@ -19,6 +19,7 @@ #include +extern __typeof (REDIRECT_NAME) OPTIMIZE (niagara7) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (niagara4) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (niagara1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ultra1) attribute_hidden; @@ -26,6 +27,8 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ultra1) attribute_hidden; static inline void * IFUNC_SELECTOR (int hwcap) { + if (hwcap & HWCAP_SPARC_ADP) + return OPTIMIZE (niagara7); if (hwcap & HWCAP_SPARC_CRYPTO) return OPTIMIZE (niagara4); if (hwcap & HWCAP_SPARC_BLKINIT) diff --git a/sysdeps/sparc/sparc64/multiarch/memset-niagara7.S b/sysdeps/sparc/sparc64/multiarch/memset-niagara7.S new file mode 100644 index 0000000..bfe107e --- /dev/null +++ b/sysdeps/sparc/sparc64/multiarch/memset-niagara7.S @@ -0,0 +1,334 @@ +/* Set a block of memory to some byte value. For SUN4V M7. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#ifndef XCC +# define XCC xcc +#endif + .register %g2, #scratch + .register %g3, #scratch + +/* The algorithm is as follows : + * + * For small 7 or fewer bytes stores, bytes will be stored. + * + * For less than 32 bytes stores, align the address on 4 byte boundary. + * Then store as many 4-byte chunks, followed by trailing bytes. + * + * For sizes greater than 32 bytes, align the address on 8 byte boundary. + * if (count >= 64) { + * store 8-bytes chunks to align the address on 64 byte boundary + * if (value to be set is zero && count >= MIN_ZERO) { + * Using BIS stores, set the first long word of each + * 64-byte cache line to zero which will also clear the + * other seven long words of the cache line. + * } + * else if (count >= MIN_LOOP) { + * Using BIS stores, set the first long word of each of + * ST_CHUNK cache lines (64 bytes each) before the main + * loop is entered. + * In the main loop, continue pre-setting the first long + * word of each cache line ST_CHUNK lines in advance while + * setting the other seven long words (56 bytes) of each + * cache line until fewer than ST_CHUNK*64 bytes remain. + * Then set the remaining seven long words of each cache + * line that has already had its first long word set. + * } + * store remaining data in 64-byte chunks until less than + * 64 bytes remain. + * } + * Store as many 8-byte chunks, followed by trailing bytes. + * + * + * BIS = Block Init Store + * Doing the advance store of the first element of the cache line + * initiates the displacement of a cache line while only using a single + * instruction in the pipeline. That avoids various pipeline delays, + * such as filling the miss buffer. The performance effect is + * similar to prefetching for normal stores. + * The special case for zero fills runs faster and uses fewer instruction + * cycles than the normal memset loop. + * + * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence + * BIS stores must be followed by a membar #StoreStore. The benefit of + * the BIS store must be balanced against the cost of the membar operation. + */ + +/* + * ASI_STBI_P marks the cache line as "least recently used" + * which means if many threads are active, it has a high chance + * of being pushed out of the cache between the first initializing + * store and the final stores. + * Thus, we use ASI_STBIMRU_P which marks the cache line as + * "most recently used" for all but the last store to the cache line. + */ + +#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 +#define ASI_ST_BLK_INIT_MRU_P 0xf2 + +#define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P +#define ASI_STBIMRU_P ASI_ST_BLK_INIT_MRU_P + +#define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */ +#define MIN_LOOP (ST_CHUNK)*64 +#define MIN_ZERO 256 + +#define EX_ST(x) x +#define EX_RETVAL(x) x +#define STORE_ASI(src,addr) stxa src, [addr] ASI_STBIMRU_P +#define STORE_INIT(src,addr) stxa src, [addr] ASI_STBI_P + +#if IS_IN (libc) + + .text + .align 32 + +ENTRY(__bzero_niagara7) + /* bzero (dst, size) */ + mov %o1, %o2 + mov 0, %o1 + /* fall through into memset code */ +END(__bzero_niagara7) + +ENTRY(__memset_niagara7) + /* memset (src, c, size) */ + mov %o0, %o5 /* copy sp1 before using it */ + cmp %o2, 7 /* if small counts, just write bytes */ + bleu,pn %XCC, .Lwrchar + and %o1, 0xff, %o1 /* o1 is (char)c */ + + sll %o1, 8, %o3 + or %o1, %o3, %o1 /* now o1 has 2 bytes of c */ + sll %o1, 16, %o3 + cmp %o2, 32 + blu,pn %XCC, .Lwdalign + or %o1, %o3, %o1 /* now o1 has 4 bytes of c */ + + sllx %o1, 32, %o3 + or %o1, %o3, %o1 /* now o1 has 8 bytes of c */ + +.Ldbalign: + andcc %o5, 7, %o3 /* is sp1 aligned on a 8 byte bound? */ + bz,pt %XCC, .Lblkalign /* already long word aligned */ + sub %o3, 8, %o3 /* -(bytes till long word aligned) */ + + add %o2, %o3, %o2 /* update o2 with new count */ + /* Set -(%o3) bytes till sp1 long word aligned */ +1: stb %o1, [%o5] /* there is at least 1 byte to set */ + inccc %o3 /* byte clearing loop */ + bl,pt %XCC, 1b + inc %o5 + + /* Now sp1 is long word aligned (sp1 is found in %o5) */ +.Lblkalign: + cmp %o2, 64 /* check if there are 64 bytes to set */ + blu,pn %XCC, .Lwrshort + mov %o2, %o3 + + andcc %o5, 63, %o3 /* is sp1 block aligned? */ + bz,pt %XCC, .Lblkwr /* now block aligned */ + sub %o3, 64, %o3 /* o3 is -(bytes till block aligned) */ + add %o2, %o3, %o2 /* o2 is the remainder */ + + /* Store -(%o3) bytes till dst is block (64 byte) aligned. */ + /* Use long word stores. */ + /* Recall that dst is already long word aligned */ +1: + addcc %o3, 8, %o3 + stx %o1, [%o5] + bl,pt %XCC, 1b + add %o5, 8, %o5 + + /* Now sp1 is block aligned */ +.Lblkwr: + andn %o2, 63, %o4 /* calculate size of blocks in bytes */ + brz,pn %o1, .Lwrzero /* special case if c == 0 */ + and %o2, 63, %o3 /* %o3 = bytes left after blk stores */ + + cmp %o4, MIN_LOOP /* check for enough bytes to set */ + blu,pn %XCC, .Lshort_set /* to justify cost of membar */ + nop /* must be > pre-cleared lines */ + + /* initial cache-clearing stores */ + /* get store pipeline moving */ + +/* Primary memset loop for large memsets */ +.Lwr_loop: + mov ST_CHUNK, %g1 +.Lwr_loop_start: + subcc %g1, 4, %g1 + EX_ST(STORE_ASI(%o1,%o5)) + add %o5, 64, %o5 + EX_ST(STORE_ASI(%o1,%o5)) + add %o5, 64, %o5 + EX_ST(STORE_ASI(%o1,%o5)) + add %o5, 64, %o5 + EX_ST(STORE_ASI(%o1,%o5)) + bgu %XCC, .Lwr_loop_start + add %o5, 64, %o5 + + sub %o5, ST_CHUNK*64, %o5 /* reset %o5 */ + mov ST_CHUNK, %g1 + sub %o5, 8, %o5 /* adjust %o5 for ASI store */ + +.Lwr_loop_rest: + stx %o1,[%o5+8+8] + sub %o4, 64, %o4 + stx %o1,[%o5+16+8] + subcc %g1, 1, %g1 + stx %o1,[%o5+24+8] + stx %o1,[%o5+32+8] + stx %o1,[%o5+40+8] + add %o5, 64, %o5 + stx %o1,[%o5-8] + bgu %XCC, .Lwr_loop_rest + EX_ST(STORE_INIT(%o1,%o5)) + + add %o5, 8, %o5 /* restore %o5 offset */ + + /* If more than ST_CHUNK*64 bytes remain to set, continue */ + /* setting the first long word of each cache line in advance */ + /* to keep the store pipeline moving. */ + + cmp %o4, ST_CHUNK*64 + bge,pt %XCC, .Lwr_loop_start + mov ST_CHUNK, %g1 + + brz,a,pn %o4, .Lasi_done + nop + + sub %o5, 8, %o5 /* adjust %o5 for ASI store */ +.Lwr_loop_small: + add %o5, 8, %o5 /* adjust %o5 for ASI store */ + EX_ST(STORE_ASI(%o1,%o5)) + stx %o1,[%o5+8] + stx %o1,[%o5+16] + stx %o1,[%o5+24] + stx %o1,[%o5+32] + subcc %o4, 64, %o4 + stx %o1,[%o5+40] + add %o5, 56, %o5 + stx %o1,[%o5-8] + bgu,pt %XCC, .Lwr_loop_small + EX_ST(STORE_INIT(%o1,%o5)) + + ba .Lasi_done + add %o5, 8, %o5 /* restore %o5 offset */ + +/* Special case loop for zero fill memsets */ +/* For each 64 byte cache line, single STBI to first element */ +/* clears line */ +.Lwrzero: + cmp %o4, MIN_ZERO /* check if enough bytes to set */ + /* to pay %asi + membar cost */ + blu %XCC, .Lshort_set + nop + sub %o4, 256, %o4 + +.Lwrzero_loop: + mov 64, %g3 + EX_ST(STORE_INIT(%o1,%o5)) + subcc %o4, 256, %o4 + EX_ST(STORE_INIT(%o1,%o5+%g3)) + add %o5, 256, %o5 + sub %g3, 192, %g3 + EX_ST(STORE_INIT(%o1,%o5+%g3)) + add %g3, 64, %g3 + bge,pt %XCC, .Lwrzero_loop + EX_ST(STORE_INIT(%o1,%o5+%g3)) + add %o4, 256, %o4 + + brz,pn %o4, .Lbsi_done + nop +.Lwrzero_small: + EX_ST(STORE_INIT(%o1,%o5)) + subcc %o4, 64, %o4 + bgu,pt %XCC, .Lwrzero_small + add %o5, 64, %o5 + +.Lasi_done: +.Lbsi_done: + membar #StoreStore /* required by use of BSI */ + +.Lshort_set: + cmp %o4, 64 /* check if 64 bytes to set */ + blu %XCC, 5f + nop +4: /* set final blocks of 64 bytes */ + stx %o1, [%o5] + stx %o1, [%o5+8] + stx %o1, [%o5+16] + stx %o1, [%o5+24] + subcc %o4, 64, %o4 + stx %o1, [%o5+32] + stx %o1, [%o5+40] + add %o5, 64, %o5 + stx %o1, [%o5-16] + bgu,pt %XCC, 4b + stx %o1, [%o5-8] + +5: + /* Set the remaining long words */ +.Lwrshort: + subcc %o3, 8, %o3 /* Can we store any long words? */ + blu,pn %XCC, .Lwrchars + and %o2, 7, %o2 /* calc bytes left after long words */ +6: + subcc %o3, 8, %o3 + stx %o1, [%o5] /* store the long words */ + bgeu,pt %XCC, 6b + add %o5, 8, %o5 + +.Lwrchars: /* check for extra chars */ + brnz %o2, .Lwrfin + nop + retl + nop + +.Lwdalign: + andcc %o5, 3, %o3 /* is sp1 aligned on a word boundary */ + bz,pn %XCC, .Lwrword + andn %o2, 3, %o3 /* create word sized count in %o3 */ + + dec %o2 /* decrement count */ + stb %o1, [%o5] /* clear a byte */ + b .Lwdalign + inc %o5 /* next byte */ + +.Lwrword: + subcc %o3, 4, %o3 + st %o1, [%o5] /* 4-byte writing loop */ + bnz,pt %XCC, .Lwrword + add %o5, 4, %o5 + and %o2, 3, %o2 /* leftover count, if any */ + +.Lwrchar: + /* Set the remaining bytes, if any */ + brz %o2, .Lexit + nop +.Lwrfin: + deccc %o2 + stb %o1, [%o5] + bgu,pt %XCC, .Lwrfin + inc %o5 +.Lexit: + retl /* %o0 was preserved */ + nop +END(__memset_niagara7) +#endif -- cgit v1.1