From e6a1c5dc776dd6b562e0aae17dbb61e396a81fb3 Mon Sep 17 00:00:00 2001
From: Patrick McGehearty <patrick.mcgehearty@oracle.com>
Date: Wed, 13 Dec 2017 18:14:17 -0200
Subject: sparc: M7 optimized memset/bzero

Support added to identify Sparc M7/T7/S7/M8/T8 processor capability.
Performance tests run on Sparc S7 using new code and old niagara4 code.

Optimizations for memset also apply to bzero as they share code.

For memset/bzero, performance comparison with niagara4 code:
For memset nonzero data,
  256-1023 bytes - 60-90% gain (in cache); 5% gain (out of cache)
  1K+ bytes - 80-260% gain (in cache); 40-80% gain (out of cache)
For memset zero data (and bzero),
  256-1023 bytes - 80-120% gain (in cache), 0% gain (out of cache)
  1024+ bytes - 2-4x gain (in cache), 10-35% gain (out of cache)

Tested in sparcv9-*-* and sparc64-*-* targets in both multi and
non-multi arch configurations.

	Patrick McGehearty <patrick.mcgehearty@oracle.com>
	Adhemerval Zanella  <adhemerval.zanella@linaro.org>

	* sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile
	(sysdeps_routines): Add memset-niagara7.
	* sysdeps/sparc/sparc64/multiarch/Makefile (sysdes_rotuines):
	Likewise.
	* sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S: New
	file.
	* sysdeps/sparc/sparc64/multiarch/memset-niagara7.S: Likewise.
	* sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add __bzero_niagara7 and __memset_niagara7.
	* sysdeps/sparc/sparc64/multiarch/ifunc-memset.h (IFUNC_SELECTOR):
	Add niagara7 option.
	* NEWS: Mention sparc m7 optimized memcpy, mempcpy, memmove, and
	memset.
---
 sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile   |   2 +-
 .../sparc32/sparcv9/multiarch/memset-niagara7.S    |   2 +
 sysdeps/sparc/sparc64/multiarch/Makefile           |   2 +-
 sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c  |   4 +
 sysdeps/sparc/sparc64/multiarch/ifunc-memset.h     |   3 +
 sysdeps/sparc/sparc64/multiarch/memset-niagara7.S  | 334 +++++++++++++++++++++
 6 files changed, 345 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S
 create mode 100644 sysdeps/sparc/sparc64/multiarch/memset-niagara7.S

(limited to 'sysdeps/sparc')

diff --git a/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile b/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile
index 869f063..a6d08f3 100644
--- a/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile
+++ b/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile
@@ -10,5 +10,5 @@ ifeq ($(subdir),string)
 sysdep_routines += memcpy-ultra3 memcpy-niagara1 memcpy-niagara2 \
 		   memset-niagara1 memcpy-niagara4 memset-niagara4 \
 		   memcpy-ultra1 memset-ultra1 memcpy-memmove-niagara7 \
-		   memmove-ultra1
+		   memmove-ultra1 memset-niagara7
 endif
diff --git a/sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S b/sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S
new file mode 100644
index 0000000..de91aa4
--- /dev/null
+++ b/sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S
@@ -0,0 +1,2 @@
+#define XCC icc
+#include <sparc64/multiarch/memset-niagara7.S>
diff --git a/sysdeps/sparc/sparc64/multiarch/Makefile b/sysdeps/sparc/sparc64/multiarch/Makefile
index 69292ca..eaf758e 100644
--- a/sysdeps/sparc/sparc64/multiarch/Makefile
+++ b/sysdeps/sparc/sparc64/multiarch/Makefile
@@ -10,7 +10,7 @@ ifeq ($(subdir),string)
 sysdep_routines += memcpy-ultra3 memcpy-niagara1 memcpy-niagara2 \
 		   memset-niagara1 memcpy-niagara4 memset-niagara4 \
 		   memcpy-ultra1 memset-ultra1 memcpy-memmove-niagara7 \
-		   memmove-ultra1
+		   memmove-ultra1 memset-niagara7
 endif
 
 ifeq ($(subdir),stdlib)
diff --git a/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c b/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
index a803392..cce78f0 100644
--- a/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
@@ -62,6 +62,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ultra1));
 
   IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_ADP,
+			      __bzero_niagara7)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_CRYPTO,
 			      __bzero_niagara4)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_BLKINIT,
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ultra1));
 
   IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_ADP,
+			      __memset_niagara7)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_CRYPTO,
 			      __memset_niagara4)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_BLKINIT,
diff --git a/sysdeps/sparc/sparc64/multiarch/ifunc-memset.h b/sysdeps/sparc/sparc64/multiarch/ifunc-memset.h
index f3b9293..bc273d6 100644
--- a/sysdeps/sparc/sparc64/multiarch/ifunc-memset.h
+++ b/sysdeps/sparc/sparc64/multiarch/ifunc-memset.h
@@ -19,6 +19,7 @@
 
 #include <ifunc-init.h>
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (niagara7) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (niagara4) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (niagara1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ultra1) attribute_hidden;
@@ -26,6 +27,8 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ultra1) attribute_hidden;
 static inline void *
 IFUNC_SELECTOR (int hwcap)
 {
+  if (hwcap & HWCAP_SPARC_ADP)
+    return OPTIMIZE (niagara7);
   if (hwcap & HWCAP_SPARC_CRYPTO)
     return OPTIMIZE (niagara4);
   if (hwcap & HWCAP_SPARC_BLKINIT)
diff --git a/sysdeps/sparc/sparc64/multiarch/memset-niagara7.S b/sysdeps/sparc/sparc64/multiarch/memset-niagara7.S
new file mode 100644
index 0000000..bfe107e
--- /dev/null
+++ b/sysdeps/sparc/sparc64/multiarch/memset-niagara7.S
@@ -0,0 +1,334 @@
+/* Set a block of memory to some byte value.  For SUN4V M7.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef XCC
+# define XCC    xcc
+#endif
+	.register	%g2, #scratch
+	.register	%g3, #scratch
+
+/* The algorithm is as follows :
+ *
+ *	For small 7 or fewer bytes stores, bytes will be stored.
+ *
+ *	For less than 32 bytes stores, align the address on 4 byte boundary.
+ *	Then store as many 4-byte chunks, followed by trailing bytes.
+ *
+ *	For sizes greater than 32 bytes, align the address on 8 byte boundary.
+ *	if (count >= 64) {
+ *		store 8-bytes chunks to align the address on 64 byte boundary
+ *		if (value to be set is zero && count >= MIN_ZERO) {
+ *			Using BIS stores, set the first long word of each
+ *			64-byte cache line to zero which will also clear the
+ *			other seven long words of the cache line.
+ *		}
+ *		else if (count >= MIN_LOOP) {
+ *			Using BIS stores, set the first long word of each of
+ *			ST_CHUNK cache lines (64 bytes each) before the main
+ *			loop is entered.
+ *			In the main loop, continue pre-setting the first long
+ *			word of each cache line ST_CHUNK lines in advance while
+ *			setting the other seven long words (56 bytes) of each
+ *			cache line until fewer than ST_CHUNK*64 bytes remain.
+ *			Then set the remaining seven long words of each cache
+ *			line that has already had its first long word set.
+ *		}
+ *		store remaining data in 64-byte chunks until less than
+ *		64 bytes remain.
+ *	}
+ *	Store as many 8-byte chunks, followed by trailing bytes.
+ *
+ *
+ * BIS = Block Init Store
+ *   Doing the advance store of the first element of the cache line
+ *   initiates the displacement of a cache line while only using a single
+ *   instruction in the pipeline. That avoids various pipeline delays,
+ *   such as filling the miss buffer. The performance effect is
+ *   similar to prefetching for normal stores.
+ *   The special case for zero fills runs faster and uses fewer instruction
+ *   cycles than the normal memset loop.
+ *
+ * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
+ * BIS stores must be followed by a membar #StoreStore. The benefit of
+ * the BIS store must be balanced against the cost of the membar operation.
+ */
+
+/*
+ * ASI_STBI_P marks the cache line as "least recently used"
+ * which means if many threads are active, it has a high chance
+ * of being pushed out of the cache between the first initializing
+ * store and the final stores.
+ * Thus, we use ASI_STBIMRU_P which marks the cache line as
+ * "most recently used" for all but the last store to the cache line.
+ */
+
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+#define ASI_ST_BLK_INIT_MRU_P 0xf2
+
+#define ASI_STBI_P	ASI_BLK_INIT_QUAD_LDD_P
+#define ASI_STBIMRU_P	ASI_ST_BLK_INIT_MRU_P
+
+#define ST_CHUNK	24   /* multiple of 4 due to loop unrolling */
+#define MIN_LOOP	(ST_CHUNK)*64
+#define MIN_ZERO	256
+
+#define EX_ST(x)	x
+#define EX_RETVAL(x)	x
+#define STORE_ASI(src,addr)	stxa src, [addr] ASI_STBIMRU_P
+#define STORE_INIT(src,addr)	stxa src, [addr] ASI_STBI_P
+
+#if IS_IN (libc)
+
+	.text
+	.align		32
+
+ENTRY(__bzero_niagara7)
+	/* bzero (dst, size)  */
+	mov	%o1, %o2
+	mov	0, %o1
+	/* fall through into memset code */
+END(__bzero_niagara7)
+
+ENTRY(__memset_niagara7)
+	/* memset (src, c, size)  */
+	mov	%o0, %o5		/* copy sp1 before using it  */
+	cmp	%o2, 7			/* if small counts, just write bytes  */
+	bleu,pn %XCC, .Lwrchar
+	 and	%o1, 0xff, %o1		/* o1 is (char)c  */
+
+	sll	%o1, 8, %o3
+	or	%o1, %o3, %o1		/* now o1 has 2 bytes of c  */
+	sll	%o1, 16, %o3
+	cmp	%o2, 32
+	blu,pn	%XCC, .Lwdalign
+	 or	%o1, %o3, %o1		/* now o1 has 4 bytes of c  */
+
+	sllx	%o1, 32, %o3
+	or	%o1, %o3, %o1		/* now o1 has 8 bytes of c  */
+
+.Ldbalign:
+	andcc	%o5, 7, %o3		/* is sp1 aligned on a 8 byte bound?  */
+	bz,pt	%XCC, .Lblkalign	/* already long word aligned  */
+	 sub	%o3, 8, %o3		/* -(bytes till long word aligned)  */
+
+	add	%o2, %o3, %o2		/* update o2 with new count  */
+	/* Set -(%o3) bytes till sp1 long word aligned  */
+1:	stb	%o1, [%o5]		/* there is at least 1 byte to set  */
+	inccc	%o3			/* byte clearing loop   */
+	bl,pt	%XCC, 1b
+	 inc	%o5
+
+	/* Now sp1 is long word aligned (sp1 is found in %o5) */
+.Lblkalign:
+	cmp	%o2, 64		/* check if there are 64 bytes to set  */
+	blu,pn	%XCC, .Lwrshort
+	 mov	%o2, %o3
+
+	andcc	%o5, 63, %o3		/* is sp1 block aligned?  */
+	bz,pt	%XCC, .Lblkwr		/* now block aligned  */
+	 sub	%o3, 64, %o3		/* o3 is -(bytes till block aligned)  */
+	add	%o2, %o3, %o2		/* o2 is the remainder  */
+
+	/* Store -(%o3) bytes till dst is block (64 byte) aligned.  */
+	/* Use long word stores.  */
+	/* Recall that dst is already long word aligned  */
+1:
+	addcc	%o3, 8, %o3
+	stx	%o1, [%o5]
+	bl,pt	%XCC, 1b
+	 add	%o5, 8, %o5
+
+	/* Now sp1 is block aligned  */
+.Lblkwr:
+	andn	%o2, 63, %o4		/* calculate size of blocks in bytes  */
+	brz,pn	%o1, .Lwrzero		/* special case if c == 0  */
+	 and	%o2, 63, %o3		/* %o3 = bytes left after blk stores  */
+
+	cmp	%o4, MIN_LOOP		/* check for enough bytes to set  */
+	blu,pn	%XCC, .Lshort_set	/* to justify cost of membar   */
+	 nop				/* must be > pre-cleared lines  */
+
+	/* initial cache-clearing stores  */
+	/* get store pipeline moving  */
+
+/*	Primary memset loop for large memsets  */
+.Lwr_loop:
+	mov	ST_CHUNK, %g1
+.Lwr_loop_start:
+	subcc	%g1, 4, %g1
+	EX_ST(STORE_ASI(%o1,%o5))
+	add	%o5, 64, %o5
+	EX_ST(STORE_ASI(%o1,%o5))
+	add	%o5, 64, %o5
+	EX_ST(STORE_ASI(%o1,%o5))
+	add	%o5, 64, %o5
+	EX_ST(STORE_ASI(%o1,%o5))
+	bgu	%XCC, .Lwr_loop_start
+	 add	%o5, 64, %o5
+
+	sub	%o5, ST_CHUNK*64, %o5	/* reset %o5  */
+	mov	ST_CHUNK, %g1
+	sub	%o5, 8, %o5		/* adjust %o5 for ASI store  */
+
+.Lwr_loop_rest:
+	stx	%o1,[%o5+8+8]
+	sub	%o4, 64, %o4
+	stx	%o1,[%o5+16+8]
+	subcc	%g1, 1, %g1
+	stx	%o1,[%o5+24+8]
+	stx	%o1,[%o5+32+8]
+	stx	%o1,[%o5+40+8]
+	add	%o5, 64, %o5
+	stx	%o1,[%o5-8]
+	bgu	%XCC, .Lwr_loop_rest
+	 EX_ST(STORE_INIT(%o1,%o5))
+
+	 add	%o5, 8, %o5		/* restore %o5 offset  */
+
+	/* If more than ST_CHUNK*64 bytes remain to set, continue  */
+	/* setting the first long word of each cache line in advance  */
+	/* to keep the store pipeline moving.  */
+
+	cmp	%o4, ST_CHUNK*64
+	bge,pt	%XCC, .Lwr_loop_start
+	 mov	ST_CHUNK, %g1
+
+	brz,a,pn %o4, .Lasi_done
+	 nop
+
+	sub	%o5, 8, %o5		/* adjust %o5 for ASI store  */
+.Lwr_loop_small:
+	add	%o5, 8, %o5		/* adjust %o5 for ASI store  */
+	EX_ST(STORE_ASI(%o1,%o5))
+	stx	%o1,[%o5+8]
+	stx	%o1,[%o5+16]
+	stx	%o1,[%o5+24]
+	stx	%o1,[%o5+32]
+	subcc	%o4, 64, %o4
+	stx	%o1,[%o5+40]
+	add	%o5, 56, %o5
+	stx	%o1,[%o5-8]
+	bgu,pt	%XCC, .Lwr_loop_small
+	 EX_ST(STORE_INIT(%o1,%o5))
+
+	ba	.Lasi_done
+	 add	%o5, 8, %o5		/* restore %o5 offset  */
+
+/*	Special case loop for zero fill memsets  */
+/*	For each 64 byte cache line, single STBI to first element  */
+/*	clears line  */
+.Lwrzero:
+	cmp	%o4, MIN_ZERO		/* check if enough bytes to set  */
+					/* to pay %asi + membar cost  */
+	blu	%XCC, .Lshort_set
+	 nop
+	sub	%o4, 256, %o4
+
+.Lwrzero_loop:
+	mov	64, %g3
+	EX_ST(STORE_INIT(%o1,%o5))
+	subcc	%o4, 256, %o4
+	EX_ST(STORE_INIT(%o1,%o5+%g3))
+	add	%o5, 256, %o5
+	sub	%g3, 192, %g3
+	EX_ST(STORE_INIT(%o1,%o5+%g3))
+	add %g3, 64, %g3
+	bge,pt	%XCC, .Lwrzero_loop
+	 EX_ST(STORE_INIT(%o1,%o5+%g3))
+	add	%o4, 256, %o4
+
+	brz,pn	%o4, .Lbsi_done
+	 nop
+.Lwrzero_small:
+	EX_ST(STORE_INIT(%o1,%o5))
+	subcc	%o4, 64, %o4
+	bgu,pt	%XCC, .Lwrzero_small
+	 add	%o5, 64, %o5
+
+.Lasi_done:
+.Lbsi_done:
+	membar	#StoreStore		/* required by use of BSI  */
+
+.Lshort_set:
+	cmp	%o4, 64			/* check if 64 bytes to set  */
+	blu	%XCC, 5f
+	 nop
+4:					/* set final blocks of 64 bytes  */
+	stx	%o1, [%o5]
+	stx	%o1, [%o5+8]
+	stx	%o1, [%o5+16]
+	stx	%o1, [%o5+24]
+	subcc	%o4, 64, %o4
+	stx	%o1, [%o5+32]
+	stx	%o1, [%o5+40]
+	add	%o5, 64, %o5
+	stx	%o1, [%o5-16]
+	bgu,pt	%XCC, 4b
+	 stx	%o1, [%o5-8]
+
+5:
+	/* Set the remaining long words  */
+.Lwrshort:
+	subcc	%o3, 8, %o3		/* Can we store any long words?  */
+	blu,pn	%XCC, .Lwrchars
+	 and	%o2, 7, %o2		/* calc bytes left after long words  */
+6:
+	subcc	%o3, 8, %o3
+	stx	%o1, [%o5]		/* store the long words  */
+	bgeu,pt %XCC, 6b
+	 add	%o5, 8, %o5
+
+.Lwrchars:				/* check for extra chars  */
+	brnz	%o2, .Lwrfin
+	 nop
+	retl
+	 nop
+
+.Lwdalign:
+	andcc	%o5, 3, %o3		/* is sp1 aligned on a word boundary  */
+	bz,pn	%XCC, .Lwrword
+	 andn	%o2, 3, %o3		/* create word sized count in %o3  */
+
+	dec	%o2			/* decrement count  */
+	stb	%o1, [%o5]		/* clear a byte  */
+	b	.Lwdalign
+	 inc	%o5			/* next byte  */
+
+.Lwrword:
+	subcc	%o3, 4, %o3
+	st	%o1, [%o5]		/* 4-byte writing loop  */
+	bnz,pt	%XCC, .Lwrword
+	 add	%o5, 4, %o5
+	and	%o2, 3, %o2		/* leftover count, if any  */
+
+.Lwrchar:
+	/* Set the remaining bytes, if any  */
+	brz	%o2, .Lexit
+	 nop
+.Lwrfin:
+	deccc	%o2
+	stb	%o1, [%o5]
+	bgu,pt	%XCC, .Lwrfin
+	 inc	%o5
+.Lexit:
+	retl				/* %o0 was preserved  */
+	 nop
+END(__memset_niagara7)
+#endif
-- 
cgit v1.1