Adjust writeback in non-zero memset

This fixes an ineffiency in the non-zero memset. Delaying the writeback until the end of the loop is slightly faster on some cores - this shows ~5% performance gain on Cortex-A53 when doing large non-zero memsets. Tested against the GLIBC testsuite.
author: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2018-11-06 14:42:10 +0000
committer: Richard Earnshaw <Richard.Earnshaw@arm.com> 2018-11-06 14:59:51 +0000
commit: d80db600664bec381230be85955b54884f21a619 (patch)
tree: 3a3c9725ac7206987203f7eeefe1a9f209d30c3e
parent: 535903696c339b71edd3575ab44bbf2e5eab689a (diff)
download: newlib-d80db600664bec381230be85955b54884f21a619.zip
newlib-d80db600664bec381230be85955b54884f21a619.tar.gz
newlib-d80db600664bec381230be85955b54884f21a619.tar.bz2
1 files changed, 3 insertions, 3 deletions
diff --git a/newlib/libc/machine/aarch64/memset.S b/newlib/libc/machine/aarch64/memset.S
index 799e7b7..7c8fe58 100644
--- a/newlib/libc/machine/aarch64/memset.S
+++ b/newlib/libc/machine/aarch64/memset.S
@@ -142,10 +142,10 @@ L(set_long):
 	b.eq	L(try_zva)
 L(no_zva):
 	sub	count, dstend, dst	/* Count is 16 too large.  */
-	add	dst, dst, 16
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
 	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
+1:	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
 L(tail64):
 	subs	count, count, 64
 	b.hi	1b
author	Wilco Dijkstra <Wilco.Dijkstra@arm.com>	2018-11-06 14:42:10 +0000
committer	Richard Earnshaw <Richard.Earnshaw@arm.com>	2018-11-06 14:59:51 +0000
commit	d80db600664bec381230be85955b54884f21a619 (patch)
tree	3a3c9725ac7206987203f7eeefe1a9f209d30c3e
parent	535903696c339b71edd3575ab44bbf2e5eab689a (diff)
download	newlib-d80db600664bec381230be85955b54884f21a619.zip newlib-d80db600664bec381230be85955b54884f21a619.tar.gz newlib-d80db600664bec381230be85955b54884f21a619.tar.bz2