aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/memset.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/memset.S')
-rw-r--r--sysdeps/powerpc/memset.S121
1 files changed, 115 insertions, 6 deletions
diff --git a/sysdeps/powerpc/memset.S b/sysdeps/powerpc/memset.S
index eb11bde..bee87af 100644
--- a/sysdeps/powerpc/memset.S
+++ b/sysdeps/powerpc/memset.S
@@ -21,12 +21,26 @@
#include <bp-sym.h>
#include <bp-asm.h>
+/* Define a global static that can hold the cache line size. The
+ assumption is that startup code will access the "aux vector" to
+ to obtain the value set by the kernel and store it into this
+ variable. */
+
+ .globl __cache_line_size
+ .section ".data","aw"
+ .align 2
+ .type __cache_line_size,@object
+ .size __cache_line_size,4
+__cache_line_size:
+ .long 0
+ .section ".text"
/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
Returns 's'.
- The memset is done in three sizes: byte (8 bits), word (32 bits),
- cache line (256 bits). There is a special case for setting cache lines
- to 0, to take advantage of the dcbz instruction. */
+ The memset is done in four sizes: byte (8 bits), word (32 bits),
+ 32-byte blocks (256 bits) and __cache_line_size (128, 256, 1024 bits).
+ There is a special case for setting whole cache lines to 0, which
+ takes advantage of the dcbz instruction. */
EALIGN (BP_SYM (memset), 5, 1)
@@ -50,6 +64,10 @@ EALIGN (BP_SYM (memset), 5, 1)
#define rNEG64 r8 /* constant -64 for clearing with dcbz */
#define rNEG32 r9 /* constant -32 for clearing with dcbz */
+#define rGOT r9 /* Address of the Global Offset Table. */
+#define rCLS r8 /* Cache line size obtained from static. */
+#define rCLM r9 /* Cache line size mask to check for cache alignment. */
+
#if __BOUNDED_POINTERS__
cmplwi cr1, rRTN, 0
CHECK_BOUNDS_BOTH_WIDE (rMEMP0, rTMP, rTMP2, rLEN)
@@ -105,7 +123,17 @@ L(caligned):
cmplwi cr1, rCHR, 0
clrrwi. rALIGN, rLEN, 5
mtcrf 0x01, rLEN /* 40th instruction from .align */
- beq cr1, L(zloopstart) /* special case for clearing memory using dcbz */
+
+/* Check if we can use the special case for clearing memory using dcbz.
+ This requires that we know the correct cache line size for this
+ processor. Getting the __cache_line_size may require establishing GOT
+ addressability, so branch out of line to set this up. */
+ beq cr1, L(checklinesize)
+
+/* Store blocks of 32-bytes (256-bits) starting on a 32-byte boundary.
+ Can't assume that rCHR is zero or that the cache line size is either
+ 32-bytes or even known. */
+L(nondcbz):
srwi rTMP, rALIGN, 5
mtctr rTMP
beq L(medium) /* we may not actually get to do a full line */
@@ -114,7 +142,9 @@ L(caligned):
li rNEG64, -0x40
bdz L(cloopdone) /* 48th instruction from .align */
-L(c3): dcbz rNEG64, rMEMP
+/* We can't use dcbz here as we don't know the cache line size. We can
+ use "data cache block touch for store", which is safe. */
+L(c3): dcbtst rNEG64, rMEMP
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
stw rCHR, -12(rMEMP)
@@ -142,7 +172,10 @@ L(cloopdone):
.align 5
nop
-/* Clear lines of memory in 128-byte chunks. */
+/* Clear cache lines of memory in 128-byte chunks.
+ This code is optimized for processors with 32-byte cache lines.
+ It is further optimized for the 601 processor, which requires
+ some care in how the code is aligned in the i-cache. */
L(zloopstart):
clrlwi rLEN, rLEN, 27
mtcrf 0x02, rALIGN
@@ -226,4 +259,80 @@ L(medium_28t):
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
blr
+
+L(checklinesize):
+#ifdef SHARED
+ mflr rTMP
+/* If the remaining length is less the 32 bytes then don't bother getting
+ the cache line size. */
+ beq L(medium)
+/* Establishes GOT addressability so we can load __cache_line_size
+ from static. This value was set from the aux vector during startup. */
+ bl _GLOBAL_OFFSET_TABLE_@local-4
+ mflr rGOT
+ lwz rGOT,__cache_line_size@got(rGOT)
+ lwz rCLS,0(rGOT)
+ mtlr rTMP
+#else
+/* Load __cache_line_size from static. This value was set from the
+ aux vector during startup. */
+ lis rCLS,__cache_line_size@ha
+/* If the remaining length is less the 32 bytes then don't bother getting
+ the cache line size. */
+ beq L(medium)
+ lwz rCLS,__cache_line_size@l(rCLS)
+#endif
+
+/*If the cache line size was not set then goto to L(nondcbz), which is
+ safe for any cache line size. */
+ cmplwi cr1,rCLS,0
+ beq cr1,L(nondcbz)
+
+/* If the cache line size is 32 bytes then goto to L(zloopstart),
+ which is coded specificly for 32-byte lines (and 601). */
+ cmplwi cr1,rCLS,32
+ beq cr1,L(zloopstart)
+
+/* Now we know the cache line size and it is not 32-bytes. However
+ we may not yet be aligned to the cache line and may have a partial
+ line to fill. Touch it 1st to fetch the cache line. */
+ dcbtst 0,rMEMP
+
+ addi rCLM,rCLS,-1
+L(getCacheAligned):
+ cmplwi cr1,rLEN,32
+ and. rTMP,rCLM,rMEMP
+ blt cr1,L(handletail32)
+ beq L(cacheAligned)
+/* We are not aligned to start of a cache line yet. Store 32-byte
+ of data and test again. */
+ addi rMEMP,rMEMP,32
+ addi rLEN,rLEN,-32
+ stw rCHR,-32(rMEMP)
+ stw rCHR,-28(rMEMP)
+ stw rCHR,-24(rMEMP)
+ stw rCHR,-20(rMEMP)
+ stw rCHR,-16(rMEMP)
+ stw rCHR,-12(rMEMP)
+ stw rCHR,-8(rMEMP)
+ stw rCHR,-4(rMEMP)
+ b L(getCacheAligned)
+
+/* Now we are aligned to the cache line and can use dcbz. */
+L(cacheAligned):
+ cmplw cr1,rLEN,rCLS
+ blt cr1,L(handletail32)
+ dcbz 0,rMEMP
+ subf rLEN,rCLS,rLEN
+ add rMEMP,rMEMP,rCLS
+ b L(cacheAligned)
+
+/* We are here because; the cache line size was set, it was not
+ 32-bytes, and the remainder (rLEN) is now less than the actual cache
+ line size. Set up the preconditions for L(nondcbz) and go there to
+ store the remaining bytes. */
+L(handletail32):
+ clrrwi. rALIGN, rLEN, 5
+ b L(nondcbz)
+
END (BP_SYM (memset))