aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power7/memcpy.S
diff options
context:
space:
mode:
authorRajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>2017-10-25 13:13:53 -0200
committerTulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>2017-10-25 13:14:30 -0200
commit63da5cd4a097d089033d980c42254c3356fa723f (patch)
tree0c503790b1b430759f4d292a2a86e49b96531c47 /sysdeps/powerpc/powerpc64/power7/memcpy.S
parenta122dbfb2e7f90b338b633a73b576efa258e215d (diff)
downloadglibc-63da5cd4a097d089033d980c42254c3356fa723f.zip
glibc-63da5cd4a097d089033d980c42254c3356fa723f.tar.gz
glibc-63da5cd4a097d089033d980c42254c3356fa723f.tar.bz2
powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove
POWER9 DD2.1 and earlier has an issue where some cache inhibited vector load traps to the kernel, causing a performance degradation. To handle this in memcpy and memmove, lvx/stvx is used for aligned addresses instead of lxvd2x/stxvd2x. Reference: https://patchwork.ozlabs.org/patch/814059/ * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace lxvd2x/stxvd2x with lvx/stvx. * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise. Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7/memcpy.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcpy.S64
1 files changed, 32 insertions, 32 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 1ccbc2e..a7cdf8b 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -91,63 +91,63 @@ L(aligned_copy):
srdi 12,cnt,7
cmpdi 12,0
beq L(aligned_tail)
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ lvx 6,0,src
+ lvx 7,src,6
mtctr 12
b L(aligned_128loop)
.align 4
L(aligned_128head):
/* for the 2nd + iteration of this loop. */
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ lvx 6,0,src
+ lvx 7,src,6
L(aligned_128loop):
- lxvd2x 8,src,7
- lxvd2x 9,src,8
- stxvd2x 6,0,dst
+ lvx 8,src,7
+ lvx 9,src,8
+ stvx 6,0,dst
addi src,src,64
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
+ lvx 6,0,src
+ lvx 7,src,6
addi dst,dst,64
- lxvd2x 8,src,7
- lxvd2x 9,src,8
+ lvx 8,src,7
+ lvx 9,src,8
addi src,src,64
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
+ stvx 6,0,dst
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
addi dst,dst,64
bdnz L(aligned_128head)
L(aligned_tail):
mtocrf 0x01,cnt
bf 25,32f
- lxvd2x 6,0,src
- lxvd2x 7,src,6
- lxvd2x 8,src,7
- lxvd2x 9,src,8
+ lvx 6,0,src
+ lvx 7,src,6
+ lvx 8,src,7
+ lvx 9,src,8
addi src,src,64
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
+ stvx 6,0,dst
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
addi dst,dst,64
32:
bf 26,16f
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ lvx 6,0,src
+ lvx 7,src,6
addi src,src,32
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
+ stvx 6,0,dst
+ stvx 7,dst,6
addi dst,dst,32
16:
bf 27,8f
- lxvd2x 6,0,src
+ lvx 6,0,src
addi src,src,16
- stxvd2x 6,0,dst
+ stvx 6,0,dst
addi dst,dst,16
8:
bf 28,4f