diff options
author | Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> | 2017-10-25 13:13:53 -0200 |
---|---|---|
committer | Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com> | 2017-10-25 13:14:30 -0200 |
commit | 63da5cd4a097d089033d980c42254c3356fa723f (patch) | |
tree | 0c503790b1b430759f4d292a2a86e49b96531c47 | |
parent | a122dbfb2e7f90b338b633a73b576efa258e215d (diff) | |
download | glibc-63da5cd4a097d089033d980c42254c3356fa723f.zip glibc-63da5cd4a097d089033d980c42254c3356fa723f.tar.gz glibc-63da5cd4a097d089033d980c42254c3356fa723f.tar.bz2 |
powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove
POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load traps to the kernel, causing a performance degradation. To
handle this in memcpy and memmove, lvx/stvx is used for aligned
addresses instead of lxvd2x/stxvd2x.
Reference: https://patchwork.ozlabs.org/patch/814059/
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
lxvd2x/stxvd2x with lvx/stvx.
* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power7/memcpy.S | 64 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power7/memmove.S | 128 |
3 files changed, 102 insertions, 96 deletions
@@ -1,3 +1,9 @@ +2017-10-25 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace + lxvd2x/stxvd2x with lvx/stvx. + * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise. + 2017-10-25 H.J. Lu <hongjiu.lu@intel.com> * include/alloc_buffer.h: Replace "if if " with "if " in diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S index 1ccbc2e..a7cdf8b 100644 --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S +++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S @@ -91,63 +91,63 @@ L(aligned_copy): srdi 12,cnt,7 cmpdi 12,0 beq L(aligned_tail) - lxvd2x 6,0,src - lxvd2x 7,src,6 + lvx 6,0,src + lvx 7,src,6 mtctr 12 b L(aligned_128loop) .align 4 L(aligned_128head): /* for the 2nd + iteration of this loop. */ - lxvd2x 6,0,src - lxvd2x 7,src,6 + lvx 6,0,src + lvx 7,src,6 L(aligned_128loop): - lxvd2x 8,src,7 - lxvd2x 9,src,8 - stxvd2x 6,0,dst + lvx 8,src,7 + lvx 9,src,8 + stvx 6,0,dst addi src,src,64 - stxvd2x 7,dst,6 - stxvd2x 8,dst,7 - stxvd2x 9,dst,8 - lxvd2x 6,0,src - lxvd2x 7,src,6 + stvx 7,dst,6 + stvx 8,dst,7 + stvx 9,dst,8 + lvx 6,0,src + lvx 7,src,6 addi dst,dst,64 - lxvd2x 8,src,7 - lxvd2x 9,src,8 + lvx 8,src,7 + lvx 9,src,8 addi src,src,64 - stxvd2x 6,0,dst - stxvd2x 7,dst,6 - stxvd2x 8,dst,7 - stxvd2x 9,dst,8 + stvx 6,0,dst + stvx 7,dst,6 + stvx 8,dst,7 + stvx 9,dst,8 addi dst,dst,64 bdnz L(aligned_128head) L(aligned_tail): mtocrf 0x01,cnt bf 25,32f - lxvd2x 6,0,src - lxvd2x 7,src,6 - lxvd2x 8,src,7 - lxvd2x 9,src,8 + lvx 6,0,src + lvx 7,src,6 + lvx 8,src,7 + lvx 9,src,8 addi src,src,64 - stxvd2x 6,0,dst - stxvd2x 7,dst,6 - stxvd2x 8,dst,7 - stxvd2x 9,dst,8 + stvx 6,0,dst + stvx 7,dst,6 + stvx 8,dst,7 + stvx 9,dst,8 addi dst,dst,64 32: bf 26,16f - lxvd2x 6,0,src - lxvd2x 7,src,6 + lvx 6,0,src + lvx 7,src,6 addi src,src,32 - stxvd2x 6,0,dst - stxvd2x 7,dst,6 + stvx 6,0,dst + stvx 7,dst,6 addi dst,dst,32 16: bf 27,8f - lxvd2x 6,0,src + lvx 6,0,src addi src,src,16 - stxvd2x 6,0,dst + stvx 6,0,dst addi dst,dst,16 8: bf 28,4f diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S index 93baa69..667c6e2 100644 --- a/sysdeps/powerpc/powerpc64/power7/memmove.S +++ b/sysdeps/powerpc/powerpc64/power7/memmove.S @@ -92,63 +92,63 @@ L(aligned_copy): srdi 12,r5,7 cmpdi 12,0 beq L(aligned_tail) - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 + lvx 6,0,r4 + lvx 7,r4,6 mtctr 12 b L(aligned_128loop) .align 4 L(aligned_128head): /* for the 2nd + iteration of this loop. */ - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 + lvx 6,0,r4 + lvx 7,r4,6 L(aligned_128loop): - lxvd2x 8,r4,7 - lxvd2x 9,r4,8 - stxvd2x 6,0,r11 + lvx 8,r4,7 + lvx 9,r4,8 + stvx 6,0,r11 addi r4,r4,64 - stxvd2x 7,r11,6 - stxvd2x 8,r11,7 - stxvd2x 9,r11,8 - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 + stvx 7,r11,6 + stvx 8,r11,7 + stvx 9,r11,8 + lvx 6,0,r4 + lvx 7,r4,6 addi r11,r11,64 - lxvd2x 8,r4,7 - lxvd2x 9,r4,8 + lvx 8,r4,7 + lvx 9,r4,8 addi r4,r4,64 - stxvd2x 6,0,r11 - stxvd2x 7,r11,6 - stxvd2x 8,r11,7 - stxvd2x 9,r11,8 + stvx 6,0,r11 + stvx 7,r11,6 + stvx 8,r11,7 + stvx 9,r11,8 addi r11,r11,64 bdnz L(aligned_128head) L(aligned_tail): mtocrf 0x01,r5 bf 25,32f - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 - lxvd2x 8,r4,7 - lxvd2x 9,r4,8 + lvx 6,0,r4 + lvx 7,r4,6 + lvx 8,r4,7 + lvx 9,r4,8 addi r4,r4,64 - stxvd2x 6,0,r11 - stxvd2x 7,r11,6 - stxvd2x 8,r11,7 - stxvd2x 9,r11,8 + stvx 6,0,r11 + stvx 7,r11,6 + stvx 8,r11,7 + stvx 9,r11,8 addi r11,r11,64 32: bf 26,16f - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 + lvx 6,0,r4 + lvx 7,r4,6 addi r4,r4,32 - stxvd2x 6,0,r11 - stxvd2x 7,r11,6 + stvx 6,0,r11 + stvx 7,r11,6 addi r11,r11,32 16: bf 27,8f - lxvd2x 6,0,r4 + lvx 6,0,r4 addi r4,r4,16 - stxvd2x 6,0,r11 + stvx 6,0,r11 addi r11,r11,16 8: bf 28,4f @@ -488,63 +488,63 @@ L(aligned_copy_bwd): srdi r12,r5,7 cmpdi r12,0 beq L(aligned_tail_bwd) - lxvd2x v6,r4,r6 - lxvd2x v7,r4,r7 + lvx v6,r4,r6 + lvx v7,r4,r7 mtctr 12 b L(aligned_128loop_bwd) .align 4 L(aligned_128head_bwd): /* for the 2nd + iteration of this loop. */ - lxvd2x v6,r4,r6 - lxvd2x v7,r4,r7 + lvx v6,r4,r6 + lvx v7,r4,r7 L(aligned_128loop_bwd): - lxvd2x v8,r4,r8 - lxvd2x v9,r4,r9 - stxvd2x v6,r11,r6 + lvx v8,r4,r8 + lvx v9,r4,r9 + stvx v6,r11,r6 subi r4,r4,64 - stxvd2x v7,r11,r7 - stxvd2x v8,r11,r8 - stxvd2x v9,r11,r9 - lxvd2x v6,r4,r6 - lxvd2x v7,r4,7 + stvx v7,r11,r7 + stvx v8,r11,r8 + stvx v9,r11,r9 + lvx v6,r4,r6 + lvx v7,r4,7 subi r11,r11,64 - lxvd2x v8,r4,r8 - lxvd2x v9,r4,r9 + lvx v8,r4,r8 + lvx v9,r4,r9 subi r4,r4,64 - stxvd2x v6,r11,r6 - stxvd2x v7,r11,r7 - stxvd2x v8,r11,r8 - stxvd2x v9,r11,r9 + stvx v6,r11,r6 + stvx v7,r11,r7 + stvx v8,r11,r8 + stvx v9,r11,r9 subi r11,r11,64 bdnz L(aligned_128head_bwd) L(aligned_tail_bwd): mtocrf 0x01,r5 bf 25,32f - lxvd2x v6,r4,r6 - lxvd2x v7,r4,r7 - lxvd2x v8,r4,r8 - lxvd2x v9,r4,r9 + lvx v6,r4,r6 + lvx v7,r4,r7 + lvx v8,r4,r8 + lvx v9,r4,r9 subi r4,r4,64 - stxvd2x v6,r11,r6 - stxvd2x v7,r11,r7 - stxvd2x v8,r11,r8 - stxvd2x v9,r11,r9 + stvx v6,r11,r6 + stvx v7,r11,r7 + stvx v8,r11,r8 + stvx v9,r11,r9 subi r11,r11,64 32: bf 26,16f - lxvd2x v6,r4,r6 - lxvd2x v7,r4,r7 + lvx v6,r4,r6 + lvx v7,r4,r7 subi r4,r4,32 - stxvd2x v6,r11,r6 - stxvd2x v7,r11,r7 + stvx v6,r11,r6 + stvx v7,r11,r7 subi r11,r11,32 16: bf 27,8f - lxvd2x v6,r4,r6 + lvx v6,r4,r6 subi r4,r4,16 - stxvd2x v6,r11,r6 + stvx v6,r11,r6 subi r11,r11,16 8: bf 28,4f |