aboutsummaryrefslogtreecommitdiff
path: root/libgcc
diff options
context:
space:
mode:
authorRoger Sayle <roger@nextmovesoftware.com>2024-06-14 06:29:27 +0100
committerRoger Sayle <roger@nextmovesoftware.com>2024-06-14 06:29:27 +0100
commitc129a34dc8e69f7b34cf72835aeba2cefbb8673a (patch)
tree0f9770c277122cd13d787a0f68c19e51cdf27542 /libgcc
parentd8a6de9e2b850b71712e89e8e6026e4ae6284766 (diff)
downloadgcc-c129a34dc8e69f7b34cf72835aeba2cefbb8673a.zip
gcc-c129a34dc8e69f7b34cf72835aeba2cefbb8673a.tar.gz
gcc-c129a34dc8e69f7b34cf72835aeba2cefbb8673a.tar.bz2
i386: More use of m{32,64}bcst addressing modes with ternlog.
This patch makes more use of m32bcst and m64bcst addressing modes in ix86_expand_ternlog. Previously, the i386 backend would only consider using a m32bcst if the inner mode of the vector was 32-bits, or using m64bcst if the inner mode was 64-bits. For ternlog (and other logic operations) this is a strange restriction, as how the same constant is materialized is dependent upon the mode it is used/operated on. Hence, the V16QI constant {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} wouldn't use m??bcst, but (V4SI){0x02020202,0x02020202,0x02020202,0x02020202} which has the same bit pattern would. This can optimized by (re)checking whether a CONST_VECTOR can be broadcast from memory after casting it to VxSI (or for m64bst to VxDI) where x has the appropriate vector size. Taking the test case from pr115407: __attribute__((__vector_size__(64))) char v; void foo() { v = v | v << 7; } Compiled with -O2 -mcmodel=large -mavx512bw GCC 14 generates a 64-byte (512-bit) load from the constant pool: foo: movabsq $v, %rax // 10 movabsq $.LC0, %rdx // 10 vpsllw $7, (%rax), %zmm1 // 7 vmovdqa64 (%rax), %zmm0 // 6 vpternlogd $248, (%rdx), %zmm1, %zmm0 // 7 vmovdqa64 %zmm0, (%rax) // 6 vzeroupper // 3 ret // 1 .LC0: .byte -12 // 64 = 114 bytes .byte -128 ;; repeated another 62 times mainline currently generates two instructions, using interunit broadcast: foo: movabsq $v, %rdx // 10 movl $-2139062144, %eax // 5 vmovdqa64 (%rdx), %zmm2 // 6 vpbroadcastd %eax, %zmm0 // 6 vpsllw $7, %zmm2, %zmm1 // 7 vpternlogd $236, %zmm0, %zmm2, %zmm1 // 7 vmovdqa64 %zmm1, (%rdx) // 6 vzeroupper // 3 ret // 1 = 51 bytes With this patch, we now generate a broadcast addressing mode: foo: movabsq $v, %rax // 10 movabsq $.LC1, %rdx // 10 vmovdqa64 (%rax), %zmm1 // 6 vpsllw $7, %zmm1, %zmm0 // 7 vpternlogd $236, (%rdx){1to16}, %zmm1, %zmm0 // 7 vmovdqa64 %zmm0, (%rax) // 6 vzeroupper // 3 ret // 1 = 50 total Without -mcmodel=large, the benefit is two instructions: foo: vmovdqa64 v(%rip), %zmm1 // 10 vpsllw $7, %zmm1, %zmm0 // 7 vpternlogd $236, .LC2(%rip){1to16}, %zmm1, %zmm0 // 11 vmovdqa64 %zmm0, v(%rip) // 10 vzeroupper // 3 ret // 1 = 42 total 2024-06-14 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-expand.cc (ix86_expand_ternlog): Try performing logic operation in a different vector mode if that enables use of a 32-bit or 64-bit broadcast addressing mode. gcc/testsuite/ChangeLog * gcc.target/i386/pr115407.c: New test case.
Diffstat (limited to 'libgcc')
0 files changed, 0 insertions, 0 deletions