diff options
author | Roger Sayle <roger@nextmovesoftware.com> | 2024-06-14 06:29:27 +0100 |
---|---|---|
committer | Roger Sayle <roger@nextmovesoftware.com> | 2024-06-14 06:29:27 +0100 |
commit | c129a34dc8e69f7b34cf72835aeba2cefbb8673a (patch) | |
tree | 0f9770c277122cd13d787a0f68c19e51cdf27542 /gcc | |
parent | d8a6de9e2b850b71712e89e8e6026e4ae6284766 (diff) | |
download | gcc-c129a34dc8e69f7b34cf72835aeba2cefbb8673a.zip gcc-c129a34dc8e69f7b34cf72835aeba2cefbb8673a.tar.gz gcc-c129a34dc8e69f7b34cf72835aeba2cefbb8673a.tar.bz2 |
i386: More use of m{32,64}bcst addressing modes with ternlog.
This patch makes more use of m32bcst and m64bcst addressing modes in
ix86_expand_ternlog. Previously, the i386 backend would only consider
using a m32bcst if the inner mode of the vector was 32-bits, or using
m64bcst if the inner mode was 64-bits. For ternlog (and other logic
operations) this is a strange restriction, as how the same constant
is materialized is dependent upon the mode it is used/operated on.
Hence, the V16QI constant {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} wouldn't
use m??bcst, but (V4SI){0x02020202,0x02020202,0x02020202,0x02020202}
which has the same bit pattern would. This can optimized by (re)checking
whether a CONST_VECTOR can be broadcast from memory after casting it
to VxSI (or for m64bst to VxDI) where x has the appropriate vector size.
Taking the test case from pr115407:
__attribute__((__vector_size__(64))) char v;
void foo() {
v = v | v << 7;
}
Compiled with -O2 -mcmodel=large -mavx512bw
GCC 14 generates a 64-byte (512-bit) load from the constant pool:
foo: movabsq $v, %rax // 10
movabsq $.LC0, %rdx // 10
vpsllw $7, (%rax), %zmm1 // 7
vmovdqa64 (%rax), %zmm0 // 6
vpternlogd $248, (%rdx), %zmm1, %zmm0 // 7
vmovdqa64 %zmm0, (%rax) // 6
vzeroupper // 3
ret // 1
.LC0: .byte -12 // 64 = 114 bytes
.byte -128
;; repeated another 62 times
mainline currently generates two instructions, using interunit broadcast:
foo: movabsq $v, %rdx // 10
movl $-2139062144, %eax // 5
vmovdqa64 (%rdx), %zmm2 // 6
vpbroadcastd %eax, %zmm0 // 6
vpsllw $7, %zmm2, %zmm1 // 7
vpternlogd $236, %zmm0, %zmm2, %zmm1 // 7
vmovdqa64 %zmm1, (%rdx) // 6
vzeroupper // 3
ret // 1 = 51 bytes
With this patch, we now generate a broadcast addressing mode:
foo: movabsq $v, %rax // 10
movabsq $.LC1, %rdx // 10
vmovdqa64 (%rax), %zmm1 // 6
vpsllw $7, %zmm1, %zmm0 // 7
vpternlogd $236, (%rdx){1to16}, %zmm1, %zmm0 // 7
vmovdqa64 %zmm0, (%rax) // 6
vzeroupper // 3
ret // 1 = 50 total
Without -mcmodel=large, the benefit is two instructions:
foo: vmovdqa64 v(%rip), %zmm1 // 10
vpsllw $7, %zmm1, %zmm0 // 7
vpternlogd $236, .LC2(%rip){1to16}, %zmm1, %zmm0 // 11
vmovdqa64 %zmm0, v(%rip) // 10
vzeroupper // 3
ret // 1 = 42 total
2024-06-14 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_expand_ternlog): Try performing
logic operation in a different vector mode if that enables use of
a 32-bit or 64-bit broadcast addressing mode.
gcc/testsuite/ChangeLog
* gcc.target/i386/pr115407.c: New test case.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/i386-expand.cc | 63 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr115407.c | 9 |
2 files changed, 72 insertions, 0 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 312329e..a4379b8 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -26041,6 +26041,69 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, tmp2 = ix86_gen_bcst_mem (mode, op2); if (!tmp2) { + machine_mode bcst32_mode = mode; + machine_mode bcst64_mode = mode; + switch (mode) + { + case V1TImode: + case V4SImode: + case V4SFmode: + case V8HImode: + case V16QImode: + bcst32_mode = V4SImode; + bcst64_mode = V2DImode; + break; + + case V2TImode: + case V8SImode: + case V8SFmode: + case V16HImode: + case V32QImode: + bcst32_mode = V8SImode; + bcst64_mode = V4DImode; + break; + + case V4TImode: + case V16SImode: + case V16SFmode: + case V32HImode: + case V64QImode: + bcst32_mode = V16SImode; + bcst64_mode = V8DImode; + break; + + default: + break; + } + + if (bcst32_mode != mode) + { + tmp2 = gen_lowpart (bcst32_mode, op2); + if (ix86_gen_bcst_mem (bcst32_mode, tmp2)) + { + tmp2 = ix86_expand_ternlog (bcst32_mode, + gen_lowpart (bcst32_mode, tmp0), + gen_lowpart (bcst32_mode, tmp1), + tmp2, idx, NULL_RTX); + emit_move_insn (target, gen_lowpart (mode, tmp2)); + return target; + } + } + + if (bcst64_mode != mode) + { + tmp2 = gen_lowpart (bcst64_mode, op2); + if (ix86_gen_bcst_mem (bcst64_mode, tmp2)) + { + tmp2 = ix86_expand_ternlog (bcst64_mode, + gen_lowpart (bcst64_mode, tmp0), + gen_lowpart (bcst64_mode, tmp1), + tmp2, idx, NULL_RTX); + emit_move_insn (target, gen_lowpart (mode, tmp2)); + return target; + } + } + tmp2 = force_const_mem (mode, op2); rtx bcast = ix86_broadcast_from_constant (mode, tmp2); tmp2 = validize_mem (tmp2); diff --git a/gcc/testsuite/gcc.target/i386/pr115407.c b/gcc/testsuite/gcc.target/i386/pr115407.c new file mode 100644 index 0000000..b6cb7a6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr115407.c @@ -0,0 +1,9 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mcmodel=large -mavx512bw" } */ +__attribute__((__vector_size__(64))) char v; + +void foo() { + v = v | v << 7; +} + +/* { dg-final { scan-assembler "vpternlog.*1to16" } } */ |