aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386
diff options
context:
space:
mode:
authorRoger Sayle <roger@nextmovesoftware.com>2022-01-08 12:27:50 +0000
committerRoger Sayle <roger@nextmovesoftware.com>2022-01-08 12:27:50 +0000
commitfad14a028f3ec2fbdc95c686a59891141c8989df (patch)
treec8da34db734921e48f2d1d731063ed530e9473ba /gcc/config/i386
parent51d464b608b38b9e2007948d10b1e0f1dcec142c (diff)
downloadgcc-fad14a028f3ec2fbdc95c686a59891141c8989df.zip
gcc-fad14a028f3ec2fbdc95c686a59891141c8989df.tar.gz
gcc-fad14a028f3ec2fbdc95c686a59891141c8989df.tar.bz2
x86_64: Improve (interunit) moves from TImode to V1TImode.
This patch improves the code generated when moving a 128-bit value in TImode, represented by two 64-bit registers, to V1TImode, which is a single SSE register. Currently, the simple move: typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); uv1ti foo(__int128 x) { return (uv1ti)x; } is always transferred via memory, as: foo: movq %rdi, -24(%rsp) movq %rsi, -16(%rsp) movdqa -24(%rsp), %xmm0 ret with this patch, we now generate (with -msse2): foo: movq %rdi, %xmm1 movq %rsi, %xmm2 punpcklqdq %xmm2, %xmm1 movdqa %xmm1, %xmm0 ret and with -mavx2: foo: vmovq %rdi, %xmm1 vpinsrq $1, %rsi, %xmm1, %xmm0 ret Even more dramatic is the improvement of zero extended transfers. uv1ti bar(unsigned char c) { return (uv1ti)(__int128)c; } Previously generated: bar: movq $0, -16(%rsp) movzbl %dil, %eax movq %rax, -24(%rsp) vmovdqa -24(%rsp), %xmm0 ret Now generates: bar: movzbl %dil, %edi movq %rdi, %xmm0 ret My first attempt at this functionality attempted to use a simple define_split, but unfortunately, this triggers very late during the compilation preventing some of the simplifications we'd like (in combine). For example the foo case above becomes: foo: movq %rsi, -16(%rsp) movq %rdi, %xmm0 movhps -16(%rsp), %xmm0 transferring half directly, and the other half via memory. And for the bar case above, GCC fails to appreciate that movq/vmovq clears the high bits, resulting in: bar: movzbl %dil, %eax xorl %edx, %edx vmovq %rax, %xmm1 vpinsrq $1, %rdx, %xmm1, %xmm0 ret Hence the solution (i.e. this patch) is to add a special case to ix86_expand_vector_move for TImode to V1TImode transfers. 2022-01-08 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-expand.c (ix86_expand_vector_move): Add special case for TImode to V1TImode moves, going via V2DImode. gcc/testsuite/ChangeLog * gcc.target/i386/sse2-v1ti-mov-1.c: New test case. * gcc.target/i386/sse2-v1ti-zext.c: New test case.
Diffstat (limited to 'gcc/config/i386')
-rw-r--r--gcc/config/i386/i386-expand.c17
1 files changed, 17 insertions, 0 deletions
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 0d4ae5a..95bba25 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -615,6 +615,23 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
return;
}
+ /* Special case TImode to V1TImode conversions, via V2DI. */
+ if (mode == V1TImode
+ && SUBREG_P (op1)
+ && GET_MODE (SUBREG_REG (op1)) == TImode
+ && TARGET_64BIT && TARGET_SSE
+ && can_create_pseudo_p ())
+ {
+ rtx tmp = gen_reg_rtx (V2DImode);
+ rtx lo = gen_reg_rtx (DImode);
+ rtx hi = gen_reg_rtx (DImode);
+ emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
+ emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
+ emit_insn (gen_vec_concatv2di (tmp, lo, hi));
+ emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
+ return;
+ }
+
/* If operand0 is a hard register, make operand1 a pseudo. */
if (can_create_pseudo_p ()
&& !ix86_hardreg_mov_ok (op0, op1))