aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorRoger Sayle <roger@nextmovesoftware.com>2023-12-19 11:24:36 +0000
committerRoger Sayle <roger@nextmovesoftware.com>2023-12-19 11:24:36 +0000
commit7e1c440bc84c02e67b1cf338579a3274cdc337e0 (patch)
treef9312aa153a0a9e62369c1f72b8f3fab7e08f03d /gcc
parentcf840a7f7c14242ab7018071310851486a557d4f (diff)
downloadgcc-7e1c440bc84c02e67b1cf338579a3274cdc337e0.zip
gcc-7e1c440bc84c02e67b1cf338579a3274cdc337e0.tar.gz
gcc-7e1c440bc84c02e67b1cf338579a3274cdc337e0.tar.bz2
i386: Improved TImode (128-bit) integer constants on x86_64.
This patch fixes two issues with the handling of 128-bit TImode integer constants in the x86_64 backend. The main issue is that GCC always tries to load 128-bit integer constants via broadcasts to vector SSE registers, even if the result is required in general registers. This is seen in the two closely related functions below: __int128 m; void foo() { m &= CONST; } void bar() { m = CONST; } When compiled with -O2 -mavx, we currently generate: foo: movabsq $81985529216486895, %rax vmovq %rax, %xmm0 vpunpcklqdq %xmm0, %xmm0, %xmm0 vmovq %xmm0, %rax vpextrq $1, %xmm0, %rdx andq %rax, m(%rip) andq %rdx, m+8(%rip) ret bar: movabsq $81985529216486895, %rax vmovq %rax, %xmm1 vpunpcklqdq %xmm1, %xmm1, %xmm0 vpextrq $1, %xmm0, %rdx vmovq %xmm0, m(%rip) movq %rdx, m+8(%rip) ret With this patch we defer the decision to use vector broadcast for TImode until we know that we actually want a SSE register result, by moving the call to ix86_convert_const_wide_int_to_broadcast from the RTL expansion pass, to the scalar-to-vector (STV) pass. With this change (and a minor tweak described below) we now generate: foo: movabsq $81985529216486895, %rax andq %rax, m(%rip) andq %rax, m+8(%rip) ret bar: movabsq $81985529216486895, %rax vmovq %rax, %xmm0 vpunpcklqdq %xmm0, %xmm0, %xmm0 vmovdqa %xmm0, m(%rip) ret showing that we now correctly use vector mode broadcasts (only) where appropriate. The one minor tweak mentioned above is to enable the un-cprop hi/lo optimization, that I originally contributed back in September 2004 https://gcc.gnu.org/pipermail/gcc-patches/2004-September/148756.html even when not optimizing for size. Without this (and currently with just -O2) the function foo above generates: foo: movabsq $81985529216486895, %rax movabsq $81985529216486895, %rdx andq %rax, m(%rip) andq %rdx, m+8(%rip) ret I'm not sure why (back in 2004) I thought that avoiding the implicit "movq %rax, %rdx" instead of a second load was faster, perhaps avoiding a dependency to allow better scheduling, but nowadays "movq %rax, %rdx" is either eliminated by GCC's hardreg cprop pass, or special cased by modern hardware, making the first foo preferrable, not only shorter but also faster. 2023-12-19 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-expand.cc (ix86_convert_const_wide_int_to_broadcast): Remove static. (ix86_expand_move): Don't attempt to convert wide constants to SSE using ix86_convert_const_wide_int_to_broadcast here. (ix86_split_long_move): Always un-cprop multi-word constants. * config/i386/i386-expand.h (ix86_convert_const_wide_int_to_broadcast): Prototype here. * config/i386/i386-features.cc: Include i386-expand.h. (timode_scalar_chain::convert_insn): When converting TImode to V1TImode, try ix86_convert_const_wide_int_to_broadcast. gcc/testsuite/ChangeLog * gcc.target/i386/movti-2.c: New test case. * gcc.target/i386/movti-3.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386-expand.cc31
-rw-r--r--gcc/config/i386/i386-expand.h1
-rw-r--r--gcc/config/i386/i386-features.cc18
-rw-r--r--gcc/testsuite/gcc.target/i386/movti-2.c12
-rw-r--r--gcc/testsuite/gcc.target/i386/movti-3.c12
5 files changed, 50 insertions, 24 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index fad4f34..57a108a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -289,7 +289,7 @@ ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
-static rtx
+rtx
ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
{
/* Don't use integer vector broadcast if we can't move from GPR to SSE
@@ -541,14 +541,6 @@ ix86_expand_move (machine_mode mode, rtx operands[])
return;
}
}
- else if (CONST_WIDE_INT_P (op1)
- && GET_MODE_SIZE (mode) >= 16)
- {
- rtx tmp = ix86_convert_const_wide_int_to_broadcast
- (GET_MODE (op0), op1);
- if (tmp != nullptr)
- op1 = tmp;
- }
}
}
@@ -6323,18 +6315,15 @@ ix86_split_long_move (rtx operands[])
}
}
- /* If optimizing for size, attempt to locally unCSE nonzero constants. */
- if (optimize_insn_for_size_p ())
- {
- for (j = 0; j < nparts - 1; j++)
- if (CONST_INT_P (operands[6 + j])
- && operands[6 + j] != const0_rtx
- && REG_P (operands[2 + j]))
- for (i = j; i < nparts - 1; i++)
- if (CONST_INT_P (operands[7 + i])
- && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
- operands[7 + i] = operands[2 + j];
- }
+ /* Attempt to locally unCSE nonzero constants. */
+ for (j = 0; j < nparts - 1; j++)
+ if (CONST_INT_P (operands[6 + j])
+ && operands[6 + j] != const0_rtx
+ && REG_P (operands[2 + j]))
+ for (i = j; i < nparts - 1; i++)
+ if (CONST_INT_P (operands[7 + i])
+ && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
+ operands[7 + i] = operands[2 + j];
for (i = 0; i < nparts; i++)
emit_move_insn (operands[2 + i], operands[6 + i]);
diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h
index 997cb7d..e9e94bf 100644
--- a/gcc/config/i386/i386-expand.h
+++ b/gcc/config/i386/i386-expand.h
@@ -57,5 +57,6 @@ bool ix86_notrack_prefixed_insn_p (rtx_insn *);
machine_mode ix86_split_reduction (machine_mode mode);
void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0,
rtx op1, rtx *quot_p, rtx *rem_p);
+rtx ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op);
#endif /* GCC_I386_EXPAND_H */
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index e6fc135..3fcbb81 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -90,6 +90,7 @@ along with GCC; see the file COPYING3. If not see
#include "dwarf2out.h"
#include "i386-builtins.h"
#include "i386-features.h"
+#include "i386-expand.h"
const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
"savms64",
@@ -1853,14 +1854,25 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
{
/* Since there are no instructions to store 128-bit constant,
temporary register usage is required. */
+ bool use_move;
start_sequence ();
- src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
- src = validize_mem (force_const_mem (V1TImode, src));
+ tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src);
+ if (tmp)
+ {
+ src = lowpart_subreg (V1TImode, tmp, TImode);
+ use_move = true;
+ }
+ else
+ {
+ src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
+ src = validize_mem (force_const_mem (V1TImode, src));
+ use_move = MEM_P (dst);
+ }
rtx_insn *seq = get_insns ();
end_sequence ();
if (seq)
emit_insn_before (seq, insn);
- if (MEM_P (dst))
+ if (use_move)
{
tmp = gen_reg_rtx (V1TImode);
emit_insn_before (gen_rtx_SET (tmp, src), insn);
diff --git a/gcc/testsuite/gcc.target/i386/movti-2.c b/gcc/testsuite/gcc.target/i386/movti-2.c
new file mode 100644
index 0000000..73f69d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/movti-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -mavx" } */
+__int128 m;
+
+void foo()
+{
+ m &= ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL;
+}
+
+/* { dg-final { scan-assembler-times "movabsq" 1 } } */
+/* { dg-final { scan-assembler-not "vmovq" } } */
+/* { dg-final { scan-assembler-not "vpunpcklqdq" } } */
diff --git a/gcc/testsuite/gcc.target/i386/movti-3.c b/gcc/testsuite/gcc.target/i386/movti-3.c
new file mode 100644
index 0000000..535e5dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/movti-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -mavx" } */
+
+__int128 m;
+
+void bar()
+{
+ m = ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL;
+}
+
+/* { dg-final { scan-assembler "vmovdqa" } } */
+/* { dg-final { scan-assembler-not "vpextrq" } } */