aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2024-08-12 18:24:34 +0800
committerliuhongt <hongtao.liu@intel.com>2024-08-15 19:15:22 +0800
commitf7e672da8fc3d416a6d07eb01f3be4400ef94fac (patch)
tree4fd6643de7a6bf2f30ffeda3cb0708162dd8867c
parentcc2d29e5f4434a3fd4e0dd93ea4f9857a0309201 (diff)
downloadgcc-f7e672da8fc3d416a6d07eb01f3be4400ef94fac.zip
gcc-f7e672da8fc3d416a6d07eb01f3be4400ef94fac.tar.gz
gcc-f7e672da8fc3d416a6d07eb01f3be4400ef94fac.tar.bz2
Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload.
It results in 2 failures for x86_64-pc-linux-gnu{\ -march=cascadelake}; gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1 For pr113560.c, now GCC generates mulx instead of mulq with -march=cascadelake, which should be optimal, so adjust testcase for that. For gcc.target/i386/extendditi2-1.c, RA happens to choose another register instead of rax and result in movq %rdi, %rbp movq %rdi, %rax sarq $63, %rbp movq %rbp, %rdx The patch adds a new define_peephole2 for that. gcc/ChangeLog: PR target/116274 * config/i386/i386-expand.cc (ix86_expand_vector_move): Restrict special case TImode to 128-bit vector conversions via V2DI under ix86_pre_reload_split (). * config/i386/i386.cc (inline_secondary_memory_needed): Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload. * config/i386/i386.md (*extendsidi2_rex64): Add a define_peephole2 after it. gcc/testsuite/ChangeLog: * gcc.target/i386/pr116274.c: New test. * gcc.target/i386/pr113560.c: Scan either mulq or mulx.
-rw-r--r--gcc/config/i386/i386-expand.cc2
-rw-r--r--gcc/config/i386/i386.cc18
-rw-r--r--gcc/config/i386/i386.md19
-rw-r--r--gcc/testsuite/gcc.target/i386/pr113560.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr116274.c12
5 files changed, 45 insertions, 8 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index bdbc142..ed546ee 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
&& SUBREG_P (op1)
&& GET_MODE (SUBREG_REG (op1)) == TImode
&& TARGET_64BIT && TARGET_SSE
- && can_create_pseudo_p ())
+ && ix86_pre_reload_split ())
{
rtx tmp = gen_reg_rtx (V2DImode);
rtx lo = gen_reg_rtx (DImode);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0721e38..d06e214 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20292,6 +20292,18 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2)))
return true;
+ /* If the target says that inter-unit moves are more expensive
+ than moving through memory, then don't generate them. */
+ if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
+ || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
+ return true;
+
+ /* With SSE4.1, *mov{ti,di}_internal supports moves between
+ SSE_REGS and GENERAL_REGS using pinsr{q,d} or pextr{q,d}. */
+ if (TARGET_SSE4_1
+ && (TARGET_64BIT ? mode == TImode : mode == DImode))
+ return false;
+
int msize = GET_MODE_SIZE (mode);
/* Between SSE and general, we have moves no larger than word size. */
@@ -20304,12 +20316,6 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
if (msize < minsize)
return true;
-
- /* If the target says that inter-unit moves are more expensive
- than moving through memory, then don't generate them. */
- if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
- || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
- return true;
}
return false;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d3ba242..efbab2f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5041,6 +5041,25 @@
DONE;
})
+(define_peephole2
+ [(set (match_operand:DI 0 "general_reg_operand")
+ (match_operand:DI 1 "general_reg_operand"))
+ (parallel [(set (match_dup 0)
+ (ashiftrt:DI (match_dup 0)
+ (const_int 63)))
+ (clobber (reg:CC FLAGS_REG))])
+ (set (match_operand:DI 2 "general_reg_operand") (match_dup 1))
+ (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))]
+ "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
+ && REGNO (operands[2]) == AX_REG
+ && REGNO (operands[3]) == DX_REG
+ && peep2_reg_dead_p (4, operands[0])
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[2], operands[0])"
+ [(set (match_dup 2) (match_dup 1))
+ (parallel [(set (match_dup 3) (ashiftrt:DI (match_dup 2) (const_int 63)))
+ (clobber (reg:CC FLAGS_REG))])])
+
(define_insn "extend<mode>di2"
[(set (match_operand:DI 0 "register_operand" "=r")
(sign_extend:DI
diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c b/gcc/testsuite/gcc.target/i386/pr113560.c
index ac2e01a..9431a2d 100644
--- a/gcc/testsuite/gcc.target/i386/pr113560.c
+++ b/gcc/testsuite/gcc.target/i386/pr113560.c
@@ -11,7 +11,7 @@ __int128 bar(__int128 x, __int128 y)
return (x & 1000) * (y & 1000);
}
-/* { dg-final { scan-assembler-times "\tmulq" 1 } } */
+/* { dg-final { scan-assembler-times "\tmul\[qx\]" 1 } } */
/* { dg-final { scan-assembler-times "\timulq" 1 } } */
/* { dg-final { scan-assembler-not "addq" } } */
/* { dg-final { scan-assembler-not "xorl" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr116274.c b/gcc/testsuite/gcc.target/i386/pr116274.c
new file mode 100644
index 0000000..51ac3e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116274.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -fno-vect-cost-model -msse4.1" } */
+
+struct aq { long x,y; };
+long testq(struct aq a) { return a.x+a.y; }
+
+struct aw { short a0,a1,a2,a3,a4,a5,a6,a7; };
+short testw(struct aw a) { return a.a0+a.a1+a.a2+a.a3+a.a4+a.a5+a.a6+a.a7; }
+
+struct ad { int x,y,z,w; };
+int testd(struct ad a) { return a.x+a.y+a.z+a.w; }
+/* { dg-final { scan-assembler-not "%rsp" } } */