aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2021-08-02 10:01:46 -0700
committerH.J. Lu <hjl.tools@gmail.com>2021-08-02 10:38:19 -0700
commit29f0e955c97da002b5adb4e8c9dfd2ea9709e207 (patch)
tree739aa72e30ea996caad29089e8813e39bd0fabc1
parent7f4c3943f795fda33df648d2196b678bada1ba81 (diff)
downloadgcc-29f0e955c97da002b5adb4e8c9dfd2ea9709e207.zip
gcc-29f0e955c97da002b5adb4e8c9dfd2ea9709e207.tar.gz
gcc-29f0e955c97da002b5adb4e8c9dfd2ea9709e207.tar.bz2
x86: Update piecewise move and store
We can use TImode/OImode/XImode integers for piecewise move and store. 1. Define MAX_MOVE_MAX to 64, which is the constant maximum number of bytes that a single instruction can move quickly between memory and registers or between two memory locations. 2. Define MOVE_MAX to the maximum number of bytes we can move from memory to memory in one reasonably fast instruction. The difference between MAX_MOVE_MAX and MOVE_MAX is that MAX_MOVE_MAX must be a constant, independent of compiler options, since it is used in reload.h to define struct target_reload and MOVE_MAX can vary, depending on compiler options. 3. When vector register is used for piecewise move and store, we don't increase stack_alignment_needed since vector register spill isn't required for piecewise move and store. Since stack_realign_needed is set to true by checking stack_alignment_estimated set by pseudo vector register usage, we also need to check stack_realign_needed to eliminate frame pointer. gcc/ * config/i386/i386.c (ix86_finalize_stack_frame_flags): Also check stack_realign_needed for stack realignment. (ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller than the largest integer supported by vector register. * config/i386/i386.h (MAX_MOVE_MAX): New. Set to 64. (MOVE_MAX): Set to bytes of the largest integer supported by vector register. (STORE_MAX_PIECES): New. gcc/testsuite/ * gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit. * gcc.target/i386/pr90773-4.c: Also run for 32-bit. * gcc.target/i386/pr90773-15.c: Likewise. * gcc.target/i386/pr90773-16.c: Likewise. * gcc.target/i386/pr90773-17.c: Likewise. * gcc.target/i386/pr90773-24.c: Likewise. * gcc.target/i386/pr90773-25.c: Likewise. * gcc.target/i386/pr100865-1.c: Likewise. * gcc.target/i386/pr100865-2.c: Likewise. * gcc.target/i386/pr100865-3.c: Likewise. * gcc.target/i386/pr90773-14.c: Also run for 32-bit and expect XMM movd to store 4 bytes. * gcc.target/i386/pr100865-4a.c: Also run for 32-bit and expect YMM registers. * gcc.target/i386/pr100865-4b.c: Likewise. * gcc.target/i386/pr100865-10a.c: Expect YMM registers. * gcc.target/i386/pr100865-10b.c: Likewise.
-rw-r--r--gcc/config/i386/i386.c21
-rw-r--r--gcc/config/i386/i386.h53
-rw-r--r--gcc/testsuite/gcc.target/i386/pr100865-1.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr100865-10a.c4
-rw-r--r--gcc/testsuite/gcc.target/i386/pr100865-10b.c4
-rw-r--r--gcc/testsuite/gcc.target/i386/pr100865-2.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr100865-3.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr100865-4a.c6
-rw-r--r--gcc/testsuite/gcc.target/i386/pr100865-4b.c8
-rw-r--r--gcc/testsuite/gcc.target/i386/pr90773-1.c10
-rw-r--r--gcc/testsuite/gcc.target/i386/pr90773-14.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr90773-15.c6
-rw-r--r--gcc/testsuite/gcc.target/i386/pr90773-16.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr90773-17.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr90773-24.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr90773-25.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr90773-4.c2
17 files changed, 79 insertions, 51 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5d20ca2..842eb0e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -7953,8 +7953,17 @@ ix86_finalize_stack_frame_flags (void)
assumed stack realignment might be needed or -fno-omit-frame-pointer
is used, but in the end nothing that needed the stack alignment had
been spilled nor stack access, clear frame_pointer_needed and say we
- don't need stack realignment. */
- if ((stack_realign || (!flag_omit_frame_pointer && optimize))
+ don't need stack realignment.
+
+ When vector register is used for piecewise move and store, we don't
+ increase stack_alignment_needed as there is no register spill for
+ piecewise move and store. Since stack_realign_needed is set to true
+ by checking stack_alignment_estimated which is updated by pseudo
+ vector register usage, we also need to check stack_realign_needed to
+ eliminate frame pointer. */
+ if ((stack_realign
+ || (!flag_omit_frame_pointer && optimize)
+ || crtl->stack_realign_needed)
&& frame_pointer_needed
&& crtl->is_leaf
&& crtl->sp_is_unchanging
@@ -10418,7 +10427,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
/* FALLTHRU */
case E_OImode:
case E_XImode:
- if (!standard_sse_constant_p (x, mode))
+ if (!standard_sse_constant_p (x, mode)
+ && GET_MODE_SIZE (TARGET_AVX512F
+ ? XImode
+ : (TARGET_AVX
+ ? OImode
+ : (TARGET_SSE2
+ ? TImode : DImode))) < GET_MODE_SIZE (mode))
return false;
default:
break;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index d1e1c22..bed9cd9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1757,24 +1757,41 @@ typedef struct ix86_args {
/* Define this as 1 if `char' should by default be signed; else as 0. */
#define DEFAULT_SIGNED_CHAR 1
-/* Max number of bytes we can move from memory to memory
- in one reasonably fast instruction. */
-#define MOVE_MAX 16
-
-/* MOVE_MAX_PIECES is the number of bytes at a time which we can
- move efficiently, as opposed to MOVE_MAX which is the maximum
- number of bytes we can move with a single instruction.
-
- ??? We should use TImode in 32-bit mode and use OImode or XImode
- if they are available. But since by_pieces_ninsns determines the
- widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in
- 64-bit mode. */
-#define MOVE_MAX_PIECES \
- ((TARGET_64BIT \
- && TARGET_SSE2 \
- && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
- && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
- ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
+/* The constant maximum number of bytes that a single instruction can
+ move quickly between memory and registers or between two memory
+ locations. */
+#define MAX_MOVE_MAX 64
+
+/* Max number of bytes we can move from memory to memory in one
+ reasonably fast instruction, as opposed to MOVE_MAX_PIECES which
+ is the number of bytes at a time which we can move efficiently.
+ MOVE_MAX_PIECES defaults to MOVE_MAX. */
+
+#define MOVE_MAX \
+ ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+ ? 64 \
+ : ((TARGET_AVX \
+ && !TARGET_PREFER_AVX128 \
+ && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
+ && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+ ? 32 \
+ : ((TARGET_SSE2 \
+ && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
+ && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+ ? 16 : UNITS_PER_WORD)))
+
+/* STORE_MAX_PIECES is the number of bytes at a time that we can
+ store efficiently. */
+#define STORE_MAX_PIECES \
+ ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+ ? 64 \
+ : ((TARGET_AVX \
+ && !TARGET_PREFER_AVX128 \
+ && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+ ? 32 \
+ : ((TARGET_SSE2 \
+ && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+ ? 16 : UNITS_PER_WORD)))
/* If a memory-to-memory move would take MOVE_RATIO or more simple
move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-1.c b/gcc/testsuite/gcc.target/i386/pr100865-1.c
index 6c3097f..949dd5c 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-1.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=x86-64" } */
extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10a.c b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
index 7ffc19e..98b6dfb 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-10a.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
@@ -29,5 +29,5 @@ foo (void)
array[i] = MK_CONST128_BROADCAST (0x1f);
}
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
index edf5276..e5616d8 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
@@ -3,5 +3,5 @@
#include "pr100865-10a.c"
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-2.c b/gcc/testsuite/gcc.target/i386/pr100865-2.c
index 17efe2d..f3ea775 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-2.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake" } */
extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
index 007e79f..714c43e 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake-avx512" } */
extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4a.c b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
index f558835..3654873 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4a.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake" } */
extern char array[64];
@@ -11,6 +11,6 @@ foo (void)
array[i] = -45;
}
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 2 } } */
/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index 1e50dc8..8e8a7ea 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -1,9 +1,9 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake-avx512" } */
#include "pr100865-4a.c"
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
-/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
+/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-1.c b/gcc/testsuite/gcc.target/i386/pr90773-1.c
index 1d9f282..4fd5a40 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-1.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-options "-O2 -msse2 -mtune=generic" } */
extern char *dst, *src;
@@ -9,9 +9,5 @@ foo (void)
__builtin_memcpy (dst, src, 15);
}
-/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-14.c b/gcc/testsuite/gcc.target/i386/pr90773-14.c
index e5c19f4..96ee5cb 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-14.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-14.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
index 185ea60..403cdb2 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-15.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake-avx512" } */
extern char *dst;
@@ -9,6 +9,6 @@ foo (int c)
__builtin_memset (dst, c, 17);
}
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, %xmm\[0-9\]+" 1 } } */
/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
-/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+%.*, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
index d820cc3..bb0aadb 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-16.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake-avx512" } */
extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
index f6f179e..73d5d5ab 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-17.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake-avx512" } */
extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-24.c b/gcc/testsuite/gcc.target/i386/pr90773-24.c
index 7b2ea66..71f1fd8 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-24.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-24.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=x86-64" } */
struct S
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-25.c b/gcc/testsuite/gcc.target/i386/pr90773-25.c
index 57642ea..ad19a88 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-25.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-25.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -march=x86-64" } */
struct S
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-4.c b/gcc/testsuite/gcc.target/i386/pr90773-4.c
index ec0bc01..ee4c046 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-4.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
extern char *dst;