diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2021-08-02 10:01:46 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2021-08-02 10:38:19 -0700 |
commit | 29f0e955c97da002b5adb4e8c9dfd2ea9709e207 (patch) | |
tree | 739aa72e30ea996caad29089e8813e39bd0fabc1 | |
parent | 7f4c3943f795fda33df648d2196b678bada1ba81 (diff) | |
download | gcc-29f0e955c97da002b5adb4e8c9dfd2ea9709e207.zip gcc-29f0e955c97da002b5adb4e8c9dfd2ea9709e207.tar.gz gcc-29f0e955c97da002b5adb4e8c9dfd2ea9709e207.tar.bz2 |
x86: Update piecewise move and store
We can use TImode/OImode/XImode integers for piecewise move and store.
1. Define MAX_MOVE_MAX to 64, which is the constant maximum number of
bytes that a single instruction can move quickly between memory and
registers or between two memory locations.
2. Define MOVE_MAX to the maximum number of bytes we can move from memory
to memory in one reasonably fast instruction. The difference between
MAX_MOVE_MAX and MOVE_MAX is that MAX_MOVE_MAX must be a constant,
independent of compiler options, since it is used in reload.h to define
struct target_reload and MOVE_MAX can vary, depending on compiler options.
3. When vector register is used for piecewise move and store, we don't
increase stack_alignment_needed since vector register spill isn't
required for piecewise move and store. Since stack_realign_needed is
set to true by checking stack_alignment_estimated set by pseudo vector
register usage, we also need to check stack_realign_needed to eliminate
frame pointer.
gcc/
* config/i386/i386.c (ix86_finalize_stack_frame_flags): Also
check stack_realign_needed for stack realignment.
(ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller
than the largest integer supported by vector register.
* config/i386/i386.h (MAX_MOVE_MAX): New. Set to 64.
(MOVE_MAX): Set to bytes of the largest integer supported by
vector register.
(STORE_MAX_PIECES): New.
gcc/testsuite/
* gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit.
* gcc.target/i386/pr90773-4.c: Also run for 32-bit.
* gcc.target/i386/pr90773-15.c: Likewise.
* gcc.target/i386/pr90773-16.c: Likewise.
* gcc.target/i386/pr90773-17.c: Likewise.
* gcc.target/i386/pr90773-24.c: Likewise.
* gcc.target/i386/pr90773-25.c: Likewise.
* gcc.target/i386/pr100865-1.c: Likewise.
* gcc.target/i386/pr100865-2.c: Likewise.
* gcc.target/i386/pr100865-3.c: Likewise.
* gcc.target/i386/pr90773-14.c: Also run for 32-bit and expect
XMM movd to store 4 bytes.
* gcc.target/i386/pr100865-4a.c: Also run for 32-bit and expect
YMM registers.
* gcc.target/i386/pr100865-4b.c: Likewise.
* gcc.target/i386/pr100865-10a.c: Expect YMM registers.
* gcc.target/i386/pr100865-10b.c: Likewise.
-rw-r--r-- | gcc/config/i386/i386.c | 21 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 53 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr100865-1.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr100865-10a.c | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr100865-10b.c | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr100865-2.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr100865-3.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr100865-4a.c | 6 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr100865-4b.c | 8 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr90773-1.c | 10 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr90773-14.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr90773-15.c | 6 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr90773-16.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr90773-17.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr90773-24.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr90773-25.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr90773-4.c | 2 |
17 files changed, 79 insertions, 51 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5d20ca2..842eb0e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -7953,8 +7953,17 @@ ix86_finalize_stack_frame_flags (void) assumed stack realignment might be needed or -fno-omit-frame-pointer is used, but in the end nothing that needed the stack alignment had been spilled nor stack access, clear frame_pointer_needed and say we - don't need stack realignment. */ - if ((stack_realign || (!flag_omit_frame_pointer && optimize)) + don't need stack realignment. + + When vector register is used for piecewise move and store, we don't + increase stack_alignment_needed as there is no register spill for + piecewise move and store. Since stack_realign_needed is set to true + by checking stack_alignment_estimated which is updated by pseudo + vector register usage, we also need to check stack_realign_needed to + eliminate frame pointer. */ + if ((stack_realign + || (!flag_omit_frame_pointer && optimize) + || crtl->stack_realign_needed) && frame_pointer_needed && crtl->is_leaf && crtl->sp_is_unchanging @@ -10418,7 +10427,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x) /* FALLTHRU */ case E_OImode: case E_XImode: - if (!standard_sse_constant_p (x, mode)) + if (!standard_sse_constant_p (x, mode) + && GET_MODE_SIZE (TARGET_AVX512F + ? XImode + : (TARGET_AVX + ? OImode + : (TARGET_SSE2 + ? TImode : DImode))) < GET_MODE_SIZE (mode)) return false; default: break; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index d1e1c22..bed9cd9 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1757,24 +1757,41 @@ typedef struct ix86_args { /* Define this as 1 if `char' should by default be signed; else as 0. */ #define DEFAULT_SIGNED_CHAR 1 -/* Max number of bytes we can move from memory to memory - in one reasonably fast instruction. */ -#define MOVE_MAX 16 - -/* MOVE_MAX_PIECES is the number of bytes at a time which we can - move efficiently, as opposed to MOVE_MAX which is the maximum - number of bytes we can move with a single instruction. - - ??? We should use TImode in 32-bit mode and use OImode or XImode - if they are available. But since by_pieces_ninsns determines the - widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in - 64-bit mode. */ -#define MOVE_MAX_PIECES \ - ((TARGET_64BIT \ - && TARGET_SSE2 \ - && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ - ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD) +/* The constant maximum number of bytes that a single instruction can + move quickly between memory and registers or between two memory + locations. */ +#define MAX_MOVE_MAX 64 + +/* Max number of bytes we can move from memory to memory in one + reasonably fast instruction, as opposed to MOVE_MAX_PIECES which + is the number of bytes at a time which we can move efficiently. + MOVE_MAX_PIECES defaults to MOVE_MAX. */ + +#define MOVE_MAX \ + ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ + ? 64 \ + : ((TARGET_AVX \ + && !TARGET_PREFER_AVX128 \ + && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \ + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + ? 32 \ + : ((TARGET_SSE2 \ + && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ + ? 16 : UNITS_PER_WORD))) + +/* STORE_MAX_PIECES is the number of bytes at a time that we can + store efficiently. */ +#define STORE_MAX_PIECES \ + ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ + ? 64 \ + : ((TARGET_AVX \ + && !TARGET_PREFER_AVX128 \ + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + ? 32 \ + : ((TARGET_SSE2 \ + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ + ? 16 : UNITS_PER_WORD))) /* If a memory-to-memory move would take MOVE_RATIO or more simple move-instruction pairs, we will do a cpymem or libcall instead. diff --git a/gcc/testsuite/gcc.target/i386/pr100865-1.c b/gcc/testsuite/gcc.target/i386/pr100865-1.c index 6c3097f..949dd5c 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-1.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=x86-64" } */ extern char *dst; diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10a.c b/gcc/testsuite/gcc.target/i386/pr100865-10a.c index 7ffc19e..98b6dfb 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-10a.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-10a.c @@ -29,5 +29,5 @@ foo (void) array[i] = MK_CONST128_BROADCAST (0x1f); } -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */ +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c index edf5276..e5616d8 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c @@ -3,5 +3,5 @@ #include "pr100865-10a.c" -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */ +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr100865-2.c b/gcc/testsuite/gcc.target/i386/pr100865-2.c index 17efe2d..f3ea775 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-2.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-2.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=skylake" } */ extern char *dst; diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c index 007e79f..714c43e 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=skylake-avx512" } */ extern char *dst; diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4a.c b/gcc/testsuite/gcc.target/i386/pr100865-4a.c index f558835..3654873 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-4a.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-4a.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=skylake" } */ extern char array[64]; @@ -11,6 +11,6 @@ foo (void) array[i] = -45; } -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */ +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 2 } } */ /* { dg-final { scan-assembler-not "vmovdqa" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c index 1e50dc8..8e8a7ea 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c @@ -1,9 +1,9 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=skylake-avx512" } */ #include "pr100865-4a.c" -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */ -/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */ +/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */ /* { dg-final { scan-assembler-not "vmovdqa" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr90773-1.c b/gcc/testsuite/gcc.target/i386/pr90773-1.c index 1d9f282..4fd5a40 100644 --- a/gcc/testsuite/gcc.target/i386/pr90773-1.c +++ b/gcc/testsuite/gcc.target/i386/pr90773-1.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -mtune=generic" } */ +/* { dg-options "-O2 -msse2 -mtune=generic" } */ extern char *dst, *src; @@ -9,9 +9,5 @@ foo (void) __builtin_memcpy (dst, src, 15); } -/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */ -/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */ -/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */ -/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 } } */ +/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr90773-14.c b/gcc/testsuite/gcc.target/i386/pr90773-14.c index e5c19f4..96ee5cb 100644 --- a/gcc/testsuite/gcc.target/i386/pr90773-14.c +++ b/gcc/testsuite/gcc.target/i386/pr90773-14.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ extern char *dst; diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c index 185ea60..403cdb2 100644 --- a/gcc/testsuite/gcc.target/i386/pr90773-15.c +++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=skylake-avx512" } */ extern char *dst; @@ -9,6 +9,6 @@ foo (int c) __builtin_memset (dst, c, 17); } -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, %xmm\[0-9\]+" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */ -/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */ +/* { dg-final { scan-assembler-times "movb\[\\t \]+%.*, 16\\(%\[\^,\]+\\)" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c index d820cc3..bb0aadb 100644 --- a/gcc/testsuite/gcc.target/i386/pr90773-16.c +++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=skylake-avx512" } */ extern char *dst; diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c index f6f179e..73d5d5ab 100644 --- a/gcc/testsuite/gcc.target/i386/pr90773-17.c +++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=skylake-avx512" } */ extern char *dst; diff --git a/gcc/testsuite/gcc.target/i386/pr90773-24.c b/gcc/testsuite/gcc.target/i386/pr90773-24.c index 7b2ea66..71f1fd8 100644 --- a/gcc/testsuite/gcc.target/i386/pr90773-24.c +++ b/gcc/testsuite/gcc.target/i386/pr90773-24.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=x86-64" } */ struct S diff --git a/gcc/testsuite/gcc.target/i386/pr90773-25.c b/gcc/testsuite/gcc.target/i386/pr90773-25.c index 57642ea..ad19a88 100644 --- a/gcc/testsuite/gcc.target/i386/pr90773-25.c +++ b/gcc/testsuite/gcc.target/i386/pr90773-25.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -march=x86-64" } */ struct S diff --git a/gcc/testsuite/gcc.target/i386/pr90773-4.c b/gcc/testsuite/gcc.target/i386/pr90773-4.c index ec0bc01..ee4c046 100644 --- a/gcc/testsuite/gcc.target/i386/pr90773-4.c +++ b/gcc/testsuite/gcc.target/i386/pr90773-4.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-do compile } */ /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ extern char *dst; |