aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2024-11-08 11:17:22 +0100
committerRichard Biener <rguenth@gcc.gnu.org>2024-11-12 09:54:25 +0100
commit9a62c1495891032922af5bf9bd1906999cf63605 (patch)
treeb58f6178322bb75070742cdfc09c1b5414cfc586 /gcc
parent82d955b0a8acfdf3e63e82135077806c19e622e6 (diff)
downloadgcc-9a62c1495891032922af5bf9bd1906999cf63605.zip
gcc-9a62c1495891032922af5bf9bd1906999cf63605.tar.gz
gcc-9a62c1495891032922af5bf9bd1906999cf63605.tar.bz2
Add X86_TUNE_AVX512_TWO_EPILOGUES, enable for Zen4 and Zen5
The following adds X86_TUNE_AVX512_TWO_EPILOGUES tuning and directs the vectorizer to produce both a vector AVX2 and SSE epilogue for AVX512 vectorized loops when set. The tuning is enabled by default for Zen4 and Zen5 where I benchmarked it to be overall positive on SPEC CPU 2017 both in performance and overall code size. In particular it speeds up 525.x264_r which with only an AVX2 epilogue ends up in unvectorized code at the moment. * config/i386/i386.cc (ix86_vector_costs::finish_cost): Set m_suggested_epilogue_mode according to X86_TUNE_AVX512_TWO_EPILOGUES. * config/i386/x86-tune.def (X86_TUNE_AVX512_TWO_EPILOGUES): Add. Enable for znver4 and znver5.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386.cc12
-rw-r--r--gcc/config/i386/x86-tune.def5
2 files changed, 17 insertions, 0 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 6ac3a5d..526c9df 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25353,6 +25353,18 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
&& TARGET_AVX256_AVOID_VEC_PERM)
m_costs[i] = INT_MAX;
+ /* When X86_TUNE_AVX512_TWO_EPILOGUES is enabled arrange for both
+ a AVX2 and a SSE epilogue for AVX512 vectorized loops. */
+ if (loop_vinfo
+ && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES])
+ {
+ if (GET_MODE_SIZE (loop_vinfo->vector_mode) == 64)
+ m_suggested_epilogue_mode = V32QImode;
+ else if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32)
+ m_suggested_epilogue_mode = V16QImode;
+ }
+
vector_costs::finish_cost (scalar_costs);
}
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 6ebb2fd..81dd895 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -597,6 +597,11 @@ DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5)
+/* X86_TUNE_AVX512_TWO_EPILOGUES: Use two vector epilogues for 512-bit
+ vectorized loops. */
+DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues",
+ m_ZNVER4 | m_ZNVER5)
+
/*****************************************************************************/
/*****************************************************************************/
/* Historical relics: tuning flags that helps a specific old CPU designs */