aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJan Hubicka <jh@suse.cz>2023-12-29 23:51:03 +0100
committerJan Hubicka <jh@suse.cz>2023-12-29 23:51:03 +0100
commit467cc398e637c8c48bdaeca7caf37bdebe2a9eb3 (patch)
treeaf27d7456a367b1e0b6bb79c9b95d35d8434540c /gcc
parent984bdeaa39b6417b11736b2c167ef82119e272dc (diff)
downloadgcc-467cc398e637c8c48bdaeca7caf37bdebe2a9eb3.zip
gcc-467cc398e637c8c48bdaeca7caf37bdebe2a9eb3.tar.gz
gcc-467cc398e637c8c48bdaeca7caf37bdebe2a9eb3.tar.bz2
Disable FMADD in chains for Zen4 and generic
this patch disables use of FMA in matrix multiplication loop for generic (for x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U. For Intel this is neutral both on the matrix multiplication microbenchmark (attached) and spec2k17 where the difference was within noise for Core. On core the micro-benchmark runs as follows: With FMA: 578,500,241 cycles:u # 3.645 GHz ( +- 0.12% ) 753,318,477 instructions:u # 1.30 insn per cycle ( +- 0.00% ) 125,417,701 branches:u # 790.227 M/sec ( +- 0.00% ) 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% ) No FMA: 577,573,960 cycles:u # 3.514 GHz ( +- 0.15% ) 878,318,479 instructions:u # 1.52 insn per cycle ( +- 0.00% ) 125,417,702 branches:u # 763.035 M/sec ( +- 0.00% ) 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% ) So the cycle count is unchanged and discrete multiply+add takes same time as FMA. While on zen: With FMA: 484875179 cycles:u # 3.599 GHz ( +- 0.05% ) (82.11%) 752031517 instructions:u # 1.55 insn per cycle 125106525 branches:u # 928.712 M/sec ( +- 0.03% ) (85.09%) 128356 branch-misses:u # 0.10% of all branches ( +- 0.06% ) (83.58%) No FMA: 375875209 cycles:u # 3.592 GHz ( +- 0.08% ) (80.74%) 875725341 instructions:u # 2.33 insn per cycle 124903825 branches:u # 1.194 G/sec ( +- 0.04% ) (84.59%) 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% ) The diffrerence is that Cores understand the fact that fmadd does not need all three parameters to start computation, while Zen cores doesn't. Since this seems noticeable win on zen and not loss on Core it seems like good default for generic. float a[SIZE][SIZE]; float b[SIZE][SIZE]; float c[SIZE][SIZE]; void init(void) { int i, j, k; for(i=0; i<SIZE; ++i) { for(j=0; j<SIZE; ++j) { a[i][j] = (float)i + j; b[i][j] = (float)i - j; c[i][j] = 0.0f; } } } void mult(void) { int i, j, k; for(i=0; i<SIZE; ++i) { for(j=0; j<SIZE; ++j) { for(k=0; k<SIZE; ++k) { c[i][j] += a[i][k] * b[k][j]; } } } } int main(void) { clock_t s, e; init(); s=clock(); mult(); e=clock(); printf(" mult took %10d clocks\n", (int)(e-s)); return 0; } gcc/ChangeLog: * config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS, X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/x86-tune.def8
1 files changed, 4 insertions, 4 deletions
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 43fa9e8..74b03cb 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -515,13 +515,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain. */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3
- | m_YONGFENG)
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4
+ | m_YONGFENG | m_GENERIC)
/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain. */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
- | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM)
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 | m_ZNVER4
+ | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
/* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain. */