diff options
author | Lili Cui <lili.cui@intel.com> | 2023-05-30 05:47:47 +0000 |
---|---|---|
committer | Cui, Lili <lili.cui@intel.com> | 2023-05-30 06:01:33 +0000 |
commit | e5405f065bace0685cb3b8878d1dfc7a6e7ef409 (patch) | |
tree | 012e458cf5061be51378fed1b6ea2388e4ce2814 /gcc/ada/init.c | |
parent | e6a9a30e6cd13ac9b3562316efcb0e807ea715e4 (diff) | |
download | gcc-e5405f065bace0685cb3b8878d1dfc7a6e7ef409.zip gcc-e5405f065bace0685cb3b8878d1dfc7a6e7ef409.tar.gz gcc-e5405f065bace0685cb3b8878d1dfc7a6e7ef409.tar.bz2 |
Handle FMA friendly in reassoc pass
Make some changes in reassoc pass to make it more friendly to fma pass later.
Using FMA instead of mult + add reduces register pressure and insruction
retired.
There are mainly two changes
1. Put no-mult ops and mult ops alternately at the end of the queue, which is
conducive to generating more fma and reducing the loss of FMA when breaking
the chain.
2. Rewrite the rewrite_expr_tree_parallel function to try to build parallel
chains according to the given correlation width, keeping the FMA chance as
much as possible.
With the patch applied
On ICX:
507.cactuBSSN_r: Improved by 1.7% for multi-copy .
503.bwaves_r : Improved by 0.60% for single copy .
507.cactuBSSN_r: Improved by 1.10% for single copy .
519.lbm_r : Improved by 2.21% for single copy .
no measurable changes for other benchmarks.
On aarch64
507.cactuBSSN_r: Improved by 1.7% for multi-copy.
503.bwaves_r : Improved by 6.00% for single-copy.
no measurable changes for other benchmarks.
TEST1:
float
foo (float a, float b, float c, float d, float *e)
{
return *e + a * b + c * d ;
}
For "-Ofast -mfpmath=sse -mfma" GCC generates:
vmulss %xmm3, %xmm2, %xmm2
vfmadd132ss %xmm1, %xmm2, %xmm0
vaddss (%rdi), %xmm0, %xmm0
ret
With this patch GCC generates:
vfmadd213ss (%rdi), %xmm1, %xmm0
vfmadd231ss %xmm2, %xmm3, %xmm0
ret
TEST2:
for (int i = 0; i < N; i++)
{
a[i] += b[i]* c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * l[i] + m[i]* o[i] + p[i];
}
For "-Ofast -mfpmath=sse -mfma" GCC generates:
vmovapd e(%rax), %ymm4
vmulpd d(%rax), %ymm4, %ymm3
addq $32, %rax
vmovapd c-32(%rax), %ymm5
vmovapd j-32(%rax), %ymm6
vmulpd h-32(%rax), %ymm6, %ymm2
vmovapd a-32(%rax), %ymm6
vaddpd p-32(%rax), %ymm6, %ymm0
vmovapd g-32(%rax), %ymm7
vfmadd231pd b-32(%rax), %ymm5, %ymm3
vmovapd o-32(%rax), %ymm4
vmulpd m-32(%rax), %ymm4, %ymm1
vmovapd l-32(%rax), %ymm5
vfmadd231pd f-32(%rax), %ymm7, %ymm2
vfmadd231pd k-32(%rax), %ymm5, %ymm1
vaddpd %ymm3, %ymm0, %ymm0
vaddpd %ymm2, %ymm0, %ymm0
vaddpd %ymm1, %ymm0, %ymm0
vmovapd %ymm0, a-32(%rax)
cmpq $8192, %rax
jne .L4
vzeroupper
ret
with this patch applied GCC breaks the chain with width = 2 and generates 6 fma:
vmovapd a(%rax), %ymm2
vmovapd c(%rax), %ymm0
addq $32, %rax
vmovapd e-32(%rax), %ymm1
vmovapd p-32(%rax), %ymm5
vmovapd g-32(%rax), %ymm3
vmovapd j-32(%rax), %ymm6
vmovapd l-32(%rax), %ymm4
vmovapd o-32(%rax), %ymm7
vfmadd132pd b-32(%rax), %ymm2, %ymm0
vfmadd132pd d-32(%rax), %ymm5, %ymm1
vfmadd231pd f-32(%rax), %ymm3, %ymm0
vfmadd231pd h-32(%rax), %ymm6, %ymm1
vfmadd231pd k-32(%rax), %ymm4, %ymm0
vfmadd231pd m-32(%rax), %ymm7, %ymm1
vaddpd %ymm1, %ymm0, %ymm0
vmovapd %ymm0, a-32(%rax)
cmpq $8192, %rax
jne .L2
vzeroupper
ret
gcc/ChangeLog:
PR tree-optimization/98350
* tree-ssa-reassoc.cc
(rewrite_expr_tree_parallel): Rewrite this function.
(rank_ops_for_fma): New.
(reassociate_bb): Handle new function.
gcc/testsuite/ChangeLog:
PR tree-optimization/98350
* gcc.dg/pr98350-1.c: New test.
* gcc.dg/pr98350-2.c: Ditto.
Diffstat (limited to 'gcc/ada/init.c')
0 files changed, 0 insertions, 0 deletions