diff options
author | Alexander Monakov <amonakov@ispras.ru> | 2022-11-01 17:04:25 +0300 |
---|---|---|
committer | Alexander Monakov <amonakov@ispras.ru> | 2022-11-16 16:41:39 +0300 |
commit | dd744f06c9952f92738b0860630085f0f0b99574 (patch) | |
tree | 3e4e2f196315dbaf04732f001ce101d9f64c1203 /gcc | |
parent | 3c54805d03ac1bcc3d8547ffb5e6c4e1f301a7a2 (diff) | |
download | gcc-dd744f06c9952f92738b0860630085f0f0b99574.zip gcc-dd744f06c9952f92738b0860630085f0f0b99574.tar.gz gcc-dd744f06c9952f92738b0860630085f0f0b99574.tar.bz2 |
i386: correct x87&SSE division modeling in znver.md
Correct modeling of division instructions in the SIMD/FP domain for
AMD Zen architectures and avoid combinatorial explosion of automaton
tables by modeling the separate floating-point division unit and
correcting reservations to reflect reciprocal throughput of the
corresponding instructions, similar to earlier commit
5cee5f94000 ("i386: correct integer division modeling in znver.md").
Division is partially pipelined and some instructions have fractional
throughput (e.g. Zen 3 can issue divss and divsd each 3.5 and 4.5
cycles on average, respectively). Considering these CPUs implement
out-of-order execution, the model doesn't need to be exact to the last
cycle, so simplify it by using 4/5 cycles for SF/DF modes, and not
modeling the fact that FP3 pipe is occupied for one cycle.
Top znver table sizes in insn-automata.o:
Before:
428108 r znver1_fp_min_issue_delay
856216 r znver1_fp_transitions
After:
30056 r znver1_fp_min_issue_delay
120224 r znver1_fp_transitions
gcc/ChangeLog:
PR target/87832
* config/i386/znver.md (znver1_fdiv): New automaton.
(znver1-fdiv): New unit.
(znver1_fp_op_div): Correct unit and cycles in the reservation.
(znver1_fp_op_div_load): Ditto.
(znver1_fp_op_idiv_load): Ditto.
(znver2_fp_op_idiv_load): Ditto.
(znver1_ssediv_ss_ps): Ditto.
(znver1_ssediv_ss_ps_load): Ditto.
(znver1_ssediv_sd_pd): Ditto.
(znver1_ssediv_sd_pd_load): Ditto.
(znver1_ssediv_avx256_ps): Ditto.
(znver1_ssediv_avx256_ps_load): Ditto.
(znver1_ssediv_avx256_pd): Ditto.
(znver1_ssediv_avx256_pd_load): Ditto.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/znver.md | 27 |
1 files changed, 14 insertions, 13 deletions
diff --git a/gcc/config/i386/znver.md b/gcc/config/i386/znver.md index 4aa098f..c52f8b5 100644 --- a/gcc/config/i386/znver.md +++ b/gcc/config/i386/znver.md @@ -24,7 +24,7 @@ ;; AMD znver1, znver2 and znver3 Scheduling ;; Modeling automatons for zen decoders, integer execution pipes, ;; SIMD/FP domain, AGU pipes, and dividers. -(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu, znver1_idiv") +(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu, znver1_idiv, znver1_fdiv") ;; Decoders unit has 4 decoders and all of them can decode fast path ;; and vector type instructions. @@ -95,6 +95,7 @@ ;; Dividers (define_cpu_unit "znver1-idiv" "znver1_idiv") +(define_cpu_unit "znver1-fdiv" "znver1_fdiv") ;; Call instruction (define_insn_reservation "znver1_call" 1 @@ -591,27 +592,27 @@ (and (eq_attr "cpu" "znver1,znver2,znver3") (and (eq_attr "type" "fdiv") (eq_attr "memory" "none"))) - "znver1-direct,znver1-fp3*15") + "znver1-direct,znver1-fdiv*6") (define_insn_reservation "znver1_fp_op_div_load" 22 (and (eq_attr "cpu" "znver1,znver2,znver3") (and (eq_attr "type" "fdiv") (eq_attr "memory" "load"))) - "znver1-direct,znver1-load,znver1-fp3*15") + "znver1-direct,znver1-load,znver1-fdiv*6") (define_insn_reservation "znver1_fp_op_idiv_load" 27 (and (eq_attr "cpu" "znver1") (and (eq_attr "type" "fdiv") (and (eq_attr "fp_int_src" "true") (eq_attr "memory" "load")))) - "znver1-double,znver1-load,znver1-fp3*19") + "znver1-double,znver1-load,znver1-fdiv*6") (define_insn_reservation "znver2_fp_op_idiv_load" 26 (and (eq_attr "cpu" "znver2,znver3") (and (eq_attr "type" "fdiv") (and (eq_attr "fp_int_src" "true") (eq_attr "memory" "load")))) - "znver1-double,znver1-load,znver1-fp3*19") + "znver1-double,znver1-load,znver1-fdiv*6") ;; MMX, SSE, SSEn.n, AVX, AVX2 instructions @@ -1088,7 +1089,7 @@ (eq_attr "mode" "V8SF,V4SF,SF"))) (and (eq_attr "type" "ssediv") (eq_attr "memory" "none"))) - "znver1-direct,znver1-fp3*10") + "znver1-direct,znver1-fdiv*4") (define_insn_reservation "znver1_ssediv_ss_ps_load" 17 (and (ior (and (eq_attr "cpu" "znver1") @@ -1099,7 +1100,7 @@ (eq_attr "mode" "V8SF,V4SF,SF"))) (and (eq_attr "type" "ssediv") (eq_attr "memory" "load"))) - "znver1-direct,znver1-load,znver1-fp3*10") + "znver1-direct,znver1-load,znver1-fdiv*4") (define_insn_reservation "znver1_ssediv_sd_pd" 13 (and (ior (and (eq_attr "cpu" "znver1") @@ -1110,7 +1111,7 @@ (eq_attr "mode" "V4DF,V2DF,DF"))) (and (eq_attr "type" "ssediv") (eq_attr "memory" "none"))) - "znver1-direct,znver1-fp3*13") + "znver1-direct,znver1-fdiv*5") (define_insn_reservation "znver1_ssediv_sd_pd_load" 20 (and (ior (and (eq_attr "cpu" "znver1") @@ -1121,35 +1122,35 @@ (eq_attr "mode" "V4DF,V2DF,DF"))) (and (eq_attr "type" "ssediv") (eq_attr "memory" "load"))) - "znver1-direct,znver1-load,znver1-fp3*13") + "znver1-direct,znver1-load,znver1-fdiv*5") (define_insn_reservation "znver1_ssediv_avx256_ps" 12 (and (eq_attr "cpu" "znver1") (and (eq_attr "mode" "V8SF") (and (eq_attr "memory" "none") (eq_attr "type" "ssediv")))) - "znver1-double,znver1-fp3*12") + "znver1-double,znver1-fdiv*8") (define_insn_reservation "znver1_ssediv_avx256_ps_load" 19 (and (eq_attr "cpu" "znver1") (and (eq_attr "mode" "V8SF") (and (eq_attr "type" "ssediv") (eq_attr "memory" "load")))) - "znver1-double,znver1-load,znver1-fp3*12") + "znver1-double,znver1-load,znver1-fdiv*8") (define_insn_reservation "znver1_ssediv_avx256_pd" 15 (and (eq_attr "cpu" "znver1") (and (eq_attr "mode" "V4DF") (and (eq_attr "type" "ssediv") (eq_attr "memory" "none")))) - "znver1-double,znver1-fp3*15") + "znver1-double,znver1-fdiv*10") (define_insn_reservation "znver1_ssediv_avx256_pd_load" 22 (and (eq_attr "cpu" "znver1") (and (eq_attr "mode" "V4DF") (and (eq_attr "type" "ssediv") (eq_attr "memory" "load")))) - "znver1-double,znver1-load,znver1-fp3*15") + "znver1-double,znver1-load,znver1-fdiv*10") ;; SSE MUL (define_insn_reservation "znver1_ssemul_ss_ps" 3 (and (ior (and (eq_attr "cpu" "znver1") |