diff options
author | wwwhhhyyy <hongyu.wang@intel.com> | 2021-08-30 16:41:41 +0800 |
---|---|---|
committer | Hongyu Wang <hongyu.wang@intel.com> | 2022-01-16 12:43:02 +0800 |
commit | 1c2575586c47f56a2e75f734af42371579516f0c (patch) | |
tree | 00cfab3267531feb8987b3cec759731030662d9d /gcc/config | |
parent | 9248ee41478754b46b70f2409b85d9743ece9e72 (diff) | |
download | gcc-1c2575586c47f56a2e75f734af42371579516f0c.zip gcc-1c2575586c47f56a2e75f734af42371579516f0c.tar.gz gcc-1c2575586c47f56a2e75f734af42371579516f0c.tar.bz2 |
[i386] GLC tuning: Break false dependency for dest register.
For GoldenCove micro-architecture, force insert zero-idiom in asm
template to break false dependency of dest register for several insns.
The related insns are:
VPERM/D/Q/PS/PD
VRANGEPD/PS/SD/SS
VGETMANTSS/SD/SH
VGETMANDPS/PD - mem version only
VPMULLQ
VFMULCSH/PH
VFCMULCSH/PH
gcc/ChangeLog:
* config/i386/i386.h (TARGET_DEST_FALSE_DEP_FOR_GLC): New macro.
* config/i386/sse.md (<avx512>_<complexopname>_<mode><maskc_name><round_name>):
Insert zero-idiom in output template when attr enabled, set new attribute to
true for non-mask/maskz insn.
(avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>):
Likewise.
(avx512dq_mul<mode>3<mask_name>): Likewise.
(<avx2_avx512>_permvar<mode><mask_name>): Likewise.
(avx2_perm<mode>_1<mask_name>): Likewise.
(avx512f_perm<mode>_1<mask_name>): Likewise.
(avx512dq_rangep<mode><mask_name><round_saeonly_name>): Likewise.
(avx512dq_ranges<mode><mask_scalar_name><round_saeonly_scalar_name>):
Likewise.
(<avx512>_getmant<mode><mask_name><round_saeonly_name>): Likewise.
(avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>):
Likewise.
* config/i386/subst.md (mask3_dest_false_dep_for_glc_cond): New
subst_attr.
(mask4_dest_false_dep_for_glc_cond): Likewise.
(mask6_dest_false_dep_for_glc_cond): Likewise.
(mask10_dest_false_dep_for_glc_cond): Likewise.
(maskc_dest_false_dep_for_glc_cond): Likewise.
(mask_scalar4_dest_false_dep_for_glc_cond): Likewise.
(mask_scalarc_dest_false_dep_for_glc_cond): Likewise.
* config/i386/x86-tune.def (X86_TUNE_DEST_FALSE_DEP_FOR_GLC): New
DEF_TUNE enabled for m_SAPPHIRERAPIDS and m_ALDERLAKE
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-dest-false-dep-for-glc.c: New test.
* gcc.target/i386/avx512dq-dest-false-dep-for-glc.c: Ditto.
* gcc.target/i386/avx512f-dest-false-dep-for-glc.c: Ditto.
* gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c: Ditto.
* gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c: Ditto.
* gcc.target/i386/avx512vl-dest-false-dep-for-glc.c: Ditto.
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/i386/i386.h | 2 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 75 | ||||
-rw-r--r-- | gcc/config/i386/subst.md | 7 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune.def | 6 |
4 files changed, 82 insertions, 8 deletions
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 3ac0f69..f1bb8a8 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -429,6 +429,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_EXPAND_ABS] #define TARGET_V2DF_REDUCTION_PREFER_HADDPD \ ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD] +#define TARGET_DEST_FALSE_DEP_FOR_GLC \ + ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 50dc5da..ea72aa5 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -6536,7 +6536,12 @@ (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")] UNSPEC_COMPLEX_F_C_MUL))] "TARGET_AVX512FP16 && <round_mode512bit_condition>" - "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <maskc_dest_false_dep_for_glc_cond>) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}"; +} [(set_attr "type" "ssemul") (set_attr "mode" "<MODE>")]) @@ -6742,7 +6747,12 @@ (match_dup 1) (const_int 3)))] "TARGET_AVX512FP16" - "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <mask_scalarc_dest_false_dep_for_glc_cond>) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}"; +} [(set_attr "type" "ssemul") (set_attr "mode" "V8HF")]) @@ -15207,7 +15217,14 @@ (match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))] "TARGET_AVX512DQ && <mask_mode512bit_condition> && ix86_binary_operator_ok (MULT, <MODE>mode, operands)" - "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <mask3_dest_false_dep_for_glc_cond> + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"; +} [(set_attr "type" "sseimul") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -24636,7 +24653,14 @@ (match_operand:<sseintvecmode> 2 "register_operand" "v")] UNSPEC_VPERMVAR))] "TARGET_AVX2 && <mask_mode512bit_condition>" - "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <mask3_dest_false_dep_for_glc_cond> + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "<mask_prefix2>") (set_attr "mode" "<sseinsnmode>")]) @@ -24873,6 +24897,10 @@ mask |= INTVAL (operands[4]) << 4; mask |= INTVAL (operands[5]) << 6; operands[2] = GEN_INT (mask); + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <mask6_dest_false_dep_for_glc_cond> + && !reg_mentioned_p (operands[0], operands[1])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}"; } [(set_attr "type" "sselog") @@ -24944,6 +24972,10 @@ mask |= INTVAL (operands[4]) << 4; mask |= INTVAL (operands[5]) << 6; operands[2] = GEN_INT (mask); + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <mask10_dest_false_dep_for_glc_cond> + && !reg_mentioned_p (operands[0], operands[1])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}"; } [(set_attr "type" "sselog") @@ -26843,7 +26875,14 @@ (match_operand:SI 3 "const_0_to_15_operand")] UNSPEC_RANGE))] "TARGET_AVX512DQ && <round_saeonly_mode512bit_condition>" - "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <mask4_dest_false_dep_for_glc_cond> + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}"; +} [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) @@ -26859,7 +26898,14 @@ (match_dup 1) (const_int 1)))] "TARGET_AVX512DQ" - "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <mask_scalar4_dest_false_dep_for_glc_cond> + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"; +} [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) @@ -26899,7 +26945,13 @@ (match_operand:SI 2 "const_0_to_15_operand")] UNSPEC_GETMANT))] "TARGET_AVX512F" - "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}"; +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <mask3_dest_false_dep_for_glc_cond> + && MEM_P (operands[1])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}"; +} [(set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) @@ -26914,7 +26966,14 @@ (match_dup 1) (const_int 1)))] "TARGET_AVX512F" - "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"; +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && <mask_scalar4_dest_false_dep_for_glc_cond> + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"; +} [(set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 21d445c..bb86f82 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -71,6 +71,11 @@ (define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex") (define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex") (define_subst_attr "mask_expand_op3" "mask" "3" "5") +(define_subst_attr "mask3_dest_false_dep_for_glc_cond" "mask" "1" "operands[3] == CONST0_RTX(<MODE>mode)") +(define_subst_attr "mask4_dest_false_dep_for_glc_cond" "mask" "1" "operands[4] == CONST0_RTX(<MODE>mode)") +(define_subst_attr "mask6_dest_false_dep_for_glc_cond" "mask" "1" "operands[6] == CONST0_RTX(<MODE>mode)") +(define_subst_attr "mask10_dest_false_dep_for_glc_cond" "mask" "1" "operands[10] == CONST0_RTX(<MODE>mode)") +(define_subst_attr "maskc_dest_false_dep_for_glc_cond" "maskc" "1" "operands[3] == CONST0_RTX(<MODE>mode)") (define_subst "mask" [(set (match_operand:SUBST_V 0) @@ -337,6 +342,8 @@ (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3") (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4") (define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4") +(define_subst_attr "mask_scalar4_dest_false_dep_for_glc_cond" "mask_scalar" "1" "operands[4] == CONST0_RTX(<MODE>mode)") +(define_subst_attr "mask_scalarc_dest_false_dep_for_glc_cond" "mask_scalarc" "1" "operands[3] == CONST0_RTX(V8HFmode)") (define_subst "mask_scalar" [(set (match_operand:SUBST_V 0) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 0d3fd07..f9eb3c2 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -79,6 +79,12 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY, m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC) +/* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before + several insns to break false dependency on the dest register for GLC + micro-architecture. */ +DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC, + "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_ALDERLAKE) + /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies are resolved on SSE register parts instead of whole registers, so we may maintain just lower part of scalar values in proper format leaving the |