diff options
author | Tamar Christina <tamar.christina@arm.com> | 2023-11-21 13:19:36 +0000 |
---|---|---|
committer | Tamar Christina <tamar.christina@arm.com> | 2023-11-21 13:19:36 +0000 |
commit | 4b6da8e7bdb93d9bca6291157db1c936ac56e7af (patch) | |
tree | 2f54d2ae09b85837b3efb595a35961d19b920f23 /gcc/config/aarch64/tuning_models/neoversev1.h | |
parent | f26f92b534f9d68371322071f309ef3e0e95f38c (diff) | |
download | gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.zip gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.gz gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.bz2 |
AArch64: Refactor costs models to different files.
This patch series attempts to move the generic cost model in AArch64 to a new
and modern generic standard. The current standard is quite old and generates
very suboptimal code out of the box for user of GCC.
The goal is for the new cost model to be beneficial on newer/current Arm
Microarchitectures while not being too negative for older ones.
It does not change any core specific optimization. The final changes reflect
both performance optimizations and size optimizations.
This first patch just re-organizes the cost structures to their own files.
The AArch64.cc file has gotten very big and it's hard to follow.
No functional changes are expected from this change. Note that since all the
structures have private visibility I've put them in header files instead.
gcc/ChangeLog:
PR target/111370
* config/aarch64/aarch64.cc (generic_addrcost_table,
exynosm1_addrcost_table,
xgene1_addrcost_table,
thunderx2t99_addrcost_table,
thunderx3t110_addrcost_table,
tsv110_addrcost_table,
qdf24xx_addrcost_table,
a64fx_addrcost_table,
neoversev1_addrcost_table,
neoversen2_addrcost_table,
neoversev2_addrcost_table,
generic_regmove_cost,
cortexa57_regmove_cost,
cortexa53_regmove_cost,
exynosm1_regmove_cost,
thunderx_regmove_cost,
xgene1_regmove_cost,
qdf24xx_regmove_cost,
thunderx2t99_regmove_cost,
thunderx3t110_regmove_cost,
tsv110_regmove_cost,
a64fx_regmove_cost,
neoversen2_regmove_cost,
neoversev1_regmove_cost,
neoversev2_regmove_cost,
generic_vector_cost,
a64fx_vector_cost,
qdf24xx_vector_cost,
thunderx_vector_cost,
tsv110_vector_cost,
cortexa57_vector_cost,
exynosm1_vector_cost,
xgene1_vector_cost,
thunderx2t99_vector_cost,
thunderx3t110_vector_cost,
ampere1_vector_cost,
generic_branch_cost,
generic_tunings,
cortexa35_tunings,
cortexa53_tunings,
cortexa57_tunings,
cortexa72_tunings,
cortexa73_tunings,
exynosm1_tunings,
thunderxt88_tunings,
thunderx_tunings,
tsv110_tunings,
xgene1_tunings,
emag_tunings,
qdf24xx_tunings,
saphira_tunings,
thunderx2t99_tunings,
thunderx3t110_tunings,
neoversen1_tunings,
ampere1_tunings,
ampere1a_tunings,
neoversev1_vector_cost,
neoversev1_tunings,
neoverse512tvb_vector_cost,
neoverse512tvb_tunings,
neoversen2_vector_cost,
neoversen2_tunings,
neoversev2_vector_cost,
neoversev2_tunings
a64fx_tunings): Split into own files.
* config/aarch64/tuning_models/a64fx.h: New file.
* config/aarch64/tuning_models/ampere1.h: New file.
* config/aarch64/tuning_models/ampere1a.h: New file.
* config/aarch64/tuning_models/cortexa35.h: New file.
* config/aarch64/tuning_models/cortexa53.h: New file.
* config/aarch64/tuning_models/cortexa57.h: New file.
* config/aarch64/tuning_models/cortexa72.h: New file.
* config/aarch64/tuning_models/cortexa73.h: New file.
* config/aarch64/tuning_models/emag.h: New file.
* config/aarch64/tuning_models/exynosm1.h: New file.
* config/aarch64/tuning_models/generic.h: New file.
* config/aarch64/tuning_models/neoverse512tvb.h: New file.
* config/aarch64/tuning_models/neoversen1.h: New file.
* config/aarch64/tuning_models/neoversen2.h: New file.
* config/aarch64/tuning_models/neoversev1.h: New file.
* config/aarch64/tuning_models/neoversev2.h: New file.
* config/aarch64/tuning_models/qdf24xx.h: New file.
* config/aarch64/tuning_models/saphira.h: New file.
* config/aarch64/tuning_models/thunderx.h: New file.
* config/aarch64/tuning_models/thunderx2t99.h: New file.
* config/aarch64/tuning_models/thunderx3t110.h: New file.
* config/aarch64/tuning_models/thunderxt88.h: New file.
* config/aarch64/tuning_models/tsv110.h: New file.
* config/aarch64/tuning_models/xgene1.h: New file.
Diffstat (limited to 'gcc/config/aarch64/tuning_models/neoversev1.h')
-rw-r--r-- | gcc/config/aarch64/tuning_models/neoversev1.h | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h new file mode 100644 index 0000000..584a500 --- /dev/null +++ b/gcc/config/aarch64/tuning_models/neoversev1.h @@ -0,0 +1,237 @@ +/* Tuning model description for AArch64 architecture. + Copyright (C) 2009-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#ifndef GCC_AARCH64_H_NEOVERSEV1 +#define GCC_AARCH64_H_NEOVERSEV1 + +#include "generic.h" + +static const struct cpu_addrcost_table neoversev1_addrcost_table = +{ + { + 1, /* hi */ + 0, /* si */ + 0, /* di */ + 1, /* ti */ + }, + 0, /* pre_modify */ + 0, /* post_modify */ + 3, /* post_modify_ld3_st3 */ + 3, /* post_modify_ld4_st4 */ + 0, /* register_offset */ + 0, /* register_sextend */ + 0, /* register_zextend */ + 0 /* imm_offset */ +}; + +static const struct cpu_regmove_cost neoversev1_regmove_cost = +{ + 1, /* GP2GP */ + /* Spilling to int<->fp instead of memory is recommended so set + realistic costs compared to memmov_cost. */ + 3, /* GP2FP */ + 2, /* FP2GP */ + 2 /* FP2FP */ +}; + +static const advsimd_vec_cost neoversev1_advsimd_vector_cost = +{ + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 4, /* ld2_st2_permute_cost */ + 4, /* ld3_st3_permute_cost */ + 5, /* ld4_st4_permute_cost */ + 3, /* permute_cost */ + 4, /* reduc_i8_cost */ + 4, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 6, /* reduc_f16_cost */ + 3, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + /* This value is just inherited from the Cortex-A57 table. */ + 8, /* vec_to_scalar_cost */ + /* This depends very much on what the scalar value is and + where it comes from. E.g. some constants take two dependent + instructions or a load, while others might be moved from a GPR. + 4 seems to be a reasonable compromise in practice. */ + 4, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 4, /* unalign_load_cost */ + /* Although stores have a latency of 2 and compete for the + vector pipes, in practice it's better not to model that. */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ +}; + +static const sve_vec_cost neoversev1_sve_vector_cost = +{ + { + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 4, /* ld2_st2_permute_cost */ + 7, /* ld3_st3_permute_cost */ + 8, /* ld4_st4_permute_cost */ + 3, /* permute_cost */ + /* Theoretically, a reduction involving 31 scalar ADDs could + complete in ~9 cycles and would have a cost of 31. [SU]ADDV + completes in 14 cycles, so give it a cost of 31 + 5. */ + 36, /* reduc_i8_cost */ + /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */ + 22, /* reduc_i16_cost */ + /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */ + 14, /* reduc_i32_cost */ + /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */ + 11, /* reduc_i64_cost */ + /* Theoretically, a reduction involving 15 scalar FADDs could + complete in ~9 cycles and would have a cost of 30. FADDV + completes in 13 cycles, so give it a cost of 30 + 4. */ + 34, /* reduc_f16_cost */ + /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */ + 19, /* reduc_f32_cost */ + /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */ + 11, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ + /* This value is just inherited from the Cortex-A57 table. */ + 8, /* vec_to_scalar_cost */ + /* See the comment above the Advanced SIMD versions. */ + 4, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 4, /* unalign_load_cost */ + /* Although stores have a latency of 2 and compete for the + vector pipes, in practice it's better not to model that. */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ + }, + 3, /* clast_cost */ + 19, /* fadda_f16_cost */ + 11, /* fadda_f32_cost */ + 8, /* fadda_f64_cost */ + 32, /* gather_load_x32_cost */ + 16, /* gather_load_x64_cost */ + 3 /* scatter_store_elt_cost */ +}; + +static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info = +{ + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 4, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ +}; + +static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info = +{ + { + 3, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 4, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ +}; + +static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info = +{ + { + { + 2, /* loads_per_cycle */ + 2, /* stores_per_cycle */ + 2, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 2, /* ld2_st2_general_ops */ + 2, /* ld3_st3_general_ops */ + 3 /* ld4_st4_general_ops */ + }, + 1, /* pred_ops_per_cycle */ + 2, /* while_pred_ops */ + 2, /* int_cmp_pred_ops */ + 1, /* fp_cmp_pred_ops */ + 1, /* gather_scatter_pair_general_ops */ + 1 /* gather_scatter_pair_pred_ops */ +}; + +static const aarch64_vec_issue_info neoversev1_vec_issue_info = +{ + &neoversev1_scalar_issue_info, + &neoversev1_advsimd_issue_info, + &neoversev1_sve_issue_info +}; + +/* Neoverse V1 costs for vector insn classes. */ +static const struct cpu_vector_cost neoversev1_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 2, /* scalar_fp_stmt_cost */ + 4, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 1, /* cond_taken_branch_cost */ + 1, /* cond_not_taken_branch_cost */ + &neoversev1_advsimd_vector_cost, /* advsimd */ + &neoversev1_sve_vector_cost, /* sve */ + &neoversev1_vec_issue_info /* issue_info */ +}; + +static const struct tune_params neoversev1_tunings = +{ + &cortexa76_extra_costs, + &neoversev1_addrcost_table, + &neoversev1_regmove_cost, + &neoversev1_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_256, /* sve_width */ + { 4, /* load_int. */ + 2, /* store_int. */ + 6, /* load_fp. */ + 2, /* store_fp. */ + 6, /* load_pred. */ + 1 /* store_pred. */ + }, /* memmov_cost. */ + 3, /* issue_rate */ + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ + "32:16", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 4, /* fma_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS + | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + &generic_prefetch_tune, + AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ + AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ +}; + + +#endif /* GCC_AARCH64_H_NEOVERSEV1. */ |