AArch64: Refactor costs models to different files.

This patch series attempts to move the generic cost model in AArch64 to a new and modern generic standard. The current standard is quite old and generates very suboptimal code out of the box for user of GCC. The goal is for the new cost model to be beneficial on newer/current Arm Microarchitectures while not being too negative for older ones. It does not change any core specific optimization. The final changes reflect both performance optimizations and size optimizations. This first patch just re-organizes the cost structures to their own files. The AArch64.cc file has gotten very big and it's hard to follow. No functional changes are expected from this change. Note that since all the structures have private visibility I've put them in header files instead. gcc/ChangeLog: PR target/111370 * config/aarch64/aarch64.cc (generic_addrcost_table, exynosm1_addrcost_table, xgene1_addrcost_table, thunderx2t99_addrcost_table, thunderx3t110_addrcost_table, tsv110_addrcost_table, qdf24xx_addrcost_table, a64fx_addrcost_table, neoversev1_addrcost_table, neoversen2_addrcost_table, neoversev2_addrcost_table, generic_regmove_cost, cortexa57_regmove_cost, cortexa53_regmove_cost, exynosm1_regmove_cost, thunderx_regmove_cost, xgene1_regmove_cost, qdf24xx_regmove_cost, thunderx2t99_regmove_cost, thunderx3t110_regmove_cost, tsv110_regmove_cost, a64fx_regmove_cost, neoversen2_regmove_cost, neoversev1_regmove_cost, neoversev2_regmove_cost, generic_vector_cost, a64fx_vector_cost, qdf24xx_vector_cost, thunderx_vector_cost, tsv110_vector_cost, cortexa57_vector_cost, exynosm1_vector_cost, xgene1_vector_cost, thunderx2t99_vector_cost, thunderx3t110_vector_cost, ampere1_vector_cost, generic_branch_cost, generic_tunings, cortexa35_tunings, cortexa53_tunings, cortexa57_tunings, cortexa72_tunings, cortexa73_tunings, exynosm1_tunings, thunderxt88_tunings, thunderx_tunings, tsv110_tunings, xgene1_tunings, emag_tunings, qdf24xx_tunings, saphira_tunings, thunderx2t99_tunings, thunderx3t110_tunings, neoversen1_tunings, ampere1_tunings, ampere1a_tunings, neoversev1_vector_cost, neoversev1_tunings, neoverse512tvb_vector_cost, neoverse512tvb_tunings, neoversen2_vector_cost, neoversen2_tunings, neoversev2_vector_cost, neoversev2_tunings a64fx_tunings): Split into own files. * config/aarch64/tuning_models/a64fx.h: New file. * config/aarch64/tuning_models/ampere1.h: New file. * config/aarch64/tuning_models/ampere1a.h: New file. * config/aarch64/tuning_models/cortexa35.h: New file. * config/aarch64/tuning_models/cortexa53.h: New file. * config/aarch64/tuning_models/cortexa57.h: New file. * config/aarch64/tuning_models/cortexa72.h: New file. * config/aarch64/tuning_models/cortexa73.h: New file. * config/aarch64/tuning_models/emag.h: New file. * config/aarch64/tuning_models/exynosm1.h: New file. * config/aarch64/tuning_models/generic.h: New file. * config/aarch64/tuning_models/neoverse512tvb.h: New file. * config/aarch64/tuning_models/neoversen1.h: New file. * config/aarch64/tuning_models/neoversen2.h: New file. * config/aarch64/tuning_models/neoversev1.h: New file. * config/aarch64/tuning_models/neoversev2.h: New file. * config/aarch64/tuning_models/qdf24xx.h: New file. * config/aarch64/tuning_models/saphira.h: New file. * config/aarch64/tuning_models/thunderx.h: New file. * config/aarch64/tuning_models/thunderx2t99.h: New file. * config/aarch64/tuning_models/thunderx3t110.h: New file. * config/aarch64/tuning_models/thunderxt88.h: New file. * config/aarch64/tuning_models/tsv110.h: New file. * config/aarch64/tuning_models/xgene1.h: New file.
author: Tamar Christina <tamar.christina@arm.com> 2023-11-21 13:19:36 +0000
committer: Tamar Christina <tamar.christina@arm.com> 2023-11-21 13:19:36 +0000
commit: 4b6da8e7bdb93d9bca6291157db1c936ac56e7af (patch)
tree: 2f54d2ae09b85837b3efb595a35961d19b920f23 /gcc/config/aarch64/tuning_models/neoversev1.h
parent: f26f92b534f9d68371322071f309ef3e0e95f38c (diff)
download: gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.zip
gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.gz
gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.bz2
1 files changed, 237 insertions, 0 deletions
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h
new file mode 100644
index 0000000..584a500
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -0,0 +1,237 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSEV1
+#define GCC_AARCH64_H_NEOVERSEV1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversev1_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  3, /* post_modify_ld3_st3  */
+  3, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversev1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  4, /* ld2_st2_permute_cost */
+  4, /* ld3_st3_permute_cost  */
+  5, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  /* This value is just inherited from the Cortex-A57 table.  */
+  8, /* vec_to_scalar_cost  */
+  /* This depends very much on what the scalar value is and
+     where it comes from.  E.g. some constants take two dependent
+     instructions or a load, while others might be moved from a GPR.
+     4 seems to be a reasonable compromise in practice.  */
+  4, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  /* Although stores have a latency of 2 and compete for the
+     vector pipes, in practice it's better not to model that.  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost neoversev1_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    4, /* ld2_st2_permute_cost  */
+    7, /* ld3_st3_permute_cost  */
+    8, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 31 scalar ADDs could
+       complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
+       completes in 14 cycles, so give it a cost of 31 + 5.  */
+    36, /* reduc_i8_cost  */
+    /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
+    22, /* reduc_i16_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
+    14, /* reduc_i32_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
+    11, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 15 scalar FADDs could
+       complete in ~9 cycles and would have a cost of 30.  FADDV
+       completes in 13 cycles, so give it a cost of 30 + 4.  */
+    34, /* reduc_f16_cost  */
+    /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
+    19, /* reduc_f32_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
+    11, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  19, /* fadda_f16_cost  */
+  11, /* fadda_f32_cost  */
+  8, /* fadda_f64_cost  */
+  32, /* gather_load_x32_cost  */
+  16, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  4, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    4, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
+{
+  {
+    {
+      2, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      2, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    2, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  1, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversev1_vec_issue_info =
+{
+  &neoversev1_scalar_issue_info,
+  &neoversev1_advsimd_issue_info,
+  &neoversev1_sve_issue_info
+};
+
+/* Neoverse V1 costs for vector insn classes.  */
+static const struct cpu_vector_cost neoversev1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversev1_advsimd_vector_cost, /* advsimd  */
+  &neoversev1_sve_vector_cost, /* sve  */
+  &neoversev1_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversev1_tunings =
+{
+  &cortexa76_extra_costs,
+  &neoversev1_addrcost_table,
+  &neoversev1_regmove_cost,
+  &neoversev1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_256, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+
+#endif /* GCC_AARCH64_H_NEOVERSEV1.  */
author	Tamar Christina <tamar.christina@arm.com>	2023-11-21 13:19:36 +0000
committer	Tamar Christina <tamar.christina@arm.com>	2023-11-21 13:19:36 +0000
commit	4b6da8e7bdb93d9bca6291157db1c936ac56e7af (patch)
tree	2f54d2ae09b85837b3efb595a35961d19b920f23 /gcc/config/aarch64/tuning_models/neoversev1.h
parent	f26f92b534f9d68371322071f309ef3e0e95f38c (diff)
download	gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.zip gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.gz gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.bz2