AArch64: Refactor costs models to different files.

This patch series attempts to move the generic cost model in AArch64 to a new and modern generic standard. The current standard is quite old and generates very suboptimal code out of the box for user of GCC. The goal is for the new cost model to be beneficial on newer/current Arm Microarchitectures while not being too negative for older ones. It does not change any core specific optimization. The final changes reflect both performance optimizations and size optimizations. This first patch just re-organizes the cost structures to their own files. The AArch64.cc file has gotten very big and it's hard to follow. No functional changes are expected from this change. Note that since all the structures have private visibility I've put them in header files instead. gcc/ChangeLog: PR target/111370 * config/aarch64/aarch64.cc (generic_addrcost_table, exynosm1_addrcost_table, xgene1_addrcost_table, thunderx2t99_addrcost_table, thunderx3t110_addrcost_table, tsv110_addrcost_table, qdf24xx_addrcost_table, a64fx_addrcost_table, neoversev1_addrcost_table, neoversen2_addrcost_table, neoversev2_addrcost_table, generic_regmove_cost, cortexa57_regmove_cost, cortexa53_regmove_cost, exynosm1_regmove_cost, thunderx_regmove_cost, xgene1_regmove_cost, qdf24xx_regmove_cost, thunderx2t99_regmove_cost, thunderx3t110_regmove_cost, tsv110_regmove_cost, a64fx_regmove_cost, neoversen2_regmove_cost, neoversev1_regmove_cost, neoversev2_regmove_cost, generic_vector_cost, a64fx_vector_cost, qdf24xx_vector_cost, thunderx_vector_cost, tsv110_vector_cost, cortexa57_vector_cost, exynosm1_vector_cost, xgene1_vector_cost, thunderx2t99_vector_cost, thunderx3t110_vector_cost, ampere1_vector_cost, generic_branch_cost, generic_tunings, cortexa35_tunings, cortexa53_tunings, cortexa57_tunings, cortexa72_tunings, cortexa73_tunings, exynosm1_tunings, thunderxt88_tunings, thunderx_tunings, tsv110_tunings, xgene1_tunings, emag_tunings, qdf24xx_tunings, saphira_tunings, thunderx2t99_tunings, thunderx3t110_tunings, neoversen1_tunings, ampere1_tunings, ampere1a_tunings, neoversev1_vector_cost, neoversev1_tunings, neoverse512tvb_vector_cost, neoverse512tvb_tunings, neoversen2_vector_cost, neoversen2_tunings, neoversev2_vector_cost, neoversev2_tunings a64fx_tunings): Split into own files. * config/aarch64/tuning_models/a64fx.h: New file. * config/aarch64/tuning_models/ampere1.h: New file. * config/aarch64/tuning_models/ampere1a.h: New file. * config/aarch64/tuning_models/cortexa35.h: New file. * config/aarch64/tuning_models/cortexa53.h: New file. * config/aarch64/tuning_models/cortexa57.h: New file. * config/aarch64/tuning_models/cortexa72.h: New file. * config/aarch64/tuning_models/cortexa73.h: New file. * config/aarch64/tuning_models/emag.h: New file. * config/aarch64/tuning_models/exynosm1.h: New file. * config/aarch64/tuning_models/generic.h: New file. * config/aarch64/tuning_models/neoverse512tvb.h: New file. * config/aarch64/tuning_models/neoversen1.h: New file. * config/aarch64/tuning_models/neoversen2.h: New file. * config/aarch64/tuning_models/neoversev1.h: New file. * config/aarch64/tuning_models/neoversev2.h: New file. * config/aarch64/tuning_models/qdf24xx.h: New file. * config/aarch64/tuning_models/saphira.h: New file. * config/aarch64/tuning_models/thunderx.h: New file. * config/aarch64/tuning_models/thunderx2t99.h: New file. * config/aarch64/tuning_models/thunderx3t110.h: New file. * config/aarch64/tuning_models/thunderxt88.h: New file. * config/aarch64/tuning_models/tsv110.h: New file. * config/aarch64/tuning_models/xgene1.h: New file.
author: Tamar Christina <tamar.christina@arm.com> 2023-11-21 13:19:36 +0000
committer: Tamar Christina <tamar.christina@arm.com> 2023-11-21 13:19:36 +0000
commit: 4b6da8e7bdb93d9bca6291157db1c936ac56e7af (patch)
tree: 2f54d2ae09b85837b3efb595a35961d19b920f23 /gcc/config/aarch64/aarch64.cc
parent: f26f92b534f9d68371322071f309ef3e0e95f38c (diff)
download: gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.zip
gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.gz
gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.bz2
1 files changed, 24 insertions, 2399 deletions
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 72e1350..3e1f004 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -354,2405 +354,30 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 };
 
 /* Tuning parameters.  */
-
-static const struct cpu_addrcost_table generic_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table exynosm1_addrcost_table =
-{
-    {
-      0, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  1, /* register_offset  */
-  1, /* register_sextend  */
-  2, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table xgene1_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  1, /* pre_modify  */
-  1, /* post_modify  */
-  1, /* post_modify_ld3_st3  */
-  1, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  1, /* register_sextend  */
-  1, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  2, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  2, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table tsv110_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  1, /* register_sextend  */
-  1, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table qdf24xx_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  1, /* pre_modify  */
-  1, /* post_modify  */
-  1, /* post_modify_ld3_st3  */
-  1, /* post_modify_ld4_st4  */
-  3, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  2, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table a64fx_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  2, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table neoversev1_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  3, /* post_modify_ld3_st3  */
-  3, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table neoversen2_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  2, /* post_modify_ld3_st3  */
-  2, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table neoversev2_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  2, /* post_modify_ld3_st3  */
-  2, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_regmove_cost generic_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  5, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost cortexa57_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  5, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost cortexa53_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  5, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost exynosm1_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost (actual, 4 and 9).  */
-  9, /* GP2FP  */
-  9, /* FP2GP  */
-  1 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost thunderx_regmove_cost =
-{
-  2, /* GP2GP  */
-  2, /* GP2FP  */
-  6, /* FP2GP  */
-  4 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost xgene1_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  8, /* GP2FP  */
-  8, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost qdf24xx_regmove_cost =
-{
-  2, /* GP2GP  */
-  /* Avoid the use of int<->fp moves for spilling.  */
-  6, /* GP2FP  */
-  6, /* FP2GP  */
-  4 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of int<->fp moves for spilling.  */
-  5, /* GP2FP  */
-  6, /* FP2GP  */
-  3, /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of int<->fp moves for spilling.  */
-  4, /* GP2FP  */
-  5, /* FP2GP  */
-  4  /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost tsv110_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  2, /* GP2FP  */
-  3, /* FP2GP  */
-  2  /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost a64fx_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  7, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost neoversen2_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Spilling to int<->fp instead of memory is recommended so set
-     realistic costs compared to memmov_cost.  */
-  3, /* GP2FP  */
-  2, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost neoversev1_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Spilling to int<->fp instead of memory is recommended so set
-     realistic costs compared to memmov_cost.  */
-  3, /* GP2FP  */
-  2, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost neoversev2_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Spilling to int<->fp instead of memory is recommended so set
-     realistic costs compared to memmov_cost.  */
-  3, /* GP2FP  */
-  2, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-/* Generic costs for Advanced SIMD vector operations.   */
-static const advsimd_vec_cost generic_advsimd_vector_cost =
-{
-  1, /* int_stmt_cost  */
-  1, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  2, /* reduc_i8_cost  */
-  2, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  2, /* reduc_f16_cost  */
-  2, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  2, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
-  1, /* align_load_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Generic costs for SVE vector operations.  */
-static const sve_vec_cost generic_sve_vector_cost =
-{
-  {
-    1, /* int_stmt_cost  */
-    1, /* fp_stmt_cost  */
-    0, /* ld2_st2_permute_cost  */
-    0, /* ld3_st3_permute_cost  */
-    0, /* ld4_st4_permute_cost  */
-    2, /* permute_cost  */
-    2, /* reduc_i8_cost  */
-    2, /* reduc_i16_cost  */
-    2, /* reduc_i32_cost  */
-    2, /* reduc_i64_cost  */
-    2, /* reduc_f16_cost  */
-    2, /* reduc_f32_cost  */
-    2, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    2, /* vec_to_scalar_cost  */
-    1, /* scalar_to_vec_cost  */
-    1, /* align_load_cost  */
-    1, /* unalign_load_cost  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  2, /* clast_cost  */
-  2, /* fadda_f16_cost  */
-  2, /* fadda_f32_cost  */
-  2, /* fadda_f64_cost  */
-  4, /* gather_load_x32_cost  */
-  2, /* gather_load_x64_cost  */
-  1 /* scatter_store_elt_cost  */
-};
-
-/* Generic costs for vector insn classes.  */
-static const struct cpu_vector_cost generic_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  1, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &generic_advsimd_vector_cost, /* advsimd  */
-  &generic_sve_vector_cost, /* sve */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost a64fx_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  13, /* reduc_i8_cost  */
-  13, /* reduc_i16_cost  */
-  13, /* reduc_i32_cost  */
-  13, /* reduc_i64_cost  */
-  13, /* reduc_f16_cost  */
-  13, /* reduc_f32_cost  */
-  13, /* reduc_f64_cost  */
-  13, /* store_elt_extra_cost  */
-  13, /* vec_to_scalar_cost  */
-  4, /* scalar_to_vec_cost  */
-  6, /* align_load_cost  */
-  6, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost a64fx_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    5, /* fp_stmt_cost  */
-    0, /* ld2_st2_permute_cost  */
-    0, /* ld3_st3_permute_cost  */
-    0, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    13, /* reduc_i8_cost  */
-    13, /* reduc_i16_cost  */
-    13, /* reduc_i32_cost  */
-    13, /* reduc_i64_cost  */
-    13, /* reduc_f16_cost  */
-    13, /* reduc_f32_cost  */
-    13, /* reduc_f64_cost  */
-    13, /* store_elt_extra_cost  */
-    13, /* vec_to_scalar_cost  */
-    4, /* scalar_to_vec_cost  */
-    6, /* align_load_cost  */
-    6, /* unalign_load_cost  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  13, /* clast_cost  */
-  13, /* fadda_f16_cost  */
-  13, /* fadda_f32_cost  */
-  13, /* fadda_f64_cost  */
-  64, /* gather_load_x32_cost  */
-  32, /* gather_load_x64_cost  */
-  1 /* scatter_store_elt_cost  */
-};
-
-static const struct cpu_vector_cost a64fx_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  5, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &a64fx_advsimd_vector_cost, /* advsimd  */
-  &a64fx_sve_vector_cost, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
-{
-  1, /* int_stmt_cost  */
-  3, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  1, /* reduc_i8_cost  */
-  1, /* reduc_i16_cost  */
-  1, /* reduc_i32_cost  */
-  1, /* reduc_i64_cost  */
-  1, /* reduc_f16_cost  */
-  1, /* reduc_f32_cost  */
-  1, /* reduc_f64_cost  */
-  1, /* store_elt_extra_cost  */
-  1, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
-  1, /* align_load_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* QDF24XX costs for vector insn classes.  */
-static const struct cpu_vector_cost qdf24xx_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  1, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &qdf24xx_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-
-static const advsimd_vec_cost thunderx_advsimd_vector_cost =
-{
-  4, /* int_stmt_cost  */
-  1, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  4, /* permute_cost  */
-  2, /* reduc_i8_cost  */
-  2, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  2, /* reduc_f16_cost  */
-  2, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  2, /* vec_to_scalar_cost  */
-  2, /* scalar_to_vec_cost  */
-  3, /* align_load_cost  */
-  5, /* unalign_load_cost  */
-  5, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* ThunderX costs for vector insn classes.  */
-static const struct cpu_vector_cost thunderx_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  3, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  3, /* cond_not_taken_branch_cost  */
-  &thunderx_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost tsv110_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  3, /* reduc_i8_cost  */
-  3, /* reduc_i16_cost  */
-  3, /* reduc_i32_cost  */
-  3, /* reduc_i64_cost  */
-  3, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  3, /* reduc_f64_cost  */
-  3, /* store_elt_extra_cost  */
-  3, /* vec_to_scalar_cost  */
-  2, /* scalar_to_vec_cost  */
-  5, /* align_load_cost  */
-  5, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const struct cpu_vector_cost tsv110_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  5, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &tsv110_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  8, /* reduc_i8_cost  */
-  8, /* reduc_i16_cost  */
-  8, /* reduc_i32_cost  */
-  8, /* reduc_i64_cost  */
-  8, /* reduc_f16_cost  */
-  8, /* reduc_f32_cost  */
-  8, /* reduc_f64_cost  */
-  8, /* store_elt_extra_cost  */
-  8, /* vec_to_scalar_cost  */
-  8, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Cortex-A57 costs for vector insn classes.  */
-static const struct cpu_vector_cost cortexa57_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &cortexa57_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
-{
-  3, /* int_stmt_cost  */
-  3, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  3, /* reduc_i8_cost  */
-  3, /* reduc_i16_cost  */
-  3, /* reduc_i32_cost  */
-  3, /* reduc_i64_cost  */
-  3, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  3, /* reduc_f64_cost  */
-  3, /* store_elt_extra_cost  */
-  3, /* vec_to_scalar_cost  */
-  3, /* scalar_to_vec_cost  */
-  5, /* align_load_cost  */
-  5, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const struct cpu_vector_cost exynosm1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  5, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &exynosm1_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost xgene1_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  4, /* reduc_i32_cost  */
-  4, /* reduc_i64_cost  */
-  4, /* reduc_f16_cost  */
-  4, /* reduc_f32_cost  */
-  4, /* reduc_f64_cost  */
-  4, /* store_elt_extra_cost  */
-  4, /* vec_to_scalar_cost  */
-  4, /* scalar_to_vec_cost  */
-  10, /* align_load_cost  */
-  10, /* unalign_load_cost  */
-  2, /* unalign_store_cost  */
-  2  /* store_cost  */
-};
-
-/* Generic costs for vector insn classes.  */
-static const struct cpu_vector_cost xgene1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  5, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  2, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &xgene1_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
-{
-  4, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  10, /* permute_cost  */
-  6, /* reduc_i8_cost  */
-  6, /* reduc_i16_cost  */
-  6, /* reduc_i32_cost  */
-  6, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  6, /* reduc_f32_cost  */
-  6, /* reduc_f64_cost  */
-  6, /* store_elt_extra_cost  */
-  6, /* vec_to_scalar_cost  */
-  5, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Costs for vector insn classes for Vulcan.  */
-static const struct cpu_vector_cost thunderx2t99_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  6, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  2, /* cond_taken_branch_cost  */
-  1,  /* cond_not_taken_branch_cost  */
-  &thunderx2t99_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
-{
-  5, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  10, /* permute_cost  */
-  5, /* reduc_i8_cost  */
-  5, /* reduc_i16_cost  */
-  5, /* reduc_i32_cost  */
-  5, /* reduc_i64_cost  */
-  5, /* reduc_f16_cost  */
-  5, /* reduc_f32_cost  */
-  5, /* reduc_f64_cost  */
-  5, /* store_elt_extra_cost  */
-  5, /* vec_to_scalar_cost  */
-  5, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  4, /* unalign_store_cost  */
-  4  /* store_cost  */
-};
-
-static const struct cpu_vector_cost thunderx3t110_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  5, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  2, /* cond_taken_branch_cost  */
-  1,  /* cond_not_taken_branch_cost  */
-  &thunderx3t110_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost ampere1_advsimd_vector_cost =
-{
-  1, /* int_stmt_cost  */
-  3, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  12, /* reduc_i8_cost  */
-  9, /* reduc_i16_cost  */
-  6, /* reduc_i32_cost  */
-  5, /* reduc_i64_cost  */
-  9, /* reduc_f16_cost  */
-  6, /* reduc_f32_cost  */
-  5, /* reduc_f64_cost  */
-  8, /* store_elt_extra_cost  */
-  6, /* vec_to_scalar_cost  */
-  7, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Ampere-1 costs for vector insn classes.  */
-static const struct cpu_vector_cost ampere1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  3, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &ampere1_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr  /* issue_info  */
-};
-
-/* Generic costs for branch instructions.  */
-static const struct cpu_branch_cost generic_branch_cost =
-{
-  1,  /* Predictable.  */
-  3   /* Unpredictable.  */
-};
-
-/* Generic approximation modes.  */
-static const cpu_approx_modes generic_approx_modes =
-{
-  AARCH64_APPROX_NONE,	/* division  */
-  AARCH64_APPROX_NONE,	/* sqrt  */
-  AARCH64_APPROX_NONE	/* recip_sqrt  */
-};
-
-/* Approximation modes for Exynos M1.  */
-static const cpu_approx_modes exynosm1_approx_modes =
-{
-  AARCH64_APPROX_NONE,	/* division  */
-  AARCH64_APPROX_ALL,	/* sqrt  */
-  AARCH64_APPROX_ALL	/* recip_sqrt  */
-};
-
-/* Approximation modes for X-Gene 1.  */
-static const cpu_approx_modes xgene1_approx_modes =
-{
-  AARCH64_APPROX_NONE,	/* division  */
-  AARCH64_APPROX_NONE,	/* sqrt  */
-  AARCH64_APPROX_ALL	/* recip_sqrt  */
-};
-
-/* Generic prefetch settings (which disable prefetch).  */
-static const cpu_prefetch_tune generic_prefetch_tune =
-{
-  0,			/* num_slots  */
-  -1,			/* l1_cache_size  */
-  -1,			/* l1_cache_line_size  */
-  -1,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune exynosm1_prefetch_tune =
-{
-  0,			/* num_slots  */
-  -1,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  -1,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune qdf24xx_prefetch_tune =
-{
-  4,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  512,			/* l2_cache_size  */
-  false,		/* prefetch_dynamic_strides */
-  2048,			/* minimum_stride */
-  3			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderxt88_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  128,			/* l1_cache_line_size  */
-  16*1024,		/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  3			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderx_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  128,			/* l1_cache_line_size  */
-  -1,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  256,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  256,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune tsv110_prefetch_tune =
-{
-  0,                    /* num_slots  */
-  64,                   /* l1_cache_size  */
-  64,                   /* l1_cache_line_size  */
-  512,                  /* l2_cache_size  */
-  true,                 /* prefetch_dynamic_strides */
-  -1,                   /* minimum_stride */
-  -1                    /* default_opt_level  */
-};
-
-static const cpu_prefetch_tune xgene1_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  256,			/* l2_cache_size  */
-  true,                 /* prefetch_dynamic_strides */
-  -1,                   /* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune a64fx_prefetch_tune =
-{
-  8,			/* num_slots  */
-  64,			/* l1_cache_size  */
-  256,			/* l1_cache_line_size  */
-  32768,		/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune ampere1_prefetch_tune =
-{
-  0,			/* num_slots  */
-  64,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  2048,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const struct tune_params generic_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "16:12",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
-     Neoverse V1.  It does not have a noticeable effect on A64FX and should
-     have at most a very minor effect on SVE2 cores.  */
-  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa35_tunings =
-{
-  &cortexa53_extra_costs,
-  &generic_addrcost_table,
-  &cortexa53_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  1, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa53_tunings =
-{
-  &cortexa53_extra_costs,
-  &generic_addrcost_table,
-  &cortexa53_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa57_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &cortexa57_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa72_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &cortexa57_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa73_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &cortexa57_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate.  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params exynosm1_tunings =
-{
-  &exynosm1_extra_costs,
-  &exynosm1_addrcost_table,
-  &exynosm1_regmove_cost,
-  &exynosm1_vector_cost,
-  &generic_branch_cost,
-  &exynosm1_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3,	/* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
-  "4",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "4",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  48,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
-  &exynosm1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderxt88_tunings =
-{
-  &thunderx_extra_costs,
-  &generic_addrcost_table,
-  &thunderx_regmove_cost,
-  &thunderx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
-  "8",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &thunderxt88_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderx_tunings =
-{
-  &thunderx_extra_costs,
-  &generic_addrcost_table,
-  &thunderx_regmove_cost,
-  &thunderx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
-  "8",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
-  &thunderx_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const struct tune_params tsv110_tunings =
-{
-  &tsv110_extra_costs,
-  &tsv110_addrcost_table,
-  &tsv110_regmove_cost,
-  &tsv110_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4,    /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
-   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
-  "16", /* function_align.  */
-  "4",  /* jump_align.  */
-  "8",  /* loop_align.  */
-  2,    /* int_reassoc_width.  */
-  4,    /* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,    /* vec_reassoc_width.  */
-  2,    /* min_div_recip_mul_sf.  */
-  2,    /* min_div_recip_mul_df.  */
-  0,    /* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
-  &tsv110_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params xgene1_tunings =
-{
-  &xgene1_extra_costs,
-  &xgene1_addrcost_table,
-  &xgene1_regmove_cost,
-  &xgene1_vector_cost,
-  &generic_branch_cost,
-  &xgene1_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  AARCH64_FUSE_NOTHING, /* fusible_ops  */
-  "16",	/* function_align.  */
-  "16",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  17,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
-  &xgene1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params emag_tunings =
-{
-  &xgene1_extra_costs,
-  &xgene1_addrcost_table,
-  &xgene1_regmove_cost,
-  &xgene1_vector_cost,
-  &generic_branch_cost,
-  &xgene1_approx_modes,
-  SVE_NOT_IMPLEMENTED,
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  AARCH64_FUSE_NOTHING, /* fusible_ops  */
-  "16",	/* function_align.  */
-  "16",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  17,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
-  &xgene1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params qdf24xx_tunings =
-{
-  &qdf24xx_extra_costs,
-  &qdf24xx_addrcost_table,
-  &qdf24xx_regmove_cost,
-  &qdf24xx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
-  &qdf24xx_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-/* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
-   for now.  */
-static const struct tune_params saphira_tunings =
-{
-  &generic_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderx2t99_tunings =
-{
-  &thunderx2t99_extra_costs,
-  &thunderx2t99_addrcost_table,
-  &thunderx2t99_regmove_cost,
-  &thunderx2t99_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate.  */
-  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
-   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  3,	/* int_reassoc_width.  */
-  2,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &thunderx2t99_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderx3t110_tunings =
-{
-  &thunderx3t110_extra_costs,
-  &thunderx3t110_addrcost_table,
-  &thunderx3t110_regmove_cost,
-  &thunderx3t110_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  6, /* issue_rate.  */
-  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
-   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  3,	/* int_reassoc_width.  */
-  2,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &thunderx3t110_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params neoversen1_tunings =
-{
-  &cortexa76_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    5, /* load_fp.  */
-    2, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params ampere1_tunings =
-{
-  &ampere1_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &ampere1_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
-   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
-   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
-   AARCH64_FUSE_CMP_BRANCH),
-  /* fusible_ops  */
-  "32",		/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &ampere1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const struct tune_params ampere1a_tunings =
-{
-  &ampere1a_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &ampere1_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
-   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
-   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
-   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
-   AARCH64_FUSE_ADDSUB_2REG_CONST1),
-  /* fusible_ops  */
-  "32",		/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &ampere1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  4, /* ld2_st2_permute_cost */
-  4, /* ld3_st3_permute_cost  */
-  5, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  /* This value is just inherited from the Cortex-A57 table.  */
-  8, /* vec_to_scalar_cost  */
-  /* This depends very much on what the scalar value is and
-     where it comes from.  E.g. some constants take two dependent
-     instructions or a load, while others might be moved from a GPR.
-     4 seems to be a reasonable compromise in practice.  */
-  4, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  /* Although stores have a latency of 2 and compete for the
-     vector pipes, in practice it's better not to model that.  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost neoversev1_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    4, /* ld2_st2_permute_cost  */
-    7, /* ld3_st3_permute_cost  */
-    8, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 31 scalar ADDs could
-       complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
-       completes in 14 cycles, so give it a cost of 31 + 5.  */
-    36, /* reduc_i8_cost  */
-    /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
-    22, /* reduc_i16_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
-    14, /* reduc_i32_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
-    11, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 15 scalar FADDs could
-       complete in ~9 cycles and would have a cost of 30.  FADDV
-       completes in 13 cycles, so give it a cost of 30 + 4.  */
-    34, /* reduc_f16_cost  */
-    /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
-    19, /* reduc_f32_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
-    11, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* See the comment above the Advanced SIMD versions.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  19, /* fadda_f16_cost  */
-  11, /* fadda_f32_cost  */
-  8, /* fadda_f64_cost  */
-  32, /* gather_load_x32_cost  */
-  16, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
-{
-  3, /* loads_stores_per_cycle  */
-  2, /* stores_per_cycle  */
-  4, /* general_ops_per_cycle  */
-  0, /* fp_simd_load_general_ops  */
-  1 /* fp_simd_store_general_ops  */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
-{
-  {
-    3, /* loads_stores_per_cycle  */
-    2, /* stores_per_cycle  */
-    4, /* general_ops_per_cycle  */
-    0, /* fp_simd_load_general_ops  */
-    1 /* fp_simd_store_general_ops  */
-  },
-  2, /* ld2_st2_general_ops  */
-  2, /* ld3_st3_general_ops  */
-  3 /* ld4_st4_general_ops  */
-};
-
-static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
-{
-  {
-    {
-      2, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      2, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    2, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  1, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoversev1_vec_issue_info =
-{
-  &neoversev1_scalar_issue_info,
-  &neoversev1_advsimd_issue_info,
-  &neoversev1_sve_issue_info
-};
-
-/* Neoverse V1 costs for vector insn classes.  */
-static const struct cpu_vector_cost neoversev1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversev1_advsimd_vector_cost, /* advsimd  */
-  &neoversev1_sve_vector_cost, /* sve  */
-  &neoversev1_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoversev1_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversev1_addrcost_table,
-  &neoversev1_regmove_cost,
-  &neoversev1_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_256, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    6, /* load_fp.  */
-    2, /* store_fp.  */
-    6, /* load_pred.  */
-    1 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const sve_vec_cost neoverse512tvb_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    4, /* ld2_st2_permute_cost  */
-    5, /* ld3_st3_permute_cost  */
-    5, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~5 cycles and would have a cost of 15.  Assume that
-       [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
-    13, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
-    9, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
-    8, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~6 cycles and would have a cost of 14.  Assume that
-       FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
-    16, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
-    8, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
-    4, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* This depends very much on what the scalar value is and
-       where it comes from.  E.g. some constants take two dependent
-       instructions or a load, while others might be moved from a GPR.
-       4 seems to be a reasonable compromise in practice.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores generally have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  10, /* fadda_f16_cost  */
-  6, /* fadda_f32_cost  */
-  4, /* fadda_f64_cost  */
-  /* A strided Advanced SIMD x64 load would take two parallel FP loads
-     (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
-     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
-     (cost 2) to that, to avoid the difference being lost in rounding.
-
-     There is no easy comparison between a strided Advanced SIMD x32 load
-     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
-     operation more than a 64-bit gather.  */
-  14, /* gather_load_x32_cost  */
-  12, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
-{
-  {
-    {
-      3, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      4, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    2, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  2, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
-{
-  &neoversev1_scalar_issue_info,
-  &neoversev1_advsimd_issue_info,
-  &neoverse512tvb_sve_issue_info
-};
-
-static const struct cpu_vector_cost neoverse512tvb_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversev1_advsimd_vector_cost, /* advsimd  */
-  &neoverse512tvb_sve_vector_cost, /* sve  */
-  &neoverse512tvb_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoverse512tvb_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversev1_addrcost_table,
-  &neoversev1_regmove_cost,
-  &neoverse512tvb_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_128 | SVE_256, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    6, /* load_fp.  */
-    2, /* store_fp.  */
-    6, /* load_pred.  */
-    1 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
-};
-
-static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  2, /* ld2_st2_permute_cost */
-  2, /* ld3_st3_permute_cost  */
-  3, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  4, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  /* This value is just inherited from the Cortex-A57 table.  */
-  8, /* vec_to_scalar_cost  */
-  /* This depends very much on what the scalar value is and
-     where it comes from.  E.g. some constants take two dependent
-     instructions or a load, while others might be moved from a GPR.
-     4 seems to be a reasonable compromise in practice.  */
-  4, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  /* Although stores have a latency of 2 and compete for the
-     vector pipes, in practice it's better not to model that.  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost neoversen2_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    3, /* ld2_st2_permute_cost  */
-    4, /* ld3_st3_permute_cost  */
-    4, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
-       completes in 11 cycles, so give it a cost of 15 + 6.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
-    13, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
-    9, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
-    2, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~8 cycles and would have a cost of 14.  FADDV
-       completes in 6 cycles, so give it a cost of 14 - 2.  */
-    12, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
-    6, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
-    2, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* See the comment above the Advanced SIMD versions.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  10, /* fadda_f16_cost  */
-  6, /* fadda_f32_cost  */
-  4, /* fadda_f64_cost  */
-  /* A strided Advanced SIMD x64 load would take two parallel FP loads
-     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
-     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
-     (cost 2) to that, to avoid the difference being lost in rounding.
-
-     There is no easy comparison between a strided Advanced SIMD x32 load
-     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
-     operation more than a 64-bit gather.  */
-  14, /* gather_load_x32_cost  */
-  12, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
-{
-  3, /* loads_stores_per_cycle  */
-  2, /* stores_per_cycle  */
-  4, /* general_ops_per_cycle  */
-  0, /* fp_simd_load_general_ops  */
-  1 /* fp_simd_store_general_ops  */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
-{
-  {
-    3, /* loads_stores_per_cycle  */
-    2, /* stores_per_cycle  */
-    2, /* general_ops_per_cycle  */
-    0, /* fp_simd_load_general_ops  */
-    1 /* fp_simd_store_general_ops  */
-  },
-  2, /* ld2_st2_general_ops  */
-  2, /* ld3_st3_general_ops  */
-  3 /* ld4_st4_general_ops  */
-};
-
-static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
-{
-  {
-    {
-      3, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      2, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    3, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  2, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoversen2_vec_issue_info =
-{
-  &neoversen2_scalar_issue_info,
-  &neoversen2_advsimd_issue_info,
-  &neoversen2_sve_issue_info
-};
-
-/* Neoverse N2 costs for vector insn classes.  */
-static const struct cpu_vector_cost neoversen2_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversen2_advsimd_vector_cost, /* advsimd  */
-  &neoversen2_sve_vector_cost, /* sve  */
-  &neoversen2_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoversen2_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversen2_addrcost_table,
-  &neoversen2_regmove_cost,
-  &neoversen2_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_128, /* sve_width  */
-  { 4, /* load_int.  */
-    1, /* store_int.  */
-    6, /* load_fp.  */
-    2, /* store_fp.  */
-    6, /* load_pred.  */
-    1 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
-   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
-};
-
-static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  2, /* ld2_st2_permute_cost */
-  2, /* ld3_st3_permute_cost  */
-  3, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  /* This value is just inherited from the Cortex-A57 table.  */
-  8, /* vec_to_scalar_cost  */
-  /* This depends very much on what the scalar value is and
-     where it comes from.  E.g. some constants take two dependent
-     instructions or a load, while others might be moved from a GPR.
-     4 seems to be a reasonable compromise in practice.  */
-  4, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  /* Although stores have a latency of 2 and compete for the
-     vector pipes, in practice it's better not to model that.  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost neoversev2_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    3, /* ld2_st2_permute_cost  */
-    3, /* ld3_st3_permute_cost  */
-    4, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
-       completes in 11 cycles, so give it a cost of 15 + 8.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
-    14, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
-    7, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
-    2, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~6 cycles and would have a cost of 14.  FADDV
-       completes in 8 cycles, so give it a cost of 14 + 2.  */
-    16, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
-    8, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
-    4, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* See the comment above the Advanced SIMD versions.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  10, /* fadda_f16_cost  */
-  6, /* fadda_f32_cost  */
-  4, /* fadda_f64_cost  */
-  /* A strided Advanced SIMD x64 load would take two parallel FP loads
-     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
-     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
-     (cost 2) to that, to avoid the difference being lost in rounding.
-
-     There is no easy comparison between a strided Advanced SIMD x32 load
-     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
-     operation more than a 64-bit gather.  */
-  14, /* gather_load_x32_cost  */
-  12, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
-{
-  3, /* loads_stores_per_cycle  */
-  2, /* stores_per_cycle  */
-  6, /* general_ops_per_cycle  */
-  0, /* fp_simd_load_general_ops  */
-  1 /* fp_simd_store_general_ops  */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
-{
-  {
-    3, /* loads_stores_per_cycle  */
-    2, /* stores_per_cycle  */
-    4, /* general_ops_per_cycle  */
-    0, /* fp_simd_load_general_ops  */
-    1 /* fp_simd_store_general_ops  */
-  },
-  2, /* ld2_st2_general_ops  */
-  2, /* ld3_st3_general_ops  */
-  3 /* ld4_st4_general_ops  */
-};
-
-static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
-{
-  {
-    {
-      3, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      4, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    3, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  2, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoversev2_vec_issue_info =
-{
-  &neoversev2_scalar_issue_info,
-  &neoversev2_advsimd_issue_info,
-  &neoversev2_sve_issue_info
-};
-
-/* Demeter costs for vector insn classes.  */
-static const struct cpu_vector_cost neoversev2_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversev2_advsimd_vector_cost, /* advsimd  */
-  &neoversev2_sve_vector_cost, /* sve  */
-  &neoversev2_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoversev2_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversev2_addrcost_table,
-  &neoversev2_regmove_cost,
-  &neoversev2_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_128, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    6, /* load_fp.  */
-    1, /* store_fp.  */
-    6, /* load_pred.  */
-    2 /* store_pred.  */
-  }, /* memmov_cost.  */
-  5, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  3,	/* int_reassoc_width.  */
-  6,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  3,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
-   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
-};
-
-static const struct tune_params a64fx_tunings =
-{
-  &a64fx_extra_costs,
-  &a64fx_addrcost_table,
-  &a64fx_regmove_cost,
-  &a64fx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_512, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  7, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32",	/* function_align.  */
-  "16",	/* jump_align.  */
-  "32",	/* loop_align.  */
-  4,	/* int_reassoc_width.  */
-  2,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &a64fx_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
+#include "tuning_models/generic.h"
+#include "tuning_models/cortexa35.h"
+#include "tuning_models/cortexa53.h"
+#include "tuning_models/cortexa57.h"
+#include "tuning_models/cortexa72.h"
+#include "tuning_models/cortexa73.h"
+#include "tuning_models/exynosm1.h"
+#include "tuning_models/thunderxt88.h"
+#include "tuning_models/thunderx.h"
+#include "tuning_models/tsv110.h"
+#include "tuning_models/xgene1.h"
+#include "tuning_models/emag.h"
+#include "tuning_models/qdf24xx.h"
+#include "tuning_models/saphira.h"
+#include "tuning_models/thunderx2t99.h"
+#include "tuning_models/thunderx3t110.h"
+#include "tuning_models/neoversen1.h"
+#include "tuning_models/ampere1.h"
+#include "tuning_models/ampere1a.h"
+#include "tuning_models/neoversev1.h"
+#include "tuning_models/neoverse512tvb.h"
+#include "tuning_models/neoversen2.h"
+#include "tuning_models/neoversev2.h"
+#include "tuning_models/a64fx.h"
 
 /* Support for fine-grained override of the tuning structures.  */
 struct aarch64_tuning_override_function
author	Tamar Christina <tamar.christina@arm.com>	2023-11-21 13:19:36 +0000
committer	Tamar Christina <tamar.christina@arm.com>	2023-11-21 13:19:36 +0000
commit	4b6da8e7bdb93d9bca6291157db1c936ac56e7af (patch)
tree	2f54d2ae09b85837b3efb595a35961d19b920f23 /gcc/config/aarch64/aarch64.cc
parent	f26f92b534f9d68371322071f309ef3e0e95f38c (diff)
download	gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.zip gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.gz gcc-4b6da8e7bdb93d9bca6291157db1c936ac56e7af.tar.bz2