AArch64: Update Neoverse V2 cost model to release costs

This updates the cost for Neoverse V2 to reflect the updated Software Optimization Guide. It also makes Cortex-X3 use the Neoverse V2 cost model. gcc/ChangeLog: * config/aarch64/aarch64-cores.def (cortex-x3): Use Neoverse-V2 costs. * config/aarch64/tuning_models/neoversev2.h: Update costs.
author: Tamar Christina <tamar.christina@arm.com> 2024-08-01 16:53:22 +0100
committer: Tamar Christina <tamar.christina@arm.com> 2024-08-01 16:53:22 +0100
commit: 7e7c1e38829d45667748db68f15584bdd16fcad6 (patch)
tree: 3da2dfe4f92d877047350516c1d7202267e56cc3
parent: c5ccdfdcab0b24afba2a661af861bec1d63f0595 (diff)
download: gcc-7e7c1e38829d45667748db68f15584bdd16fcad6.zip
gcc-7e7c1e38829d45667748db68f15584bdd16fcad6.tar.gz
gcc-7e7c1e38829d45667748db68f15584bdd16fcad6.tar.bz2
2 files changed, 20 insertions, 20 deletions
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index e58bc0f..34307fe 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -186,7 +186,7 @@ AARCH64_CORE("cortex-a720",  cortexa720, cortexa57, V9_2A,  (SVE2_BITPERM, MEMTA
 
 AARCH64_CORE("cortex-x2",  cortexx2, cortexa57, V9A,  (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd48, -1)
 
-AARCH64_CORE("cortex-x3",  cortexx3, cortexa57, V9A,  (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd4e, -1)
+AARCH64_CORE("cortex-x3",  cortexx3, cortexa57, V9A,  (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversev2, 0x41, 0xd4e, -1)
 
 AARCH64_CORE("cortex-x4",  cortexx4, cortexa57, V9_2A,  (SVE2_BITPERM, MEMTAG, PROFILE), neoversen2, 0x41, 0xd81, -1)
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h
index f76e4ef..c9c3019 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -57,13 +57,13 @@ static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
   2, /* ld2_st2_permute_cost */
   2, /* ld3_st3_permute_cost  */
   3, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
+  2, /* permute_cost  */
   4, /* reduc_i8_cost  */
   4, /* reduc_i16_cost  */
   2, /* reduc_i32_cost  */
   2, /* reduc_i64_cost  */
   6, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
+  4, /* reduc_f32_cost  */
   2, /* reduc_f64_cost  */
   2, /* store_elt_extra_cost  */
   /* This value is just inherited from the Cortex-A57 table.  */
@@ -86,22 +86,22 @@ static const sve_vec_cost neoversev2_sve_vector_cost =
   {
     2, /* int_stmt_cost  */
     2, /* fp_stmt_cost  */
-    3, /* ld2_st2_permute_cost  */
+    2, /* ld2_st2_permute_cost  */
     3, /* ld3_st3_permute_cost  */
-    4, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
+    3, /* ld4_st4_permute_cost  */
+    2, /* permute_cost  */
     /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
-       completes in 11 cycles, so give it a cost of 15 + 8.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
-    14, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
+       complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
+       completes in 9 cycles, so give it a cost of 15 + 4.  */
+    19, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5.  */
+    12, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4.  */
     7, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
-    2, /* reduc_i64_cost  */
+    /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3.  */
+    4, /* reduc_i64_cost  */
     /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~6 cycles and would have a cost of 14.  FADDV
+       complete in ~6 cycles and would have a cost of  14.  FADDV
        completes in 8 cycles, so give it a cost of 14 + 2.  */
     16, /* reduc_f16_cost  */
     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
@@ -127,7 +127,7 @@ static const sve_vec_cost neoversev2_sve_vector_cost =
   /* A strided Advanced SIMD x64 load would take two parallel FP loads
      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 8) and a vec_construct (cost 4).  Add a full vector operation
      (cost 2) to that, to avoid the difference being lost in rounding.
 
      There is no easy comparison between a strided Advanced SIMD x32 load
@@ -165,14 +165,14 @@ static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
 {
   {
     {
-      3, /* loads_per_cycle  */
+      3, /* loads_stores_per_cycle  */
       2, /* stores_per_cycle  */
       4, /* general_ops_per_cycle  */
       0, /* fp_simd_load_general_ops  */
       1 /* fp_simd_store_general_ops  */
     },
     2, /* ld2_st2_general_ops  */
-    3, /* ld3_st3_general_ops  */
+    2, /* ld3_st3_general_ops  */
     3 /* ld4_st4_general_ops  */
   },
   2, /* pred_ops_per_cycle  */
@@ -190,7 +190,7 @@ static const aarch64_vec_issue_info neoversev2_vec_issue_info =
   &neoversev2_sve_issue_info
 };
 
-/* Demeter costs for vector insn classes.  */
+/* Neoversev2 costs for vector insn classes.  */
 static const struct cpu_vector_cost neoversev2_vector_cost =
 {
   1, /* scalar_int_stmt_cost  */
@@ -243,4 +243,4 @@ static const struct tune_params neoversev2_tunings =
   AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
 };
 
-#endif /* GCC_AARCH64_H_NEOVERSEV2.  */
+#endif /* GCC_AARCH64_H_NEOVERSEV2.  */
+\ No newline at end of file
author	Tamar Christina <tamar.christina@arm.com>	2024-08-01 16:53:22 +0100
committer	Tamar Christina <tamar.christina@arm.com>	2024-08-01 16:53:22 +0100
commit	7e7c1e38829d45667748db68f15584bdd16fcad6 (patch)
tree	3da2dfe4f92d877047350516c1d7202267e56cc3
parent	c5ccdfdcab0b24afba2a661af861bec1d63f0595 (diff)
download	gcc-7e7c1e38829d45667748db68f15584bdd16fcad6.zip gcc-7e7c1e38829d45667748db68f15584bdd16fcad6.tar.gz gcc-7e7c1e38829d45667748db68f15584bdd16fcad6.tar.bz2