aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJennifer Schmitz <jschmitz@nvidia.com>2024-11-26 00:43:48 -0800
committerJennifer Schmitz <jschmitz@nvidia.com>2025-01-07 09:02:29 +0100
commit70035b6c13852435d7ae396c0762ee26897d4d45 (patch)
tree7215b77c12b388ff326e3d5d7cbd4608409a7b21 /gcc
parente53277d849a13a8a36f488f6725700311c74080e (diff)
downloadgcc-70035b6c13852435d7ae396c0762ee26897d4d45.zip
gcc-70035b6c13852435d7ae396c0762ee26897d4d45.tar.gz
gcc-70035b6c13852435d7ae396c0762ee26897d4d45.tar.bz2
AArch64: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and use_new_vector_costs entry in aarch64-tuning-flags.def and makes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the default. To that end, the function aarch64_use_new_vector_costs_p and its uses were removed. To prevent costing vec_to_scalar operations with 0, as described in https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html, we adjusted vectorizable_store such that the variable n_adjacent_stores also covers vec_to_scalar operations. This way vec_to_scalar operations are not costed individually, but as a group. As suggested by Richard Sandiford, the "known_ne" in the multilane-check was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector rather than a scalar. Two tests were adjusted due to changes in codegen. In both cases, the old code performed loop unrolling once, but the new code does not: Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with -O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none): f_int64_t_32: cbz w3, .L92 mov x4, 0 uxtw x3, w3 + cntd x5 + whilelo p7.d, xzr, x3 + mov z29.s, w5 mov z31.s, w2 - whilelo p6.d, xzr, x3 - mov x2, x3 - index z30.s, #0, #1 - uqdecd x2 - ptrue p5.b, all - whilelo p7.d, xzr, x2 + index z30.d, #0, #1 + ptrue p6.b, all .p2align 3,,7 .L94: - ld1d z27.d, p7/z, [x0, #1, mul vl] - ld1d z28.d, p6/z, [x0] - movprfx z29, z31 - mul z29.s, p5/m, z29.s, z30.s - incw x4 - uunpklo z0.d, z29.s - uunpkhi z29.d, z29.s - ld1d z25.d, p6/z, [x1, z0.d, lsl 3] - ld1d z26.d, p7/z, [x1, z29.d, lsl 3] - add z25.d, z28.d, z25.d + ld1d z27.d, p7/z, [x0, x4, lsl 3] + movprfx z28, z31 + mul z28.s, p6/m, z28.s, z30.s + ld1d z26.d, p7/z, [x1, z28.d, uxtw 3] add z26.d, z27.d, z26.d - st1d z26.d, p7, [x0, #1, mul vl] - whilelo p7.d, x4, x2 - st1d z25.d, p6, [x0] - incw z30.s - incb x0, all, mul #2 - whilelo p6.d, x4, x3 + st1d z26.d, p7, [x0, x4, lsl 3] + add z30.s, z30.s, z29.s + incd x4 + whilelo p7.d, x4, x3 b.any .L94 .L92: ret Example from gcc.target/aarch64/sve/strided_store_2.c (compiled with -O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none): f_int64_t_32: cbz w3, .L84 - addvl x5, x1, #1 mov x4, 0 uxtw x3, w3 - mov z31.s, w2 + cntd x5 whilelo p7.d, xzr, x3 - mov x2, x3 - index z30.s, #0, #1 - uqdecd x2 - ptrue p5.b, all - whilelo p6.d, xzr, x2 + mov z29.s, w5 + mov z31.s, w2 + index z30.d, #0, #1 + ptrue p6.b, all .p2align 3,,7 .L86: - ld1d z28.d, p7/z, [x1, x4, lsl 3] - ld1d z27.d, p6/z, [x5, x4, lsl 3] - movprfx z29, z30 - mul z29.s, p5/m, z29.s, z31.s - add z28.d, z28.d, #1 - uunpklo z26.d, z29.s - st1d z28.d, p7, [x0, z26.d, lsl 3] - incw x4 - uunpkhi z29.d, z29.s + ld1d z27.d, p7/z, [x1, x4, lsl 3] + movprfx z28, z30 + mul z28.s, p6/m, z28.s, z31.s add z27.d, z27.d, #1 - whilelo p6.d, x4, x2 - st1d z27.d, p7, [x0, z29.d, lsl 3] - incw z30.s + st1d z27.d, p7, [x0, z28.d, uxtw 3] + incd x4 + add z30.s, z30.s, z29.s whilelo p7.d, x4, x3 b.any .L86 .L84: ret The patch was bootstrapped and tested on aarch64-linux-gnu, no regression. OK for mainline? Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com> gcc/ * tree-vect-stmts.cc (vectorizable_store): Extend the use of n_adjacent_stores to also cover vec_to_scalar operations. * config/aarch64/aarch64-tuning-flags.def: Remove use_new_vector_costs as tuning option. * config/aarch64/aarch64.cc (aarch64_use_new_vector_costs_p): Remove. (aarch64_vector_costs::add_stmt_cost): Remove use of aarch64_use_new_vector_costs_p. (aarch64_vector_costs::finish_cost): Remove use of aarch64_use_new_vector_costs_p. * config/aarch64/tuning_models/cortexx925.h: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS. * config/aarch64/tuning_models/fujitsu_monaka.h: Likewise. * config/aarch64/tuning_models/generic_armv8_a.h: Likewise. * config/aarch64/tuning_models/generic_armv9_a.h: Likewise. * config/aarch64/tuning_models/neoverse512tvb.h: Likewise. * config/aarch64/tuning_models/neoversen2.h: Likewise. * config/aarch64/tuning_models/neoversen3.h: Likewise. * config/aarch64/tuning_models/neoversev1.h: Likewise. * config/aarch64/tuning_models/neoversev2.h: Likewise. * config/aarch64/tuning_models/neoversev3.h: Likewise. * config/aarch64/tuning_models/neoversev3ae.h: Likewise. gcc/testsuite/ * gcc.target/aarch64/sve/strided_load_2.c: Adjust expected outcome. * gcc.target/aarch64/sve/strided_store_2.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/aarch64/aarch64-tuning-flags.def2
-rw-r--r--gcc/config/aarch64/aarch64.cc20
-rw-r--r--gcc/config/aarch64/tuning_models/cortexx925.h1
-rw-r--r--gcc/config/aarch64/tuning_models/fujitsu_monaka.h1
-rw-r--r--gcc/config/aarch64/tuning_models/generic_armv8_a.h1
-rw-r--r--gcc/config/aarch64/tuning_models/generic_armv9_a.h1
-rw-r--r--gcc/config/aarch64/tuning_models/neoverse512tvb.h1
-rw-r--r--gcc/config/aarch64/tuning_models/neoversen2.h1
-rw-r--r--gcc/config/aarch64/tuning_models/neoversen3.h1
-rw-r--r--gcc/config/aarch64/tuning_models/neoversev1.h1
-rw-r--r--gcc/config/aarch64/tuning_models/neoversev2.h1
-rw-r--r--gcc/config/aarch64/tuning_models/neoversev3.h1
-rw-r--r--gcc/config/aarch64/tuning_models/neoversev3ae.h1
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c2
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c2
-rw-r--r--gcc/tree-vect-stmts.cc40
16 files changed, 27 insertions, 50 deletions
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 9bfa1f2..7ebf390 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -38,8 +38,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
-AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
-
AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT)
AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9e69bc7..d6a8e4c 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16627,16 +16627,6 @@ aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
return new aarch64_vector_costs (vinfo, costing_for_scalar);
}
-/* Return true if the current CPU should use the new costs defined
- in GCC 11. This should be removed for GCC 12 and above, with the
- costs applying to all CPUs instead. */
-static bool
-aarch64_use_new_vector_costs_p ()
-{
- return (aarch64_tune_params.extra_tuning_flags
- & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
-}
-
/* Return the appropriate SIMD costs for vectors of type VECTYPE. */
static const simd_vec_cost *
aarch64_simd_vec_costs (tree vectype)
@@ -17555,7 +17545,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
/* Do one-time initialization based on the vinfo. */
loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
- if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
+ if (!m_analyzed_vinfo)
{
if (loop_vinfo)
analyze_loop_vinfo (loop_vinfo);
@@ -17573,7 +17563,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
/* Try to get a more accurate cost by looking at STMT_INFO instead
of just looking at KIND. */
- if (stmt_info && aarch64_use_new_vector_costs_p ())
+ if (stmt_info)
{
/* If we scalarize a strided store, the vectorizer costs one
vec_to_scalar for each element. However, we can store the first
@@ -17638,7 +17628,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
else
m_num_last_promote_demote = 0;
- if (stmt_info && aarch64_use_new_vector_costs_p ())
+ if (stmt_info)
{
/* Account for any extra "embedded" costs that apply additively
to the base cost calculated above. */
@@ -17999,9 +17989,7 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
auto *scalar_costs
= static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
- if (loop_vinfo
- && m_vec_flags
- && aarch64_use_new_vector_costs_p ())
+ if (loop_vinfo && m_vec_flags)
{
m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
m_costs[vect_body]);
diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h b/gcc/config/aarch64/tuning_models/cortexx925.h
index 2823fd8..7d0162e 100644
--- a/gcc/config/aarch64/tuning_models/cortexx925.h
+++ b/gcc/config/aarch64/tuning_models/cortexx925.h
@@ -221,7 +221,6 @@ static const struct tune_params cortexx925_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h
index 3850ed7..5dc4024 100644
--- a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h
+++ b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h
@@ -55,7 +55,6 @@ static const struct tune_params fujitsu_monaka_tunings =
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
index e092a34..35de3f0 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
@@ -183,7 +183,6 @@ static const struct tune_params generic_armv8_a_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
index 32b5d0e..f76a250 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -251,7 +251,6 @@ static const struct tune_params generic_armv9_a_tunings =
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_armv9a_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
index 2530217..50eb058 100644
--- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h
+++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
@@ -156,7 +156,6 @@ static const struct tune_params neoverse512tvb_tunings =
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_armv9a_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h
index dbfebc5..9fbc059 100644
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversen2_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h b/gcc/config/aarch64/tuning_models/neoversen3.h
index 5ca0e13..78177e7 100644
--- a/gcc/config/aarch64/tuning_models/neoversen3.h
+++ b/gcc/config/aarch64/tuning_models/neoversen3.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversen3_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_armv9a_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h
index 8719297..f1ec7dc 100644
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -228,7 +228,6 @@ static const struct tune_params neoversev1_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h
index 219016f..4fabe4d 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversev2_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
| AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA), /* tune_flags. */
diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h b/gcc/config/aarch64/tuning_models/neoversev3.h
index 092de7b..ad3cd22 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversev3_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h b/gcc/config/aarch64/tuning_models/neoversev3ae.h
index f395799..a0adef0 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3ae.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversev3ae_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
- | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
index 762805f..c334b7a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
@@ -15,4 +15,4 @@
so we vectorize the offset calculation. This means that the
64-bit version needs two copies. */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 9 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
index f0ea58e..94cc630 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
@@ -15,4 +15,4 @@
so we vectorize the offset calculation. This means that the
64-bit version needs two copies. */
/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 9 } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 2765dcd..c0e38d0 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8834,22 +8834,7 @@ vectorizable_store (vec_info *vinfo,
{
if (costing_p)
{
- /* Only need vector extracting when there are more
- than one stores. */
- if (nstores > 1)
- inside_cost
- += record_stmt_cost (cost_vec, 1, vec_to_scalar,
- stmt_info, slp_node,
- 0, vect_body);
- /* Take a single lane vector type store as scalar
- store to avoid ICE like 110776. */
- if (VECTOR_TYPE_P (ltype)
- && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
- n_adjacent_stores++;
- else
- inside_cost
- += record_stmt_cost (cost_vec, 1, scalar_store,
- stmt_info, 0, vect_body);
+ n_adjacent_stores++;
continue;
}
tree newref, newoff;
@@ -8905,9 +8890,26 @@ vectorizable_store (vec_info *vinfo,
if (costing_p)
{
if (n_adjacent_stores > 0)
- vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
- alignment_support_scheme, misalignment,
- &inside_cost, cost_vec);
+ {
+ /* Take a single lane vector type store as scalar
+ store to avoid ICE like 110776. */
+ if (VECTOR_TYPE_P (ltype)
+ && maybe_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
+ vect_get_store_cost (vinfo, stmt_info, slp_node,
+ n_adjacent_stores, alignment_support_scheme,
+ misalignment, &inside_cost, cost_vec);
+ else
+ inside_cost
+ += record_stmt_cost (cost_vec, n_adjacent_stores,
+ scalar_store, stmt_info, 0, vect_body);
+ /* Only need vector extracting when there are more
+ than one stores. */
+ if (nstores > 1)
+ inside_cost
+ += record_stmt_cost (cost_vec, n_adjacent_stores,
+ vec_to_scalar, stmt_info, slp_node,
+ 0, vect_body);
+ }
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"vect_model_store_cost: inside_cost = %d, "