diff options
author | Jennifer Schmitz <jschmitz@nvidia.com> | 2024-11-26 00:43:48 -0800 |
---|---|---|
committer | Jennifer Schmitz <jschmitz@nvidia.com> | 2025-01-07 09:02:29 +0100 |
commit | 70035b6c13852435d7ae396c0762ee26897d4d45 (patch) | |
tree | 7215b77c12b388ff326e3d5d7cbd4608409a7b21 /gcc | |
parent | e53277d849a13a8a36f488f6725700311c74080e (diff) | |
download | gcc-70035b6c13852435d7ae396c0762ee26897d4d45.zip gcc-70035b6c13852435d7ae396c0762ee26897d4d45.tar.gz gcc-70035b6c13852435d7ae396c0762ee26897d4d45.tar.bz2 |
AArch64: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and
use_new_vector_costs entry in aarch64-tuning-flags.def and makes the
AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the
default. To that end, the function aarch64_use_new_vector_costs_p and its uses
were removed. To prevent costing vec_to_scalar operations with 0, as
described in
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html,
we adjusted vectorizable_store such that the variable n_adjacent_stores
also covers vec_to_scalar operations. This way vec_to_scalar operations
are not costed individually, but as a group.
As suggested by Richard Sandiford, the "known_ne" in the multilane-check
was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector
rather than a scalar.
Two tests were adjusted due to changes in codegen. In both cases, the
old code performed loop unrolling once, but the new code does not:
Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none):
f_int64_t_32:
cbz w3, .L92
mov x4, 0
uxtw x3, w3
+ cntd x5
+ whilelo p7.d, xzr, x3
+ mov z29.s, w5
mov z31.s, w2
- whilelo p6.d, xzr, x3
- mov x2, x3
- index z30.s, #0, #1
- uqdecd x2
- ptrue p5.b, all
- whilelo p7.d, xzr, x2
+ index z30.d, #0, #1
+ ptrue p6.b, all
.p2align 3,,7
.L94:
- ld1d z27.d, p7/z, [x0, #1, mul vl]
- ld1d z28.d, p6/z, [x0]
- movprfx z29, z31
- mul z29.s, p5/m, z29.s, z30.s
- incw x4
- uunpklo z0.d, z29.s
- uunpkhi z29.d, z29.s
- ld1d z25.d, p6/z, [x1, z0.d, lsl 3]
- ld1d z26.d, p7/z, [x1, z29.d, lsl 3]
- add z25.d, z28.d, z25.d
+ ld1d z27.d, p7/z, [x0, x4, lsl 3]
+ movprfx z28, z31
+ mul z28.s, p6/m, z28.s, z30.s
+ ld1d z26.d, p7/z, [x1, z28.d, uxtw 3]
add z26.d, z27.d, z26.d
- st1d z26.d, p7, [x0, #1, mul vl]
- whilelo p7.d, x4, x2
- st1d z25.d, p6, [x0]
- incw z30.s
- incb x0, all, mul #2
- whilelo p6.d, x4, x3
+ st1d z26.d, p7, [x0, x4, lsl 3]
+ add z30.s, z30.s, z29.s
+ incd x4
+ whilelo p7.d, x4, x3
b.any .L94
.L92:
ret
Example from gcc.target/aarch64/sve/strided_store_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none):
f_int64_t_32:
cbz w3, .L84
- addvl x5, x1, #1
mov x4, 0
uxtw x3, w3
- mov z31.s, w2
+ cntd x5
whilelo p7.d, xzr, x3
- mov x2, x3
- index z30.s, #0, #1
- uqdecd x2
- ptrue p5.b, all
- whilelo p6.d, xzr, x2
+ mov z29.s, w5
+ mov z31.s, w2
+ index z30.d, #0, #1
+ ptrue p6.b, all
.p2align 3,,7
.L86:
- ld1d z28.d, p7/z, [x1, x4, lsl 3]
- ld1d z27.d, p6/z, [x5, x4, lsl 3]
- movprfx z29, z30
- mul z29.s, p5/m, z29.s, z31.s
- add z28.d, z28.d, #1
- uunpklo z26.d, z29.s
- st1d z28.d, p7, [x0, z26.d, lsl 3]
- incw x4
- uunpkhi z29.d, z29.s
+ ld1d z27.d, p7/z, [x1, x4, lsl 3]
+ movprfx z28, z30
+ mul z28.s, p6/m, z28.s, z31.s
add z27.d, z27.d, #1
- whilelo p6.d, x4, x2
- st1d z27.d, p7, [x0, z29.d, lsl 3]
- incw z30.s
+ st1d z27.d, p7, [x0, z28.d, uxtw 3]
+ incd x4
+ add z30.s, z30.s, z29.s
whilelo p7.d, x4, x3
b.any .L86
.L84:
ret
The patch was bootstrapped and tested on aarch64-linux-gnu, no
regression.
OK for mainline?
Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
gcc/
* tree-vect-stmts.cc (vectorizable_store): Extend the use of
n_adjacent_stores to also cover vec_to_scalar operations.
* config/aarch64/aarch64-tuning-flags.def: Remove
use_new_vector_costs as tuning option.
* config/aarch64/aarch64.cc (aarch64_use_new_vector_costs_p):
Remove.
(aarch64_vector_costs::add_stmt_cost): Remove use of
aarch64_use_new_vector_costs_p.
(aarch64_vector_costs::finish_cost): Remove use of
aarch64_use_new_vector_costs_p.
* config/aarch64/tuning_models/cortexx925.h: Remove
AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS.
* config/aarch64/tuning_models/fujitsu_monaka.h: Likewise.
* config/aarch64/tuning_models/generic_armv8_a.h: Likewise.
* config/aarch64/tuning_models/generic_armv9_a.h: Likewise.
* config/aarch64/tuning_models/neoverse512tvb.h: Likewise.
* config/aarch64/tuning_models/neoversen2.h: Likewise.
* config/aarch64/tuning_models/neoversen3.h: Likewise.
* config/aarch64/tuning_models/neoversev1.h: Likewise.
* config/aarch64/tuning_models/neoversev2.h: Likewise.
* config/aarch64/tuning_models/neoversev3.h: Likewise.
* config/aarch64/tuning_models/neoversev3ae.h: Likewise.
gcc/testsuite/
* gcc.target/aarch64/sve/strided_load_2.c: Adjust expected outcome.
* gcc.target/aarch64/sve/strided_store_2.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-tuning-flags.def | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.cc | 20 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/cortexx925.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/fujitsu_monaka.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/generic_armv8_a.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/generic_armv9_a.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/neoverse512tvb.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/neoversen2.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/neoversen3.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/neoversev1.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/neoversev2.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/neoversev3.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/tuning_models/neoversev3ae.h | 1 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c | 2 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.cc | 40 |
16 files changed, 27 insertions, 50 deletions
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 9bfa1f2..7ebf390 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -38,8 +38,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) -AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS) - AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT) AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 9e69bc7..d6a8e4c 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -16627,16 +16627,6 @@ aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) return new aarch64_vector_costs (vinfo, costing_for_scalar); } -/* Return true if the current CPU should use the new costs defined - in GCC 11. This should be removed for GCC 12 and above, with the - costs applying to all CPUs instead. */ -static bool -aarch64_use_new_vector_costs_p () -{ - return (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS); -} - /* Return the appropriate SIMD costs for vectors of type VECTYPE. */ static const simd_vec_cost * aarch64_simd_vec_costs (tree vectype) @@ -17555,7 +17545,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Do one-time initialization based on the vinfo. */ loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); - if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ()) + if (!m_analyzed_vinfo) { if (loop_vinfo) analyze_loop_vinfo (loop_vinfo); @@ -17573,7 +17563,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Try to get a more accurate cost by looking at STMT_INFO instead of just looking at KIND. */ - if (stmt_info && aarch64_use_new_vector_costs_p ()) + if (stmt_info) { /* If we scalarize a strided store, the vectorizer costs one vec_to_scalar for each element. However, we can store the first @@ -17638,7 +17628,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, else m_num_last_promote_demote = 0; - if (stmt_info && aarch64_use_new_vector_costs_p ()) + if (stmt_info) { /* Account for any extra "embedded" costs that apply additively to the base cost calculated above. */ @@ -17999,9 +17989,7 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs) auto *scalar_costs = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs); - if (loop_vinfo - && m_vec_flags - && aarch64_use_new_vector_costs_p ()) + if (loop_vinfo && m_vec_flags) { m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs, m_costs[vect_body]); diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h b/gcc/config/aarch64/tuning_models/cortexx925.h index 2823fd8..7d0162e 100644 --- a/gcc/config/aarch64/tuning_models/cortexx925.h +++ b/gcc/config/aarch64/tuning_models/cortexx925.h @@ -221,7 +221,6 @@ static const struct tune_params cortexx925_tunings = tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ &generic_armv9a_prefetch_tune, diff --git a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h index 3850ed7..5dc4024 100644 --- a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h +++ b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h @@ -55,7 +55,6 @@ static const struct tune_params fujitsu_monaka_tunings = 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ &generic_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h b/gcc/config/aarch64/tuning_models/generic_armv8_a.h index e092a34..35de3f0 100644 --- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h +++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h @@ -183,7 +183,6 @@ static const struct tune_params generic_armv8_a_tunings = tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ &generic_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h index 32b5d0e..f76a250 100644 --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h @@ -251,7 +251,6 @@ static const struct tune_params generic_armv9_a_tunings = 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ &generic_armv9a_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h b/gcc/config/aarch64/tuning_models/neoverse512tvb.h index 2530217..50eb058 100644 --- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h +++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h @@ -156,7 +156,6 @@ static const struct tune_params neoverse512tvb_tunings = 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ &generic_armv9a_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h index dbfebc5..9fbc059 100644 --- a/gcc/config/aarch64/tuning_models/neoversen2.h +++ b/gcc/config/aarch64/tuning_models/neoversen2.h @@ -219,7 +219,6 @@ static const struct tune_params neoversen2_tunings = tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ &generic_armv9a_prefetch_tune, diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h b/gcc/config/aarch64/tuning_models/neoversen3.h index 5ca0e13..78177e7 100644 --- a/gcc/config/aarch64/tuning_models/neoversen3.h +++ b/gcc/config/aarch64/tuning_models/neoversen3.h @@ -219,7 +219,6 @@ static const struct tune_params neoversen3_tunings = tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ &generic_armv9a_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h index 8719297..f1ec7dc 100644 --- a/gcc/config/aarch64/tuning_models/neoversev1.h +++ b/gcc/config/aarch64/tuning_models/neoversev1.h @@ -228,7 +228,6 @@ static const struct tune_params neoversev1_tunings = tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ &generic_armv9a_prefetch_tune, diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h index 219016f..4fabe4d 100644 --- a/gcc/config/aarch64/tuning_models/neoversev2.h +++ b/gcc/config/aarch64/tuning_models/neoversev2.h @@ -219,7 +219,6 @@ static const struct tune_params neoversev2_tunings = tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA), /* tune_flags. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h b/gcc/config/aarch64/tuning_models/neoversev3.h index 092de7b..ad3cd22 100644 --- a/gcc/config/aarch64/tuning_models/neoversev3.h +++ b/gcc/config/aarch64/tuning_models/neoversev3.h @@ -219,7 +219,6 @@ static const struct tune_params neoversev3_tunings = tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ &generic_armv9a_prefetch_tune, diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h b/gcc/config/aarch64/tuning_models/neoversev3ae.h index f395799..a0adef0 100644 --- a/gcc/config/aarch64/tuning_models/neoversev3ae.h +++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h @@ -219,7 +219,6 @@ static const struct tune_params neoversev3ae_tunings = tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_BASE | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ &generic_armv9a_prefetch_tune, diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c index 762805f..c334b7a 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c @@ -15,4 +15,4 @@ so we vectorize the offset calculation. This means that the 64-bit version needs two copies. */ /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */ -/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 9 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c index f0ea58e..94cc630 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c @@ -15,4 +15,4 @@ so we vectorize the offset calculation. This means that the 64-bit version needs two copies. */ /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */ -/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 9 } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 2765dcd..c0e38d0 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -8834,22 +8834,7 @@ vectorizable_store (vec_info *vinfo, { if (costing_p) { - /* Only need vector extracting when there are more - than one stores. */ - if (nstores > 1) - inside_cost - += record_stmt_cost (cost_vec, 1, vec_to_scalar, - stmt_info, slp_node, - 0, vect_body); - /* Take a single lane vector type store as scalar - store to avoid ICE like 110776. */ - if (VECTOR_TYPE_P (ltype) - && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U)) - n_adjacent_stores++; - else - inside_cost - += record_stmt_cost (cost_vec, 1, scalar_store, - stmt_info, 0, vect_body); + n_adjacent_stores++; continue; } tree newref, newoff; @@ -8905,9 +8890,26 @@ vectorizable_store (vec_info *vinfo, if (costing_p) { if (n_adjacent_stores > 0) - vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores, - alignment_support_scheme, misalignment, - &inside_cost, cost_vec); + { + /* Take a single lane vector type store as scalar + store to avoid ICE like 110776. */ + if (VECTOR_TYPE_P (ltype) + && maybe_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U)) + vect_get_store_cost (vinfo, stmt_info, slp_node, + n_adjacent_stores, alignment_support_scheme, + misalignment, &inside_cost, cost_vec); + else + inside_cost + += record_stmt_cost (cost_vec, n_adjacent_stores, + scalar_store, stmt_info, 0, vect_body); + /* Only need vector extracting when there are more + than one stores. */ + if (nstores > 1) + inside_cost + += record_stmt_cost (cost_vec, n_adjacent_stores, + vec_to_scalar, stmt_info, slp_node, + 0, vect_body); + } if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "vect_model_store_cost: inside_cost = %d, " |