diff options
author | Jennifer Schmitz <jschmitz@nvidia.com> | 2024-11-26 00:43:48 -0800 |
---|---|---|
committer | Jennifer Schmitz <jschmitz@nvidia.com> | 2025-01-07 09:02:29 +0100 |
commit | 70035b6c13852435d7ae396c0762ee26897d4d45 (patch) | |
tree | 7215b77c12b388ff326e3d5d7cbd4608409a7b21 /gcc/fortran/trans-expr.cc | |
parent | e53277d849a13a8a36f488f6725700311c74080e (diff) | |
download | gcc-70035b6c13852435d7ae396c0762ee26897d4d45.zip gcc-70035b6c13852435d7ae396c0762ee26897d4d45.tar.gz gcc-70035b6c13852435d7ae396c0762ee26897d4d45.tar.bz2 |
AArch64: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and
use_new_vector_costs entry in aarch64-tuning-flags.def and makes the
AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the
default. To that end, the function aarch64_use_new_vector_costs_p and its uses
were removed. To prevent costing vec_to_scalar operations with 0, as
described in
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html,
we adjusted vectorizable_store such that the variable n_adjacent_stores
also covers vec_to_scalar operations. This way vec_to_scalar operations
are not costed individually, but as a group.
As suggested by Richard Sandiford, the "known_ne" in the multilane-check
was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector
rather than a scalar.
Two tests were adjusted due to changes in codegen. In both cases, the
old code performed loop unrolling once, but the new code does not:
Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none):
f_int64_t_32:
cbz w3, .L92
mov x4, 0
uxtw x3, w3
+ cntd x5
+ whilelo p7.d, xzr, x3
+ mov z29.s, w5
mov z31.s, w2
- whilelo p6.d, xzr, x3
- mov x2, x3
- index z30.s, #0, #1
- uqdecd x2
- ptrue p5.b, all
- whilelo p7.d, xzr, x2
+ index z30.d, #0, #1
+ ptrue p6.b, all
.p2align 3,,7
.L94:
- ld1d z27.d, p7/z, [x0, #1, mul vl]
- ld1d z28.d, p6/z, [x0]
- movprfx z29, z31
- mul z29.s, p5/m, z29.s, z30.s
- incw x4
- uunpklo z0.d, z29.s
- uunpkhi z29.d, z29.s
- ld1d z25.d, p6/z, [x1, z0.d, lsl 3]
- ld1d z26.d, p7/z, [x1, z29.d, lsl 3]
- add z25.d, z28.d, z25.d
+ ld1d z27.d, p7/z, [x0, x4, lsl 3]
+ movprfx z28, z31
+ mul z28.s, p6/m, z28.s, z30.s
+ ld1d z26.d, p7/z, [x1, z28.d, uxtw 3]
add z26.d, z27.d, z26.d
- st1d z26.d, p7, [x0, #1, mul vl]
- whilelo p7.d, x4, x2
- st1d z25.d, p6, [x0]
- incw z30.s
- incb x0, all, mul #2
- whilelo p6.d, x4, x3
+ st1d z26.d, p7, [x0, x4, lsl 3]
+ add z30.s, z30.s, z29.s
+ incd x4
+ whilelo p7.d, x4, x3
b.any .L94
.L92:
ret
Example from gcc.target/aarch64/sve/strided_store_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none):
f_int64_t_32:
cbz w3, .L84
- addvl x5, x1, #1
mov x4, 0
uxtw x3, w3
- mov z31.s, w2
+ cntd x5
whilelo p7.d, xzr, x3
- mov x2, x3
- index z30.s, #0, #1
- uqdecd x2
- ptrue p5.b, all
- whilelo p6.d, xzr, x2
+ mov z29.s, w5
+ mov z31.s, w2
+ index z30.d, #0, #1
+ ptrue p6.b, all
.p2align 3,,7
.L86:
- ld1d z28.d, p7/z, [x1, x4, lsl 3]
- ld1d z27.d, p6/z, [x5, x4, lsl 3]
- movprfx z29, z30
- mul z29.s, p5/m, z29.s, z31.s
- add z28.d, z28.d, #1
- uunpklo z26.d, z29.s
- st1d z28.d, p7, [x0, z26.d, lsl 3]
- incw x4
- uunpkhi z29.d, z29.s
+ ld1d z27.d, p7/z, [x1, x4, lsl 3]
+ movprfx z28, z30
+ mul z28.s, p6/m, z28.s, z31.s
add z27.d, z27.d, #1
- whilelo p6.d, x4, x2
- st1d z27.d, p7, [x0, z29.d, lsl 3]
- incw z30.s
+ st1d z27.d, p7, [x0, z28.d, uxtw 3]
+ incd x4
+ add z30.s, z30.s, z29.s
whilelo p7.d, x4, x3
b.any .L86
.L84:
ret
The patch was bootstrapped and tested on aarch64-linux-gnu, no
regression.
OK for mainline?
Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
gcc/
* tree-vect-stmts.cc (vectorizable_store): Extend the use of
n_adjacent_stores to also cover vec_to_scalar operations.
* config/aarch64/aarch64-tuning-flags.def: Remove
use_new_vector_costs as tuning option.
* config/aarch64/aarch64.cc (aarch64_use_new_vector_costs_p):
Remove.
(aarch64_vector_costs::add_stmt_cost): Remove use of
aarch64_use_new_vector_costs_p.
(aarch64_vector_costs::finish_cost): Remove use of
aarch64_use_new_vector_costs_p.
* config/aarch64/tuning_models/cortexx925.h: Remove
AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS.
* config/aarch64/tuning_models/fujitsu_monaka.h: Likewise.
* config/aarch64/tuning_models/generic_armv8_a.h: Likewise.
* config/aarch64/tuning_models/generic_armv9_a.h: Likewise.
* config/aarch64/tuning_models/neoverse512tvb.h: Likewise.
* config/aarch64/tuning_models/neoversen2.h: Likewise.
* config/aarch64/tuning_models/neoversen3.h: Likewise.
* config/aarch64/tuning_models/neoversev1.h: Likewise.
* config/aarch64/tuning_models/neoversev2.h: Likewise.
* config/aarch64/tuning_models/neoversev3.h: Likewise.
* config/aarch64/tuning_models/neoversev3ae.h: Likewise.
gcc/testsuite/
* gcc.target/aarch64/sve/strided_load_2.c: Adjust expected outcome.
* gcc.target/aarch64/sve/strided_store_2.c: Likewise.
Diffstat (limited to 'gcc/fortran/trans-expr.cc')
0 files changed, 0 insertions, 0 deletions