diff options
author | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2021-02-22 21:24:41 +0000 |
---|---|---|
committer | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2021-02-22 21:24:41 +0000 |
commit | a65b9ad863c5fc0aea12db58557f4d286a1974d7 (patch) | |
tree | 388caa27fcb281d2a4b2c2f4de839b5eb322c713 /gcc | |
parent | 692ba083d9a22aaa08c8a3700d0237db8c922dc4 (diff) | |
download | gcc-a65b9ad863c5fc0aea12db58557f4d286a1974d7.zip gcc-a65b9ad863c5fc0aea12db58557f4d286a1974d7.tar.gz gcc-a65b9ad863c5fc0aea12db58557f4d286a1974d7.tar.bz2 |
aarch64: Add internal tune flag to minimise VL-based scalar ops
This patch introduces an internal tune flag to break up VL-based scalar ops
into a GP-reg scalar op with the VL read kept separate. This can be preferable on some CPUs.
I went for a tune param rather than extending the rtx costs as our RTX costs tables aren't set up to track
this intricacy.
I've confirmed that on the simple loop:
void vadd (int *dst, int *op1, int *op2, int count)
{
for (int i = 0; i < count; ++i)
dst[i] = op1[i] + op2[i];
}
we now split the incw into a cntw outside the loop and the add inside.
+ cntw x5
...
loop:
- incw x4
+ add x4, x4, x5
gcc/ChangeLog:
* config/aarch64/aarch64-tuning-flags.def (cse_sve_vl_constants):
Define.
* config/aarch64/aarch64.md (add<mode>3): Force CONST_POLY_INT immediates
into a register when the above is enabled.
* config/aarch64/aarch64.c (neoversev1_tunings):
AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.
(aarch64_rtx_costs): Use AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.
gcc/testsuite/
* gcc.target/aarch64/sve/cse_sve_vl_constants_1.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64-tuning-flags.def | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 16 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.md | 8 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c | 12 |
4 files changed, 35 insertions, 3 deletions
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index aae9952..588edf4 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -46,4 +46,6 @@ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS) AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS) +AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 6fda6bc..6997669 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1492,7 +1492,7 @@ static const struct tune_params neoversev1_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */ &generic_prefetch_tune }; @@ -12589,8 +12589,18 @@ cost_plus: *cost += rtx_cost (op0, mode, PLUS, 0, speed); if (speed) - /* ADD (immediate). */ - *cost += extra_cost->alu.arith; + { + /* ADD (immediate). */ + *cost += extra_cost->alu.arith; + + /* Some tunings prefer to not use the VL-based scalar ops. + Increase the cost of the poly immediate to prevent their + formation. */ + if (GET_CODE (op1) == CONST_POLY_INT + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS)) + *cost += COSTS_N_INSNS (1); + } return true; } diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index a482419..65d00c4 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1933,6 +1933,14 @@ && (!REG_P (op1) || !REGNO_PTR_FRAME_P (REGNO (op1)))) operands[2] = force_reg (<MODE>mode, operands[2]); + /* Some tunings prefer to avoid VL-based operations. + Split off the poly immediate here. The rtx costs hook will reject attempts + to combine them back. */ + else if (GET_CODE (operands[2]) == CONST_POLY_INT + && can_create_pseudo_p () + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS)) + operands[2] = force_reg (<MODE>mode, operands[2]); /* Expand polynomial additions now if the destination is the stack pointer, since we don't want to use that as a temporary. */ else if (operands[0] == stack_pointer_rtx diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c new file mode 100644 index 0000000..dd04b66 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -moverride=tune=cse_sve_vl_constants" } */ + +void __attribute__((noinline, noclone)) +vadd (int *dst, int *op1, int *op2, int count) +{ + for (int i = 0; i < count; ++i) + dst[i] = op1[i] + op2[i]; +} + +/* { dg-final { scan-assembler-not {\tincw\tx[0-9]+} } } */ + |