aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorKyrylo Tkachov <kyrylo.tkachov@arm.com>2021-02-22 21:24:41 +0000
committerKyrylo Tkachov <kyrylo.tkachov@arm.com>2021-02-22 21:24:41 +0000
commita65b9ad863c5fc0aea12db58557f4d286a1974d7 (patch)
tree388caa27fcb281d2a4b2c2f4de839b5eb322c713 /gcc
parent692ba083d9a22aaa08c8a3700d0237db8c922dc4 (diff)
downloadgcc-a65b9ad863c5fc0aea12db58557f4d286a1974d7.zip
gcc-a65b9ad863c5fc0aea12db58557f4d286a1974d7.tar.gz
gcc-a65b9ad863c5fc0aea12db58557f4d286a1974d7.tar.bz2
aarch64: Add internal tune flag to minimise VL-based scalar ops
This patch introduces an internal tune flag to break up VL-based scalar ops into a GP-reg scalar op with the VL read kept separate. This can be preferable on some CPUs. I went for a tune param rather than extending the rtx costs as our RTX costs tables aren't set up to track this intricacy. I've confirmed that on the simple loop: void vadd (int *dst, int *op1, int *op2, int count) { for (int i = 0; i < count; ++i) dst[i] = op1[i] + op2[i]; } we now split the incw into a cntw outside the loop and the add inside. + cntw x5 ... loop: - incw x4 + add x4, x4, x5 gcc/ChangeLog: * config/aarch64/aarch64-tuning-flags.def (cse_sve_vl_constants): Define. * config/aarch64/aarch64.md (add<mode>3): Force CONST_POLY_INT immediates into a register when the above is enabled. * config/aarch64/aarch64.c (neoversev1_tunings): AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. (aarch64_rtx_costs): Use AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. gcc/testsuite/ * gcc.target/aarch64/sve/cse_sve_vl_constants_1.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/aarch64/aarch64-tuning-flags.def2
-rw-r--r--gcc/config/aarch64/aarch64.c16
-rw-r--r--gcc/config/aarch64/aarch64.md8
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c12
4 files changed, 35 insertions, 3 deletions
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index aae9952..588edf4 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -46,4 +46,6 @@ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
+
#undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6fda6bc..6997669 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1492,7 +1492,7 @@ static const struct tune_params neoversev1_tunings =
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
&generic_prefetch_tune
};
@@ -12589,8 +12589,18 @@ cost_plus:
*cost += rtx_cost (op0, mode, PLUS, 0, speed);
if (speed)
- /* ADD (immediate). */
- *cost += extra_cost->alu.arith;
+ {
+ /* ADD (immediate). */
+ *cost += extra_cost->alu.arith;
+
+ /* Some tunings prefer to not use the VL-based scalar ops.
+ Increase the cost of the poly immediate to prevent their
+ formation. */
+ if (GET_CODE (op1) == CONST_POLY_INT
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
+ *cost += COSTS_N_INSNS (1);
+ }
return true;
}
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a482419..65d00c4 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1933,6 +1933,14 @@
&& (!REG_P (op1)
|| !REGNO_PTR_FRAME_P (REGNO (op1))))
operands[2] = force_reg (<MODE>mode, operands[2]);
+ /* Some tunings prefer to avoid VL-based operations.
+ Split off the poly immediate here. The rtx costs hook will reject attempts
+ to combine them back. */
+ else if (GET_CODE (operands[2]) == CONST_POLY_INT
+ && can_create_pseudo_p ()
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
+ operands[2] = force_reg (<MODE>mode, operands[2]);
/* Expand polynomial additions now if the destination is the stack
pointer, since we don't want to use that as a temporary. */
else if (operands[0] == stack_pointer_rtx
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c
new file mode 100644
index 0000000..dd04b66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cse_sve_vl_constants_1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -moverride=tune=cse_sve_vl_constants" } */
+
+void __attribute__((noinline, noclone))
+vadd (int *dst, int *op1, int *op2, int count)
+{
+ for (int i = 0; i < count; ++i)
+ dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not {\tincw\tx[0-9]+} } } */
+