aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2020-11-19 15:57:52 +0000
committerWilco Dijkstra <wdijkstr@arm.com>2020-11-19 16:05:33 +0000
commit1d77928fc49b4f2487fd78db26bbebd00f881414 (patch)
tree1121ac93d078a72f50bbb46975f8a6815fef1eb5 /gcc/config
parent2729378d0905a04e476a8bdcaaf0288f417810ec (diff)
downloadgcc-1d77928fc49b4f2487fd78db26bbebd00f881414.zip
gcc-1d77928fc49b4f2487fd78db26bbebd00f881414.tar.gz
gcc-1d77928fc49b4f2487fd78db26bbebd00f881414.tar.bz2
AArch64: Improve inline memcpy expansion
Improve the inline memcpy expansion. Use integer load/store for copies <= 24 bytes instead of SIMD. Set the maximum copy to expand to 256 by default, except that -Os or no Neon expands up to 128 bytes. When using LDP/STP of Q-registers, also use Q-register accesses for the unaligned tail, saving 2 instructions (eg. all sizes up to 48 bytes emit exactly 4 instructions). Cleanup code and comments. The codesize gain vs the GCC10 expansion is 0.05% on SPECINT2017. 2020-11-03 Wilco Dijkstra <wdijkstr@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_expand_cpymem): Cleanup code and comments, tweak expansion decisions and improve tail expansion.
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/aarch64/aarch64.c73
1 files changed, 37 insertions, 36 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6de51b5..1cf4900 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -21200,35 +21200,39 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
bool
aarch64_expand_cpymem (rtx *operands)
{
- /* These need to be signed as we need to perform arithmetic on n as
- signed operations. */
- int n, mode_bits;
+ int mode_bits;
rtx dst = operands[0];
rtx src = operands[1];
rtx base;
- machine_mode cur_mode = BLKmode, next_mode;
- bool speed_p = !optimize_function_for_size_p (cfun);
+ machine_mode cur_mode = BLKmode;
- /* When optimizing for size, give a better estimate of the length of a
- memcpy call, but use the default otherwise. Moves larger than 8 bytes
- will always require an even number of instructions to do now. And each
- operation requires both a load+store, so divide the max number by 2. */
- unsigned int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
-
- /* We can't do anything smart if the amount to copy is not constant. */
+ /* Only expand fixed-size copies. */
if (!CONST_INT_P (operands[2]))
return false;
- unsigned HOST_WIDE_INT tmp = INTVAL (operands[2]);
+ unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
- /* Try to keep the number of instructions low. For all cases we will do at
- most two moves for the residual amount, since we'll always overlap the
- remainder. */
- if (((tmp / 16) + (tmp % 16 ? 2 : 0)) > max_num_moves)
- return false;
+ /* Inline up to 256 bytes when optimizing for speed. */
+ unsigned HOST_WIDE_INT max_copy_size = 256;
- /* At this point tmp is known to have to fit inside an int. */
- n = tmp;
+ if (optimize_function_for_size_p (cfun))
+ max_copy_size = 128;
+
+ int copy_bits = 256;
+
+ /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
+ support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
+ if (size <= 24
+ || !TARGET_SIMD
+ || (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+ {
+ copy_bits = 128;
+ max_copy_size = max_copy_size / 2;
+ }
+
+ if (size > max_copy_size)
+ return false;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -21236,15 +21240,8 @@ aarch64_expand_cpymem (rtx *operands)
base = copy_to_mode_reg (Pmode, XEXP (src, 0));
src = adjust_automodify_address (src, VOIDmode, base, 0);
- /* Convert n to bits to make the rest of the code simpler. */
- n = n * BITS_PER_UNIT;
-
- /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
- AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD. */
- const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
- & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
- || !TARGET_SIMD)
- ? GET_MODE_BITSIZE (TImode) : 256;
+ /* Convert size to bits to make the rest of the code simpler. */
+ int n = size * BITS_PER_UNIT;
while (n > 0)
{
@@ -21252,24 +21249,28 @@ aarch64_expand_cpymem (rtx *operands)
or writing. */
opt_scalar_int_mode mode_iter;
FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
- if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+ if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
cur_mode = mode_iter.require ();
gcc_assert (cur_mode != BLKmode);
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+
+ /* Prefer Q-register accesses for the last bytes. */
+ if (mode_bits == 128 && copy_bits == 256)
+ cur_mode = V4SImode;
+
aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
n -= mode_bits;
- /* Do certain trailing copies as overlapping if it's going to be
- cheaper. i.e. less instructions to do so. For instance doing a 15
- byte copy it's more efficient to do two overlapping 8 byte copies than
- 8 + 6 + 1. */
- if (n > 0 && n <= 8 * BITS_PER_UNIT)
+ /* Emit trailing copies using overlapping unaligned accesses - this is
+ smaller and faster. */
+ if (n > 0 && n < copy_bits / 2)
{
- next_mode = smallest_mode_for_size (n, MODE_INT);
+ machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
+ gcc_assert (n_bits <= mode_bits);
src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
n = n_bits;