diff options
author | Tamar Christina <tamar.christina@arm.com> | 2018-07-05 10:31:04 +0000 |
---|---|---|
committer | Tamar Christina <tnfchris@gcc.gnu.org> | 2018-07-05 10:31:04 +0000 |
commit | 89c52e5e2ca6e63b1dc0868893d05434d61a0c33 (patch) | |
tree | 6d840f071e72215c17fb63692d89956f1bba850b /gcc/config | |
parent | bdfc619ed80b29b35aff74731f84915e033a5e84 (diff) | |
download | gcc-89c52e5e2ca6e63b1dc0868893d05434d61a0c33.zip gcc-89c52e5e2ca6e63b1dc0868893d05434d61a0c33.tar.gz gcc-89c52e5e2ca6e63b1dc0868893d05434d61a0c33.tar.bz2 |
Simplify movmem code by always doing overlapping copies when larger than 8 bytes on AArch64.
This changes the movmem code in AArch64 that does copy for data between 4 and 7
bytes to use the smallest possible mode capable of copying the remaining bytes in one
go and then overlapping the reads if needed.
This means that if we're copying 5 bytes we would issue an SImode and QImode
load instead of two SImode loads.
This does smaller memory accesses but also gives the mid-end a chance to realise
that it can CSE the loads in certain circumstances. e.g. when you have something
like
return foo;
where foo is a struct. This would be transformed by the mid-end into SSA form as
D.XXXX = foo;
return D.XXXX;
This movmem routine will handle the first copy, but it's usually not needed,
the mid-end would do SImode and QImode stores into X0 for the 5 bytes example
but without the first copies being in the same mode, it doesn't know it doesn't
need the stores at all.
From-SVN: r262434
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 112 |
1 files changed, 35 insertions, 77 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 143f9d0..01f35f8 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -16137,26 +16137,29 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, bool aarch64_expand_movmem (rtx *operands) { - unsigned int n; + int n, mode_bits; rtx dst = operands[0]; rtx src = operands[1]; rtx base; + machine_mode cur_mode = BLKmode, next_mode; bool speed_p = !optimize_function_for_size_p (cfun); /* When optimizing for size, give a better estimate of the length of a - memcpy call, but use the default otherwise. */ - unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2; + memcpy call, but use the default otherwise. Moves larger than 8 bytes + will always require an even number of instructions to do now. And each + operation requires both a load+store, so devide the max number by 2. */ + int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2; /* We can't do anything smart if the amount to copy is not constant. */ if (!CONST_INT_P (operands[2])) return false; - n = UINTVAL (operands[2]); + n = INTVAL (operands[2]); - /* Try to keep the number of instructions low. For cases below 16 bytes we - need to make at most two moves. For cases above 16 bytes it will be one - move for each 16 byte chunk, then at most two additional moves. */ - if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions) + /* Try to keep the number of instructions low. For all cases we will do at + most two moves for the residual amount, since we'll always overlap the + remainder. */ + if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves) return false; base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); @@ -16165,81 +16168,36 @@ aarch64_expand_movmem (rtx *operands) base = copy_to_mode_reg (Pmode, XEXP (src, 0)); src = adjust_automodify_address (src, VOIDmode, base, 0); - /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a - 1-byte chunk. */ - if (n < 4) - { - if (n >= 2) - { - aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); - n -= 2; - } - - if (n == 1) - aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); - - return true; - } + /* Convert n to bits to make the rest of the code simpler. */ + n = n * BITS_PER_UNIT; - /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second - 4-byte chunk, partially overlapping with the previously copied chunk. */ - if (n < 8) + while (n > 0) { - aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); - n -= 4; - if (n > 0) - { - int move = n - 4; + /* Find the largest mode in which to do the copy in without over reading + or writing. */ + opt_scalar_int_mode mode_iter; + FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT) + if (GET_MODE_BITSIZE (mode_iter.require ()) <= n) + cur_mode = mode_iter.require (); - src = aarch64_move_pointer (src, move); - dst = aarch64_move_pointer (dst, move); - aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); - } - return true; - } + gcc_assert (cur_mode != BLKmode); - /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of - them, then (if applicable) an 8-byte chunk. */ - while (n >= 8) - { - if (n / 16) - { - aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode); - n -= 16; - } - else - { - aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); - n -= 8; - } - } + mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); + aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode); - /* Finish the final bytes of the copy. We can always do this in one - instruction. We either copy the exact amount we need, or partially - overlap with the previous chunk we copied and copy 8-bytes. */ - if (n == 0) - return true; - else if (n == 1) - aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); - else if (n == 2) - aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); - else if (n == 4) - aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); - else - { - if (n == 3) - { - src = aarch64_move_pointer (src, -1); - dst = aarch64_move_pointer (dst, -1); - aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); - } - else - { - int move = n - 8; + n -= mode_bits; - src = aarch64_move_pointer (src, move); - dst = aarch64_move_pointer (dst, move); - aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); + /* Do certain trailing copies as overlapping if it's going to be + cheaper. i.e. less instructions to do so. For instance doing a 15 + byte copy it's more efficient to do two overlapping 8 byte copies than + 8 + 6 + 1. */ + next_mode = smallest_mode_for_size (n, MODE_INT); + int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); + if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT) + { + src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT); + dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT); + n = n_bits; } } |