aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorSudakshina Das <sudi.das@arm.com>2020-08-04 12:01:21 +0100
committerSudakshina Das <sudi.das@arm.com>2020-08-04 12:01:53 +0100
commit7cda9e0878da44dcaf025d3d146534dfaf0b9986 (patch)
tree55d3496d7ffbd9f70eef4063b9fafbdd6c980423 /gcc
parentd2b86e14c14020f3e119ab8f462e2a91bd7d46e5 (diff)
downloadgcc-7cda9e0878da44dcaf025d3d146534dfaf0b9986.zip
gcc-7cda9e0878da44dcaf025d3d146534dfaf0b9986.tar.gz
gcc-7cda9e0878da44dcaf025d3d146534dfaf0b9986.tar.bz2
aarch64: Use Q-reg loads/stores in movmem expansion
This is my attempt at reviving the old patch https://gcc.gnu.org/pipermail/gcc-patches/2019-January/514632.html I have followed on Kyrill's comment upstream on the link above and I am using the recommended option iii that he mentioned. "1) Adjust the copy_limit to 256 bits after checking AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS in the tuning. 2) Adjust aarch64_copy_one_block_and_progress_pointers to handle 256-bit moves. by iii: iii) Emit explicit V4SI (or any other 128-bit vector mode) pairs ldp/stps. This wouldn't need any adjustments to MD patterns, but would make aarch64_copy_one_block_and_progress_pointers more complex as it would now have two paths, where one handles two adjacent memory addresses in one calls." gcc/ChangeLog: * config/aarch64/aarch64.c (aarch64_gen_store_pair): Add case for E_V4SImode. (aarch64_gen_load_pair): Likewise. (aarch64_copy_one_block_and_progress_pointers): Handle 256 bit copy. (aarch64_expand_cpymem): Expand copy_limit to 256bits where appropriate. gcc/testsuite/ChangeLog: * gcc.target/aarch64/cpymem-q-reg_1.c: New test. * gcc.target/aarch64/large_struct_copy_2.c: Update for ldp q regs.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/aarch64/aarch64.c36
-rw-r--r--gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c16
-rw-r--r--gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c7
3 files changed, 52 insertions, 7 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 7c3ab3e..7b7faba 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6957,6 +6957,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
case E_TFmode:
return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
+ case E_V4SImode:
+ return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
+
default:
gcc_unreachable ();
}
@@ -6980,6 +6983,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
case E_TFmode:
return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
+ case E_V4SImode:
+ return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
+
default:
gcc_unreachable ();
}
@@ -21144,6 +21150,27 @@ static void
aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
machine_mode mode)
{
+ /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
+ address copies using V4SImode so that we can use Q registers. */
+ if (known_eq (GET_MODE_BITSIZE (mode), 256))
+ {
+ mode = V4SImode;
+ rtx reg1 = gen_reg_rtx (mode);
+ rtx reg2 = gen_reg_rtx (mode);
+ /* "Cast" the pointers to the correct mode. */
+ *src = adjust_address (*src, mode, 0);
+ *dst = adjust_address (*dst, mode, 0);
+ /* Emit the memcpy. */
+ emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
+ aarch64_progress_pointer (*src)));
+ emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
+ aarch64_progress_pointer (*dst), reg2));
+ /* Move the pointers forward. */
+ *src = aarch64_move_pointer (*src, 32);
+ *dst = aarch64_move_pointer (*dst, 32);
+ return;
+ }
+
rtx reg = gen_reg_rtx (mode);
/* "Cast" the pointers to the correct mode. */
@@ -21197,9 +21224,12 @@ aarch64_expand_cpymem (rtx *operands)
/* Convert n to bits to make the rest of the code simpler. */
n = n * BITS_PER_UNIT;
- /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
- larger than TImode, but we should not use them for loads/stores here. */
- const int copy_limit = GET_MODE_BITSIZE (TImode);
+ /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
+ AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD. */
+ const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+ || !TARGET_SIMD)
+ ? GET_MODE_BITSIZE (TImode) : 256;
while (n > 0)
{
diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
new file mode 100644
index 0000000..df5f67e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define N 8
+extern int src[N], dst[N];
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, N * sizeof (int));
+}
+
+/* { dg-final { scan-assembler {ldp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {ldp\tx[0-9]*} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {stp\tx[0-9]*} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
index 5654342..8ee0a9f 100644
--- a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
@@ -16,11 +16,10 @@ struct S2 {
struct S0 f3;
};
-void fn1 () {
- struct S2 b = {0, 1, 7, 4073709551611, 4, 8, 7};
+void fn1 (struct S2 b) {
a = b.f3;
}
-/* { dg-final { scan-assembler-times {ldp\s+x[0-9]+} 2 } } */
-/* { dg-final { scan-assembler-times {stp\s+x[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {ldp\s+q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {stp\s+q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-not {ld[1-3]} } } */