aarch64: Improve size heuristic for cpymem expansion

Similar to my previous patch for setmem this one does the same for the cpymem expansion. We count the number of ops emitted and compare it against the alternative of just calling the library function when optimising for size. For the code: void cpy_127 (char *out, char *in) { __builtin_memcpy (out, in, 127); } void cpy_128 (char *out, char *in) { __builtin_memcpy (out, in, 128); } we now emit a call to memcpy (with an extra MOV-immediate instruction for the size) instead of: cpy_127(char*, char*): ldp q0, q1, [x1] stp q0, q1, [x0] ldp q0, q1, [x1, 32] stp q0, q1, [x0, 32] ldp q0, q1, [x1, 64] stp q0, q1, [x0, 64] ldr q0, [x1, 96] str q0, [x0, 96] ldr q0, [x1, 111] str q0, [x0, 111] ret cpy_128(char*, char*): ldp q0, q1, [x1] stp q0, q1, [x0] ldp q0, q1, [x1, 32] stp q0, q1, [x0, 32] ldp q0, q1, [x1, 64] stp q0, q1, [x0, 64] ldp q0, q1, [x1, 96] stp q0, q1, [x0, 96] ret which is a clear code size win. Speed optimisation heuristics remain unchanged. 2021-09-29 Kyrylo Tkachov <kyrylo.tkachov@arm.com> * config/aarch64/aarch64.c (aarch64_expand_cpymem): Count number of emitted operations and adjust heuristic for code size. 2021-09-29 Kyrylo Tkachov <kyrylo.tkachov@arm.com> * gcc.target/aarch64/cpymem-size.c: New test.
author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2021-09-29 11:21:45 +0100
committer: Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2021-09-29 11:21:45 +0100
commit: a459ee44c0a74b0df0485ed7a56683816c02aae9 (patch)
tree: 28120613b6c3f40a1f797cab6d68499117140a49 /gcc
parent: 8f95e3c04d659d541ca4937b3df2f1175a1c5f05 (diff)
download: gcc-a459ee44c0a74b0df0485ed7a56683816c02aae9.zip
gcc-a459ee44c0a74b0df0485ed7a56683816c02aae9.tar.gz
gcc-a459ee44c0a74b0df0485ed7a56683816c02aae9.tar.bz2
2 files changed, 54 insertions, 11 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ac17c1c..a9a1800 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23390,7 +23390,8 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 }
 
 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false.  */
+   we succeed, otherwise return false, indicating that a libcall to
+   memcpy should be emitted.  */
 
 bool
 aarch64_expand_cpymem (rtx *operands)
@@ -23407,11 +23408,13 @@ aarch64_expand_cpymem (rtx *operands)
 
   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Inline up to 256 bytes when optimizing for speed.  */
+  /* Try to inline up to 256 bytes.  */
   unsigned HOST_WIDE_INT max_copy_size = 256;
 
-  if (optimize_function_for_size_p (cfun))
-    max_copy_size = 128;
+  bool size_p = optimize_function_for_size_p (cfun);
+
+  if (size > max_copy_size)
+    return false;
 
   int copy_bits = 256;
 
@@ -23421,13 +23424,14 @@ aarch64_expand_cpymem (rtx *operands)
       || !TARGET_SIMD
       || (aarch64_tune_params.extra_tuning_flags
 	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-    {
-      copy_bits = 128;
-      max_copy_size = max_copy_size / 2;
-    }
+    copy_bits = 128;
 
-  if (size > max_copy_size)
-    return false;
+  /* Emit an inline load+store sequence and count the number of operations
+     involved.  We use a simple count of just the loads and stores emitted
+     rather than rtx_insn count as all the pointer adjustments and reg copying
+     in this function will get optimized away later in the pipeline.  */
+  start_sequence ();
+  unsigned nops = 0;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -23456,7 +23460,8 @@ aarch64_expand_cpymem (rtx *operands)
 	cur_mode = V4SImode;
 
       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
-
+      /* A single block copy is 1 load + 1 store.  */
+      nops += 2;
       n -= mode_bits;
 
       /* Emit trailing copies using overlapping unaligned accesses - this is
@@ -23471,7 +23476,16 @@ aarch64_expand_cpymem (rtx *operands)
 	  n = n_bits;
 	}
     }
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+     arguments + 1 for the call.  */
+  unsigned libcall_cost = 4;
+  if (size_p && libcall_cost < nops)
+    return false;
 
+  emit_insn (seq);
   return true;
 }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-size.c b/gcc/testsuite/gcc.target/aarch64/cpymem-size.c
new file mode 100644
index 0000000..4d488b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cpymem-size.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+#include <stdlib.h>
+
+/*
+** cpy_127:
+**      mov	x2, 127
+**      b	memcpy
+*/
+void
+cpy_127 (char *out, char *in)
+{
+  __builtin_memcpy (out, in, 127);
+}
+
+/*
+** cpy_128:
+**      mov	x2, 128
+**      b	memcpy
+*/
+void
+cpy_128 (char *out, char *in)
+{
+  __builtin_memcpy (out, in, 128);
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2021-09-29 11:21:45 +0100
committer	Kyrylo Tkachov <kyrylo.tkachov@arm.com>	2021-09-29 11:21:45 +0100
commit	a459ee44c0a74b0df0485ed7a56683816c02aae9 (patch)
tree	28120613b6c3f40a1f797cab6d68499117140a49 /gcc
parent	8f95e3c04d659d541ca4937b3df2f1175a1c5f05 (diff)
download	gcc-a459ee44c0a74b0df0485ed7a56683816c02aae9.zip gcc-a459ee44c0a74b0df0485ed7a56683816c02aae9.tar.gz gcc-a459ee44c0a74b0df0485ed7a56683816c02aae9.tar.bz2