aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/loongarch/loongarch.h
diff options
context:
space:
mode:
authorXi Ruoyao <xry111@xry111.site>2023-04-12 11:45:48 +0000
committerXi Ruoyao <xry111@xry111.site>2023-04-19 18:13:50 +0800
commit6d7e0bcfa49e4ddc84dabe520bba8a023bc52692 (patch)
tree434e067faa3ff644a6bf44fb9fbdd9f902fc5299 /gcc/config/loongarch/loongarch.h
parent81c6501445fcddad653363f815cd04ca6fdb488e (diff)
downloadgcc-6d7e0bcfa49e4ddc84dabe520bba8a023bc52692.zip
gcc-6d7e0bcfa49e4ddc84dabe520bba8a023bc52692.tar.gz
gcc-6d7e0bcfa49e4ddc84dabe520bba8a023bc52692.tar.bz2
LoongArch: Improve cpymemsi expansion [PR109465]
We'd been generating really bad block move sequences which is recently complained by kernel developers who tried __builtin_memcpy. To improve it: 1. Take the advantage of -mno-strict-align. When it is set, set mode size to UNITS_PER_WORD regardless of the alignment. 2. Half the mode size when (block size) % (mode size) != 0, instead of falling back to ld.bu/st.b at once. 3. Limit the length of block move sequence considering the number of instructions, not the size of block. When -mstrict-align is set and the block is not aligned, the old size limit for straight-line implementation (64 bytes) was definitely too large (we don't have 64 registers anyway). Change since v1: add a comment about the calculation of num_reg. gcc/ChangeLog: PR target/109465 * config/loongarch/loongarch-protos.h (loongarch_expand_block_move): Add a parameter as alignment RTX. * config/loongarch/loongarch.h: (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER): Remove. (LARCH_MAX_MOVE_BYTES_STRAIGHT): Remove. (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER): Define. (LARCH_MAX_MOVE_OPS_STRAIGHT): Define. (MOVE_RATIO): Use LARCH_MAX_MOVE_OPS_PER_LOOP_ITER instead of LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER. * config/loongarch/loongarch.cc (loongarch_expand_block_move): Take the alignment from the parameter, but set it to UNITS_PER_WORD if !TARGET_STRICT_ALIGN. Limit the length of straight-line implementation with LARCH_MAX_MOVE_OPS_STRAIGHT instead of LARCH_MAX_MOVE_BYTES_STRAIGHT. (loongarch_block_move_straight): When there are left-over bytes, half the mode size instead of falling back to byte mode at once. (loongarch_block_move_loop): Limit the length of loop body with LARCH_MAX_MOVE_OPS_PER_LOOP_ITER instead of LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER. * config/loongarch/loongarch.md (cpymemsi): Pass the alignment to loongarch_expand_block_move. gcc/testsuite/ChangeLog: PR target/109465 * gcc.target/loongarch/pr109465-1.c: New test. * gcc.target/loongarch/pr109465-2.c: New test. * gcc.target/loongarch/pr109465-3.c: New test.
Diffstat (limited to 'gcc/config/loongarch/loongarch.h')
-rw-r--r--gcc/config/loongarch/loongarch.h10
1 files changed, 4 insertions, 6 deletions
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index 277facb..a9eff6a 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -1062,13 +1062,13 @@ typedef struct {
/* The maximum number of bytes that can be copied by one iteration of
a cpymemsi loop; see loongarch_block_move_loop. */
-#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)
+#define LARCH_MAX_MOVE_OPS_PER_LOOP_ITER 4
/* The maximum number of bytes that can be copied by a straight-line
implementation of cpymemsi; see loongarch_block_move_straight. We want
to make sure that any loop-based implementation will iterate at
least twice. */
-#define LARCH_MAX_MOVE_BYTES_STRAIGHT (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2)
+#define LARCH_MAX_MOVE_OPS_STRAIGHT (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2)
/* The base cost of a memcpy call, for MOVE_RATIO and friends. These
values were determined experimentally by benchmarking with CSiBE.
@@ -1076,7 +1076,7 @@ typedef struct {
#define LARCH_CALL_RATIO 8
/* Any loop-based implementation of cpymemsi will have at least
- LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory
+ LARCH_MAX_MOVE_OPS_PER_LOOP_ITER memory-to-memory
moves, so allow individual copies of fewer elements.
When cpymemsi is not available, use a value approximating
@@ -1087,9 +1087,7 @@ typedef struct {
value of LARCH_CALL_RATIO to take that into account. */
#define MOVE_RATIO(speed) \
- (HAVE_cpymemsi \
- ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \
- : CLEAR_RATIO (speed) / 2)
+ (HAVE_cpymemsi ? LARCH_MAX_MOVE_OPS_PER_LOOP_ITER : CLEAR_RATIO (speed) / 2)
/* For CLEAR_RATIO, when optimizing for size, give a better estimate
of the length of a memset call, but use the default otherwise. */