diff options
author | Stefan Schulze Frielinghaus <stefansf@linux.ibm.com> | 2023-05-16 08:34:28 +0200 |
---|---|---|
committer | Stefan Schulze Frielinghaus <stefansf@linux.ibm.com> | 2023-05-16 08:34:28 +0200 |
commit | c4dbea65b2777735551727d56f59a26f8ce2de0a (patch) | |
tree | 4787df305a714b97b1f6a8e6434e1a22801ad198 | |
parent | 5154171e1f3aa4aa120cee3ecf0347cd72a427fb (diff) | |
download | gcc-c4dbea65b2777735551727d56f59a26f8ce2de0a.zip gcc-c4dbea65b2777735551727d56f59a26f8ce2de0a.tar.gz gcc-c4dbea65b2777735551727d56f59a26f8ce2de0a.tar.bz2 |
s390: Refactor block operation setmem
Vectorize memset with a constant length of less than or equal to 64
bytes.
Do not perform a libc function call into memset in case the size is not
a compile-time constant but bounded and the upper bound is less than or
equal to 256 bytes.
gcc/ChangeLog:
* config/s390/s390-protos.h (s390_expand_setmem): Change
function signature.
* config/s390/s390.cc (s390_expand_setmem): For memset's less
than or equal to 256 byte do not perform a libc call.
* config/s390/s390.md: Change expander into a version which
takes 8 operands.
gcc/testsuite/ChangeLog:
* gcc.target/s390/memset-1.c: Test case memset1 makes use of
vst, now.
-rw-r--r-- | gcc/config/s390/s390-protos.h | 2 | ||||
-rw-r--r-- | gcc/config/s390/s390.cc | 129 | ||||
-rw-r--r-- | gcc/config/s390/s390.md | 14 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/s390/memset-1.c | 7 |
4 files changed, 132 insertions, 20 deletions
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h index 65e4f97..4a5263f 100644 --- a/gcc/config/s390/s390-protos.h +++ b/gcc/config/s390/s390-protos.h @@ -109,7 +109,7 @@ extern void emit_symbolic_move (rtx *); extern void s390_load_address (rtx, rtx); extern bool s390_expand_cpymem (rtx, rtx, rtx, rtx, rtx); extern bool s390_expand_movmem (rtx, rtx, rtx, rtx, rtx); -extern void s390_expand_setmem (rtx, rtx, rtx); +extern void s390_expand_setmem (rtx, rtx, rtx, rtx, rtx); extern bool s390_expand_cmpmem (rtx, rtx, rtx, rtx); extern void s390_expand_vec_strlen (rtx, rtx, rtx); extern void s390_expand_vec_movstr (rtx, rtx, rtx); diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 553273f..b1cb546 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -5910,20 +5910,62 @@ s390_expand_movmem (rtx dst, rtx src, rtx len, rtx min_len_rtx, rtx max_len_rtx) Make use of clrmem if VAL is zero. */ void -s390_expand_setmem (rtx dst, rtx len, rtx val) +s390_expand_setmem (rtx dst, rtx len, rtx val, rtx min_len_rtx, rtx max_len_rtx) { - if (GET_CODE (len) == CONST_INT && INTVAL (len) <= 0) + /* Exit early in case nothing has to be done. */ + if (CONST_INT_P (len) && UINTVAL (len) == 0) return; gcc_assert (GET_CODE (val) == CONST_INT || GET_MODE (val) == QImode); + unsigned HOST_WIDE_INT min_len = UINTVAL (min_len_rtx); + unsigned HOST_WIDE_INT max_len + = max_len_rtx ? UINTVAL (max_len_rtx) : HOST_WIDE_INT_M1U; + + /* Vectorize memset with a constant length + - if 0 < LEN < 16, then emit a vstl based solution; + - if 16 <= LEN <= 64, then emit a vst based solution + where the last two vector stores may overlap in case LEN%16!=0. Paying + the price for an overlap is negligible compared to an extra GPR which is + required for vstl. */ + if (CONST_INT_P (len) && UINTVAL (len) <= 64 && val != const0_rtx + && TARGET_VX) + { + rtx val_vec = gen_reg_rtx (V16QImode); + emit_move_insn (val_vec, gen_rtx_VEC_DUPLICATE (V16QImode, val)); + + if (UINTVAL (len) < 16) + { + rtx len_reg = gen_reg_rtx (SImode); + emit_move_insn (len_reg, GEN_INT (UINTVAL (len) - 1)); + emit_insn (gen_vstlv16qi (val_vec, len_reg, dst)); + } + else + { + unsigned HOST_WIDE_INT l = UINTVAL (len) / 16; + unsigned HOST_WIDE_INT r = UINTVAL (len) % 16; + unsigned HOST_WIDE_INT o = 0; + for (unsigned HOST_WIDE_INT i = 0; i < l; ++i) + { + rtx newdst = adjust_address (dst, V16QImode, o); + emit_move_insn (newdst, val_vec); + o += 16; + } + if (r != 0) + { + rtx newdst = adjust_address (dst, V16QImode, (o - 16) + r); + emit_move_insn (newdst, val_vec); + } + } + } + /* Expand setmem/clrmem for a constant length operand without a loop if it will be shorter that way. clrmem loop (with PFD) is 30 bytes -> 5 * xc clrmem loop (without PFD) is 24 bytes -> 4 * xc setmem loop (with PFD) is 38 bytes -> ~4 * (mvi/stc + mvc) setmem loop (without PFD) is 32 bytes -> ~4 * (mvi/stc + mvc) */ - if (GET_CODE (len) == CONST_INT + else if (GET_CODE (len) == CONST_INT && ((val == const0_rtx && (INTVAL (len) <= 256 * 4 || (INTVAL (len) <= 256 * 5 && TARGET_SETMEM_PFD(val,len)))) @@ -5968,6 +6010,70 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) val)); } + /* Non-constant length and no loop required. */ + else if (!CONST_INT_P (len) && max_len <= 256) + { + rtx_code_label *end_label; + + if (min_len == 0) + { + end_label = gen_label_rtx (); + emit_cmp_and_jump_insns (len, const0_rtx, EQ, NULL_RTX, + GET_MODE (len), 1, end_label, + profile_probability::very_unlikely ()); + } + + rtx lenm1 = expand_binop (GET_MODE (len), add_optab, len, constm1_rtx, + NULL_RTX, 1, OPTAB_DIRECT); + + /* Prefer a vectorized implementation over one which makes use of an + execute instruction since it is faster (although it increases register + pressure). */ + if (max_len <= 16 && TARGET_VX) + { + rtx val_vec = gen_reg_rtx (V16QImode); + if (val == const0_rtx) + emit_move_insn (val_vec, CONST0_RTX (V16QImode)); + else + emit_move_insn (val_vec, gen_rtx_VEC_DUPLICATE (V16QImode, val)); + + lenm1 = convert_to_mode (SImode, lenm1, 1); + emit_insn (gen_vstlv16qi (val_vec, lenm1, dst)); + } + else + { + if (val == const0_rtx) + emit_insn ( + gen_clrmem_short (dst, convert_to_mode (Pmode, lenm1, 1))); + else + { + emit_move_insn (adjust_address (dst, QImode, 0), val); + + rtx_code_label *onebyte_end_label; + if (min_len <= 1) + { + onebyte_end_label = gen_label_rtx (); + emit_cmp_and_jump_insns ( + len, const1_rtx, EQ, NULL_RTX, GET_MODE (len), 1, + onebyte_end_label, profile_probability::very_unlikely ()); + } + + rtx dstp1 = adjust_address (dst, VOIDmode, 1); + rtx lenm2 + = expand_binop (GET_MODE (len), add_optab, len, GEN_INT (-2), + NULL_RTX, 1, OPTAB_DIRECT); + lenm2 = convert_to_mode (Pmode, lenm2, 1); + emit_insn (gen_cpymem_short (dstp1, dst, lenm2)); + + if (min_len <= 1) + emit_label (onebyte_end_label); + } + } + + if (min_len == 0) + emit_label (end_label); + } + else { rtx dst_addr, count, blocks, temp, dstp1 = NULL_RTX; @@ -5986,9 +6092,10 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) blocks = gen_reg_rtx (mode); convert_move (count, len, 1); - emit_cmp_and_jump_insns (count, const0_rtx, - EQ, NULL_RTX, mode, 1, zerobyte_end_label, - profile_probability::very_unlikely ()); + if (min_len == 0) + emit_cmp_and_jump_insns (count, const0_rtx, EQ, NULL_RTX, mode, 1, + zerobyte_end_label, + profile_probability::very_unlikely ()); /* We need to make a copy of the target address since memset is supposed to return it unmodified. We have to make it here @@ -6003,10 +6110,10 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) the mvc reading this value). */ set_mem_size (dst, 1); dstp1 = adjust_address (dst, VOIDmode, 1); - emit_cmp_and_jump_insns (count, - const1_rtx, EQ, NULL_RTX, mode, 1, - onebyte_end_label, - profile_probability::very_unlikely ()); + if (min_len <= 1) + emit_cmp_and_jump_insns (count, const1_rtx, EQ, NULL_RTX, mode, 1, + onebyte_end_label, + profile_probability::very_unlikely ()); } /* There is one unconditional (mvi+mvc)/xc after the loop @@ -6029,7 +6136,7 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) emit_jump (loop_start_label); - if (val != const0_rtx) + if (val != const0_rtx && min_len <= 1) { /* The 1 byte != 0 special case. Not handled efficiently since we require two jumps for that. However, this diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index abe3bbc..9631b2a 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -3595,12 +3595,16 @@ ; (define_expand "setmem<mode>" - [(set (match_operand:BLK 0 "memory_operand" "") - (match_operand:QI 2 "general_operand" "")) - (use (match_operand:GPR 1 "general_operand" "")) - (match_operand 3 "" "")] + [(set (match_operand:BLK 0 "memory_operand" "") ; destination + (match_operand:QI 2 "general_operand" "")) ; value + (use (match_operand:GPR 1 "general_operand" "")) ; size + (match_operand 3 "") ; align + (match_operand 4 "") ; expected align + (match_operand 5 "") ; expected size + (match_operand 6 "") ; minimal size + (match_operand 7 "")] ; maximal size "" - "s390_expand_setmem (operands[0], operands[1], operands[2]); DONE;") + "s390_expand_setmem (operands[0], operands[1], operands[2], operands[6], operands[7]); DONE;") ; Clear a block that is up to 256 bytes in length. ; The block length is taken as (operands[1] % 256) + 1. diff --git a/gcc/testsuite/gcc.target/s390/memset-1.c b/gcc/testsuite/gcc.target/s390/memset-1.c index 9463a77..5eb9611 100644 --- a/gcc/testsuite/gcc.target/s390/memset-1.c +++ b/gcc/testsuite/gcc.target/s390/memset-1.c @@ -11,7 +11,7 @@ void return __builtin_memset (s, c, 1); } -/* 1 stc 1 mvc */ +/* 3 vst */ void *memset1(void *s, int c) { @@ -170,8 +170,9 @@ void } /* { dg-final { scan-assembler-times "mvi\\s" 1 } } */ -/* { dg-final { scan-assembler-times "mvc\\s" 20 } } */ +/* { dg-final { scan-assembler-times "mvc\\s" 19 } } */ /* { dg-final { scan-assembler-times "xc\\s" 28 } } */ -/* { dg-final { scan-assembler-times "stc\\s" 22 } } */ +/* { dg-final { scan-assembler-times "stc\\s" 21 } } */ /* { dg-final { scan-assembler-times "stcy\\s" 0 } } */ /* { dg-final { scan-assembler-times "pfd\\s" 2 } } */ +/* { dg-final { scan-assembler-times "vst\\s" 3 } } */ |