aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Krebbel <krebbel@linux.vnet.ibm.com>2017-01-05 09:59:32 +0000
committerAndreas Krebbel <krebbel@gcc.gnu.org>2017-01-05 09:59:32 +0000
commit587790e60d22605b9b3aa73e7313cc55a6417c30 (patch)
treeffa2fd7e3106dd480f6f7c2a22f9439a2e9b2ab0
parent6ff92497174e84271f71d1a1ede8a0ec513ee6a7 (diff)
downloadgcc-587790e60d22605b9b3aa73e7313cc55a6417c30.zip
gcc-587790e60d22605b9b3aa73e7313cc55a6417c30.tar.gz
gcc-587790e60d22605b9b3aa73e7313cc55a6417c30.tar.bz2
S/390: memset: Avoid overlapping MVC operands between iterations.
A memset with a value != 0 is currently implemented using the mvc instruction propagating the first byte through 256 byte blocks. While for the first mvc the byte is written with a separate instruction subsequent MVCs used the last byte of the previous 256 byte block. Starting with z13 this causes a major performance degradation. With this patch we always set the first byte with an mvi or stc in order to avoid the overlapping of the MVC operands between loop iterations. On older machines this basically makes no measurable difference so the patch enables the new behavior for all machine levels in order to make sure that code built for older machine levels runs well when moved to a z13. Bootstrapped and regression tested on s390 and s390x using z900 and z13 as default -march level. No regressions. gcc/ChangeLog: 2017-01-05 Andreas Krebbel <krebbel@linux.vnet.ibm.com> * config/s390/s390.c (s390_expand_setmem): Avoid overlapping bytes between loop iterations. From-SVN: r244096
-rw-r--r--gcc/ChangeLog5
-rw-r--r--gcc/config/s390/s390.c95
2 files changed, 69 insertions, 31 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b9d6cb4..6c47cb8 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2017-01-05 Andreas Krebbel <krebbel@linux.vnet.ibm.com>
+
+ * config/s390/s390.c (s390_expand_setmem): Avoid overlapping bytes
+ between loop iterations.
+
2017-01-05 Martin Liska <mliska@suse.cz>
PR sanitizer/78815
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 2082cb5..257bce7 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -5346,6 +5346,8 @@ s390_expand_movmem (rtx dst, rtx src, rtx len)
void
s390_expand_setmem (rtx dst, rtx len, rtx val)
{
+ const int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
+
if (GET_CODE (len) == CONST_INT && INTVAL (len) == 0)
return;
@@ -5391,13 +5393,14 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
{
rtx dst_addr, count, blocks, temp, dstp1 = NULL_RTX;
rtx_code_label *loop_start_label = gen_label_rtx ();
- rtx_code_label *loop_end_label = gen_label_rtx ();
- rtx_code_label *end_label = gen_label_rtx ();
+ rtx_code_label *onebyte_end_label = gen_label_rtx ();
+ rtx_code_label *zerobyte_end_label = gen_label_rtx ();
+ rtx_code_label *restbyte_end_label = gen_label_rtx ();
machine_mode mode;
mode = GET_MODE (len);
if (mode == VOIDmode)
- mode = Pmode;
+ mode = Pmode;
dst_addr = gen_reg_rtx (Pmode);
count = gen_reg_rtx (mode);
@@ -5405,39 +5408,56 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
convert_move (count, len, 1);
emit_cmp_and_jump_insns (count, const0_rtx,
- EQ, NULL_RTX, mode, 1, end_label);
+ EQ, NULL_RTX, mode, 1, zerobyte_end_label,
+ very_unlikely);
+ /* We need to make a copy of the target address since memset is
+ supposed to return it unmodified. We have to make it here
+ already since the new reg is used at onebyte_end_label. */
emit_move_insn (dst_addr, force_operand (XEXP (dst, 0), NULL_RTX));
dst = change_address (dst, VOIDmode, dst_addr);
- if (val == const0_rtx)
- temp = expand_binop (mode, add_optab, count, constm1_rtx, count, 1,
- OPTAB_DIRECT);
- else
+ if (val != const0_rtx)
{
- dstp1 = adjust_address (dst, VOIDmode, 1);
+ /* When using the overlapping mvc the original target
+ address is only accessed as single byte entity (even by
+ the mvc reading this value). */
set_mem_size (dst, 1);
-
- /* Initialize memory by storing the first byte. */
- emit_move_insn (adjust_address (dst, QImode, 0), val);
-
- /* If count is 1 we are done. */
- emit_cmp_and_jump_insns (count, const1_rtx,
- EQ, NULL_RTX, mode, 1, end_label);
-
- temp = expand_binop (mode, add_optab, count, GEN_INT (-2), count, 1,
- OPTAB_DIRECT);
- }
+ dstp1 = adjust_address (dst, VOIDmode, 1);
+ emit_cmp_and_jump_insns (count,
+ const1_rtx, EQ, NULL_RTX, mode, 1,
+ onebyte_end_label, very_unlikely);
+ }
+
+ /* There is one unconditional (mvi+mvc)/xc after the loop
+ dealing with the rest of the bytes, subtracting two (mvi+mvc)
+ or one (xc) here leaves this number of bytes to be handled by
+ it. */
+ temp = expand_binop (mode, add_optab, count,
+ val == const0_rtx ? constm1_rtx : GEN_INT (-2),
+ count, 1, OPTAB_DIRECT);
if (temp != count)
- emit_move_insn (count, temp);
+ emit_move_insn (count, temp);
temp = expand_binop (mode, lshr_optab, count, GEN_INT (8), blocks, 1,
OPTAB_DIRECT);
if (temp != blocks)
- emit_move_insn (blocks, temp);
+ emit_move_insn (blocks, temp);
emit_cmp_and_jump_insns (blocks, const0_rtx,
- EQ, NULL_RTX, mode, 1, loop_end_label);
+ EQ, NULL_RTX, mode, 1, restbyte_end_label);
+
+ emit_jump (loop_start_label);
+
+ if (val != const0_rtx)
+ {
+ /* The 1 byte != 0 special case. Not handled efficiently
+ since we require two jumps for that. However, this
+ should be very rare. */
+ emit_label (onebyte_end_label);
+ emit_move_insn (adjust_address (dst, QImode, 0), val);
+ emit_jump (zerobyte_end_label);
+ }
emit_label (loop_start_label);
@@ -5455,26 +5475,39 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
if (val == const0_rtx)
emit_insn (gen_clrmem_short (dst, GEN_INT (255)));
else
- emit_insn (gen_movmem_short (dstp1, dst, GEN_INT (255)));
+ {
+ /* Set the first byte in the block to the value and use an
+ overlapping mvc for the block. */
+ emit_move_insn (adjust_address (dst, QImode, 0), val);
+ emit_insn (gen_movmem_short (dstp1, dst, GEN_INT (254)));
+ }
s390_load_address (dst_addr,
gen_rtx_PLUS (Pmode, dst_addr, GEN_INT (256)));
temp = expand_binop (mode, add_optab, blocks, constm1_rtx, blocks, 1,
OPTAB_DIRECT);
if (temp != blocks)
- emit_move_insn (blocks, temp);
+ emit_move_insn (blocks, temp);
emit_cmp_and_jump_insns (blocks, const0_rtx,
- EQ, NULL_RTX, mode, 1, loop_end_label);
+ NE, NULL_RTX, mode, 1, loop_start_label);
- emit_jump (loop_start_label);
- emit_label (loop_end_label);
+ emit_label (restbyte_end_label);
if (val == const0_rtx)
- emit_insn (gen_clrmem_short (dst, convert_to_mode (Pmode, count, 1)));
+ emit_insn (gen_clrmem_short (dst, convert_to_mode (Pmode, count, 1)));
else
- emit_insn (gen_movmem_short (dstp1, dst, convert_to_mode (Pmode, count, 1)));
- emit_label (end_label);
+ {
+ /* Set the first byte in the block to the value and use an
+ overlapping mvc for the block. */
+ emit_move_insn (adjust_address (dst, QImode, 0), val);
+ /* execute only uses the lowest 8 bits of count that's
+ exactly what we need here. */
+ emit_insn (gen_movmem_short (dstp1, dst,
+ convert_to_mode (Pmode, count, 1)));
+ }
+
+ emit_label (zerobyte_end_label);
}
}