aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2004-08-06 12:17:14 +0200
committerJakub Jelinek <jakub@gcc.gnu.org>2004-08-06 12:17:14 +0200
commit6b32b6286bcc158ec954b458576d50bcd7ed5df3 (patch)
tree921579deec711673c464f2e0a1bfb3050a1f4db1
parent6797f908eec82d7ead4ca65eb970868c7489f244 (diff)
downloadgcc-6b32b6286bcc158ec954b458576d50bcd7ed5df3.zip
gcc-6b32b6286bcc158ec954b458576d50bcd7ed5df3.tar.gz
gcc-6b32b6286bcc158ec954b458576d50bcd7ed5df3.tar.bz2
i386.c (ix86_expand_clrmem): Move gen_cld down to the places where it is actually needed.
* config/i386/i386.c (ix86_expand_clrmem): Move gen_cld down to the places where it is actually needed. Don't use repz; stosb for -Os with sufficiently small constant sizes. For sufficiently small repz; stos{l,q} repeat counts use a sequence of stos{l,q} instructions instead. From-SVN: r85635
-rw-r--r--gcc/ChangeLog8
-rw-r--r--gcc/config/i386/i386.c66
2 files changed, 63 insertions, 11 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d83bdbe..eacfe7c 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2004-08-06 Jakub Jelinek <jakub@redhat.com>
+
+ * config/i386/i386.c (ix86_expand_clrmem): Move gen_cld down to
+ the places where it is actually needed. Don't use repz; stosb
+ for -Os with sufficiently small constant sizes.
+ For sufficiently small repz; stos{l,q} repeat counts use a sequence
+ of stos{l,q} instructions instead.
+
2004-08-06 Zdenek Dvorak <rakdver@atrey.karlin.mff.cuni.cz>
PR tree-optimization/16807
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 22de6e3..9515734 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -11508,13 +11508,20 @@ ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp)
if (destreg != XEXP (dst, 0))
dst = replace_equiv_address_nv (dst, destreg);
- emit_insn (gen_cld ());
/* When optimizing for size emit simple rep ; movsb instruction for
- counts not divisible by 4. */
+ counts not divisible by 4. The movl $N, %ecx; rep; stosb
+ sequence is 7 bytes long, so if optimizing for size and count is
+ small enough that some stosl, stosw and stosb instructions without
+ rep are shorter, fall back into the next if. */
- if ((!optimize || optimize_size) && (count == 0 || (count & 0x03)))
+ if ((!optimize || optimize_size)
+ && (count == 0
+ || ((count & 0x03)
+ && (!optimize_size || (count & 0x03) + (count >> 2) > 7))))
{
+ emit_insn (gen_cld ());
+
countreg = ix86_zero_extend_to_Pmode (count_exp);
zeroreg = copy_to_mode_reg (QImode, const0_rtx);
destexp = gen_rtx_PLUS (Pmode, destreg, countreg);
@@ -11528,17 +11535,54 @@ ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp)
int size = TARGET_64BIT && !optimize_size ? 8 : 4;
unsigned HOST_WIDE_INT offset = 0;
+ emit_insn (gen_cld ());
+
zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx);
if (count & ~(size - 1))
{
- countreg = copy_to_mode_reg (counter_mode,
- GEN_INT ((count >> (size == 4 ? 2 : 3))
- & (TARGET_64BIT ? -1 : 0x3fffffff)));
- countreg = ix86_zero_extend_to_Pmode (countreg);
- destexp = gen_rtx_ASHIFT (Pmode, countreg, GEN_INT (size == 4 ? 2 : 3));
- destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
- emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp));
- offset = count & ~(size - 1);
+ unsigned HOST_WIDE_INT repcount;
+ unsigned int max_nonrep;
+
+ repcount = count >> (size == 4 ? 2 : 3);
+ if (!TARGET_64BIT)
+ repcount &= 0x3fffffff;
+
+ /* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes.
+ movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN
+ bytes. In both cases the latter seems to be faster for small
+ values of N. */
+ max_nonrep = size == 4 ? 7 : 4;
+ if (!optimize_size)
+ switch (ix86_tune)
+ {
+ case PROCESSOR_PENTIUM4:
+ case PROCESSOR_NOCONA:
+ max_nonrep = 3;
+ break;
+ default:
+ break;
+ }
+
+ if (repcount <= max_nonrep)
+ while (repcount-- > 0)
+ {
+ rtx mem = adjust_automodify_address_nv (dst,
+ GET_MODE (zeroreg),
+ destreg, offset);
+ emit_insn (gen_strset (destreg, mem, zeroreg));
+ offset += size;
+ }
+ else
+ {
+ countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount));
+ countreg = ix86_zero_extend_to_Pmode (countreg);
+ destexp = gen_rtx_ASHIFT (Pmode, countreg,
+ GEN_INT (size == 4 ? 2 : 3));
+ destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
+ emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg,
+ destexp));
+ offset = count & ~(size - 1);
+ }
}
if (size == 8 && (count & 0x04))
{