aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorJan Hubicka <jh@suse.cz>2006-11-27 17:00:26 +0100
committerJan Hubicka <hubicka@gcc.gnu.org>2006-11-27 16:00:26 +0000
commit8c996513856f2769aee1730cb211050fef055fb5 (patch)
tree0fffda932600cccb90f66739f0c0ad7403474356 /gcc/config
parentd3d3d8986db75c95a43be00918c9888ca7569d41 (diff)
downloadgcc-8c996513856f2769aee1730cb211050fef055fb5.zip
gcc-8c996513856f2769aee1730cb211050fef055fb5.tar.gz
gcc-8c996513856f2769aee1730cb211050fef055fb5.tar.bz2
expr.c (emit_block_move_via_libcall): Export.
* expr.c (emit_block_move_via_libcall): Export. (clear_storage_via_libcall): Rename to ... (set_storage_via_libcall): ... this one; handle arbitrary value to set. (clear_storage): Update to set_storage_via_libcall. * expr.h (emit_block_move_via_libcall): Declare * config/i386/i386.h (stringop_alg): New enum. (MAX_STRINGOP_ALGS): New constant. (stringop_algs): New struct. (processor_costs): Add memcpy/memset descriptors. (TARGET_REP_MOVEL_OPTIMAL): Remove. * config/i386/i386.md (movsi_insv_1_rex64): New pattern. (strmovsi, strmovdi, strsetsi, strsetdi): Update; accept all operands for memset. * config/i386/i386.opt (minline-stringops-dynamically, mstringop-strategy): New parameters. * config/i386/i386-prostos.h (ix86_expand_movmem): Update prototype. (ix86_expand-clrmem): Rename to ... (ix86_expand_setmem): ... this one; update prototype. * config/i386/i386.c (DUMMY_STRINGOP_ALGS): New constant. (size_cost, i386_cost, i486_cost, pentium_cost, pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost, pentium4_cost, nocona_cost, core2_cost, generic_cost, generic64_cost, generic32_cost): Add memcpy/memset descriptors. (x86_rep_movl_optimal): Remove. (stringop_alg): New static variable. (ix86_expand_aligntest): Handle predictions. (override_options): Add strgop_alg handling. (predict_jump): New function. (scale_counter): New function. (expand_set_or_movmem_via_loop): New function. (expand_movmem_via_rep_mov): New function. (expand_setmem_via_rep_stots): New function. (emit_strmov): New function. (expand_movmem_epilogue): New function. (expand_setmem_epilogue_via_loop): New function. (expand_setmem_epilogue): New function. (expand_movmem_prologue): New function. (expand_setmem_prologue): New function. (decide_alg): New function. (decide_alignment): New function. (ix86_exand_movmem): Rewrite. (promote_duplicated_reg): New function. (ix86_expand_clrmem): Rename to ... (ix86_expand_setmem): ... this one. Rewrite. * invoke.texi (minline-stringops-dynamically): New command line option. (mstringop-strategy): Likewise. From-SVN: r119252
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/i386/i386-protos.h4
-rw-r--r--gcc/config/i386/i386.c1693
-rw-r--r--gcc/config/i386/i386.h36
-rw-r--r--gcc/config/i386/i386.md38
-rw-r--r--gcc/config/i386/i386.opt8
5 files changed, 1299 insertions, 480 deletions
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index b8d20a3..184b5b2 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -50,8 +50,8 @@ extern bool x86_extended_QIreg_mentioned_p (rtx);
extern bool x86_extended_reg_mentioned_p (rtx);
extern enum machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
-extern int ix86_expand_movmem (rtx, rtx, rtx, rtx);
-extern int ix86_expand_clrmem (rtx, rtx, rtx);
+extern int ix86_expand_movmem (rtx, rtx, rtx, rtx, rtx, rtx);
+extern int ix86_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx);
extern int ix86_expand_strlen (rtx, rtx, rtx, rtx);
extern bool legitimate_constant_p (rtx);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 9e11787..067c192 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -68,6 +68,8 @@ Boston, MA 02110-1301, USA. */
/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
#define COSTS_N_BYTES(N) ((N) * 2)
+#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
+
static const
struct processor_costs size_cost = { /* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of an add instruction */
@@ -119,6 +121,10 @@ struct processor_costs size_cost = { /* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of FABS instruction. */
COSTS_N_BYTES (2), /* cost of FCHS instruction. */
COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
+ {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+ {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
+ {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+ {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
};
/* Processor costs (relative to an add) */
@@ -173,6 +179,10 @@ struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (22), /* cost of FABS instruction. */
COSTS_N_INSNS (24), /* cost of FCHS instruction. */
COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
+ {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+ DUMMY_STRINGOP_ALGS},
+ {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+ DUMMY_STRINGOP_ALGS},
};
static const
@@ -226,6 +236,10 @@ struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
+ {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
+ DUMMY_STRINGOP_ALGS},
+ {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
+ DUMMY_STRINGOP_ALGS}
};
static const
@@ -279,6 +293,10 @@ struct processor_costs pentium_cost = {
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS},
+ {{libcall, {{-1, rep_prefix_4_byte}}},
+ DUMMY_STRINGOP_ALGS}
};
static const
@@ -332,6 +350,17 @@ struct processor_costs pentiumpro_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
+ /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
+ the alignment). For small blocks inline loop is still a noticeable win, for bigger
+ blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently
+ more expensive startup time in CPU, but after 4K the difference is down in the noise.
+ */
+ {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
+ {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
+ DUMMY_STRINGOP_ALGS},
+ {{rep_prefix_4_byte, {{1024, unrolled_loop},
+ {8192, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS}
};
static const
@@ -386,6 +415,10 @@ struct processor_costs geode_cost = {
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS},
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS}
};
static const
@@ -439,6 +472,10 @@ struct processor_costs k6_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS},
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS}
};
static const
@@ -492,6 +529,13 @@ struct processor_costs athlon_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
+ /* For some reason, Athlon deals better with REP prefix (relative to loops)
+ comopared to K8. Alignment becomes important after 8 bytes for mempcy and
+ 128 bytes for memset. */
+ {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS},
+ {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS}
};
static const
@@ -550,6 +594,14 @@ struct processor_costs k8_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
+ /* K8 has optimized REP instruction for medium sized blocks, but for very small
+ blocks it is better to use loop. For large blocks, libcall can do
+ nontemporary accesses and beat inline considerably. */
+ {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
+ {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {{libcall, {{8, loop}, {24, unrolled_loop},
+ {2048, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
};
static const
@@ -603,6 +655,10 @@ struct processor_costs pentium4_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}},
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}}
};
static const
@@ -656,6 +712,12 @@ struct processor_costs nocona_cost = {
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
+ {100000, unrolled_loop}, {-1, libcall}}}},
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{24, loop}, {64, unrolled_loop},
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}}
};
static const
@@ -708,6 +770,13 @@ struct processor_costs core2_cost = {
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (58), /* cost of FSQRT instruction. */
+ {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
+ {libcall, {{32, loop}, {64, rep_prefix_4_byte},
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {{libcall, {{8, loop}, {15, unrolled_loop},
+ {2048, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{24, loop}, {32, unrolled_loop},
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}}
};
/* Generic64 should produce code tuned for Nocona and K8. */
@@ -768,6 +837,10 @@ struct processor_costs generic64_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
+ {DUMMY_STRINGOP_ALGS,
+ {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {DUMMY_STRINGOP_ALGS,
+ {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
};
/* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */
@@ -822,6 +895,10 @@ struct processor_costs generic32_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
+ {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS},
+ {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
+ DUMMY_STRINGOP_ALGS},
};
const struct processor_costs *ix86_cost = &pentium_cost;
@@ -923,7 +1000,6 @@ const int x86_sse_split_regs = m_ATHLON_K8;
const int x86_sse_typeless_stores = m_ATHLON_K8;
const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA;
const int x86_use_ffreep = m_ATHLON_K8;
-const int x86_rep_movl_optimal = m_386 | m_PENT | m_PPRO | m_K6_GEODE | m_CORE2;
const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC);
/* ??? Allowing interunit moves makes it all too easy for the compiler to put
@@ -948,6 +1024,8 @@ const int x86_xadd = ~m_386;
const int x86_bswap = ~m_386;
const int x86_pad_returns = m_ATHLON_K8 | m_CORE2 | m_GENERIC;
+static enum stringop_alg stringop_alg = no_stringop;
+
/* In case the average insn count for single function invocation is
lower than this constant, emit fast (but longer) prologue and
epilogue code. */
@@ -1212,7 +1290,6 @@ static void ix86_emit_restore_regs_using_mov (rtx, HOST_WIDE_INT, int);
static void ix86_output_function_epilogue (FILE *, HOST_WIDE_INT);
static HOST_WIDE_INT ix86_GOT_alias_set (void);
static void ix86_adjust_counter (rtx, HOST_WIDE_INT);
-static rtx ix86_expand_aligntest (rtx, int);
static void ix86_expand_strlensi_unroll_1 (rtx, rtx, rtx);
static int ix86_issue_rate (void);
static int ix86_adjust_cost (rtx, rtx, rtx, int);
@@ -1732,6 +1809,25 @@ override_options (void)
ix86_tune_string = "generic32";
}
}
+ if (ix86_stringop_string)
+ {
+ if (!strcmp (ix86_stringop_string, "rep_byte"))
+ stringop_alg = rep_prefix_1_byte;
+ else if (!strcmp (ix86_stringop_string, "libcall"))
+ stringop_alg = libcall;
+ else if (!strcmp (ix86_stringop_string, "rep_4byte"))
+ stringop_alg = rep_prefix_4_byte;
+ else if (!strcmp (ix86_stringop_string, "rep_8byte"))
+ stringop_alg = rep_prefix_8_byte;
+ else if (!strcmp (ix86_stringop_string, "byte_loop"))
+ stringop_alg = loop_1_byte;
+ else if (!strcmp (ix86_stringop_string, "loop"))
+ stringop_alg = loop;
+ else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
+ stringop_alg = unrolled_loop;
+ else
+ error ("bad value (%s) for -mstringop-strategy= switch", ix86_stringop_string);
+ }
if (!strcmp (ix86_tune_string, "x86-64"))
warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated. Use -mtune=k8 or "
"-mtune=generic instead as appropriate.");
@@ -12725,10 +12821,22 @@ ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
}
}
+/* Predict just emitted jump instruction to be taken with probability PROB. */
+static void
+predict_jump (int prob)
+{
+ rtx insn = get_last_insn ();
+ gcc_assert (GET_CODE (insn) == JUMP_INSN);
+ REG_NOTES (insn)
+ = gen_rtx_EXPR_LIST (REG_BR_PROB,
+ GEN_INT (prob),
+ REG_NOTES (insn));
+}
+
/* Helper function for the string operations below. Dest VARIABLE whether
it is aligned to VALUE bytes. If true, jump to the label. */
static rtx
-ix86_expand_aligntest (rtx variable, int value)
+ix86_expand_aligntest (rtx variable, int value, bool epilogue)
{
rtx label = gen_label_rtx ();
rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
@@ -12738,6 +12846,10 @@ ix86_expand_aligntest (rtx variable, int value)
emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
1, label);
+ if (epilogue)
+ predict_jump (REG_BR_PROB_BASE * 50 / 100);
+ else
+ predict_jump (REG_BR_PROB_BASE * 90 / 100);
return label;
}
@@ -12765,581 +12877,1236 @@ ix86_zero_extend_to_Pmode (rtx exp)
return r;
}
-/* Expand string move (memcpy) operation. Use i386 string operations when
- profitable. expand_clrmem contains similar code. */
-int
-ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
+/* Divide COUNTREG by SCALE. */
+static rtx
+scale_counter (rtx countreg, int scale)
{
- rtx srcreg, destreg, countreg, srcexp, destexp;
- enum machine_mode counter_mode;
- HOST_WIDE_INT align = 0;
- unsigned HOST_WIDE_INT count = 0;
+ rtx sc;
+ rtx piece_size_mask;
- if (GET_CODE (align_exp) == CONST_INT)
- align = INTVAL (align_exp);
+ if (scale == 1)
+ return countreg;
+ if (GET_CODE (countreg) == CONST_INT)
+ return GEN_INT (INTVAL (countreg) / scale);
+ gcc_assert (REG_P (countreg));
- /* Can't use any of this if the user has appropriated esi or edi. */
- if (global_regs[4] || global_regs[5])
- return 0;
+ piece_size_mask = GEN_INT (scale - 1);
+ sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
+ GEN_INT (exact_log2 (scale)),
+ NULL, 1, OPTAB_DIRECT);
+ return sc;
+}
- /* This simple hack avoids all inlining code and simplifies code below. */
- if (!TARGET_ALIGN_STRINGOPS)
- align = 64;
+/* When SRCPTR is non-NULL, output simple loop to move memory
+ pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
+ overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
+ equivalent loop to set memory by VALUE (supposed to be in MODE).
- if (GET_CODE (count_exp) == CONST_INT)
+ The size is rounded down to whole number of chunk size moved at once.
+ SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
+
+
+static void
+expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr, rtx value,
+ rtx count, enum machine_mode mode, int unroll,
+ int expected_size)
+{
+ rtx out_label, top_label, iter, tmp;
+ enum machine_mode iter_mode;
+ rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
+ rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
+ rtx size;
+ rtx x_addr;
+ rtx y_addr;
+ int i;
+
+ iter_mode = GET_MODE (count);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ out_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
+ NULL, 1, OPTAB_DIRECT);
+ /* Those two should combine. */
+ if (piece_size == const1_rtx)
{
- count = INTVAL (count_exp);
- if (!TARGET_INLINE_ALL_STRINGOPS && count > 64)
- return 0;
+ emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
+ true, out_label);
+ predict_jump (REG_BR_PROB_BASE * 10 / 100);
}
+ emit_move_insn (iter, const0_rtx);
- /* Figure out proper mode for counter. For 32bits it is always SImode,
- for 64bits use SImode when possible, otherwise DImode.
- Set count to number of bytes copied when known at compile time. */
- if (!TARGET_64BIT
- || GET_MODE (count_exp) == SImode
- || x86_64_zext_immediate_operand (count_exp, VOIDmode))
- counter_mode = SImode;
+ emit_label (top_label);
+
+ tmp = convert_modes (Pmode, iter_mode, iter, true);
+ x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
+ destmem = change_address (destmem, mode, x_addr);
+
+ if (srcmem)
+ {
+ y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
+ srcmem = change_address (srcmem, mode, y_addr);
+
+ /* When unrolling for chips that reorder memory reads and writes,
+ we can save registers by using single temporary.
+ Also using 4 temporaries is overkill in 32bit mode. */
+ if (!TARGET_64BIT && 0)
+ {
+ for (i = 0; i < unroll; i++)
+ {
+ if (i)
+ {
+ destmem =
+ adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
+ srcmem =
+ adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, srcmem);
+ }
+ }
+ else
+ {
+ rtx tmpreg[4];
+ gcc_assert (unroll <= 4);
+ for (i = 0; i < unroll; i++)
+ {
+ tmpreg[i] = gen_reg_rtx (mode);
+ if (i)
+ {
+ srcmem =
+ adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (tmpreg[i], srcmem);
+ }
+ for (i = 0; i < unroll; i++)
+ {
+ if (i)
+ {
+ destmem =
+ adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, tmpreg[i]);
+ }
+ }
+ }
else
- counter_mode = DImode;
+ for (i = 0; i < unroll; i++)
+ {
+ if (i)
+ destmem =
+ adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
+ emit_move_insn (destmem, value);
+ }
- gcc_assert (counter_mode == SImode || counter_mode == DImode);
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
- destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
- if (destreg != XEXP (dst, 0))
- dst = replace_equiv_address_nv (dst, destreg);
- srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
- if (srcreg != XEXP (src, 0))
- src = replace_equiv_address_nv (src, srcreg);
-
- /* When optimizing for size emit simple rep ; movsb instruction for
- counts not divisible by 4, except when (movsl;)*(movsw;)?(movsb;)?
- sequence is shorter than mov{b,l} $count, %{ecx,cl}; rep; movsb.
- Sice of (movsl;)*(movsw;)?(movsb;)? sequence is
- count / 4 + (count & 3), the other sequence is either 4 or 7 bytes,
- but we don't know whether upper 24 (resp. 56) bits of %ecx will be
- known to be zero or not. The rep; movsb sequence causes higher
- register pressure though, so take that into account. */
-
- if ((!optimize || optimize_size)
- && (count == 0
- || ((count & 0x03)
- && (!optimize_size
- || count > 5 * 4
- || (count & 3) + count / 4 > 6))))
+ emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+ true, top_label);
+ if (expected_size != -1)
{
- emit_insn (gen_cld ());
- countreg = ix86_zero_extend_to_Pmode (count_exp);
- destexp = gen_rtx_PLUS (Pmode, destreg, countreg);
- srcexp = gen_rtx_PLUS (Pmode, srcreg, countreg);
- emit_insn (gen_rep_mov (destreg, dst, srcreg, src, countreg,
- destexp, srcexp));
+ expected_size /= GET_MODE_SIZE (mode) * unroll;
+ if (expected_size == 0)
+ predict_jump (0);
+ else if (expected_size > REG_BR_PROB_BASE)
+ predict_jump (REG_BR_PROB_BASE - 1);
+ else
+ predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
+ }
+ else
+ predict_jump (REG_BR_PROB_BASE * 80 / 100);
+ iter = ix86_zero_extend_to_Pmode (iter);
+ tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != destptr)
+ emit_move_insn (destptr, tmp);
+ if (srcptr)
+ {
+ tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != srcptr)
+ emit_move_insn (srcptr, tmp);
+ }
+ emit_label (out_label);
+}
+
+/* Output "rep; mov" instruction.
+ Arguments have same meaning as for previous function */
+static void
+expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr,
+ rtx count,
+ enum machine_mode mode)
+{
+ rtx destexp;
+ rtx srcexp;
+ rtx countreg;
+
+ /* If the size is known, it is shorter to use rep movs. */
+ if (mode == QImode && GET_CODE (count) == CONST_INT
+ && !(INTVAL (count) & 3))
+ mode = SImode;
+
+ if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
+ destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
+ if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
+ srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
+ countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
+ if (mode != QImode)
+ {
+ destexp = gen_rtx_ASHIFT (Pmode, countreg,
+ GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+ destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
+ srcexp = gen_rtx_ASHIFT (Pmode, countreg,
+ GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+ srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
+ }
+ else
+ {
+ destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
+ srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
}
+ emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
+ destexp, srcexp));
+}
- /* For constant aligned (or small unaligned) copies use rep movsl
- followed by code copying the rest. For PentiumPro ensure 8 byte
- alignment to allow rep movsl acceleration. */
+/* Output "rep; stos" instruction.
+ Arguments have same meaning as for previous function */
+static void
+expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
+ rtx count,
+ enum machine_mode mode)
+{
+ rtx destexp;
+ rtx countreg;
- else if (count != 0
- && (align >= 8
- || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4)
- || optimize_size || count < (unsigned int) 64))
+ if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
+ destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
+ value = force_reg (mode, gen_lowpart (mode, value));
+ countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
+ if (mode != QImode)
{
- unsigned HOST_WIDE_INT offset = 0;
- int size = TARGET_64BIT && !optimize_size ? 8 : 4;
- rtx srcmem, dstmem;
+ destexp = gen_rtx_ASHIFT (Pmode, countreg,
+ GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+ destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
+ }
+ else
+ destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
+ emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
+}
- emit_insn (gen_cld ());
- if (count & ~(size - 1))
+static void
+emit_strmov (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
+{
+ rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
+ rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
+}
+
+/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
+static void
+expand_movmem_epilogue (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr, rtx count, int max_size)
+{
+ rtx src, dest;
+ if (GET_CODE (count) == CONST_INT)
+ {
+ HOST_WIDE_INT countval = INTVAL (count);
+ int offset = 0;
+
+ if ((countval & 0x16) && max_size > 16)
{
- if ((TARGET_SINGLE_STRINGOP || optimize_size) && count < 5 * 4)
+ if (TARGET_64BIT)
{
- enum machine_mode movs_mode = size == 4 ? SImode : DImode;
-
- while (offset < (count & ~(size - 1)))
- {
- srcmem = adjust_automodify_address_nv (src, movs_mode,
- srcreg, offset);
- dstmem = adjust_automodify_address_nv (dst, movs_mode,
- destreg, offset);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
- offset += size;
- }
+ emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
+ emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
}
else
+ gcc_unreachable ();
+ offset += 16;
+ }
+ if ((countval & 0x08) && max_size > 8)
+ {
+ if (TARGET_64BIT)
+ emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
+ else
{
- countreg = GEN_INT ((count >> (size == 4 ? 2 : 3))
- & (TARGET_64BIT ? -1 : 0x3fffffff));
- countreg = copy_to_mode_reg (counter_mode, countreg);
- countreg = ix86_zero_extend_to_Pmode (countreg);
-
- destexp = gen_rtx_ASHIFT (Pmode, countreg,
- GEN_INT (size == 4 ? 2 : 3));
- srcexp = gen_rtx_PLUS (Pmode, destexp, srcreg);
- destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
-
- emit_insn (gen_rep_mov (destreg, dst, srcreg, src,
- countreg, destexp, srcexp));
- offset = count & ~(size - 1);
+ emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
+ emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 4);
}
+ offset += 8;
}
- if (size == 8 && (count & 0x04))
+ if ((countval & 0x04) && max_size > 4)
{
- srcmem = adjust_automodify_address_nv (src, SImode, srcreg,
- offset);
- dstmem = adjust_automodify_address_nv (dst, SImode, destreg,
- offset);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
+ emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
offset += 4;
}
- if (count & 0x02)
+ if ((countval & 0x02) && max_size > 2)
{
- srcmem = adjust_automodify_address_nv (src, HImode, srcreg,
- offset);
- dstmem = adjust_automodify_address_nv (dst, HImode, destreg,
- offset);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
+ emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
offset += 2;
}
- if (count & 0x01)
+ if ((countval & 0x01) && max_size > 1)
{
- srcmem = adjust_automodify_address_nv (src, QImode, srcreg,
- offset);
- dstmem = adjust_automodify_address_nv (dst, QImode, destreg,
- offset);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
+ emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
+ offset += 1;
}
+ return;
}
- /* The generic code based on the glibc implementation:
- - align destination to 4 bytes (8 byte alignment is used for PentiumPro
- allowing accelerated copying there)
- - copy the data using rep movsl
- - copy the rest. */
- else
+ if (max_size > 8)
{
- rtx countreg2;
- rtx label = NULL;
- rtx srcmem, dstmem;
- int desired_alignment = (TARGET_PENTIUMPRO
- && (count == 0 || count >= (unsigned int) 260)
- ? 8 : UNITS_PER_WORD);
- /* Get rid of MEM_OFFSETs, they won't be accurate. */
- dst = change_address (dst, BLKmode, destreg);
- src = change_address (src, BLKmode, srcreg);
-
- /* In case we don't know anything about the alignment, default to
- library version, since it is usually equally fast and result in
- shorter code.
-
- Also emit call when we know that the count is large and call overhead
- will not be important. */
- if (!TARGET_INLINE_ALL_STRINGOPS
- && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL))
- return 0;
-
- if (TARGET_SINGLE_STRINGOP)
- emit_insn (gen_cld ());
-
- countreg2 = gen_reg_rtx (Pmode);
- countreg = copy_to_mode_reg (counter_mode, count_exp);
-
- /* We don't use loops to align destination and to copy parts smaller
- than 4 bytes, because gcc is able to optimize such code better (in
- the case the destination or the count really is aligned, gcc is often
- able to predict the branches) and also it is friendlier to the
- hardware branch prediction.
-
- Using loops is beneficial for generic case, because we can
- handle small counts using the loops. Many CPUs (such as Athlon)
- have large REP prefix setup costs.
-
- This is quite costly. Maybe we can revisit this decision later or
- add some customizability to this code. */
+ count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
+ count, 1, OPTAB_DIRECT);
+ expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
+ count, QImode, 1, 4);
+ return;
+ }
- if (count == 0 && align < desired_alignment)
+ /* When there are stringops, we can cheaply increase dest and src pointers.
+ Otherwise we save code size by maintaining offset (zero is readily
+ available from preceeding rep operation) and using x86 addressing modes.
+ */
+ if (TARGET_SINGLE_STRINGOP)
+ {
+ if (max_size > 4)
{
- label = gen_label_rtx ();
- emit_cmp_and_jump_insns (countreg, GEN_INT (desired_alignment - 1),
- LEU, 0, counter_mode, 1, label);
+ rtx label = ix86_expand_aligntest (count, 4, true);
+ src = change_address (srcmem, SImode, srcptr);
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
}
- if (align <= 1)
+ if (max_size > 2)
{
- rtx label = ix86_expand_aligntest (destreg, 1);
- srcmem = change_address (src, QImode, srcreg);
- dstmem = change_address (dst, QImode, destreg);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
- ix86_adjust_counter (countreg, 1);
+ rtx label = ix86_expand_aligntest (count, 2, true);
+ src = change_address (srcmem, HImode, srcptr);
+ dest = change_address (destmem, HImode, destptr);
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
emit_label (label);
LABEL_NUSES (label) = 1;
}
- if (align <= 2)
+ if (max_size > 1)
{
- rtx label = ix86_expand_aligntest (destreg, 2);
- srcmem = change_address (src, HImode, srcreg);
- dstmem = change_address (dst, HImode, destreg);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
- ix86_adjust_counter (countreg, 2);
+ rtx label = ix86_expand_aligntest (count, 1, true);
+ src = change_address (srcmem, QImode, srcptr);
+ dest = change_address (destmem, QImode, destptr);
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
emit_label (label);
LABEL_NUSES (label) = 1;
}
- if (align <= 4 && desired_alignment > 4)
+ }
+ else
+ {
+ rtx offset = force_reg (Pmode, const0_rtx);
+ rtx tmp;
+
+ if (max_size > 4)
{
- rtx label = ix86_expand_aligntest (destreg, 4);
- srcmem = change_address (src, SImode, srcreg);
- dstmem = change_address (dst, SImode, destreg);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
- ix86_adjust_counter (countreg, 4);
+ rtx label = ix86_expand_aligntest (count, 4, true);
+ src = change_address (srcmem, SImode, srcptr);
+ dest = change_address (destmem, SImode, destptr);
+ emit_move_insn (dest, src);
+ tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != offset)
+ emit_move_insn (offset, tmp);
emit_label (label);
LABEL_NUSES (label) = 1;
}
-
- if (label && desired_alignment > 4 && !TARGET_64BIT)
+ if (max_size > 2)
+ {
+ rtx label = ix86_expand_aligntest (count, 2, true);
+ tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+ src = change_address (srcmem, HImode, tmp);
+ tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+ dest = change_address (destmem, HImode, tmp);
+ emit_move_insn (dest, src);
+ tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != offset)
+ emit_move_insn (offset, tmp);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 1)
{
+ rtx label = ix86_expand_aligntest (count, 1, true);
+ tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+ src = change_address (srcmem, QImode, tmp);
+ tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+ dest = change_address (destmem, QImode, tmp);
+ emit_move_insn (dest, src);
emit_label (label);
LABEL_NUSES (label) = 1;
- label = NULL_RTX;
}
- if (!TARGET_SINGLE_STRINGOP)
- emit_insn (gen_cld ());
- if (TARGET_64BIT)
+ }
+}
+
+/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
+static void
+expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
+ rtx count, int max_size)
+{
+ count =
+ expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
+ count, 1, OPTAB_DIRECT);
+ expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
+ gen_lowpart (QImode, value), count, QImode,
+ 1, max_size / 2);
+}
+
+/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
+static void
+expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
+{
+ rtx dest;
+ if (GET_CODE (count) == CONST_INT)
+ {
+ HOST_WIDE_INT countval = INTVAL (count);
+ int offset = 0;
+
+ if ((countval & 0x16) && max_size > 16)
{
- emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg),
- GEN_INT (3)));
- destexp = gen_rtx_ASHIFT (Pmode, countreg2, GEN_INT (3));
+ if (TARGET_64BIT)
+ {
+ dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, value));
+ dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
+ emit_insn (gen_strset (destptr, dest, value));
+ }
+ else
+ gcc_unreachable ();
+ offset += 16;
}
- else
+ if ((countval & 0x08) && max_size > 8)
{
- emit_insn (gen_lshrsi3 (countreg2, countreg, const2_rtx));
- destexp = gen_rtx_ASHIFT (Pmode, countreg2, const2_rtx);
+ if (TARGET_64BIT)
+ {
+ dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, value));
+ }
+ else
+ {
+ dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, value));
+ dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
+ emit_insn (gen_strset (destptr, dest, value));
+ }
+ offset += 8;
}
- srcexp = gen_rtx_PLUS (Pmode, destexp, srcreg);
- destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
- emit_insn (gen_rep_mov (destreg, dst, srcreg, src,
- countreg2, destexp, srcexp));
-
- if (label)
+ if ((countval & 0x04) && max_size > 4)
{
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
+ offset += 4;
}
- if (TARGET_64BIT && align > 4 && count != 0 && (count & 4))
+ if ((countval & 0x02) && max_size > 2)
{
- srcmem = change_address (src, SImode, srcreg);
- dstmem = change_address (dst, SImode, destreg);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
+ dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
+ offset += 2;
}
- if ((align <= 4 || count == 0) && TARGET_64BIT)
+ if ((countval & 0x01) && max_size > 1)
{
- rtx label = ix86_expand_aligntest (countreg, 4);
- srcmem = change_address (src, SImode, srcreg);
- dstmem = change_address (dst, SImode, destreg);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
+ offset += 1;
}
- if (align > 2 && count != 0 && (count & 2))
+ return;
+ }
+ if (max_size > 32)
+ {
+ expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
+ return;
+ }
+ if (max_size > 16)
+ {
+ rtx label = ix86_expand_aligntest (count, 16, true);
+ if (TARGET_64BIT)
{
- srcmem = change_address (src, HImode, srcreg);
- dstmem = change_address (dst, HImode, destreg);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
+ dest = change_address (destmem, DImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
}
- if (align <= 2 || count == 0)
+ else
{
- rtx label = ix86_expand_aligntest (countreg, 2);
- srcmem = change_address (src, HImode, srcreg);
- dstmem = change_address (dst, HImode, destreg);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
}
- if (align > 1 && count != 0 && (count & 1))
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 8)
+ {
+ rtx label = ix86_expand_aligntest (count, 8, true);
+ if (TARGET_64BIT)
{
- srcmem = change_address (src, QImode, srcreg);
- dstmem = change_address (dst, QImode, destreg);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
+ dest = change_address (destmem, DImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
}
- if (align <= 1 || count == 0)
+ else
{
- rtx label = ix86_expand_aligntest (countreg, 1);
- srcmem = change_address (src, QImode, srcreg);
- dstmem = change_address (dst, QImode, destreg);
- emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem));
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
}
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 4)
+ {
+ rtx label = ix86_expand_aligntest (count, 4, true);
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
}
+ if (max_size > 2)
+ {
+ rtx label = ix86_expand_aligntest (count, 2, true);
+ dest = change_address (destmem, HImode, destptr);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 1)
+ {
+ rtx label = ix86_expand_aligntest (count, 1, true);
+ dest = change_address (destmem, QImode, destptr);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+}
- return 1;
+/* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
+ DESIRED_ALIGNMENT. */
+static void
+expand_movmem_prologue (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr, rtx count,
+ int align, int desired_alignment)
+{
+ if (align <= 1 && desired_alignment > 1)
+ {
+ rtx label = ix86_expand_aligntest (destptr, 1, false);
+ srcmem = change_address (srcmem, QImode, srcptr);
+ destmem = change_address (destmem, QImode, destptr);
+ emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
+ ix86_adjust_counter (count, 1);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 2 && desired_alignment > 2)
+ {
+ rtx label = ix86_expand_aligntest (destptr, 2, false);
+ srcmem = change_address (srcmem, HImode, srcptr);
+ destmem = change_address (destmem, HImode, destptr);
+ emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
+ ix86_adjust_counter (count, 2);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 4 && desired_alignment > 4)
+ {
+ rtx label = ix86_expand_aligntest (destptr, 4, false);
+ srcmem = change_address (srcmem, SImode, srcptr);
+ destmem = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
+ ix86_adjust_counter (count, 4);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ gcc_assert (desired_alignment <= 8);
}
-/* Expand string clear operation (bzero). Use i386 string operations when
- profitable. expand_movmem contains similar code. */
+/* Set enough from DEST to align DEST known to by aligned by ALIGN to
+ DESIRED_ALIGNMENT. */
+static void
+expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
+ int align, int desired_alignment)
+{
+ if (align <= 1 && desired_alignment > 1)
+ {
+ rtx label = ix86_expand_aligntest (destptr, 1, false);
+ destmem = change_address (destmem, QImode, destptr);
+ emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
+ ix86_adjust_counter (count, 1);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 2 && desired_alignment > 2)
+ {
+ rtx label = ix86_expand_aligntest (destptr, 2, false);
+ destmem = change_address (destmem, HImode, destptr);
+ emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
+ ix86_adjust_counter (count, 2);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 4 && desired_alignment > 4)
+ {
+ rtx label = ix86_expand_aligntest (destptr, 4, false);
+ destmem = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
+ ix86_adjust_counter (count, 4);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ gcc_assert (desired_alignment <= 8);
+}
+
+/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
+static enum stringop_alg
+decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
+ int *dynamic_check)
+{
+ const struct stringop_algs * algs;
+
+ *dynamic_check = -1;
+ if (memset)
+ algs = &ix86_cost->memset[TARGET_64BIT != 0];
+ else
+ algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+ if (stringop_alg != no_stringop)
+ return stringop_alg;
+ /* rep; movq or rep; movl is the smallest variant. */
+ else if (optimize_size)
+ {
+ if (!count || (count & 3))
+ return rep_prefix_1_byte;
+ else
+ return rep_prefix_4_byte;
+ }
+ /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
+ */
+ else if (expected_size != -1 && expected_size < 4)
+ return loop_1_byte;
+ else if (expected_size != -1)
+ {
+ unsigned int i;
+ enum stringop_alg alg = libcall;
+ for (i = 0; i < NAX_STRINGOP_ALGS; i++)
+ {
+ gcc_assert (algs->size[i].max);
+ if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
+ {
+ if (algs->size[i].alg != libcall)
+ alg = algs->size[i].alg;
+ /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
+ last non-libcall inline algorithm. */
+ if (TARGET_INLINE_ALL_STRINGOPS)
+ {
+ gcc_assert (alg != libcall);
+ return alg;
+ }
+ else
+ return algs->size[i].alg;
+ }
+ }
+ gcc_unreachable ();
+ }
+ /* When asked to inline the call anyway, try to pick meaningful choice.
+ We look for maximal size of block that is faster to copy by hand and
+ take blocks of at most of that size guessing that average size will
+ be roughly half of the block.
+
+ If this turns out to be bad, we might simply specify the preferred
+ choice in ix86_costs. */
+ if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+ && algs->unknown_size == libcall)
+ {
+ int max = -1;
+ enum stringop_alg alg;
+ int i;
+
+ for (i = 0; i < NAX_STRINGOP_ALGS; i++)
+ if (algs->size[i].alg != libcall && algs->size[i].alg)
+ max = algs->size[i].max;
+ if (max == -1)
+ max = 4096;
+ alg = decide_alg (count, max / 2, memset, dynamic_check);
+ gcc_assert (*dynamic_check == -1);
+ gcc_assert (alg != libcall);
+ if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+ *dynamic_check = max;
+ return alg;
+ }
+ return algs->unknown_size;
+}
+
+/* Decide on alignment. We know that the operand is already aligned to ALIGN
+ (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
+static int
+decide_alignment (int align,
+ enum stringop_alg alg,
+ int expected_size)
+{
+ int desired_align = 0;
+ switch (alg)
+ {
+ case no_stringop:
+ gcc_unreachable ();
+ case loop:
+ case unrolled_loop:
+ desired_align = GET_MODE_SIZE (Pmode);
+ break;
+ case rep_prefix_8_byte:
+ desired_align = 8;
+ break;
+ case rep_prefix_4_byte:
+ /* PentiumPro has special logic triggering for 8 byte aligned blocks.
+ copying whole cacheline at once. */
+ if (TARGET_PENTIUMPRO)
+ desired_align = 8;
+ else
+ desired_align = 4;
+ break;
+ case rep_prefix_1_byte:
+ /* PentiumPro has special logic triggering for 8 byte aligned blocks.
+ copying whole cacheline at once. */
+ if (TARGET_PENTIUMPRO)
+ desired_align = 8;
+ else
+ desired_align = 1;
+ break;
+ case loop_1_byte:
+ desired_align = 1;
+ break;
+ case libcall:
+ return 0;
+ }
+
+ if (optimize_size)
+ desired_align = 1;
+ if (desired_align < align)
+ desired_align = align;
+ if (expected_size != -1 && expected_size < 4)
+ desired_align = align;
+ return desired_align;
+}
+
+/* Expand string move (memcpy) operation. Use i386 string operations when
+ profitable. expand_clrmem contains similar code. */
int
-ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp)
+ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
+ rtx expected_align_exp, rtx expected_size_exp)
{
- rtx destreg, zeroreg, countreg, destexp;
- enum machine_mode counter_mode;
- HOST_WIDE_INT align = 0;
+ rtx destreg;
+ rtx srcreg;
+ rtx label = NULL;
+ rtx tmp;
+ rtx jump_around_label = NULL;
+ HOST_WIDE_INT align = 1;
unsigned HOST_WIDE_INT count = 0;
+ HOST_WIDE_INT expected_size = -1;
+ int size_needed = 0;
+ int desired_align = 0;
+ enum stringop_alg alg;
+ int dynamic_check;
+ /* Precise placement on cld depends whether stringops will be emit in
+ prologue, main copying body or epilogue. This variable keeps track
+ if cld was already needed. */
+ bool cld_done = false;
if (GET_CODE (align_exp) == CONST_INT)
align = INTVAL (align_exp);
+ /* i386 can do missaligned access on resonably increased cost. */
+ if (GET_CODE (expected_align_exp) == CONST_INT
+ && INTVAL (expected_align_exp) > align)
+ align = INTVAL (expected_align_exp);
+ if (GET_CODE (count_exp) == CONST_INT)
+ count = expected_size = INTVAL (count_exp);
+ if (GET_CODE (expected_size_exp) == CONST_INT && count == 0)
+ {
+ expected_size = INTVAL (expected_size_exp);
+ }
- /* Can't use any of this if the user has appropriated esi. */
- if (global_regs[4])
- return 0;
+ alg = decide_alg (count, expected_size, false, &dynamic_check);
+ desired_align = decide_alignment (align, alg, expected_size);
- /* This simple hack avoids all inlining code and simplifies code below. */
if (!TARGET_ALIGN_STRINGOPS)
- align = 32;
+ align = desired_align;
- if (GET_CODE (count_exp) == CONST_INT)
+ if (alg == libcall)
+ return 0;
+ gcc_assert (alg != no_stringop);
+ if (!count)
+ count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
+ destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
+ srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
+ switch (alg)
{
- count = INTVAL (count_exp);
- if (!TARGET_INLINE_ALL_STRINGOPS && count > 64)
- return 0;
+ case libcall:
+ case no_stringop:
+ gcc_unreachable ();
+ case loop:
+ size_needed = GET_MODE_SIZE (Pmode);
+ break;
+ case unrolled_loop:
+ size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
+ break;
+ case rep_prefix_8_byte:
+ size_needed = 8;
+ break;
+ case rep_prefix_4_byte:
+ size_needed = 4;
+ break;
+ case rep_prefix_1_byte:
+ case loop_1_byte:
+ size_needed = 1;
+ break;
}
- /* Figure out proper mode for counter. For 32bits it is always SImode,
- for 64bits use SImode when possible, otherwise DImode.
- Set count to number of bytes copied when known at compile time. */
- if (!TARGET_64BIT
- || GET_MODE (count_exp) == SImode
- || x86_64_zext_immediate_operand (count_exp, VOIDmode))
- counter_mode = SImode;
- else
- counter_mode = DImode;
- destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
- if (destreg != XEXP (dst, 0))
- dst = replace_equiv_address_nv (dst, destreg);
+ /* Alignment code needs count to be in register. */
+ if (GET_CODE (count_exp) == CONST_INT && desired_align > align)
+ {
+ enum machine_mode mode = SImode;
+ if (TARGET_64BIT && (count & ~0xffffffff))
+ mode = DImode;
+ count_exp = force_reg (mode, count_exp);
+ }
+ gcc_assert (desired_align >= 1 && align >= 1);
+ /* Ensure that alignment prologue won't copy past end of block. */
+ if ((size_needed > 1 || (desired_align > 1 && desired_align > align))
+ && !count)
+ {
+ int size = MAX (size_needed - 1, desired_align - align);
+ if (TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld ()), cld_done = true;
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp,
+ GEN_INT (size),
+ LEU, 0, GET_MODE (count_exp), 1, label);
+ if (expected_size == -1 || expected_size < size)
+ predict_jump (REG_BR_PROB_BASE * 60 / 100);
+ else
+ predict_jump (REG_BR_PROB_BASE * 20 / 100);
+ }
+ /* Emit code to decide on runtime whether library call or inline should be
+ used. */
+ if (dynamic_check != -1)
+ {
+ rtx hot_label = gen_label_rtx ();
+ jump_around_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
+ LEU, 0, GET_MODE (count_exp), 1, hot_label);
+ predict_jump (REG_BR_PROB_BASE * 90 / 100);
+ emit_block_move_via_libcall (dst, src, count_exp, false);
+ emit_jump (jump_around_label);
+ emit_label (hot_label);
+ }
- /* When optimizing for size emit simple rep ; movsb instruction for
- counts not divisible by 4. The movl $N, %ecx; rep; stosb
- sequence is 7 bytes long, so if optimizing for size and count is
- small enough that some stosl, stosw and stosb instructions without
- rep are shorter, fall back into the next if. */
+ /* Alignment prologue. */
+ if (desired_align > align)
+ {
+ /* Except for the first move in epilogue, we no longer know
+ constant offset in aliasing info. It don't seems to worth
+ the pain to maintain it for the first move, so throw away
+ the info early. */
+ src = change_address (src, BLKmode, srcreg);
+ dst = change_address (dst, BLKmode, destreg);
+ if (TARGET_SINGLE_STRINGOP && !cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
+ desired_align);
+ }
+ if (label && size_needed == 1)
+ {
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ label = NULL;
+ }
- if ((!optimize || optimize_size)
- && (count == 0
- || ((count & 0x03)
- && (!optimize_size || (count & 0x03) + (count >> 2) > 7))))
+ /* Main body. */
+ switch (alg)
{
- emit_insn (gen_cld ());
+ case libcall:
+ case no_stringop:
+ gcc_unreachable ();
+ case loop_1_byte:
+ expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
+ count_exp, QImode, 1, expected_size);
+ break;
+ case loop:
+ expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
+ count_exp, Pmode, 1, expected_size);
+ break;
+ case unrolled_loop:
+ /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
+ registers for 4 temporaries anyway. */
+ expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
+ count_exp, Pmode, TARGET_64BIT ? 4 : 2,
+ expected_size);
+ break;
+ case rep_prefix_8_byte:
+ if (!cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
+ DImode);
+ break;
+ case rep_prefix_4_byte:
+ if (!cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
+ SImode);
+ break;
+ case rep_prefix_1_byte:
+ if (!cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
+ QImode);
+ break;
+ }
+ /* Adjust properly the offset of src and dest memory for aliasing. */
+ if (GET_CODE (count_exp) == CONST_INT)
+ {
+ src = adjust_automodify_address_nv (src, BLKmode, srcreg,
+ (count / size_needed) * size_needed);
+ dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
+ (count / size_needed) * size_needed);
+ }
+ else
+ {
+ src = change_address (src, BLKmode, srcreg);
+ dst = change_address (dst, BLKmode, destreg);
+ }
- countreg = ix86_zero_extend_to_Pmode (count_exp);
- zeroreg = copy_to_mode_reg (QImode, const0_rtx);
- destexp = gen_rtx_PLUS (Pmode, destreg, countreg);
- emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp));
+ /* Epologue to copy the remaining bytes. */
+ if (label)
+ {
+ if (size_needed < desired_align - align)
+ {
+ tmp =
+ expand_simple_binop (GET_MODE (count_exp), AND, count_exp,
+ GEN_INT (size_needed - 1), count_exp, 1,
+ OPTAB_DIRECT);
+ size_needed = desired_align - align + 1;
+ if (tmp != count_exp)
+ emit_move_insn (count_exp, tmp);
+ }
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
}
- else if (count != 0
- && (align >= 8
- || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4)
- || optimize_size || count < (unsigned int) 64))
+ if (count_exp != const0_rtx && size_needed > 1)
{
- int size = TARGET_64BIT && !optimize_size ? 8 : 4;
- unsigned HOST_WIDE_INT offset = 0;
+ if (TARGET_SINGLE_STRINGOP && !cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
+ size_needed);
+ }
+ if (jump_around_label)
+ emit_label (jump_around_label);
+ return 1;
+}
- emit_insn (gen_cld ());
+/* Helper function for memcpy. For QImode value 0xXY produce
+ 0xXYXYXYXY of wide specified by MODE. This is essentially
+ a * 0x10101010, but we can do slightly better than
+ synth_mult by unwinding the sequence by hand on CPUs with
+ slow multiply. */
+static rtx
+promote_duplicated_reg (enum machine_mode mode, rtx val)
+{
+ enum machine_mode valmode = GET_MODE (val);
+ rtx tmp;
+ int nops = mode == DImode ? 3 : 2;
- zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx);
- if (count & ~(size - 1))
- {
- unsigned HOST_WIDE_INT repcount;
- unsigned int max_nonrep;
+ gcc_assert (mode == SImode || mode == DImode);
+ if (val == const0_rtx)
+ return copy_to_mode_reg (mode, const0_rtx);
+ if (GET_CODE (val) == CONST_INT)
+ {
+ HOST_WIDE_INT v = INTVAL (val) & 255;
- repcount = count >> (size == 4 ? 2 : 3);
- if (!TARGET_64BIT)
- repcount &= 0x3fffffff;
+ v |= v << 8;
+ v |= v << 16;
+ if (mode == DImode)
+ v |= (v << 16) << 16;
+ return copy_to_mode_reg (mode, gen_int_mode (v, mode));
+ }
- /* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes.
- movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN
- bytes. In both cases the latter seems to be faster for small
- values of N. */
- max_nonrep = size == 4 ? 7 : 4;
- if (!optimize_size)
- switch (ix86_tune)
- {
- case PROCESSOR_PENTIUM4:
- case PROCESSOR_NOCONA:
- max_nonrep = 3;
- break;
- default:
- break;
- }
+ if (valmode == VOIDmode)
+ valmode = QImode;
+ if (valmode != QImode)
+ val = gen_lowpart (QImode, val);
+ if (mode == QImode)
+ return val;
+ if (!TARGET_PARTIAL_REG_STALL)
+ nops--;
+ if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
+ + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
+ <= (ix86_cost->shift_const + ix86_cost->add) * nops
+ + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
+ {
+ rtx reg = convert_modes (mode, QImode, val, true);
+ tmp = promote_duplicated_reg (mode, const1_rtx);
+ return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
+ OPTAB_DIRECT);
+ }
+ else
+ {
+ rtx reg = convert_modes (mode, QImode, val, true);
- if (repcount <= max_nonrep)
- while (repcount-- > 0)
- {
- rtx mem = adjust_automodify_address_nv (dst,
- GET_MODE (zeroreg),
- destreg, offset);
- emit_insn (gen_strset (destreg, mem, zeroreg));
- offset += size;
- }
- else
- {
- countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount));
- countreg = ix86_zero_extend_to_Pmode (countreg);
- destexp = gen_rtx_ASHIFT (Pmode, countreg,
- GEN_INT (size == 4 ? 2 : 3));
- destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
- emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg,
- destexp));
- offset = count & ~(size - 1);
- }
- }
- if (size == 8 && (count & 0x04))
- {
- rtx mem = adjust_automodify_address_nv (dst, SImode, destreg,
- offset);
- emit_insn (gen_strset (destreg, mem,
- gen_rtx_SUBREG (SImode, zeroreg, 0)));
- offset += 4;
- }
- if (count & 0x02)
- {
- rtx mem = adjust_automodify_address_nv (dst, HImode, destreg,
- offset);
- emit_insn (gen_strset (destreg, mem,
- gen_rtx_SUBREG (HImode, zeroreg, 0)));
- offset += 2;
- }
- if (count & 0x01)
+ if (!TARGET_PARTIAL_REG_STALL)
+ if (mode == SImode)
+ emit_insn (gen_movsi_insv_1 (reg, reg));
+ else
+ emit_insn (gen_movdi_insv_1_rex64 (reg, reg));
+ else
{
- rtx mem = adjust_automodify_address_nv (dst, QImode, destreg,
- offset);
- emit_insn (gen_strset (destreg, mem,
- gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
+ NULL, 1, OPTAB_DIRECT);
+ reg =
+ expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
}
+ tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
+ NULL, 1, OPTAB_DIRECT);
+ reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+ if (mode == SImode)
+ return reg;
+ tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
+ NULL, 1, OPTAB_DIRECT);
+ reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+ return reg;
}
- else
- {
- rtx countreg2;
- rtx label = NULL;
- /* Compute desired alignment of the string operation. */
- int desired_alignment = (TARGET_PENTIUMPRO
- && (count == 0 || count >= (unsigned int) 260)
- ? 8 : UNITS_PER_WORD);
-
- /* In case we don't know anything about the alignment, default to
- library version, since it is usually equally fast and result in
- shorter code.
-
- Also emit call when we know that the count is large and call overhead
- will not be important. */
- if (!TARGET_INLINE_ALL_STRINGOPS
- && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL))
- return 0;
+}
- if (TARGET_SINGLE_STRINGOP)
- emit_insn (gen_cld ());
+/* Expand string clear operation (bzero). Use i386 string operations when
+ profitable. expand_movmem contains similar code. */
+int
+ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
+ rtx expected_align_exp, rtx expected_size_exp)
+{
+ rtx destreg;
+ rtx label = NULL;
+ rtx tmp;
+ rtx jump_around_label = NULL;
+ HOST_WIDE_INT align = 1;
+ unsigned HOST_WIDE_INT count = 0;
+ HOST_WIDE_INT expected_size = -1;
+ int size_needed = 0;
+ int desired_align = 0;
+ enum stringop_alg alg;
+ /* Precise placement on cld depends whether stringops will be emit in
+ prologue, main copying body or epilogue. This variable keeps track
+ if cld was already needed. */
+ bool cld_done = false;
+ rtx promoted_val = val_exp;
+ bool force_loopy_epilogue = false;
+ int dynamic_check;
- countreg2 = gen_reg_rtx (Pmode);
- countreg = copy_to_mode_reg (counter_mode, count_exp);
- zeroreg = copy_to_mode_reg (Pmode, const0_rtx);
- /* Get rid of MEM_OFFSET, it won't be accurate. */
- dst = change_address (dst, BLKmode, destreg);
+ if (GET_CODE (align_exp) == CONST_INT)
+ align = INTVAL (align_exp);
+ /* i386 can do missaligned access on resonably increased cost. */
+ if (GET_CODE (expected_align_exp) == CONST_INT
+ && INTVAL (expected_align_exp) > align)
+ align = INTVAL (expected_align_exp);
+ if (GET_CODE (count_exp) == CONST_INT)
+ count = expected_size = INTVAL (count_exp);
+ if (GET_CODE (expected_size_exp) == CONST_INT && count == 0)
+ expected_size = INTVAL (expected_size_exp);
- if (count == 0 && align < desired_alignment)
- {
- label = gen_label_rtx ();
- emit_cmp_and_jump_insns (countreg, GEN_INT (desired_alignment - 1),
- LEU, 0, counter_mode, 1, label);
- }
- if (align <= 1)
- {
- rtx label = ix86_expand_aligntest (destreg, 1);
- emit_insn (gen_strset (destreg, dst,
- gen_rtx_SUBREG (QImode, zeroreg, 0)));
- ix86_adjust_counter (countreg, 1);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (align <= 2)
- {
- rtx label = ix86_expand_aligntest (destreg, 2);
- emit_insn (gen_strset (destreg, dst,
- gen_rtx_SUBREG (HImode, zeroreg, 0)));
- ix86_adjust_counter (countreg, 2);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (align <= 4 && desired_alignment > 4)
- {
- rtx label = ix86_expand_aligntest (destreg, 4);
- emit_insn (gen_strset (destreg, dst,
- (TARGET_64BIT
- ? gen_rtx_SUBREG (SImode, zeroreg, 0)
- : zeroreg)));
- ix86_adjust_counter (countreg, 4);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
+ alg = decide_alg (count, expected_size, true, &dynamic_check);
+ desired_align = decide_alignment (align, alg, expected_size);
- if (label && desired_alignment > 4 && !TARGET_64BIT)
- {
- emit_label (label);
- LABEL_NUSES (label) = 1;
- label = NULL_RTX;
- }
+ if (!TARGET_ALIGN_STRINGOPS)
+ align = desired_align;
- if (!TARGET_SINGLE_STRINGOP)
- emit_insn (gen_cld ());
- if (TARGET_64BIT)
- {
- emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg),
- GEN_INT (3)));
- destexp = gen_rtx_ASHIFT (Pmode, countreg2, GEN_INT (3));
- }
+ if (alg == libcall)
+ return 0;
+ gcc_assert (alg != no_stringop);
+ if (!count)
+ count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
+ destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
+ switch (alg)
+ {
+ case libcall:
+ case no_stringop:
+ gcc_unreachable ();
+ case loop:
+ size_needed = GET_MODE_SIZE (Pmode);
+ break;
+ case unrolled_loop:
+ size_needed = GET_MODE_SIZE (Pmode) * 4;
+ break;
+ case rep_prefix_8_byte:
+ size_needed = 8;
+ break;
+ case rep_prefix_4_byte:
+ size_needed = 4;
+ break;
+ case rep_prefix_1_byte:
+ case loop_1_byte:
+ size_needed = 1;
+ break;
+ }
+ /* Alignment code needs count to be in register. */
+ if (GET_CODE (count_exp) == CONST_INT && desired_align > align)
+ {
+ enum machine_mode mode = SImode;
+ if (TARGET_64BIT && (count & ~0xffffffff))
+ mode = DImode;
+ count_exp = force_reg (mode, count_exp);
+ }
+ /* Ensure that alignment prologue won't copy past end of block. */
+ if ((size_needed > 1 || (desired_align > 1 && desired_align > align))
+ && !count)
+ {
+ int size = MAX (size_needed - 1, desired_align - align);
+ /* To improve performance of small blocks, we jump around the promoting
+ code, so we need to use QImode accesses in epilogue. */
+ if (GET_CODE (val_exp) != CONST_INT && size_needed > 1)
+ force_loopy_epilogue = true;
+ else if (TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld ()), cld_done = true;
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp,
+ GEN_INT (size),
+ LEU, 0, GET_MODE (count_exp), 1, label);
+ if (expected_size == -1 || expected_size <= size)
+ predict_jump (REG_BR_PROB_BASE * 60 / 100);
else
- {
- emit_insn (gen_lshrsi3 (countreg2, countreg, const2_rtx));
- destexp = gen_rtx_ASHIFT (Pmode, countreg2, const2_rtx);
- }
- destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
- emit_insn (gen_rep_stos (destreg, countreg2, dst, zeroreg, destexp));
-
- if (label)
- {
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
+ predict_jump (REG_BR_PROB_BASE * 20 / 100);
+ }
+ if (dynamic_check != -1)
+ {
+ rtx hot_label = gen_label_rtx ();
+ jump_around_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
+ LEU, 0, GET_MODE (count_exp), 1, hot_label);
+ predict_jump (REG_BR_PROB_BASE * 90 / 100);
+ set_storage_via_libcall (dst, count_exp, val_exp, false);
+ emit_jump (jump_around_label);
+ emit_label (hot_label);
+ }
+ if (TARGET_64BIT
+ && (size_needed > 4 || (desired_align > align && desired_align > 4)))
+ promoted_val = promote_duplicated_reg (DImode, val_exp);
+ else if (size_needed > 2 || (desired_align > align && desired_align > 2))
+ promoted_val = promote_duplicated_reg (SImode, val_exp);
+ else if (size_needed > 1 || (desired_align > align && desired_align > 1))
+ promoted_val = promote_duplicated_reg (HImode, val_exp);
+ else
+ promoted_val = val_exp;
+ gcc_assert (desired_align >= 1 && align >= 1);
+ if ((size_needed > 1 || (desired_align > 1 && desired_align > align))
+ && !count && !label)
+ {
+ int size = MAX (size_needed - 1, desired_align - align);
+ if (TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld ()), cld_done = true;
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp,
+ GEN_INT (size),
+ LEU, 0, GET_MODE (count_exp), 1, label);
+ if (expected_size == -1 || expected_size <= size)
+ predict_jump (REG_BR_PROB_BASE * 60 / 100);
+ else
+ predict_jump (REG_BR_PROB_BASE * 20 / 100);
+ }
+ if (desired_align > align)
+ {
+ /* Except for the first move in epilogue, we no longer know
+ constant offset in aliasing info. It don't seems to worth
+ the pain to maintain it for the first move, so throw away
+ the info early. */
+ dst = change_address (dst, BLKmode, destreg);
+ if (TARGET_SINGLE_STRINGOP && !cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
+ desired_align);
+ }
+ if (label && size_needed == 1)
+ {
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ label = NULL;
+ }
+ switch (alg)
+ {
+ case libcall:
+ case no_stringop:
+ gcc_unreachable ();
+ case loop_1_byte:
+ expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
+ count_exp, QImode, 1, expected_size);
+ break;
+ case loop:
+ expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
+ count_exp, Pmode, 1, expected_size);
+ break;
+ case unrolled_loop:
+ expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
+ count_exp, Pmode, 4, expected_size);
+ break;
+ case rep_prefix_8_byte:
+ if (!cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
+ DImode);
+ break;
+ case rep_prefix_4_byte:
+ if (!cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
+ SImode);
+ break;
+ case rep_prefix_1_byte:
+ if (!cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
+ QImode);
+ break;
+ }
+ /* Adjust properly the offset of src and dest memory for aliasing. */
+ if (GET_CODE (count_exp) == CONST_INT)
+ dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
+ (count / size_needed) * size_needed);
+ else
+ dst = change_address (dst, BLKmode, destreg);
- if (TARGET_64BIT && align > 4 && count != 0 && (count & 4))
- emit_insn (gen_strset (destreg, dst,
- gen_rtx_SUBREG (SImode, zeroreg, 0)));
- if (TARGET_64BIT && (align <= 4 || count == 0))
+ if (label)
+ {
+ if (size_needed < desired_align - align)
{
- rtx label = ix86_expand_aligntest (countreg, 4);
- emit_insn (gen_strset (destreg, dst,
- gen_rtx_SUBREG (SImode, zeroreg, 0)));
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ tmp =
+ expand_simple_binop (GET_MODE (count_exp), AND, count_exp,
+ GEN_INT (size_needed - 1), count_exp, 1,
+ OPTAB_DIRECT);
+ size_needed = desired_align - align + 1;
+ if (tmp != count_exp)
+ emit_move_insn (count_exp, tmp);
}
- if (align > 2 && count != 0 && (count & 2))
- emit_insn (gen_strset (destreg, dst,
- gen_rtx_SUBREG (HImode, zeroreg, 0)));
- if (align <= 2 || count == 0)
- {
- rtx label = ix86_expand_aligntest (countreg, 2);
- emit_insn (gen_strset (destreg, dst,
- gen_rtx_SUBREG (HImode, zeroreg, 0)));
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (align > 1 && count != 0 && (count & 1))
- emit_insn (gen_strset (destreg, dst,
- gen_rtx_SUBREG (QImode, zeroreg, 0)));
- if (align <= 1 || count == 0)
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (count_exp != const0_rtx && size_needed > 1)
+ {
+ if (force_loopy_epilogue)
+ expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
+ size_needed);
+ else
{
- rtx label = ix86_expand_aligntest (countreg, 1);
- emit_insn (gen_strset (destreg, dst,
- gen_rtx_SUBREG (QImode, zeroreg, 0)));
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ if (TARGET_SINGLE_STRINGOP && !cld_done)
+ emit_insn (gen_cld ()), cld_done = true;
+ expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
+ size_needed);
}
}
+ if (jump_around_label)
+ emit_label (jump_around_label);
return 1;
}
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 039278c..52118f9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -19,6 +19,38 @@ along with GCC; see the file COPYING. If not, write to
the Free Software Foundation, 51 Franklin Street, Fifth Floor,
Boston, MA 02110-1301, USA. */
+/* Algorithm to expand string function with. */
+enum stringop_alg
+{
+ no_stringop,
+ libcall,
+ rep_prefix_1_byte,
+ rep_prefix_4_byte,
+ rep_prefix_8_byte,
+ loop_1_byte,
+ loop,
+ unrolled_loop
+};
+#define NAX_STRINGOP_ALGS 4
+/* Specify what algorithm to use for stringops on known size.
+ When size is unknown, the UNKNOWN_SIZE alg is used. When size is
+ known at compile time or estimated via feedback, the SIZE array
+ is walked in order until MAX is greater then the estimate (or -1
+ means infinity). Corresponding ALG is used then.
+ For example initializer:
+ {{256, loop}, {-1, rep_prefix_4_byte}}
+ will use loop for blocks smaller or equal to 256 bytes, rep prefix will
+ be used otherwise.
+*/
+struct stringop_algs
+{
+ const enum stringop_alg unknown_size;
+ const struct stringop_strategy {
+ const int max;
+ const enum stringop_alg alg;
+ } size [NAX_STRINGOP_ALGS];
+};
+
/* The purpose of this file is to define the characteristics of the i386,
independent of assembler syntax or operating system.
@@ -84,6 +116,9 @@ struct processor_costs {
const int fabs; /* cost of FABS instruction. */
const int fchs; /* cost of FCHS instruction. */
const int fsqrt; /* cost of FSQRT instruction. */
+ /* Specify what algorithm
+ to use for stringops on unknown size. */
+ struct stringop_algs memcpy[2], memset[2];
};
extern const struct processor_costs *ix86_cost;
@@ -217,7 +252,6 @@ extern int x86_prefetch_sse;
#define TARGET_PREFETCH_SSE (x86_prefetch_sse)
#define TARGET_SHIFT1 (x86_shift1 & TUNEMASK)
#define TARGET_USE_FFREEP (x86_use_ffreep & TUNEMASK)
-#define TARGET_REP_MOVL_OPTIMAL (x86_rep_movl_optimal & TUNEMASK)
#define TARGET_INTER_UNIT_MOVES (x86_inter_unit_moves & TUNEMASK)
#define TARGET_FOUR_JUMP_LIMIT (x86_four_jump_limit & TUNEMASK)
#define TARGET_SCHEDULE (x86_schedule & TUNEMASK)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 5265d83..f237abe 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1805,6 +1805,16 @@
[(set_attr "type" "imov")
(set_attr "mode" "QI")])
+(define_insn "*movsi_insv_1_rex64"
+ [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "+Q")
+ (const_int 8)
+ (const_int 8))
+ (match_operand:SI 1 "nonmemory_operand" "Qn"))]
+ "TARGET_64BIT"
+ "mov{b}\t{%b1, %h0|%h0, %b1}"
+ [(set_attr "type" "imov")
+ (set_attr "mode" "QI")])
+
(define_insn "movdi_insv_1_rex64"
[(set (zero_extract:DI (match_operand 0 "ext_register_operand" "+Q")
(const_int 8)
@@ -18160,9 +18170,10 @@
(use (match_operand:BLK 1 "memory_operand" ""))
(use (match_operand:SI 2 "nonmemory_operand" ""))
(use (match_operand:SI 3 "const_int_operand" ""))]
- "! optimize_size || TARGET_INLINE_ALL_STRINGOPS"
+ ""
{
- if (ix86_expand_movmem (operands[0], operands[1], operands[2], operands[3]))
+ if (ix86_expand_movmem (operands[0], operands[1], operands[2], operands[3],
+ operands[3], constm1_rtx))
DONE;
else
FAIL;
@@ -18175,7 +18186,8 @@
(use (match_operand:DI 3 "const_int_operand" ""))]
"TARGET_64BIT"
{
- if (ix86_expand_movmem (operands[0], operands[1], operands[2], operands[3]))
+ if (ix86_expand_movmem (operands[0], operands[1], operands[2], operands[3],
+ operands[3], constm1_rtx))
DONE;
else
FAIL;
@@ -18450,11 +18462,9 @@
(use (match_operand 3 "const_int_operand" ""))]
""
{
- /* If value to set is not zero, use the library routine. */
- if (operands[2] != const0_rtx)
- FAIL;
-
- if (ix86_expand_clrmem (operands[0], operands[1], operands[3]))
+ if (ix86_expand_setmem (operands[0], operands[1],
+ operands[2], operands[3],
+ operands[3], constm1_rtx))
DONE;
else
FAIL;
@@ -18464,14 +18474,14 @@
[(use (match_operand:BLK 0 "memory_operand" ""))
(use (match_operand:DI 1 "nonmemory_operand" ""))
(use (match_operand 2 "const_int_operand" ""))
- (use (match_operand 3 "const_int_operand" ""))]
+ (use (match_operand 3 "const_int_operand" ""))
+ (use (match_operand 4 "const_int_operand" ""))
+ (use (match_operand 5 "const_int_operand" ""))]
"TARGET_64BIT"
{
- /* If value to set is not zero, use the library routine. */
- if (operands[2] != const0_rtx)
- FAIL;
-
- if (ix86_expand_clrmem (operands[0], operands[1], operands[3]))
+ if (ix86_expand_setmem (operands[0], operands[1],
+ operands[2], operands[3],
+ operands[3], constm1_rtx))
DONE;
else
FAIL;
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 36e0944..aba2ee7 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -125,6 +125,10 @@ minline-all-stringops
Target Report Mask(INLINE_ALL_STRINGOPS)
Inline all known string operations
+minline-stringops-dynamically
+Target Report Mask(INLINE_STRINGOPS_DYNAMICALLY)
+Inline memset/memcpy string operations, but perform inline version only for small blocks
+
mintel-syntax
Target Undocumented
;; Deprecated
@@ -221,6 +225,10 @@ mstack-arg-probe
Target Report Mask(STACK_PROBE)
Enable stack probing
+mstringop-strategy=
+Target RejectNegative Joined Var(ix86_stringop_string)
+Chose strategy to generate stringop using
+
mtls-dialect=
Target RejectNegative Joined Var(ix86_tls_dialect_string)
Use given thread-local storage dialect