aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2016-03-06 06:38:21 -0800
committerH.J. Lu <hjl.tools@gmail.com>2021-07-30 10:34:19 -0700
commite5e164effa30fd2b5c5bc3e6883d63889e96d8da (patch)
tree2bc5b9da149ccd05f459cebc9086397430a4cccb /gcc
parentd68d275a00573be49f5e83eba52ce3f26d11db9e (diff)
downloadgcc-e5e164effa30fd2b5c5bc3e6883d63889e96d8da.zip
gcc-e5e164effa30fd2b5c5bc3e6883d63889e96d8da.tar.gz
gcc-e5e164effa30fd2b5c5bc3e6883d63889e96d8da.tar.bz2
Add QI vector mode support to by-pieces for memset
1. Replace scalar_int_mode with fixed_size_mode in the by-pieces infrastructure to allow non-integer mode. 2. Rename widest_int_mode_for_size to widest_fixed_size_mode_for_size to return QI vector mode for memset. 3. Add op_by_pieces_d::smallest_fixed_size_mode_for_size to return the smallest integer or QI vector mode. 4. Remove clear_by_pieces_1 and use builtin_memset_read_str in clear_by_pieces to support vector mode broadcast. 5. Add lowpart_subreg_regno, a wrapper around simplify_subreg_regno that uses subreg_lowpart_offset (mode, prev_mode) as the offset. 6. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard scratch register to avoid stack realignment when expanding memset. gcc/ PR middle-end/90773 * builtins.c (builtin_memcpy_read_str): Change the mode argument from scalar_int_mode to fixed_size_mode. (builtin_strncpy_read_str): Likewise. (gen_memset_value_from_prev): New function. (builtin_memset_read_str): Change the mode argument from scalar_int_mode to fixed_size_mode. Use gen_memset_value_from_prev and support CONST_VECTOR. (builtin_memset_gen_str): Likewise. (try_store_by_multiple_pieces): Use by_pieces_constfn to declare constfun. * builtins.h (builtin_strncpy_read_str): Replace scalar_int_mode with fixed_size_mode. (builtin_memset_read_str): Likewise. * expr.c (widest_int_mode_for_size): Renamed to ... (widest_fixed_size_mode_for_size): Add a bool argument to indicate if QI vector mode can be used. (by_pieces_ninsns): Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. (pieces_addr::adjust): Change the mode argument from scalar_int_mode to fixed_size_mode. (op_by_pieces_d): Make m_len read-only. Add a bool member, m_qi_vector_mode, to indicate that QI vector mode can be used. (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize m_qi_vector_mode. Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. (op_by_pieces_d::get_usable_mode): Change the mode argument from scalar_int_mode to fixed_size_mode. Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. (op_by_pieces_d::smallest_fixed_size_mode_for_size): New member function to return the smallest integer or QI vector mode. (op_by_pieces_d::run): Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. Call smallest_fixed_size_mode_for_size instead of smallest_int_mode_for_size. (store_by_pieces_d::store_by_pieces_d): Add a bool argument to indicate that QI vector mode can be used and pass it to op_by_pieces_d::op_by_pieces_d. (can_store_by_pieces): Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. Pass memsetp to widest_fixed_size_mode_for_size to support QI vector mode. Allow all CONST_VECTORs for memset if vec_duplicate is supported. (store_by_pieces): Pass memsetp to store_by_pieces_d::store_by_pieces_d. (clear_by_pieces_1): Removed. (clear_by_pieces): Replace clear_by_pieces_1 with builtin_memset_read_str and pass true to store_by_pieces_d to support vector mode broadcast. (string_cst_read_str): Change the mode argument from scalar_int_mode to fixed_size_mode. * expr.h (by_pieces_constfn): Change scalar_int_mode to fixed_size_mode. (by_pieces_prev): Likewise. * rtl.h (lowpart_subreg_regno): New. * rtlanal.c (lowpart_subreg_regno): New. A wrapper around simplify_subreg_regno. * target.def (gen_memset_scratch_rtx): New hook. * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. * doc/tm.texi: Regenerated. gcc/testsuite/ * gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of vmovdqu. * gcc.target/i386/pr100865-4b.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/builtins.c171
-rw-r--r--gcc/builtins.h4
-rw-r--r--gcc/doc/tm.texi7
-rw-r--r--gcc/doc/tm.texi.in2
-rw-r--r--gcc/expr.c172
-rw-r--r--gcc/expr.h4
-rw-r--r--gcc/rtl.h2
-rw-r--r--gcc/rtlanal.c11
-rw-r--r--gcc/target.def9
-rw-r--r--gcc/testsuite/gcc.target/i386/pr100865-3.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr100865-4b.c2
11 files changed, 297 insertions, 89 deletions
diff --git a/gcc/builtins.c b/gcc/builtins.c
index 845a8bb..2387b5d 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -3119,13 +3119,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
static rtx
builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
- scalar_int_mode mode)
+ fixed_size_mode mode)
{
/* The REPresentation pointed to by DATA need not be a nul-terminated
string but the caller guarantees it's large enough for MODE. */
const char *rep = (const char *) data;
- return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
+ /* The by-pieces infrastructure does not try to pick a vector mode
+ for memcpy expansion. */
+ return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
+ /*nul_terminated=*/false);
}
/* LEN specify length of the block of memcpy/memset operation.
@@ -3742,14 +3745,16 @@ expand_builtin_stpncpy (tree exp, rtx)
rtx
builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
- scalar_int_mode mode)
+ fixed_size_mode mode)
{
const char *str = (const char *) data;
if ((unsigned HOST_WIDE_INT) offset > strlen (str))
return const0_rtx;
- return c_readstr (str + offset, mode);
+ /* The by-pieces infrastructure does not try to pick a vector mode
+ for strncpy expansion. */
+ return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
}
/* Helper to check the sizes of sequences and the destination of calls
@@ -3950,30 +3955,122 @@ expand_builtin_strncpy (tree exp, rtx target)
return NULL_RTX;
}
-/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
- bytes from constant string DATA + OFFSET and return it as target
- constant. If PREV isn't nullptr, it has the RTL info from the
+/* Return the RTL of a register in MODE generated from PREV in the
previous iteration. */
-rtx
-builtin_memset_read_str (void *data, void *prevp,
- HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
- scalar_int_mode mode)
+static rtx
+gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode)
{
- by_pieces_prev *prev = (by_pieces_prev *) prevp;
+ rtx target = nullptr;
if (prev != nullptr && prev->data != nullptr)
{
/* Use the previous data in the same mode. */
if (prev->mode == mode)
return prev->data;
+
+ fixed_size_mode prev_mode = prev->mode;
+
+ /* Don't use the previous data to write QImode if it is in a
+ vector mode. */
+ if (VECTOR_MODE_P (prev_mode) && mode == QImode)
+ return target;
+
+ rtx prev_rtx = prev->data;
+
+ if (REG_P (prev_rtx)
+ && HARD_REGISTER_P (prev_rtx)
+ && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
+ {
+ /* This case occurs when PREV_MODE is a vector and when
+ MODE is too small to store using vector operations.
+ After register allocation, the code will need to move the
+ lowpart of the vector register into a non-vector register.
+
+ Also, the target has chosen to use a hard register
+ instead of going with the default choice of using a
+ pseudo register. We should respect that choice and try to
+ avoid creating a pseudo register with the same mode as the
+ current hard register.
+
+ In principle, we could just use a lowpart MODE subreg of
+ the vector register. However, the vector register mode might
+ be too wide for non-vector registers, and we already know
+ that the non-vector mode is too small for vector registers.
+ It's therefore likely that we'd need to spill to memory in
+ the vector mode and reload the non-vector value from there.
+
+ Try to avoid that by reducing the vector register to the
+ smallest size that it can hold. This should increase the
+ chances that non-vector registers can hold both the inner
+ and outer modes of the subreg that we generate later. */
+ machine_mode m;
+ fixed_size_mode candidate;
+ FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode))
+ if (is_a<fixed_size_mode> (m, &candidate))
+ {
+ if (GET_MODE_SIZE (candidate)
+ >= GET_MODE_SIZE (prev_mode))
+ break;
+ if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode)
+ && lowpart_subreg_regno (REGNO (prev_rtx),
+ prev_mode, candidate) >= 0)
+ {
+ target = lowpart_subreg (candidate, prev_rtx,
+ prev_mode);
+ prev_rtx = target;
+ prev_mode = candidate;
+ break;
+ }
+ }
+ if (target == nullptr)
+ prev_rtx = copy_to_reg (prev_rtx);
+ }
+
+ target = lowpart_subreg (mode, prev_rtx, prev_mode);
}
+ return target;
+}
+/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
+ bytes from constant string DATA + OFFSET and return it as target
+ constant. If PREV isn't nullptr, it has the RTL info from the
+ previous iteration. */
+
+rtx
+builtin_memset_read_str (void *data, void *prev,
+ HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+ fixed_size_mode mode)
+{
const char *c = (const char *) data;
- char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+ unsigned int size = GET_MODE_SIZE (mode);
+
+ rtx target = gen_memset_value_from_prev ((by_pieces_prev *) prev,
+ mode);
+ if (target != nullptr)
+ return target;
+ rtx src = gen_int_mode (*c, QImode);
- memset (p, *c, GET_MODE_SIZE (mode));
+ if (VECTOR_MODE_P (mode))
+ {
+ gcc_assert (GET_MODE_INNER (mode) == QImode);
+
+ rtx const_vec = gen_const_vec_duplicate (mode, src);
+ if (prev == NULL)
+ /* Return CONST_VECTOR when called by a query function. */
+ return const_vec;
+
+ /* Use the move expander with CONST_VECTOR. */
+ target = targetm.gen_memset_scratch_rtx (mode);
+ emit_move_insn (target, const_vec);
+ return target;
+ }
+
+ char *p = XALLOCAVEC (char, size);
+
+ memset (p, *c, size);
- return c_readstr (p, mode);
+ /* Vector modes should be handled above. */
+ return c_readstr (p, as_a <scalar_int_mode> (mode));
}
/* Callback routine for store_by_pieces. Return the RTL of a register
@@ -3983,33 +4080,45 @@ builtin_memset_read_str (void *data, void *prevp,
nullptr, it has the RTL info from the previous iteration. */
static rtx
-builtin_memset_gen_str (void *data, void *prevp,
+builtin_memset_gen_str (void *data, void *prev,
HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
- scalar_int_mode mode)
+ fixed_size_mode mode)
{
rtx target, coeff;
size_t size;
char *p;
- by_pieces_prev *prev = (by_pieces_prev *) prevp;
- if (prev != nullptr && prev->data != nullptr)
- {
- /* Use the previous data in the same mode. */
- if (prev->mode == mode)
- return prev->data;
-
- target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
- if (target != nullptr)
- return target;
- }
-
size = GET_MODE_SIZE (mode);
if (size == 1)
return (rtx) data;
+ target = gen_memset_value_from_prev ((by_pieces_prev *) prev, mode);
+ if (target != nullptr)
+ return target;
+
+ if (VECTOR_MODE_P (mode))
+ {
+ gcc_assert (GET_MODE_INNER (mode) == QImode);
+
+ /* vec_duplicate_optab is a precondition to pick a vector mode for
+ the memset expander. */
+ insn_code icode = optab_handler (vec_duplicate_optab, mode);
+
+ target = targetm.gen_memset_scratch_rtx (mode);
+ class expand_operand ops[2];
+ create_output_operand (&ops[0], target, mode);
+ create_input_operand (&ops[1], (rtx) data, QImode);
+ expand_insn (icode, 2, ops);
+ if (!rtx_equal_p (target, ops[0].value))
+ emit_move_insn (target, ops[0].value);
+
+ return target;
+ }
+
p = XALLOCAVEC (char, size);
memset (p, 1, size);
- coeff = c_readstr (p, mode);
+ /* Vector modes should be handled above. */
+ coeff = c_readstr (p, as_a <scalar_int_mode> (mode));
target = convert_to_mode (mode, (rtx) data, 1);
target = expand_mult (mode, target, coeff, NULL_RTX, 1);
@@ -4113,7 +4222,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
&valc, align, true))
return false;
- rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);
+ by_pieces_constfn constfun;
void *constfundata;
if (val)
{
diff --git a/gcc/builtins.h b/gcc/builtins.h
index b580635..024ddbf 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn);
extern tree mathfn_built_in (tree, combined_fn);
extern tree mathfn_built_in_type (combined_fn);
extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
- scalar_int_mode);
+ fixed_size_mode);
extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
- scalar_int_mode);
+ fixed_size_mode);
extern rtx expand_builtin_saveregs (void);
extern tree std_build_builtin_va_list (void);
extern tree std_fn_abi_va_list (tree);
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index a464d26..cb01528 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -12158,6 +12158,13 @@ This function prepares to emit a conditional comparison within a sequence
@var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
@end deftypefn
+@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
+This hook should return an rtx for a scratch register in @var{mode} to
+be used when expanding memset calls. The backend can use a hard scratch
+register to avoid stack realignment when expanding memset. The default
+is @code{gen_reg_rtx}.
+@end deftypefn
+
@deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
This target hook returns a new value for the number of times @var{loop}
should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 0b60342..4a522ae 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7986,6 +7986,8 @@ lists.
@hook TARGET_GEN_CCMP_NEXT
+@hook TARGET_GEN_MEMSET_SCRATCH_RTX
+
@hook TARGET_LOOP_UNROLL_ADJUST
@defmac POWI_MAX_MULTS
diff --git a/gcc/expr.c b/gcc/expr.c
index 6a43681..b65cfcf 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
return align;
}
-/* Return the widest integer mode that is narrower than SIZE bytes. */
+/* Return the widest QI vector, if QI_MODE is true, or integer mode
+ that is narrower than SIZE bytes. */
-static scalar_int_mode
-widest_int_mode_for_size (unsigned int size)
+static fixed_size_mode
+widest_fixed_size_mode_for_size (unsigned int size, bool qi_vector)
{
- scalar_int_mode result = NARROWEST_INT_MODE;
+ fixed_size_mode result = NARROWEST_INT_MODE;
gcc_checking_assert (size > 1);
+ /* Use QI vector only if size is wider than a WORD. */
+ if (qi_vector && size > UNITS_PER_WORD)
+ {
+ machine_mode mode;
+ fixed_size_mode candidate;
+ FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+ if (is_a<fixed_size_mode> (mode, &candidate)
+ && GET_MODE_INNER (candidate) == QImode)
+ {
+ if (GET_MODE_SIZE (candidate) >= size)
+ break;
+ if (optab_handler (vec_duplicate_optab, candidate)
+ != CODE_FOR_nothing)
+ result = candidate;
+ }
+
+ if (result != NARROWEST_INT_MODE)
+ return result;
+ }
+
opt_scalar_int_mode tmode;
FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
if (GET_MODE_SIZE (tmode.require ()) < size)
@@ -815,13 +836,14 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
unsigned int max_size, by_pieces_operation op)
{
unsigned HOST_WIDE_INT n_insns = 0;
- scalar_int_mode mode;
+ fixed_size_mode mode;
if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
{
/* NB: Round up L and ALIGN to the widest integer mode for
MAX_SIZE. */
- mode = widest_int_mode_for_size (max_size);
+ mode = widest_fixed_size_mode_for_size (max_size,
+ op == SET_BY_PIECES);
if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
{
unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
@@ -835,7 +857,8 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
while (max_size > 1 && l > 0)
{
- mode = widest_int_mode_for_size (max_size);
+ mode = widest_fixed_size_mode_for_size (max_size,
+ op == SET_BY_PIECES);
enum insn_code icode;
unsigned int modesize = GET_MODE_SIZE (mode);
@@ -903,8 +926,7 @@ class pieces_addr
void *m_cfndata;
public:
pieces_addr (rtx, bool, by_pieces_constfn, void *);
- rtx adjust (scalar_int_mode, HOST_WIDE_INT,
- by_pieces_prev * = nullptr);
+ rtx adjust (fixed_size_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
void increment_address (HOST_WIDE_INT);
void maybe_predec (HOST_WIDE_INT);
void maybe_postinc (HOST_WIDE_INT);
@@ -1006,7 +1028,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
but we still modify the MEM's properties. */
rtx
-pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
+pieces_addr::adjust (fixed_size_mode mode, HOST_WIDE_INT offset,
by_pieces_prev *prev)
{
if (m_constfn)
@@ -1060,11 +1082,14 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)
class op_by_pieces_d
{
private:
- scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
+ fixed_size_mode get_usable_mode (fixed_size_mode, unsigned int);
+ fixed_size_mode smallest_fixed_size_mode_for_size (unsigned int);
protected:
pieces_addr m_to, m_from;
- unsigned HOST_WIDE_INT m_len;
+ /* Make m_len read-only so that smallest_fixed_size_mode_for_size can
+ use it to check the valid mode size. */
+ const unsigned HOST_WIDE_INT m_len;
HOST_WIDE_INT m_offset;
unsigned int m_align;
unsigned int m_max_size;
@@ -1073,6 +1098,8 @@ class op_by_pieces_d
bool m_push;
/* True if targetm.overlap_op_by_pieces_p () returns true. */
bool m_overlap_op_by_pieces;
+ /* True if QI vector mode can be used. */
+ bool m_qi_vector_mode;
/* Virtual functions, overriden by derived classes for the specific
operation. */
@@ -1084,7 +1111,8 @@ class op_by_pieces_d
public:
op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
- unsigned HOST_WIDE_INT, unsigned int, bool);
+ unsigned HOST_WIDE_INT, unsigned int, bool,
+ bool = false);
void run ();
};
@@ -1099,11 +1127,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
by_pieces_constfn from_cfn,
void *from_cfn_data,
unsigned HOST_WIDE_INT len,
- unsigned int align, bool push)
+ unsigned int align, bool push,
+ bool qi_vector_mode)
: m_to (to, to_load, NULL, NULL),
m_from (from, from_load, from_cfn, from_cfn_data),
m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
- m_push (push)
+ m_push (push), m_qi_vector_mode (qi_vector_mode)
{
int toi = m_to.get_addr_inc ();
int fromi = m_from.get_addr_inc ();
@@ -1124,7 +1153,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)
{
/* Find the mode of the largest comparison. */
- scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+ fixed_size_mode mode
+ = widest_fixed_size_mode_for_size (m_max_size,
+ m_qi_vector_mode);
m_from.decide_autoinc (mode, m_reverse, len);
m_to.decide_autoinc (mode, m_reverse, len);
@@ -1139,8 +1170,8 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
/* This function returns the largest usable integer mode for LEN bytes
whose size is no bigger than size of MODE. */
-scalar_int_mode
-op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
+fixed_size_mode
+op_by_pieces_d::get_usable_mode (fixed_size_mode mode, unsigned int len)
{
unsigned int size;
do
@@ -1148,13 +1179,42 @@ op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
size = GET_MODE_SIZE (mode);
if (len >= size && prepare_mode (mode, m_align))
break;
- /* NB: widest_int_mode_for_size checks SIZE > 1. */
- mode = widest_int_mode_for_size (size);
+ /* widest_fixed_size_mode_for_size checks SIZE > 1. */
+ mode = widest_fixed_size_mode_for_size (size, m_qi_vector_mode);
}
while (1);
return mode;
}
+/* Return the smallest integer or QI vector mode that is not narrower
+ than SIZE bytes. */
+
+fixed_size_mode
+op_by_pieces_d::smallest_fixed_size_mode_for_size (unsigned int size)
+{
+ /* Use QI vector only for > size of WORD. */
+ if (m_qi_vector_mode && size > UNITS_PER_WORD)
+ {
+ machine_mode mode;
+ fixed_size_mode candidate;
+ FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+ if (is_a<fixed_size_mode> (mode, &candidate)
+ && GET_MODE_INNER (candidate) == QImode)
+ {
+ /* Don't return a mode wider than M_LEN. */
+ if (GET_MODE_SIZE (candidate) > m_len)
+ break;
+
+ if (GET_MODE_SIZE (candidate) >= size
+ && (optab_handler (vec_duplicate_optab, candidate)
+ != CODE_FOR_nothing))
+ return candidate;
+ }
+ }
+
+ return smallest_int_mode_for_size (size * BITS_PER_UNIT);
+}
+
/* This function contains the main loop used for expanding a block
operation. First move what we can in the largest integer mode,
then go to successively smaller modes. For every access, call
@@ -1166,9 +1226,12 @@ op_by_pieces_d::run ()
if (m_len == 0)
return;
- /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */
- scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
- mode = get_usable_mode (mode, m_len);
+ unsigned HOST_WIDE_INT length = m_len;
+
+ /* widest_fixed_size_mode_for_size checks M_MAX_SIZE > 1. */
+ fixed_size_mode mode
+ = widest_fixed_size_mode_for_size (m_max_size, m_qi_vector_mode);
+ mode = get_usable_mode (mode, length);
by_pieces_prev to_prev = { nullptr, mode };
by_pieces_prev from_prev = { nullptr, mode };
@@ -1178,7 +1241,7 @@ op_by_pieces_d::run ()
unsigned int size = GET_MODE_SIZE (mode);
rtx to1 = NULL_RTX, from1;
- while (m_len >= size)
+ while (length >= size)
{
if (m_reverse)
m_offset -= size;
@@ -1201,22 +1264,22 @@ op_by_pieces_d::run ()
if (!m_reverse)
m_offset += size;
- m_len -= size;
+ length -= size;
}
finish_mode (mode);
- if (m_len == 0)
+ if (length == 0)
return;
if (!m_push && m_overlap_op_by_pieces)
{
/* NB: Generate overlapping operations if it is not a stack
push since stack push must not overlap. Get the smallest
- integer mode for M_LEN bytes. */
- mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
+ fixed size mode for M_LEN bytes. */
+ mode = smallest_fixed_size_mode_for_size (length);
mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
- int gap = GET_MODE_SIZE (mode) - m_len;
+ int gap = GET_MODE_SIZE (mode) - length;
if (gap > 0)
{
/* If size of MODE > M_LEN, generate the last operation
@@ -1226,20 +1289,21 @@ op_by_pieces_d::run ()
m_offset += gap;
else
m_offset -= gap;
- m_len += gap;
+ length += gap;
}
}
else
{
- /* NB: widest_int_mode_for_size checks SIZE > 1. */
- mode = widest_int_mode_for_size (size);
- mode = get_usable_mode (mode, m_len);
+ /* widest_fixed_size_mode_for_size checks SIZE > 1. */
+ mode = widest_fixed_size_mode_for_size (size,
+ m_qi_vector_mode);
+ mode = get_usable_mode (mode, length);
}
}
while (1);
/* The code above should have handled everything. */
- gcc_assert (!m_len);
+ gcc_assert (!length);
}
/* Derived class from op_by_pieces_d, providing support for block move
@@ -1355,9 +1419,10 @@ class store_by_pieces_d : public op_by_pieces_d
public:
store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
- unsigned HOST_WIDE_INT len, unsigned int align)
+ unsigned HOST_WIDE_INT len, unsigned int align,
+ bool qi_vector_mode)
: op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
- align, false)
+ align, false, qi_vector_mode)
{
}
rtx finish_retmode (memop_ret);
@@ -1446,7 +1511,8 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
max_size = STORE_MAX_PIECES + 1;
while (max_size > 1 && l > 0)
{
- scalar_int_mode mode = widest_int_mode_for_size (max_size);
+ fixed_size_mode mode
+ = widest_fixed_size_mode_for_size (max_size, memsetp);
icode = optab_handler (mov_optab, mode);
if (icode != CODE_FOR_nothing
@@ -1460,7 +1526,11 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
offset -= size;
cst = (*constfun) (constfundata, nullptr, offset, mode);
- if (!targetm.legitimate_constant_p (mode, cst))
+ /* All CONST_VECTORs can be loaded for memset since
+ vec_duplicate_optab is a precondition to pick a
+ vector mode for the memset expander. */
+ if (!((memsetp && VECTOR_MODE_P (mode))
+ || targetm.legitimate_constant_p (mode, cst)))
return 0;
if (!reverse)
@@ -1504,7 +1574,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
optimize_insn_for_speed_p ()));
- store_by_pieces_d data (to, constfun, constfundata, len, align);
+ store_by_pieces_d data (to, constfun, constfundata, len, align,
+ memsetp);
data.run ();
if (retmode != RETURN_BEGIN)
@@ -1513,15 +1584,6 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
return to;
}
-/* Callback routine for clear_by_pieces.
- Return const0_rtx unconditionally. */
-
-static rtx
-clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
-{
- return const0_rtx;
-}
-
/* Generate several move instructions to clear LEN bytes of block TO. (A MEM
rtx with BLKmode). ALIGN is maximum alignment we can assume. */
@@ -1531,7 +1593,10 @@ clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
if (len == 0)
return;
- store_by_pieces_d data (to, clear_by_pieces_1, NULL, len, align);
+ /* Use builtin_memset_read_str to support vector mode broadcast. */
+ char c = 0;
+ store_by_pieces_d data (to, builtin_memset_read_str, &c, len, align,
+ true);
data.run ();
}
@@ -5754,7 +5819,7 @@ emit_storent_insn (rtx to, rtx from)
static rtx
string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
- scalar_int_mode mode)
+ fixed_size_mode mode)
{
tree str = (tree) data;
@@ -5769,10 +5834,13 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
size_t l = TREE_STRING_LENGTH (str) - offset;
memcpy (p, TREE_STRING_POINTER (str) + offset, l);
memset (p + l, '\0', GET_MODE_SIZE (mode) - l);
- return c_readstr (p, mode, false);
+ return c_readstr (p, as_a <scalar_int_mode> (mode), false);
}
- return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);
+ /* The by-pieces infrastructure does not try to pick a vector mode
+ for storing STRING_CST. */
+ return c_readstr (TREE_STRING_POINTER (str) + offset,
+ as_a <scalar_int_mode> (mode), false);
}
/* Generate code for computing expression EXP,
diff --git a/gcc/expr.h b/gcc/expr.h
index a4f4426..94a85b4 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -108,13 +108,13 @@ enum block_op_methods
};
typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,
- scalar_int_mode);
+ fixed_size_mode);
/* The second pointer passed to by_pieces_constfn. */
struct by_pieces_prev
{
rtx data;
- scalar_int_mode mode;
+ fixed_size_mode mode;
};
extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
diff --git a/gcc/rtl.h b/gcc/rtl.h
index 5ffe900..5fdcdfc 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -2460,6 +2460,8 @@ extern bool subreg_offset_representable_p (unsigned int, machine_mode,
extern unsigned int subreg_regno (const_rtx);
extern int simplify_subreg_regno (unsigned int, machine_mode,
poly_uint64, machine_mode);
+extern int lowpart_subreg_regno (unsigned int, machine_mode,
+ machine_mode);
extern unsigned int subreg_nregs (const_rtx);
extern unsigned int subreg_nregs_with_regno (unsigned int, const_rtx);
extern unsigned HOST_WIDE_INT nonzero_bits (const_rtx, machine_mode);
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
index ec7a062..3b8d88a 100644
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -4325,6 +4325,17 @@ simplify_subreg_regno (unsigned int xregno, machine_mode xmode,
return (int) yregno;
}
+/* A wrapper around simplify_subreg_regno that uses subreg_lowpart_offset
+ (xmode, ymode) as the offset. */
+
+int
+lowpart_subreg_regno (unsigned int regno, machine_mode xmode,
+ machine_mode ymode)
+{
+ poly_uint64 offset = subreg_lowpart_offset (xmode, ymode);
+ return simplify_subreg_regno (regno, xmode, offset, ymode);
+}
+
/* Return the final regno that a subreg expression refers to. */
unsigned int
subreg_regno (const_rtx x)
diff --git a/gcc/target.def b/gcc/target.def
index 6b4226c..68a46aa 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2726,6 +2726,15 @@ DEFHOOK
rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
NULL)
+DEFHOOK
+(gen_memset_scratch_rtx,
+ "This hook should return an rtx for a scratch register in @var{mode} to\n\
+be used when expanding memset calls. The backend can use a hard scratch\n\
+register to avoid stack realignment when expanding memset. The default\n\
+is @code{gen_reg_rtx}.",
+ rtx, (machine_mode mode),
+ gen_reg_rtx)
+
/* Return a new value for loop unroll size. */
DEFHOOK
(loop_unroll_adjust,
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
index b6dbcf7..007e79f 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
@@ -10,6 +10,6 @@ foo (void)
}
/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index f41e614..1e50dc8 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -4,6 +4,6 @@
#include "pr100865-4a.c"
/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
/* { dg-final { scan-assembler-not "vmovdqa" } } */