aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorBin Cheng <bin.cheng@arm.com>2014-09-05 03:45:57 +0000
committerBin Cheng <amker@gcc.gnu.org>2014-09-05 03:45:57 +0000
commitad4211596510c04caeee5404d7b066871114b2bb (patch)
treec2c034f7f79ce46f5c16ab4210ec8b47729aa000 /gcc/config
parent6f22122491661053297b7fbc1a13d162d977e3bd (diff)
downloadgcc-ad4211596510c04caeee5404d7b066871114b2bb.zip
gcc-ad4211596510c04caeee5404d7b066871114b2bb.tar.gz
gcc-ad4211596510c04caeee5404d7b066871114b2bb.tar.bz2
re PR target/55701 (Inline some instances of memset for ARM)
PR target/55701 * config/arm/arm.md (setmem): New pattern. * config/arm/arm-protos.h (struct tune_params): New fields. (arm_gen_setmem): New prototype. * config/arm/arm.c (arm_slowmul_tune): Initialize new fields. (arm_fastmul_tune, arm_strongarm_tune, arm_xscale_tune): Ditto. (arm_9e_tune, arm_v6t2_tune, arm_cortex_tune): Ditto. (arm_cortex_a8_tune, arm_cortex_a7_tune): Ditto. (arm_cortex_a15_tune, arm_cortex_a53_tune): Ditto. (arm_cortex_a57_tune, arm_cortex_a5_tune): Ditto. (arm_cortex_a9_tune, arm_cortex_a12_tune): Ditto. (arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune): Ditto. (arm_const_inline_cost): New function. (arm_block_set_max_insns): New function. (arm_block_set_non_vect_profit_p): New function. (arm_block_set_vect_profit_p): New function. (arm_block_set_unaligned_vect): New function. (arm_block_set_aligned_vect): New function. (arm_block_set_unaligned_non_vect): New function. (arm_block_set_aligned_non_vect): New function. (arm_block_set_vect, arm_gen_setmem): New functions. testsuite * gcc.target/arm/memset-inline-1.c: New test. * gcc.target/arm/memset-inline-2.c: New test. * gcc.target/arm/memset-inline-3.c: New test. * gcc.target/arm/memset-inline-4.c: New test. * gcc.target/arm/memset-inline-5.c: New test. * gcc.target/arm/memset-inline-6.c: New test. * gcc.target/arm/memset-inline-7.c: New test. * gcc.target/arm/memset-inline-8.c: New test. * gcc.target/arm/memset-inline-9.c: New test. * gcc.target/arm/memset-inline-10.c: New test. From-SVN: r214937
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/arm/arm-protos.h5
-rw-r--r--gcc/config/arm/arm.c593
-rw-r--r--gcc/config/arm/arm.md14
3 files changed, 594 insertions, 18 deletions
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index d3540c7..3bd7d1c 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -278,6 +278,10 @@ struct tune_params
/* Prefer 32-bit encoding instead of 16-bit encoding where subset of flags
would be set. */
bool disparage_partial_flag_setting_t16_encodings;
+ /* Prefer to inline string operations like memset by using Neon. */
+ bool string_ops_prefer_neon;
+ /* Maximum number of instructions to inline calls to memset. */
+ int max_insns_inline_memset;
};
extern const struct tune_params *current_tune;
@@ -290,6 +294,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
#endif /* RTX_CODE */
+extern bool arm_gen_setmem (rtx *);
extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
extern bool arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 15c6341..88f91e6 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1698,7 +1698,9 @@ const struct tune_params arm_slowmul_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_fastmul_tune =
@@ -1715,7 +1717,9 @@ const struct tune_params arm_fastmul_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
/* StrongARM has early execution of branches, so a sequence that is worth
@@ -1735,7 +1739,9 @@ const struct tune_params arm_strongarm_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_xscale_tune =
@@ -1752,7 +1758,9 @@ const struct tune_params arm_xscale_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_9e_tune =
@@ -1769,7 +1777,9 @@ const struct tune_params arm_9e_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_v6t2_tune =
@@ -1786,7 +1796,9 @@ const struct tune_params arm_v6t2_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
/* Generic Cortex tuning. Use more specific tunings if appropriate. */
@@ -1804,7 +1816,9 @@ const struct tune_params arm_cortex_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a8_tune =
@@ -1821,7 +1835,9 @@ const struct tune_params arm_cortex_a8_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ true, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a7_tune =
@@ -1838,7 +1854,9 @@ const struct tune_params arm_cortex_a7_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ true, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a15_tune =
@@ -1855,7 +1873,9 @@ const struct tune_params arm_cortex_a15_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- true, true /* Prefer 32-bit encodings. */
+ true, true, /* Prefer 32-bit encodings. */
+ true, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a53_tune =
@@ -1872,7 +1892,9 @@ const struct tune_params arm_cortex_a53_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a57_tune =
@@ -1889,7 +1911,9 @@ const struct tune_params arm_cortex_a57_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- true, true /* Prefer 32-bit encodings. */
+ true, true, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
/* Branches can be dual-issued on Cortex-A5, so conditional execution is
@@ -1909,7 +1933,9 @@ const struct tune_params arm_cortex_a5_tune =
{false, false}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ true, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a9_tune =
@@ -1926,7 +1952,9 @@ const struct tune_params arm_cortex_a9_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a12_tune =
@@ -1943,7 +1971,9 @@ const struct tune_params arm_cortex_a12_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ true, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
/* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single
@@ -1967,7 +1997,9 @@ const struct tune_params arm_v7m_tune =
{false, false}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
/* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
@@ -1986,7 +2018,9 @@ const struct tune_params arm_v6m_tune =
{false, false}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_fa726te_tune =
@@ -2003,7 +2037,9 @@ const struct tune_params arm_fa726te_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
- false, false /* Prefer 32-bit encodings. */
+ false, false, /* Prefer 32-bit encodings. */
+ false, /* Prefer Neon for stringops. */
+ 8 /* Maximum insns to inline memset. */
};
@@ -16903,6 +16939,14 @@ arm_const_double_inline_cost (rtx val)
NULL_RTX, NULL_RTX, 0, 0));
}
+/* Cost of loading a SImode constant. */
+static inline int
+arm_const_inline_cost (enum rtx_code code, rtx val)
+{
+ return arm_gen_constant (code, SImode, NULL_RTX, INTVAL (val),
+ NULL_RTX, NULL_RTX, 1, 0);
+}
+
/* Return true if it is worthwhile to split a 64-bit constant into two
32-bit operations. This is the case if optimizing for size, or
if we have load delay slots, or if one 32-bit part can be done with
@@ -31587,6 +31631,519 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
}
+/* Maximum number of instructions to set block of memory. */
+static int
+arm_block_set_max_insns (void)
+{
+ if (optimize_function_for_size_p (cfun))
+ return 4;
+ else
+ return current_tune->max_insns_inline_memset;
+}
+
+/* Return TRUE if it's profitable to set block of memory for
+ non-vectorized case. VAL is the value to set the memory
+ with. LENGTH is the number of bytes to set. ALIGN is the
+ alignment of the destination memory in bytes. UNALIGNED_P
+ is TRUE if we can only set the memory with instructions
+ meeting alignment requirements. USE_STRD_P is TRUE if we
+ can use strd to set the memory. */
+static bool
+arm_block_set_non_vect_profit_p (rtx val,
+ unsigned HOST_WIDE_INT length,
+ unsigned HOST_WIDE_INT align,
+ bool unaligned_p, bool use_strd_p)
+{
+ int num = 0;
+ /* For leftovers in bytes of 0-7, we can set the memory block using
+ strb/strh/str with minimum instruction number. */
+ const int leftover[8] = {0, 1, 1, 2, 1, 2, 2, 3};
+
+ if (unaligned_p)
+ {
+ num = arm_const_inline_cost (SET, val);
+ num += length / align + length % align;
+ }
+ else if (use_strd_p)
+ {
+ num = arm_const_double_inline_cost (val);
+ num += (length >> 3) + leftover[length & 7];
+ }
+ else
+ {
+ num = arm_const_inline_cost (SET, val);
+ num += (length >> 2) + leftover[length & 3];
+ }
+
+ /* We may be able to combine last pair STRH/STRB into a single STR
+ by shifting one byte back. */
+ if (unaligned_access && length > 3 && (length & 3) == 3)
+ num--;
+
+ return (num <= arm_block_set_max_insns ());
+}
+
+/* Return TRUE if it's profitable to set block of memory for
+ vectorized case. LENGTH is the number of bytes to set.
+ ALIGN is the alignment of destination memory in bytes.
+ MODE is the vector mode used to set the memory. */
+static bool
+arm_block_set_vect_profit_p (unsigned HOST_WIDE_INT length,
+ unsigned HOST_WIDE_INT align,
+ enum machine_mode mode)
+{
+ int num;
+ bool unaligned_p = ((align & 3) != 0);
+ unsigned int nelt = GET_MODE_NUNITS (mode);
+
+ /* Instruction loading constant value. */
+ num = 1;
+ /* Instructions storing the memory. */
+ num += (length + nelt - 1) / nelt;
+ /* Instructions adjusting the address expression. Only need to
+ adjust address expression if it's 4 bytes aligned and bytes
+ leftover can only be stored by mis-aligned store instruction. */
+ if (!unaligned_p && (length & 3) != 0)
+ num++;
+
+ /* Store the first 16 bytes using vst1:v16qi for the aligned case. */
+ if (!unaligned_p && mode == V16QImode)
+ num--;
+
+ return (num <= arm_block_set_max_insns ());
+}
+
+/* Set a block of memory using vectorization instructions for the
+ unaligned case. We fill the first LENGTH bytes of the memory
+ area starting from DSTBASE with byte constant VALUE. ALIGN is
+ the alignment requirement of memory. Return TRUE if succeeded. */
+static bool
+arm_block_set_unaligned_vect (rtx dstbase,
+ unsigned HOST_WIDE_INT length,
+ unsigned HOST_WIDE_INT value,
+ unsigned HOST_WIDE_INT align)
+{
+ unsigned int i, j, nelt_v16, nelt_v8, nelt_mode;
+ rtx dst, mem;
+ rtx val_elt, val_vec, reg;
+ rtx rval[MAX_VECT_LEN];
+ rtx (*gen_func) (rtx, rtx);
+ enum machine_mode mode;
+ unsigned HOST_WIDE_INT v = value;
+
+ gcc_assert ((align & 0x3) != 0);
+ nelt_v8 = GET_MODE_NUNITS (V8QImode);
+ nelt_v16 = GET_MODE_NUNITS (V16QImode);
+ if (length >= nelt_v16)
+ {
+ mode = V16QImode;
+ gen_func = gen_movmisalignv16qi;
+ }
+ else
+ {
+ mode = V8QImode;
+ gen_func = gen_movmisalignv8qi;
+ }
+ nelt_mode = GET_MODE_NUNITS (mode);
+ gcc_assert (length >= nelt_mode);
+ /* Skip if it isn't profitable. */
+ if (!arm_block_set_vect_profit_p (length, align, mode))
+ return false;
+
+ dst = copy_addr_to_reg (XEXP (dstbase, 0));
+ mem = adjust_automodify_address (dstbase, mode, dst, 0);
+
+ v = sext_hwi (v, BITS_PER_WORD);
+ val_elt = GEN_INT (v);
+ for (j = 0; j < nelt_mode; j++)
+ rval[j] = val_elt;
+
+ reg = gen_reg_rtx (mode);
+ val_vec = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt_mode, rval));
+ /* Emit instruction loading the constant value. */
+ emit_move_insn (reg, val_vec);
+
+ /* Handle nelt_mode bytes in a vector. */
+ for (i = 0; (i + nelt_mode <= length); i += nelt_mode)
+ {
+ emit_insn ((*gen_func) (mem, reg));
+ if (i + 2 * nelt_mode <= length)
+ emit_insn (gen_add2_insn (dst, GEN_INT (nelt_mode)));
+ }
+
+ /* If there are not less than nelt_v8 bytes leftover, we must be in
+ V16QI mode. */
+ gcc_assert ((i + nelt_v8) > length || mode == V16QImode);
+
+ /* Handle (8, 16) bytes leftover. */
+ if (i + nelt_v8 < length)
+ {
+ emit_insn (gen_add2_insn (dst, GEN_INT (length - i)));
+ /* We are shifting bytes back, set the alignment accordingly. */
+ if ((length & 1) != 0 && align >= 2)
+ set_mem_align (mem, BITS_PER_UNIT);
+
+ emit_insn (gen_movmisalignv16qi (mem, reg));
+ }
+ /* Handle (0, 8] bytes leftover. */
+ else if (i < length && i + nelt_v8 >= length)
+ {
+ if (mode == V16QImode)
+ {
+ reg = gen_lowpart (V8QImode, reg);
+ mem = adjust_automodify_address (dstbase, V8QImode, dst, 0);
+ }
+ emit_insn (gen_add2_insn (dst, GEN_INT ((length - i)
+ + (nelt_mode - nelt_v8))));
+ /* We are shifting bytes back, set the alignment accordingly. */
+ if ((length & 1) != 0 && align >= 2)
+ set_mem_align (mem, BITS_PER_UNIT);
+
+ emit_insn (gen_movmisalignv8qi (mem, reg));
+ }
+
+ return true;
+}
+
+/* Set a block of memory using vectorization instructions for the
+ aligned case. We fill the first LENGTH bytes of the memory area
+ starting from DSTBASE with byte constant VALUE. ALIGN is the
+ alignment requirement of memory. Return TRUE if succeeded. */
+static bool
+arm_block_set_aligned_vect (rtx dstbase,
+ unsigned HOST_WIDE_INT length,
+ unsigned HOST_WIDE_INT value,
+ unsigned HOST_WIDE_INT align)
+{
+ unsigned int i, j, nelt_v8, nelt_v16, nelt_mode;
+ rtx dst, addr, mem;
+ rtx val_elt, val_vec, reg;
+ rtx rval[MAX_VECT_LEN];
+ enum machine_mode mode;
+ unsigned HOST_WIDE_INT v = value;
+
+ gcc_assert ((align & 0x3) == 0);
+ nelt_v8 = GET_MODE_NUNITS (V8QImode);
+ nelt_v16 = GET_MODE_NUNITS (V16QImode);
+ if (length >= nelt_v16 && unaligned_access && !BYTES_BIG_ENDIAN)
+ mode = V16QImode;
+ else
+ mode = V8QImode;
+
+ nelt_mode = GET_MODE_NUNITS (mode);
+ gcc_assert (length >= nelt_mode);
+ /* Skip if it isn't profitable. */
+ if (!arm_block_set_vect_profit_p (length, align, mode))
+ return false;
+
+ dst = copy_addr_to_reg (XEXP (dstbase, 0));
+
+ v = sext_hwi (v, BITS_PER_WORD);
+ val_elt = GEN_INT (v);
+ for (j = 0; j < nelt_mode; j++)
+ rval[j] = val_elt;
+
+ reg = gen_reg_rtx (mode);
+ val_vec = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt_mode, rval));
+ /* Emit instruction loading the constant value. */
+ emit_move_insn (reg, val_vec);
+
+ i = 0;
+ /* Handle first 16 bytes specially using vst1:v16qi instruction. */
+ if (mode == V16QImode)
+ {
+ mem = adjust_automodify_address (dstbase, mode, dst, 0);
+ emit_insn (gen_movmisalignv16qi (mem, reg));
+ i += nelt_mode;
+ /* Handle (8, 16) bytes leftover using vst1:v16qi again. */
+ if (i + nelt_v8 < length && i + nelt_v16 > length)
+ {
+ emit_insn (gen_add2_insn (dst, GEN_INT (length - nelt_mode)));
+ mem = adjust_automodify_address (dstbase, mode, dst, 0);
+ /* We are shifting bytes back, set the alignment accordingly. */
+ if ((length & 0x3) == 0)
+ set_mem_align (mem, BITS_PER_UNIT * 4);
+ else if ((length & 0x1) == 0)
+ set_mem_align (mem, BITS_PER_UNIT * 2);
+ else
+ set_mem_align (mem, BITS_PER_UNIT);
+
+ emit_insn (gen_movmisalignv16qi (mem, reg));
+ return true;
+ }
+ /* Fall through for bytes leftover. */
+ mode = V8QImode;
+ nelt_mode = GET_MODE_NUNITS (mode);
+ reg = gen_lowpart (V8QImode, reg);
+ }
+
+ /* Handle 8 bytes in a vector. */
+ for (; (i + nelt_mode <= length); i += nelt_mode)
+ {
+ addr = plus_constant (Pmode, dst, i);
+ mem = adjust_automodify_address (dstbase, mode, addr, i);
+ emit_move_insn (mem, reg);
+ }
+
+ /* Handle single word leftover by shifting 4 bytes back. We can
+ use aligned access for this case. */
+ if (i + UNITS_PER_WORD == length)
+ {
+ addr = plus_constant (Pmode, dst, i - UNITS_PER_WORD);
+ mem = adjust_automodify_address (dstbase, mode,
+ addr, i - UNITS_PER_WORD);
+ /* We are shifting 4 bytes back, set the alignment accordingly. */
+ if (align > UNITS_PER_WORD)
+ set_mem_align (mem, BITS_PER_UNIT * UNITS_PER_WORD);
+
+ emit_move_insn (mem, reg);
+ }
+ /* Handle (0, 4), (4, 8) bytes leftover by shifting bytes back.
+ We have to use unaligned access for this case. */
+ else if (i < length)
+ {
+ emit_insn (gen_add2_insn (dst, GEN_INT (length - nelt_mode)));
+ mem = adjust_automodify_address (dstbase, mode, dst, 0);
+ /* We are shifting bytes back, set the alignment accordingly. */
+ if ((length & 1) == 0)
+ set_mem_align (mem, BITS_PER_UNIT * 2);
+ else
+ set_mem_align (mem, BITS_PER_UNIT);
+
+ emit_insn (gen_movmisalignv8qi (mem, reg));
+ }
+
+ return true;
+}
+
+/* Set a block of memory using plain strh/strb instructions, only
+ using instructions allowed by ALIGN on processor. We fill the
+ first LENGTH bytes of the memory area starting from DSTBASE
+ with byte constant VALUE. ALIGN is the alignment requirement
+ of memory. */
+static bool
+arm_block_set_unaligned_non_vect (rtx dstbase,
+ unsigned HOST_WIDE_INT length,
+ unsigned HOST_WIDE_INT value,
+ unsigned HOST_WIDE_INT align)
+{
+ unsigned int i;
+ rtx dst, addr, mem;
+ rtx val_exp, val_reg, reg;
+ enum machine_mode mode;
+ HOST_WIDE_INT v = value;
+
+ gcc_assert (align == 1 || align == 2);
+
+ if (align == 2)
+ v |= (value << BITS_PER_UNIT);
+
+ v = sext_hwi (v, BITS_PER_WORD);
+ val_exp = GEN_INT (v);
+ /* Skip if it isn't profitable. */
+ if (!arm_block_set_non_vect_profit_p (val_exp, length,
+ align, true, false))
+ return false;
+
+ dst = copy_addr_to_reg (XEXP (dstbase, 0));
+ mode = (align == 2 ? HImode : QImode);
+ val_reg = force_reg (SImode, val_exp);
+ reg = gen_lowpart (mode, val_reg);
+
+ for (i = 0; (i + GET_MODE_SIZE (mode) <= length); i += GET_MODE_SIZE (mode))
+ {
+ addr = plus_constant (Pmode, dst, i);
+ mem = adjust_automodify_address (dstbase, mode, addr, i);
+ emit_move_insn (mem, reg);
+ }
+
+ /* Handle single byte leftover. */
+ if (i + 1 == length)
+ {
+ reg = gen_lowpart (QImode, val_reg);
+ addr = plus_constant (Pmode, dst, i);
+ mem = adjust_automodify_address (dstbase, QImode, addr, i);
+ emit_move_insn (mem, reg);
+ i++;
+ }
+
+ gcc_assert (i == length);
+ return true;
+}
+
+/* Set a block of memory using plain strd/str/strh/strb instructions,
+ to permit unaligned copies on processors which support unaligned
+ semantics for those instructions. We fill the first LENGTH bytes
+ of the memory area starting from DSTBASE with byte constant VALUE.
+ ALIGN is the alignment requirement of memory. */
+static bool
+arm_block_set_aligned_non_vect (rtx dstbase,
+ unsigned HOST_WIDE_INT length,
+ unsigned HOST_WIDE_INT value,
+ unsigned HOST_WIDE_INT align)
+{
+ unsigned int i;
+ rtx dst, addr, mem;
+ rtx val_exp, val_reg, reg;
+ unsigned HOST_WIDE_INT v;
+ bool use_strd_p;
+
+ use_strd_p = (length >= 2 * UNITS_PER_WORD && (align & 3) == 0
+ && TARGET_LDRD && current_tune->prefer_ldrd_strd);
+
+ v = (value | (value << 8) | (value << 16) | (value << 24));
+ if (length < UNITS_PER_WORD)
+ v &= (0xFFFFFFFF >> (UNITS_PER_WORD - length) * BITS_PER_UNIT);
+
+ if (use_strd_p)
+ v |= (v << BITS_PER_WORD);
+ else
+ v = sext_hwi (v, BITS_PER_WORD);
+
+ val_exp = GEN_INT (v);
+ /* Skip if it isn't profitable. */
+ if (!arm_block_set_non_vect_profit_p (val_exp, length,
+ align, false, use_strd_p))
+ {
+ if (!use_strd_p)
+ return false;
+
+ /* Try without strd. */
+ v = (v >> BITS_PER_WORD);
+ v = sext_hwi (v, BITS_PER_WORD);
+ val_exp = GEN_INT (v);
+ use_strd_p = false;
+ if (!arm_block_set_non_vect_profit_p (val_exp, length,
+ align, false, use_strd_p))
+ return false;
+ }
+
+ i = 0;
+ dst = copy_addr_to_reg (XEXP (dstbase, 0));
+ /* Handle double words using strd if possible. */
+ if (use_strd_p)
+ {
+ val_reg = force_reg (DImode, val_exp);
+ reg = val_reg;
+ for (; (i + 8 <= length); i += 8)
+ {
+ addr = plus_constant (Pmode, dst, i);
+ mem = adjust_automodify_address (dstbase, DImode, addr, i);
+ emit_move_insn (mem, reg);
+ }
+ }
+ else
+ val_reg = force_reg (SImode, val_exp);
+
+ /* Handle words. */
+ reg = (use_strd_p ? gen_lowpart (SImode, val_reg) : val_reg);
+ for (; (i + 4 <= length); i += 4)
+ {
+ addr = plus_constant (Pmode, dst, i);
+ mem = adjust_automodify_address (dstbase, SImode, addr, i);
+ if ((align & 3) == 0)
+ emit_move_insn (mem, reg);
+ else
+ emit_insn (gen_unaligned_storesi (mem, reg));
+ }
+
+ /* Merge last pair of STRH and STRB into a STR if possible. */
+ if (unaligned_access && i > 0 && (i + 3) == length)
+ {
+ addr = plus_constant (Pmode, dst, i - 1);
+ mem = adjust_automodify_address (dstbase, SImode, addr, i - 1);
+ /* We are shifting one byte back, set the alignment accordingly. */
+ if ((align & 1) == 0)
+ set_mem_align (mem, BITS_PER_UNIT);
+
+ /* Most likely this is an unaligned access, and we can't tell at
+ compilation time. */
+ emit_insn (gen_unaligned_storesi (mem, reg));
+ return true;
+ }
+
+ /* Handle half word leftover. */
+ if (i + 2 <= length)
+ {
+ reg = gen_lowpart (HImode, val_reg);
+ addr = plus_constant (Pmode, dst, i);
+ mem = adjust_automodify_address (dstbase, HImode, addr, i);
+ if ((align & 1) == 0)
+ emit_move_insn (mem, reg);
+ else
+ emit_insn (gen_unaligned_storehi (mem, reg));
+
+ i += 2;
+ }
+
+ /* Handle single byte leftover. */
+ if (i + 1 == length)
+ {
+ reg = gen_lowpart (QImode, val_reg);
+ addr = plus_constant (Pmode, dst, i);
+ mem = adjust_automodify_address (dstbase, QImode, addr, i);
+ emit_move_insn (mem, reg);
+ }
+
+ return true;
+}
+
+/* Set a block of memory using vectorization instructions for both
+ aligned and unaligned cases. We fill the first LENGTH bytes of
+ the memory area starting from DSTBASE with byte constant VALUE.
+ ALIGN is the alignment requirement of memory. */
+static bool
+arm_block_set_vect (rtx dstbase,
+ unsigned HOST_WIDE_INT length,
+ unsigned HOST_WIDE_INT value,
+ unsigned HOST_WIDE_INT align)
+{
+ /* Check whether we need to use unaligned store instruction. */
+ if (((align & 3) != 0 || (length & 3) != 0)
+ /* Check whether unaligned store instruction is available. */
+ && (!unaligned_access || BYTES_BIG_ENDIAN))
+ return false;
+
+ if ((align & 3) == 0)
+ return arm_block_set_aligned_vect (dstbase, length, value, align);
+ else
+ return arm_block_set_unaligned_vect (dstbase, length, value, align);
+}
+
+/* Expand string store operation. Firstly we try to do that by using
+ vectorization instructions, then try with ARM unaligned access and
+ double-word store if profitable. OPERANDS[0] is the destination,
+ OPERANDS[1] is the number of bytes, operands[2] is the value to
+ initialize the memory, OPERANDS[3] is the known alignment of the
+ destination. */
+bool
+arm_gen_setmem (rtx *operands)
+{
+ rtx dstbase = operands[0];
+ unsigned HOST_WIDE_INT length;
+ unsigned HOST_WIDE_INT value;
+ unsigned HOST_WIDE_INT align;
+
+ if (!CONST_INT_P (operands[2]) || !CONST_INT_P (operands[1]))
+ return false;
+
+ length = UINTVAL (operands[1]);
+ if (length > 64)
+ return false;
+
+ value = (UINTVAL (operands[2]) & 0xFF);
+ align = UINTVAL (operands[3]);
+ if (TARGET_NEON && length >= 8
+ && current_tune->string_ops_prefer_neon
+ && arm_block_set_vect (dstbase, length, value, align))
+ return true;
+
+ if (!unaligned_access && (align & 3) != 0)
+ return arm_block_set_unaligned_non_vect (dstbase, length, value, align);
+
+ return arm_block_set_aligned_non_vect (dstbase, length, value, align);
+}
+
/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
static unsigned HOST_WIDE_INT
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index f394855..0e43dd2 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -6716,6 +6716,20 @@
})
+(define_expand "setmemsi"
+ [(match_operand:BLK 0 "general_operand" "")
+ (match_operand:SI 1 "const_int_operand" "")
+ (match_operand:SI 2 "const_int_operand" "")
+ (match_operand:SI 3 "const_int_operand" "")]
+ "TARGET_32BIT"
+{
+ if (arm_gen_setmem (operands))
+ DONE;
+
+ FAIL;
+})
+
+
;; Move a block of memory if it is word aligned and MORE than 2 words long.
;; We could let this apply for blocks of less than this, but it clobbers so
;; many registers that there is then probably a better way.