aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorJan Hubicka <jh@suse.cz>2000-02-03 15:10:02 +0100
committerJan Hubicka <hubicka@gcc.gnu.org>2000-02-03 14:10:02 +0000
commit79f05c19ca1ee164031f0d3926bc5074c499da10 (patch)
treee836a7b6ad85ed399075edd002c6b38840018694 /gcc
parent31a72d3f3e81140f66f4c140114147de5b913398 (diff)
downloadgcc-79f05c19ca1ee164031f0d3926bc5074c499da10.zip
gcc-79f05c19ca1ee164031f0d3926bc5074c499da10.tar.gz
gcc-79f05c19ca1ee164031f0d3926bc5074c499da10.tar.bz2
i386.md (movstrsi, clrstrsi): Support variable sized copies, align destination when needed.
* i386.md (movstrsi, clrstrsi): Support variable sized copies, align destination when needed. (strmovsi, strsetsi): New expander. (strmovsi_1, strsetsi_1): New pattern. * i386.h (MASK_NO_ALIGN_STROP, MASK_INLINE_ALL_STROP, TARGET_ALIGN_STRINGOPS, TARGET_INLINE_ALL_STRINGOPS): New macros. (TARGET_SWITCHES) Add align-stringops and inline-all-stringops. * invoke.texi (align-stringops, inline-all-stringops): Document. From-SVN: r31773
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog11
-rw-r--r--gcc/config/i386/i386.h13
-rw-r--r--gcc/config/i386/i386.md360
-rw-r--r--gcc/invoke.texi15
4 files changed, 384 insertions, 15 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5d0f642..8bb5dd6 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,14 @@
+Thu Feb 3 15:08:13 MET 2000 Jan Hubicka <jh@suse.cz>
+
+ * i386.md (movstrsi, clrstrsi): Support variable sized copies, align
+ destination when needed.
+ (strmovsi, strsetsi): New expander.
+ (strmovsi_1, strsetsi_1): New pattern.
+ * i386.h (MASK_NO_ALIGN_STROP, MASK_INLINE_ALL_STROP,
+ TARGET_ALIGN_STRINGOPS, TARGET_INLINE_ALL_STRINGOPS): New macros.
+ (TARGET_SWITCHES) Add align-stringops and inline-all-stringops.
+ * invoke.texi (align-stringops, inline-all-stringops): Document.
+
Wed Feb 2 23:04:47 2000 Krister Walfridsson <cato@df.lth.se>
* i386/netbsd.h (INT_ASM_OP): Define.
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 1302b65..8c33c66 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -101,6 +101,8 @@ extern int target_flags;
#define MASK_NO_FANCY_MATH_387 0x00000040 /* Disable sin, cos, sqrt */
#define MASK_OMIT_LEAF_FRAME_POINTER 0x080 /* omit leaf frame pointers */
#define MASK_STACK_PROBE 0x00000100 /* Enable stack probing */
+#define MASK_NO_ALIGN_STROPS 0x00001000 /* Enable aligning of string ops. */
+#define MASK_INLINE_ALL_STROPS 0x00002000 /* Inline stringops in all cases */
/* Temporary codegen switches */
#define MASK_INTEL_SYNTAX 0x00000200
@@ -190,6 +192,9 @@ extern const int x86_promote_QImode, x86_single_stringop;
#define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
+#define TARGET_ALIGN_STRINGOPS (!(target_flags & MASK_NO_ALIGN_STROPS))
+#define TARGET_INLINE_ALL_STRINGOPS (target_flags & MASK_INLINE_ALL_STROPS)
+
#define ASSEMBLER_DIALECT ((target_flags & MASK_INTEL_SYNTAX) != 0)
#define TARGET_SWITCHES \
@@ -238,6 +243,14 @@ extern const int x86_promote_QImode, x86_single_stringop;
{ "intel-syntax", MASK_INTEL_SYNTAX, \
"Emit Intel syntax assembler opcodes" }, \
{ "no-intel-syntax", -MASK_INTEL_SYNTAX, "" }, \
+ { "align-stringops", -MASK_NO_ALIGN_STROPS, \
+ "Align destination of the string operations" }, \
+ { "no-align-stringops", MASK_NO_ALIGN_STROPS, \
+ "Do not align destination of the string operations" }, \
+ { "inline-all-strinops", MASK_INLINE_ALL_STROPS, \
+ "Inline all known string operations" }, \
+ { "no-inline-all-stringops", -MASK_INLINE_ALL_STROPS, \
+ "Do not inline all known string operations" }, \
SUBTARGET_SWITCHES \
{ "", TARGET_DEFAULT, 0 }}
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index ce2ac95..c5454d7 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -7838,49 +7838,208 @@
(define_expand "movstrsi"
[(use (match_operand:BLK 0 "memory_operand" ""))
(use (match_operand:BLK 1 "memory_operand" ""))
- (use (match_operand:SI 2 "const_int_operand" ""))
+ (use (match_operand:SI 2 "nonmemory_operand" ""))
(use (match_operand:SI 3 "const_int_operand" ""))]
""
"
{
rtx srcreg, destreg, countreg;
+ int align = 0;
+ int count = -1;
- if (GET_CODE (operands[2]) != CONST_INT)
- FAIL;
+ if (GET_CODE (operands[3]) == CONST_INT)
+ align = INTVAL (operands[3]);
+
+ /* This simple hack avoids all inlining code and simplifies code bellow. */
+ if (!TARGET_ALIGN_STRINGOPS)
+ align = 32;
+
+ if (GET_CODE (operands[2]) == CONST_INT)
+ count = INTVAL (operands[2]);
destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
emit_insn (gen_cld());
+
/* When optimizing for size emit simple rep ; movsb instruction for
counts not divisible by 4. */
- if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03))
+
+ if ((!optimize || optimize_size)
+ && (count < 0 || (count & 0x03)))
{
countreg = copy_to_mode_reg (SImode, operands[2]);
emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
destreg, srcreg, countreg));
}
- else
+
+ /* For constant aligned (or small unaligned) copies use rep movsl
+ followed by code copying the rest. For PentiumPro ensure 8 byte
+ alignment to allow rep movsl acceleration. */
+
+ else if (count >= 0
+ && (align >= 8
+ || (!TARGET_PENTIUMPRO && align >= 4)
+ || optimize_size || count < 64))
{
- if (INTVAL (operands[2]) & ~0x03)
+ if (count & ~0x03)
{
countreg = copy_to_mode_reg (SImode,
- GEN_INT ((INTVAL (operands[2]) >> 2)
+ GEN_INT ((count >> 2)
& 0x3fffffff));
emit_insn (gen_rep_movsi (destreg, srcreg, countreg,
destreg, srcreg, countreg));
}
- if (INTVAL (operands[2]) & 0x02)
+ if (count & 0x02)
emit_insn (gen_strmovhi (destreg, srcreg));
- if (INTVAL (operands[2]) & 0x01)
+ if (count & 0x01)
emit_insn (gen_strmovqi (destreg, srcreg));
}
+ /* The generic code based on the glibc implementation:
+ - align destination to 4 bytes (8 byte alignment is used for PentiumPro
+ allowing accelerated copying there)
+ - copy the data using rep movsl
+ - copy the rest. */
+ else
+ {
+ rtx countreg2;
+ rtx label = NULL;
+
+ /* In case we don't know anything about the alignment, default to
+ library version, since it is usually equally fast and result in
+ shorter code. */
+ if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
+ FAIL;
+
+ if (TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld());
+
+ countreg2 = gen_reg_rtx (SImode);
+ countreg = copy_to_mode_reg (SImode, operands[2]);
+
+ /* We don't use loops to align destination and to copy parts smaller
+ than 4 bytes, because gcc is able to optimize such code better (in
+ the case the destination or the count really is aligned, gcc is often
+ able to predict the branches) and also it is friendlier to the
+ hardware branch prediction.
+
+ Using loops is benefical for generic case, because we can
+ handle small counts using the loops. Many CPUs (such as Athlon)
+ have large REP prefix setup costs.
+
+ This is quite costy. Maybe we can revisit this decision later or
+ add some customizability to this code. */
+
+ if (count < 0
+ && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
+ {
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (countreg, GEN_INT (3),
+ LEU, 0, SImode, 1, 0, label);
+ }
+ if (align <= 1)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovqi (destreg, srcreg));
+ emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 2)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovhi (destreg, srcreg));
+ emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovsi (destreg, srcreg));
+ emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+
+ if (!TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld());
+ emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
+ emit_insn (gen_rep_movsi (destreg, srcreg, countreg2,
+ destreg, srcreg, countreg2));
+
+ if (label)
+ {
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align > 2 && count > 0 && (count & 2))
+ emit_insn (gen_strmovhi (destreg, srcreg));
+ if (align <= 2 || count < 0)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovhi (destreg, srcreg));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align > 1 && count > 0 && (count & 1))
+ emit_insn (gen_strmovsi (destreg, srcreg));
+ if (align <= 1 || count < 0)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovqi (destreg, srcreg));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ }
DONE;
}")
;; Most CPUs don't like single string operations
;; Handle this case here to simplify previous expander.
+(define_expand "strmovsi"
+ [(set (match_dup 2)
+ (mem:SI (match_operand:SI 1 "register_operand" "")))
+ (set (mem:SI (match_operand:SI 0 "register_operand" ""))
+ (match_dup 2))
+ (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
+ (clobber (reg:CC 17))])
+ (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 4)))
+ (clobber (reg:CC 17))])]
+ ""
+ "
+{
+ if (TARGET_SINGLE_STRINGOP || optimize_size)
+ {
+ emit_insn (gen_strmovsi_1 (operands[0], operands[1], operands[0],
+ operands[1]));
+ DONE;
+ }
+ else
+ operands[2] = gen_reg_rtx (SImode);
+}")
+
(define_expand "strmovhi"
[(set (match_dup 2)
(mem:HI (match_operand:SI 1 "register_operand" "")))
@@ -7925,6 +8084,21 @@
operands[2] = gen_reg_rtx (QImode);
}")
+(define_insn "strmovsi_1"
+ [(set (mem:SI (match_operand:SI 2 "register_operand" "0"))
+ (mem:SI (match_operand:SI 3 "register_operand" "1")))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_dup 0)
+ (const_int 4)))
+ (set (match_operand:SI 1 "register_operand" "=S")
+ (plus:SI (match_dup 1)
+ (const_int 4)))
+ (use (reg:SI 19))]
+ "TARGET_SINGLE_STRINGOP || optimize_size"
+ "movsl"
+ [(set_attr "type" "str")
+ (set_attr "memory" "both")])
+
(define_insn "strmovhi_1"
[(set (mem:HI (match_operand:SI 2 "register_operand" "0"))
(mem:HI (match_operand:SI 3 "register_operand" "1")))
@@ -7996,15 +8170,26 @@
(define_expand "clrstrsi"
[(use (match_operand:BLK 0 "memory_operand" ""))
- (use (match_operand:SI 1 "const_int_operand" ""))
+ (use (match_operand:SI 1 "nonmemory_operand" ""))
(use (match_operand:SI 2 "const_int_operand" ""))]
""
"
{
+ /* See comments in movstr expanders. The code is mostly identical. */
+
rtx destreg, zeroreg, countreg;
+ int align = 0;
+ int count = -1;
- if (GET_CODE (operands[1]) != CONST_INT)
- FAIL;
+ if (GET_CODE (operands[2]) == CONST_INT)
+ align = INTVAL (operands[2]);
+
+ /* This simple hack avoids all inlining code and simplifies code bellow. */
+ if (!TARGET_ALIGN_STRINGOPS)
+ align = 32;
+
+ if (GET_CODE (operands[1]) == CONST_INT)
+ count = INTVAL (operands[1]);
destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
@@ -8012,14 +8197,19 @@
/* When optimizing for size emit simple rep ; movsb instruction for
counts not divisible by 4. */
- if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03))
+
+ if ((!optimize || optimize_size)
+ && (count < 0 || (count & 0x03)))
{
countreg = copy_to_mode_reg (SImode, operands[1]);
zeroreg = copy_to_mode_reg (QImode, const0_rtx);
emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
destreg, countreg));
}
- else
+ else if (count >= 0
+ && (align >= 8
+ || (!TARGET_PENTIUMPRO && align >= 4)
+ || optimize_size || count < 64))
{
zeroreg = copy_to_mode_reg (SImode, const0_rtx);
if (INTVAL (operands[1]) & ~0x03)
@@ -8037,12 +8227,133 @@
emit_insn (gen_strsetqi (destreg,
gen_rtx_SUBREG (QImode, zeroreg, 0)));
}
+ else
+ {
+ rtx countreg2;
+ rtx label = NULL;
+
+ /* In case we don't know anything about the alignment, default to
+ library version, since it is usually equally fast and result in
+ shorter code. */
+ if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
+ FAIL;
+
+ if (TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld());
+
+ countreg2 = gen_reg_rtx (SImode);
+ countreg = copy_to_mode_reg (SImode, operands[1]);
+ zeroreg = copy_to_mode_reg (SImode, const0_rtx);
+
+ if (count < 0
+ && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
+ {
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (countreg, GEN_INT (3),
+ LEU, 0, SImode, 1, 0, label);
+ }
+ if (align <= 1)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsetqi (destreg,
+ gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 2)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsethi (destreg,
+ gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsethi (destreg, zeroreg));
+ emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+
+ if (!TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld());
+ emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
+ emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg,
+ destreg, countreg2));
+
+ if (label)
+ {
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align > 2 && count > 0 && (count & 2))
+ emit_insn (gen_strsethi (destreg,
+ gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ if (align <= 2 || count < 0)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsethi (destreg,
+ gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align > 1 && count > 0 && (count & 1))
+ emit_insn (gen_strsetqi (destreg,
+ gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ if (align <= 1 || count < 0)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsetqi (destreg,
+ gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ }
DONE;
}")
;; Most CPUs don't like single string operations
;; Handle this case here to simplify previous expander.
+(define_expand "strsetsi"
+ [(set (mem:SI (match_operand:SI 0 "register_operand" ""))
+ (match_operand:SI 1 "register_operand" ""))
+ (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
+ (clobber (reg:CC 17))])]
+ ""
+ "
+{
+ if (TARGET_SINGLE_STRINGOP || optimize_size)
+ {
+ emit_insn (gen_strsetsi_1 (operands[0], operands[0], operands[1]));
+ DONE;
+ }
+}")
+
(define_expand "strsethi"
[(set (mem:HI (match_operand:SI 0 "register_operand" ""))
(match_operand:HI 1 "register_operand" ""))
@@ -8073,6 +8384,18 @@
}
}")
+(define_insn "strsetsi_1"
+ [(set (mem:SI (match_operand:SI 1 "register_operand" "0"))
+ (match_operand:SI 2 "register_operand" "a"))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_dup 0)
+ (const_int 4)))
+ (use (reg:SI 19))]
+ "TARGET_SINGLE_STRINGOP || optimize_size"
+ "stosl"
+ [(set_attr "type" "str")
+ (set_attr "memory" "store")])
+
(define_insn "strsethi_1"
[(set (mem:HI (match_operand:SI 1 "register_operand" "0"))
(match_operand:HI 2 "register_operand" "a"))
@@ -8252,6 +8575,14 @@
{
rtx out, addr, eoschar, align, scratch1, scratch2, scratch3;
+ /* The generic case of strlen expander is long. Avoid it's
+ expanding unless TARGET_INLINE_ALL_STRINGOPS. */
+
+ if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
+ && !optimize_size
+ && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4))
+ FAIL;
+
out = operands[0];
addr = force_reg (Pmode, XEXP (operands[1], 0));
eoschar = operands[2];
@@ -8271,6 +8602,7 @@
if (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)
emit_move_insn (scratch1, addr);
+
emit_move_insn (out, addr);
ix86_expand_strlensi_unroll_1 (out, align, scratch1);
diff --git a/gcc/invoke.texi b/gcc/invoke.texi
index 549ece1..f09ef55 100644
--- a/gcc/invoke.texi
+++ b/gcc/invoke.texi
@@ -360,7 +360,7 @@ in the following sections.
-mreg-alloc=@var{list} -mregparm=@var{num}
-malign-jumps=@var{num} -malign-loops=@var{num}
-malign-functions=@var{num} -mpreferred-stack-boundary=@var{num}
--mthreads
+-mthreads -mno-align-stringops -minline-all-stringops
@emph{HPPA Options}
-march=@var{architecture type}
@@ -5954,6 +5954,19 @@ on thread-safe exception handling must compile and link all code with the
@samp{-mthreads} option. When compiling, @samp{-mthreads} defines
@samp{-D_MT}; when linking, it links in a special thread helper library
@samp{-lmingwthrd} which cleans up per thread exception handling data.
+
+@item -mno-align-stringops
+@kindex -mno-align-stringops
+Do not align destination of inlined string operations. This switch reduces
+code size and improves performance in case the destination is already aligned,
+but gcc don't know about it.
+
+@item -minline-all-stringops
+@kindex -minline-all-stringops
+By default GCC inlines string operations only when destination is known to be
+aligned at least to 4 byte boundary. This enables more inlining, increase code
+size, but may improve performance of code that depends on fast memcpy, strlen
+and memset for short lengths.
@end table
@node HPPA Options